diff options
-rw-r--r-- | arch/x86/xen/mmu.c | 3 | ||||
-rw-r--r-- | drivers/tty/hvc/hvc_xen.c | 2 | ||||
-rw-r--r-- | drivers/xen/Kconfig | 10 | ||||
-rw-r--r-- | drivers/xen/Makefile | 6 | ||||
-rw-r--r-- | drivers/xen/balloon.c | 359 | ||||
-rw-r--r-- | drivers/xen/gntalloc.c | 545 | ||||
-rw-r--r-- | drivers/xen/gntdev.c | 382 | ||||
-rw-r--r-- | drivers/xen/grant-table.c | 10 | ||||
-rw-r--r-- | drivers/xen/xen-balloon.c | 256 | ||||
-rw-r--r-- | include/xen/balloon.h | 25 | ||||
-rw-r--r-- | include/xen/gntalloc.h | 82 | ||||
-rw-r--r-- | include/xen/gntdev.h | 31 |
12 files changed, 1325 insertions, 386 deletions
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3f6f334..5695fa6 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -79,8 +79,7 @@ /* * Protects atomic reservation decrease/increase against concurrent increases. - * Also protects non-atomic updates of current_pages and driver_pages, and - * balloon lists. + * Also protects non-atomic updates of current_pages and balloon lists. */ DEFINE_SPINLOCK(xen_reservation_lock); diff --git a/drivers/tty/hvc/hvc_xen.c b/drivers/tty/hvc/hvc_xen.c index 3740e32..c35f1a7 100644 --- a/drivers/tty/hvc/hvc_xen.c +++ b/drivers/tty/hvc/hvc_xen.c @@ -177,6 +177,8 @@ static int __init xen_hvc_init(void) } if (xencons_irq < 0) xencons_irq = 0; /* NO_IRQ */ + else + set_irq_noprobe(xencons_irq); hp = hvc_alloc(HVC_COOKIE, xencons_irq, ops, 256); if (IS_ERR(hp)) diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 07bec09..a59638b 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -76,10 +76,20 @@ config XEN_XENBUS_FRONTEND config XEN_GNTDEV tristate "userspace grant access device driver" depends on XEN + default m select MMU_NOTIFIER help Allows userspace processes to use grants. +config XEN_GRANT_DEV_ALLOC + tristate "User-space grant reference allocator driver" + depends on XEN + default m + help + Allows userspace processes to create pages with access granted + to other domains. This can be used to implement frontend drivers + or as part of an inter-domain shared memory channel. + config XEN_PLATFORM_PCI tristate "xen platform pci device driver" depends on XEN_PVHVM && PCI diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index 5088cc2..f420f1f 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -1,4 +1,4 @@ -obj-y += grant-table.o features.o events.o manage.o +obj-y += grant-table.o features.o events.o manage.o balloon.o obj-y += xenbus/ nostackp := $(call cc-option, -fno-stack-protector) @@ -7,9 +7,10 @@ CFLAGS_features.o := $(nostackp) obj-$(CONFIG_BLOCK) += biomerge.o obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o obj-$(CONFIG_XEN_XENCOMM) += xencomm.o -obj-$(CONFIG_XEN_BALLOON) += balloon.o +obj-$(CONFIG_XEN_BALLOON) += xen-balloon.o obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o +obj-$(CONFIG_XEN_GRANT_DEV_ALLOC) += xen-gntalloc.o obj-$(CONFIG_XENFS) += xenfs/ obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o obj-$(CONFIG_XEN_PLATFORM_PCI) += xen-platform-pci.o @@ -18,5 +19,6 @@ obj-$(CONFIG_XEN_DOM0) += pci.o xen-evtchn-y := evtchn.o xen-gntdev-y := gntdev.o +xen-gntalloc-y := gntalloc.o xen-platform-pci-y := platform-pci.o diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 718050a..043af8a 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -1,6 +1,4 @@ /****************************************************************************** - * balloon.c - * * Xen balloon driver - enables returning/claiming memory to/from Xen. * * Copyright (c) 2003, B Dragovic @@ -33,7 +31,6 @@ */ #include <linux/kernel.h> -#include <linux/module.h> #include <linux/sched.h> #include <linux/errno.h> #include <linux/mm.h> @@ -42,13 +39,11 @@ #include <linux/highmem.h> #include <linux/mutex.h> #include <linux/list.h> -#include <linux/sysdev.h> #include <linux/gfp.h> #include <asm/page.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> -#include <asm/uaccess.h> #include <asm/tlb.h> #include <asm/e820.h> @@ -58,35 +53,29 @@ #include <xen/xen.h> #include <xen/interface/xen.h> #include <xen/interface/memory.h> -#include <xen/xenbus.h> +#include <xen/balloon.h> #include <xen/features.h> #include <xen/page.h> -#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) - -#define BALLOON_CLASS_NAME "xen_memory" +/* + * balloon_process() state: + * + * BP_DONE: done or nothing to do, + * BP_EAGAIN: error, go to sleep, + * BP_ECANCELED: error, balloon operation canceled. + */ -struct balloon_stats { - /* We aim for 'current allocation' == 'target allocation'. */ - unsigned long current_pages; - unsigned long target_pages; - /* - * Drivers may alter the memory reservation independently, but they - * must inform the balloon driver so we avoid hitting the hard limit. - */ - unsigned long driver_pages; - /* Number of pages in high- and low-memory balloons. */ - unsigned long balloon_low; - unsigned long balloon_high; +enum bp_state { + BP_DONE, + BP_EAGAIN, + BP_ECANCELED }; -static DEFINE_MUTEX(balloon_mutex); - -static struct sys_device balloon_sysdev; -static int register_balloon(struct sys_device *sysdev); +static DEFINE_MUTEX(balloon_mutex); -static struct balloon_stats balloon_stats; +struct balloon_stats balloon_stats; +EXPORT_SYMBOL_GPL(balloon_stats); /* We increase/decrease in batches which fit in a page */ static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)]; @@ -104,8 +93,7 @@ static LIST_HEAD(ballooned_pages); /* Main work function, always executed in process context. */ static void balloon_process(struct work_struct *work); -static DECLARE_WORK(balloon_worker, balloon_process); -static struct timer_list balloon_timer; +static DECLARE_DELAYED_WORK(balloon_worker, balloon_process); /* When ballooning out (allocating memory to return to Xen) we don't really want the kernel to try too hard since that can trigger the oom killer. */ @@ -140,14 +128,17 @@ static void balloon_append(struct page *page) } /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ -static struct page *balloon_retrieve(void) +static struct page *balloon_retrieve(bool prefer_highmem) { struct page *page; if (list_empty(&ballooned_pages)) return NULL; - page = list_entry(ballooned_pages.next, struct page, lru); + if (prefer_highmem) + page = list_entry(ballooned_pages.prev, struct page, lru); + else + page = list_entry(ballooned_pages.next, struct page, lru); list_del(&page->lru); if (PageHighMem(page)) { @@ -177,9 +168,29 @@ static struct page *balloon_next_page(struct page *page) return list_entry(next, struct page, lru); } -static void balloon_alarm(unsigned long unused) +static enum bp_state update_schedule(enum bp_state state) { - schedule_work(&balloon_worker); + if (state == BP_DONE) { + balloon_stats.schedule_delay = 1; + balloon_stats.retry_count = 1; + return BP_DONE; + } + + ++balloon_stats.retry_count; + + if (balloon_stats.max_retry_count != RETRY_UNLIMITED && + balloon_stats.retry_count > balloon_stats.max_retry_count) { + balloon_stats.schedule_delay = 1; + balloon_stats.retry_count = 1; + return BP_ECANCELED; + } + + balloon_stats.schedule_delay <<= 1; + + if (balloon_stats.schedule_delay > balloon_stats.max_schedule_delay) + balloon_stats.schedule_delay = balloon_stats.max_schedule_delay; + + return BP_EAGAIN; } static unsigned long current_target(void) @@ -194,11 +205,11 @@ static unsigned long current_target(void) return target; } -static int increase_reservation(unsigned long nr_pages) +static enum bp_state increase_reservation(unsigned long nr_pages) { + int rc; unsigned long pfn, i; struct page *page; - long rc; struct xen_memory_reservation reservation = { .address_bits = 0, .extent_order = 0, @@ -210,7 +221,10 @@ static int increase_reservation(unsigned long nr_pages) page = balloon_first_page(); for (i = 0; i < nr_pages; i++) { - BUG_ON(page == NULL); + if (!page) { + nr_pages = i; + break; + } frame_list[i] = page_to_pfn(page); page = balloon_next_page(page); } @@ -218,11 +232,11 @@ static int increase_reservation(unsigned long nr_pages) set_xen_guest_handle(reservation.extent_start, frame_list); reservation.nr_extents = nr_pages; rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); - if (rc < 0) - goto out; + if (rc <= 0) + return BP_EAGAIN; for (i = 0; i < rc; i++) { - page = balloon_retrieve(); + page = balloon_retrieve(false); BUG_ON(page == NULL); pfn = page_to_pfn(page); @@ -249,15 +263,14 @@ static int increase_reservation(unsigned long nr_pages) balloon_stats.current_pages += rc; - out: - return rc < 0 ? rc : rc != nr_pages; + return BP_DONE; } -static int decrease_reservation(unsigned long nr_pages) +static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) { + enum bp_state state = BP_DONE; unsigned long pfn, i; struct page *page; - int need_sleep = 0; int ret; struct xen_memory_reservation reservation = { .address_bits = 0, @@ -269,9 +282,9 @@ static int decrease_reservation(unsigned long nr_pages) nr_pages = ARRAY_SIZE(frame_list); for (i = 0; i < nr_pages; i++) { - if ((page = alloc_page(GFP_BALLOON)) == NULL) { + if ((page = alloc_page(gfp)) == NULL) { nr_pages = i; - need_sleep = 1; + state = BP_EAGAIN; break; } @@ -307,7 +320,7 @@ static int decrease_reservation(unsigned long nr_pages) balloon_stats.current_pages -= nr_pages; - return need_sleep; + return state; } /* @@ -318,77 +331,101 @@ static int decrease_reservation(unsigned long nr_pages) */ static void balloon_process(struct work_struct *work) { - int need_sleep = 0; + enum bp_state state = BP_DONE; long credit; mutex_lock(&balloon_mutex); do { credit = current_target() - balloon_stats.current_pages; + if (credit > 0) - need_sleep = (increase_reservation(credit) != 0); + state = increase_reservation(credit); + if (credit < 0) - need_sleep = (decrease_reservation(-credit) != 0); + state = decrease_reservation(-credit, GFP_BALLOON); + + state = update_schedule(state); #ifndef CONFIG_PREEMPT if (need_resched()) schedule(); #endif - } while ((credit != 0) && !need_sleep); + } while (credit && state == BP_DONE); /* Schedule more work if there is some still to be done. */ - if (current_target() != balloon_stats.current_pages) - mod_timer(&balloon_timer, jiffies + HZ); + if (state == BP_EAGAIN) + schedule_delayed_work(&balloon_worker, balloon_stats.schedule_delay * HZ); mutex_unlock(&balloon_mutex); } /* Resets the Xen limit, sets new target, and kicks off processing. */ -static void balloon_set_new_target(unsigned long target) +void balloon_set_new_target(unsigned long target) { /* No need for lock. Not read-modify-write updates. */ balloon_stats.target_pages = target; - schedule_work(&balloon_worker); + schedule_delayed_work(&balloon_worker, 0); } +EXPORT_SYMBOL_GPL(balloon_set_new_target); -static struct xenbus_watch target_watch = -{ - .node = "memory/target" -}; - -/* React to a change in the target key */ -static void watch_target(struct xenbus_watch *watch, - const char **vec, unsigned int len) +/** + * alloc_xenballooned_pages - get pages that have been ballooned out + * @nr_pages: Number of pages to get + * @pages: pages returned + * @return 0 on success, error otherwise + */ +int alloc_xenballooned_pages(int nr_pages, struct page** pages) { - unsigned long long new_target; - int err; - - err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target); - if (err != 1) { - /* This is ok (for domain0 at least) - so just return */ - return; + int pgno = 0; + struct page* page; + mutex_lock(&balloon_mutex); + while (pgno < nr_pages) { + page = balloon_retrieve(true); + if (page) { + pages[pgno++] = page; + } else { + enum bp_state st; + st = decrease_reservation(nr_pages - pgno, GFP_HIGHUSER); + if (st != BP_DONE) + goto out_undo; + } } - - /* The given memory/target value is in KiB, so it needs converting to - * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10. - */ - balloon_set_new_target(new_target >> (PAGE_SHIFT - 10)); + mutex_unlock(&balloon_mutex); + return 0; + out_undo: + while (pgno) + balloon_append(pages[--pgno]); + /* Free the memory back to the kernel soon */ + schedule_delayed_work(&balloon_worker, 0); + mutex_unlock(&balloon_mutex); + return -ENOMEM; } +EXPORT_SYMBOL(alloc_xenballooned_pages); -static int balloon_init_watcher(struct notifier_block *notifier, - unsigned long event, - void *data) +/** + * free_xenballooned_pages - return pages retrieved with get_ballooned_pages + * @nr_pages: Number of pages + * @pages: pages to return + */ +void free_xenballooned_pages(int nr_pages, struct page** pages) { - int err; + int i; - err = register_xenbus_watch(&target_watch); - if (err) - printk(KERN_ERR "Failed to set balloon watcher\n"); + mutex_lock(&balloon_mutex); - return NOTIFY_DONE; -} + for (i = 0; i < nr_pages; i++) { + if (pages[i]) + balloon_append(pages[i]); + } + + /* The balloon may be too large now. Shrink it if needed. */ + if (current_target() != balloon_stats.current_pages) + schedule_delayed_work(&balloon_worker, 0); -static struct notifier_block xenstore_notifier; + mutex_unlock(&balloon_mutex); +} +EXPORT_SYMBOL(free_xenballooned_pages); static int __init balloon_init(void) { @@ -398,7 +435,7 @@ static int __init balloon_init(void) if (!xen_domain()) return -ENODEV; - pr_info("xen_balloon: Initialising balloon driver.\n"); + pr_info("xen/balloon: Initialising balloon driver.\n"); if (xen_pv_domain()) nr_pages = xen_start_info->nr_pages; @@ -408,13 +445,11 @@ static int __init balloon_init(void) balloon_stats.target_pages = balloon_stats.current_pages; balloon_stats.balloon_low = 0; balloon_stats.balloon_high = 0; - balloon_stats.driver_pages = 0UL; - - init_timer(&balloon_timer); - balloon_timer.data = 0; - balloon_timer.function = balloon_alarm; - register_balloon(&balloon_sysdev); + balloon_stats.schedule_delay = 1; + balloon_stats.max_schedule_delay = 32; + balloon_stats.retry_count = 1; + balloon_stats.max_retry_count = RETRY_UNLIMITED; /* * Initialise the balloon with excess memory space. We need @@ -436,153 +471,9 @@ static int __init balloon_init(void) __balloon_append(page); } - target_watch.callback = watch_target; - xenstore_notifier.notifier_call = balloon_init_watcher; - - register_xenstore_notifier(&xenstore_notifier); - return 0; } subsys_initcall(balloon_init); -static void balloon_exit(void) -{ - /* XXX - release balloon here */ - return; -} - -module_exit(balloon_exit); - -#define BALLOON_SHOW(name, format, args...) \ - static ssize_t show_##name(struct sys_device *dev, \ - struct sysdev_attribute *attr, \ - char *buf) \ - { \ - return sprintf(buf, format, ##args); \ - } \ - static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL) - -BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages)); -BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_low)); -BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high)); -BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(balloon_stats.driver_pages)); - -static ssize_t show_target_kb(struct sys_device *dev, struct sysdev_attribute *attr, - char *buf) -{ - return sprintf(buf, "%lu\n", PAGES2KB(balloon_stats.target_pages)); -} - -static ssize_t store_target_kb(struct sys_device *dev, - struct sysdev_attribute *attr, - const char *buf, - size_t count) -{ - char *endchar; - unsigned long long target_bytes; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - target_bytes = simple_strtoull(buf, &endchar, 0) * 1024; - - balloon_set_new_target(target_bytes >> PAGE_SHIFT); - - return count; -} - -static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR, - show_target_kb, store_target_kb); - - -static ssize_t show_target(struct sys_device *dev, struct sysdev_attribute *attr, - char *buf) -{ - return sprintf(buf, "%llu\n", - (unsigned long long)balloon_stats.target_pages - << PAGE_SHIFT); -} - -static ssize_t store_target(struct sys_device *dev, - struct sysdev_attribute *attr, - const char *buf, - size_t count) -{ - char *endchar; - unsigned long long target_bytes; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - target_bytes = memparse(buf, &endchar); - - balloon_set_new_target(target_bytes >> PAGE_SHIFT); - - return count; -} - -static SYSDEV_ATTR(target, S_IRUGO | S_IWUSR, - show_target, store_target); - - -static struct sysdev_attribute *balloon_attrs[] = { - &attr_target_kb, - &attr_target, -}; - -static struct attribute *balloon_info_attrs[] = { - &attr_current_kb.attr, - &attr_low_kb.attr, - &attr_high_kb.attr, - &attr_driver_kb.attr, - NULL -}; - -static struct attribute_group balloon_info_group = { - .name = "info", - .attrs = balloon_info_attrs, -}; - -static struct sysdev_class balloon_sysdev_class = { - .name = BALLOON_CLASS_NAME, -}; - -static int register_balloon(struct sys_device *sysdev) -{ - int i, error; - - error = sysdev_class_register(&balloon_sysdev_class); - if (error) - return error; - - sysdev->id = 0; - sysdev->cls = &balloon_sysdev_class; - - error = sysdev_register(sysdev); - if (error) { - sysdev_class_unregister(&balloon_sysdev_class); - return error; - } - - for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) { - error = sysdev_create_file(sysdev, balloon_attrs[i]); - if (error) - goto fail; - } - - error = sysfs_create_group(&sysdev->kobj, &balloon_info_group); - if (error) - goto fail; - - return 0; - - fail: - while (--i >= 0) - sysdev_remove_file(sysdev, balloon_attrs[i]); - sysdev_unregister(sysdev); - sysdev_class_unregister(&balloon_sysdev_class); - return error; -} - MODULE_LICENSE("GPL"); diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c new file mode 100644 index 0000000..a7ffdfe --- /dev/null +++ b/drivers/xen/gntalloc.c @@ -0,0 +1,545 @@ +/****************************************************************************** + * gntalloc.c + * + * Device for creating grant references (in user-space) that may be shared + * with other domains. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * This driver exists to allow userspace programs in Linux to allocate kernel + * memory that will later be shared with another domain. Without this device, + * Linux userspace programs cannot create grant references. + * + * How this stuff works: + * X -> granting a page to Y + * Y -> mapping the grant from X + * + * 1. X uses the gntalloc device to allocate a page of kernel memory, P. + * 2. X creates an entry in the grant table that says domid(Y) can access P. + * This is done without a hypercall unless the grant table needs expansion. + * 3. X gives the grant reference identifier, GREF, to Y. + * 4. Y maps the page, either directly into kernel memory for use in a backend + * driver, or via a the gntdev device to map into the address space of an + * application running in Y. This is the first point at which Xen does any + * tracking of the page. + * 5. A program in X mmap()s a segment of the gntalloc device that corresponds + * to the shared page, and can now communicate with Y over the shared page. + * + * + * NOTE TO USERSPACE LIBRARIES: + * The grant allocation and mmap()ing are, naturally, two separate operations. + * You set up the sharing by calling the create ioctl() and then the mmap(). + * Teardown requires munmap() and either close() or ioctl(). + * + * WARNING: Since Xen does not allow a guest to forcibly end the use of a grant + * reference, this device can be used to consume kernel memory by leaving grant + * references mapped by another domain when an application exits. Therefore, + * there is a global limit on the number of pages that can be allocated. When + * all references to the page are unmapped, it will be freed during the next + * grant operation. + */ + +#include <linux/atomic.h> +#include <linux/module.h> +#include <linux/miscdevice.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/device.h> +#include <linux/mm.h> +#include <linux/uaccess.h> +#include <linux/types.h> +#include <linux/list.h> +#include <linux/highmem.h> + +#include <xen/xen.h> +#include <xen/page.h> +#include <xen/grant_table.h> +#include <xen/gntalloc.h> +#include <xen/events.h> + +static int limit = 1024; +module_param(limit, int, 0644); +MODULE_PARM_DESC(limit, "Maximum number of grants that may be allocated by " + "the gntalloc device"); + +static LIST_HEAD(gref_list); +static DEFINE_SPINLOCK(gref_lock); +static int gref_size; + +struct notify_info { + uint16_t pgoff:12; /* Bits 0-11: Offset of the byte to clear */ + uint16_t flags:2; /* Bits 12-13: Unmap notification flags */ + int event; /* Port (event channel) to notify */ +}; + +/* Metadata on a grant reference. */ +struct gntalloc_gref { + struct list_head next_gref; /* list entry gref_list */ + struct list_head next_file; /* list entry file->list, if open */ + struct page *page; /* The shared page */ + uint64_t file_index; /* File offset for mmap() */ + unsigned int users; /* Use count - when zero, waiting on Xen */ + grant_ref_t gref_id; /* The grant reference number */ + struct notify_info notify; /* Unmap notification */ +}; + +struct gntalloc_file_private_data { + struct list_head list; + uint64_t index; +}; + +static void __del_gref(struct gntalloc_gref *gref); + +static void do_cleanup(void) +{ + struct gntalloc_gref *gref, *n; + list_for_each_entry_safe(gref, n, &gref_list, next_gref) { + if (!gref->users) + __del_gref(gref); + } +} + +static int add_grefs(struct ioctl_gntalloc_alloc_gref *op, + uint32_t *gref_ids, struct gntalloc_file_private_data *priv) +{ + int i, rc, readonly; + LIST_HEAD(queue_gref); + LIST_HEAD(queue_file); + struct gntalloc_gref *gref; + + readonly = !(op->flags & GNTALLOC_FLAG_WRITABLE); + rc = -ENOMEM; + for (i = 0; i < op->count; i++) { + gref = kzalloc(sizeof(*gref), GFP_KERNEL); + if (!gref) + goto undo; + list_add_tail(&gref->next_gref, &queue_gref); + list_add_tail(&gref->next_file, &queue_file); + gref->users = 1; + gref->file_index = op->index + i * PAGE_SIZE; + gref->page = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (!gref->page) + goto undo; + + /* Grant foreign access to the page. */ + gref->gref_id = gnttab_grant_foreign_access(op->domid, + pfn_to_mfn(page_to_pfn(gref->page)), readonly); + if (gref->gref_id < 0) { + rc = gref->gref_id; + goto undo; + } + gref_ids[i] = gref->gref_id; + } + + /* Add to gref lists. */ + spin_lock(&gref_lock); + list_splice_tail(&queue_gref, &gref_list); + list_splice_tail(&queue_file, &priv->list); + spin_unlock(&gref_lock); + + return 0; + +undo: + spin_lock(&gref_lock); + gref_size -= (op->count - i); + + list_for_each_entry(gref, &queue_file, next_file) { + /* __del_gref does not remove from queue_file */ + __del_gref(gref); + } + + /* It's possible for the target domain to map the just-allocated grant + * references by blindly guessing their IDs; if this is done, then + * __del_gref will leave them in the queue_gref list. They need to be + * added to the global list so that we can free them when they are no + * longer referenced. + */ + if (unlikely(!list_empty(&queue_gref))) + list_splice_tail(&queue_gref, &gref_list); + spin_unlock(&gref_lock); + return rc; +} + +static void __del_gref(struct gntalloc_gref *gref) +{ + if (gref->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) { + uint8_t *tmp = kmap(gref->page); + tmp[gref->notify.pgoff] = 0; + kunmap(gref->page); + } + if (gref->notify.flags & UNMAP_NOTIFY_SEND_EVENT) + notify_remote_via_evtchn(gref->notify.event); + + gref->notify.flags = 0; + + if (gref->gref_id > 0) { + if (gnttab_query_foreign_access(gref->gref_id)) + return; + + if (!gnttab_end_foreign_access_ref(gref->gref_id, 0)) + return; + } + + gref_size--; + list_del(&gref->next_gref); + + if (gref->page) + __free_page(gref->page); + + kfree(gref); +} + +/* finds contiguous grant references in a file, returns the first */ +static struct gntalloc_gref *find_grefs(struct gntalloc_file_private_data *priv, + uint64_t index, uint32_t count) +{ + struct gntalloc_gref *rv = NULL, *gref; + list_for_each_entry(gref, &priv->list, next_file) { + if (gref->file_index == index && !rv) + rv = gref; + if (rv) { + if (gref->file_index != index) + return NULL; + index += PAGE_SIZE; + count--; + if (count == 0) + return rv; + } + } + return NULL; +} + +/* + * ------------------------------------- + * File operations. + * ------------------------------------- + */ +static int gntalloc_open(struct inode *inode, struct file *filp) +{ + struct gntalloc_file_private_data *priv; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + goto out_nomem; + INIT_LIST_HEAD(&priv->list); + + filp->private_data = priv; + + pr_debug("%s: priv %p\n", __func__, priv); + + return 0; + +out_nomem: + return -ENOMEM; +} + +static int gntalloc_release(struct inode *inode, struct file *filp) +{ + struct gntalloc_file_private_data *priv = filp->private_data; + struct gntalloc_gref *gref; + + pr_debug("%s: priv %p\n", __func__, priv); + + spin_lock(&gref_lock); + while (!list_empty(&priv->list)) { + gref = list_entry(priv->list.next, + struct gntalloc_gref, next_file); + list_del(&gref->next_file); + gref->users--; + if (gref->users == 0) + __del_gref(gref); + } + kfree(priv); + spin_unlock(&gref_lock); + + return 0; +} + +static long gntalloc_ioctl_alloc(struct gntalloc_file_private_data *priv, + struct ioctl_gntalloc_alloc_gref __user *arg) +{ + int rc = 0; + struct ioctl_gntalloc_alloc_gref op; + uint32_t *gref_ids; + + pr_debug("%s: priv %p\n", __func__, priv); + + if (copy_from_user(&op, arg, sizeof(op))) { + rc = -EFAULT; + goto out; + } + + gref_ids = kzalloc(sizeof(gref_ids[0]) * op.count, GFP_TEMPORARY); + if (!gref_ids) { + rc = -ENOMEM; + goto out; + } + + spin_lock(&gref_lock); + /* Clean up pages that were at zero (local) users but were still mapped + * by remote domains. Since those pages count towards the limit that we + * are about to enforce, removing them here is a good idea. + */ + do_cleanup(); + if (gref_size + op.count > limit) { + spin_unlock(&gref_lock); + rc = -ENOSPC; + goto out_free; + } + gref_size += op.count; + op.index = priv->index; + priv->index += op.count * PAGE_SIZE; + spin_unlock(&gref_lock); + + rc = add_grefs(&op, gref_ids, priv); + if (rc < 0) + goto out_free; + + /* Once we finish add_grefs, it is unsafe to touch the new reference, + * since it is possible for a concurrent ioctl to remove it (by guessing + * its index). If the userspace application doesn't provide valid memory + * to write the IDs to, then it will need to close the file in order to + * release - which it will do by segfaulting when it tries to access the + * IDs to close them. + */ + if (copy_to_user(arg, &op, sizeof(op))) { + rc = -EFAULT; + goto out_free; + } + if (copy_to_user(arg->gref_ids, gref_ids, + sizeof(gref_ids[0]) * op.count)) { + rc = -EFAULT; + goto out_free; + } + +out_free: + kfree(gref_ids); +out: + return rc; +} + +static long gntalloc_ioctl_dealloc(struct gntalloc_file_private_data *priv, + void __user *arg) +{ + int i, rc = 0; + struct ioctl_gntalloc_dealloc_gref op; + struct gntalloc_gref *gref, *n; + + pr_debug("%s: priv %p\n", __func__, priv); + + if (copy_from_user(&op, arg, sizeof(op))) { + rc = -EFAULT; + goto dealloc_grant_out; + } + + spin_lock(&gref_lock); + gref = find_grefs(priv, op.index, op.count); + if (gref) { + /* Remove from the file list only, and decrease reference count. + * The later call to do_cleanup() will remove from gref_list and + * free the memory if the pages aren't mapped anywhere. + */ + for (i = 0; i < op.count; i++) { + n = list_entry(gref->next_file.next, + struct gntalloc_gref, next_file); + list_del(&gref->next_file); + gref->users--; + gref = n; + } + } else { + rc = -EINVAL; + } + + do_cleanup(); + + spin_unlock(&gref_lock); +dealloc_grant_out: + return rc; +} + +static long gntalloc_ioctl_unmap_notify(struct gntalloc_file_private_data *priv, + void __user *arg) +{ + struct ioctl_gntalloc_unmap_notify op; + struct gntalloc_gref *gref; + uint64_t index; + int pgoff; + int rc; + + if (copy_from_user(&op, arg, sizeof(op))) + return -EFAULT; + + index = op.index & ~(PAGE_SIZE - 1); + pgoff = op.index & (PAGE_SIZE - 1); + + spin_lock(&gref_lock); + + gref = find_grefs(priv, index, 1); + if (!gref) { + rc = -ENOENT; + goto unlock_out; + } + + if (op.action & ~(UNMAP_NOTIFY_CLEAR_BYTE|UNMAP_NOTIFY_SEND_EVENT)) { + rc = -EINVAL; + goto unlock_out; + } + + gref->notify.flags = op.action; + gref->notify.pgoff = pgoff; + gref->notify.event = op.event_channel_port; + rc = 0; + unlock_out: + spin_unlock(&gref_lock); + return rc; +} + +static long gntalloc_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct gntalloc_file_private_data *priv = filp->private_data; + + switch (cmd) { + case IOCTL_GNTALLOC_ALLOC_GREF: + return gntalloc_ioctl_alloc(priv, (void __user *)arg); + + case IOCTL_GNTALLOC_DEALLOC_GREF: + return gntalloc_ioctl_dealloc(priv, (void __user *)arg); + + case IOCTL_GNTALLOC_SET_UNMAP_NOTIFY: + return gntalloc_ioctl_unmap_notify(priv, (void __user *)arg); + + default: + return -ENOIOCTLCMD; + } + + return 0; +} + +static void gntalloc_vma_close(struct vm_area_struct *vma) +{ + struct gntalloc_gref *gref = vma->vm_private_data; + if (!gref) + return; + + spin_lock(&gref_lock); + gref->users--; + if (gref->users == 0) + __del_gref(gref); + spin_unlock(&gref_lock); +} + +static struct vm_operations_struct gntalloc_vmops = { + .close = gntalloc_vma_close, +}; + +static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct gntalloc_file_private_data *priv = filp->private_data; + struct gntalloc_gref *gref; + int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + int rv, i; + + pr_debug("%s: priv %p, page %lu+%d\n", __func__, + priv, vma->vm_pgoff, count); + + if (!(vma->vm_flags & VM_SHARED)) { + printk(KERN_ERR "%s: Mapping must be shared.\n", __func__); + return -EINVAL; + } + + spin_lock(&gref_lock); + gref = find_grefs(priv, vma->vm_pgoff << PAGE_SHIFT, count); + if (gref == NULL) { + rv = -ENOENT; + pr_debug("%s: Could not find grant reference", + __func__); + goto out_unlock; + } + + vma->vm_private_data = gref; + + vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_DONTCOPY; + vma->vm_flags |= VM_PFNMAP | VM_PFN_AT_MMAP; + + vma->vm_ops = &gntalloc_vmops; + + for (i = 0; i < count; i++) { + gref->users++; + rv = vm_insert_page(vma, vma->vm_start + i * PAGE_SIZE, + gref->page); + if (rv) + goto out_unlock; + + gref = list_entry(gref->next_file.next, + struct gntalloc_gref, next_file); + } + rv = 0; + +out_unlock: + spin_unlock(&gref_lock); + return rv; +} + +static const struct file_operations gntalloc_fops = { + .owner = THIS_MODULE, + .open = gntalloc_open, + .release = gntalloc_release, + .unlocked_ioctl = gntalloc_ioctl, + .mmap = gntalloc_mmap +}; + +/* + * ------------------------------------- + * Module creation/destruction. + * ------------------------------------- + */ +static struct miscdevice gntalloc_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "xen/gntalloc", + .fops = &gntalloc_fops, +}; + +static int __init gntalloc_init(void) +{ + int err; + + if (!xen_domain()) + return -ENODEV; + + err = misc_register(&gntalloc_miscdev); + if (err != 0) { + printk(KERN_ERR "Could not register misc gntalloc device\n"); + return err; + } + + pr_debug("Created grant allocation device at %d,%d\n", + MISC_MAJOR, gntalloc_miscdev.minor); + + return 0; +} + +static void __exit gntalloc_exit(void) +{ + misc_deregister(&gntalloc_miscdev); +} + +module_init(gntalloc_init); +module_exit(gntalloc_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Carter Weatherly <carter.weatherly@jhuapl.edu>, " + "Daniel De Graaf <dgdegra@tycho.nsa.gov>"); +MODULE_DESCRIPTION("User-space grant reference allocator driver"); diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 1e31cdc..017ce60 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -32,10 +32,13 @@ #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/slab.h> +#include <linux/highmem.h> #include <xen/xen.h> #include <xen/grant_table.h> +#include <xen/balloon.h> #include <xen/gntdev.h> +#include <xen/events.h> #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> #include <asm/xen/page.h> @@ -45,35 +48,46 @@ MODULE_AUTHOR("Derek G. Murray <Derek.Murray@cl.cam.ac.uk>, " "Gerd Hoffmann <kraxel@redhat.com>"); MODULE_DESCRIPTION("User-space granted page access driver"); -static int limit = 1024; +static int limit = 1024*1024; module_param(limit, int, 0644); -MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped at " - "once by a gntdev instance"); +MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped by " + "the gntdev device"); + +static atomic_t pages_mapped = ATOMIC_INIT(0); + +static int use_ptemod; struct gntdev_priv { struct list_head maps; - uint32_t used; - uint32_t limit; /* lock protects maps from concurrent changes */ spinlock_t lock; struct mm_struct *mm; struct mmu_notifier mn; }; +struct unmap_notify { + int flags; + /* Address relative to the start of the grant_map */ + int addr; + int event; +}; + struct grant_map { struct list_head next; - struct gntdev_priv *priv; struct vm_area_struct *vma; int index; int count; int flags; - int is_mapped; + atomic_t users; + struct unmap_notify notify; struct ioctl_gntdev_grant_ref *grants; struct gnttab_map_grant_ref *map_ops; struct gnttab_unmap_grant_ref *unmap_ops; struct page **pages; }; +static int unmap_grant_pages(struct grant_map *map, int offset, int pages); + /* ------------------------------------------------------------------ */ static void gntdev_print_maps(struct gntdev_priv *priv, @@ -82,9 +96,7 @@ static void gntdev_print_maps(struct gntdev_priv *priv, #ifdef DEBUG struct grant_map *map; - pr_debug("maps list (priv %p, usage %d/%d)\n", - priv, priv->used, priv->limit); - + pr_debug("%s: maps list (priv %p)\n", __func__, priv); list_for_each_entry(map, &priv->maps, next) pr_debug(" index %2d, count %2d %s\n", map->index, map->count, @@ -111,27 +123,21 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) NULL == add->pages) goto err; + if (alloc_xenballooned_pages(count, add->pages)) + goto err; + for (i = 0; i < count; i++) { - add->pages[i] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); - if (add->pages[i] == NULL) - goto err; + add->map_ops[i].handle = -1; + add->unmap_ops[i].handle = -1; } add->index = 0; add->count = count; - add->priv = priv; - - if (add->count + priv->used > priv->limit) - goto err; + atomic_set(&add->users, 1); return add; err: - if (add->pages) - for (i = 0; i < count; i++) { - if (add->pages[i]) - __free_page(add->pages[i]); - } kfree(add->pages); kfree(add->grants); kfree(add->map_ops); @@ -154,7 +160,6 @@ static void gntdev_add_map(struct gntdev_priv *priv, struct grant_map *add) list_add_tail(&add->next, &priv->maps); done: - priv->used += add->count; gntdev_print_maps(priv, "[new]", add->index); } @@ -166,57 +171,33 @@ static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv, list_for_each_entry(map, &priv->maps, next) { if (map->index != index) continue; - if (map->count != count) + if (count && map->count != count) continue; return map; } return NULL; } -static struct grant_map *gntdev_find_map_vaddr(struct gntdev_priv *priv, - unsigned long vaddr) +static void gntdev_put_map(struct grant_map *map) { - struct grant_map *map; - - list_for_each_entry(map, &priv->maps, next) { - if (!map->vma) - continue; - if (vaddr < map->vma->vm_start) - continue; - if (vaddr >= map->vma->vm_end) - continue; - return map; - } - return NULL; -} - -static int gntdev_del_map(struct grant_map *map) -{ - int i; + if (!map) + return; - if (map->vma) - return -EBUSY; - for (i = 0; i < map->count; i++) - if (map->unmap_ops[i].handle) - return -EBUSY; + if (!atomic_dec_and_test(&map->users)) + return; - map->priv->used -= map->count; - list_del(&map->next); - return 0; -} + atomic_sub(map->count, &pages_mapped); -static void gntdev_free_map(struct grant_map *map) -{ - int i; + if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT) { + notify_remote_via_evtchn(map->notify.event); + } - if (!map) - return; + if (map->pages) { + if (!use_ptemod) + unmap_grant_pages(map, 0, map->count); - if (map->pages) - for (i = 0; i < map->count; i++) { - if (map->pages[i]) - __free_page(map->pages[i]); - } + free_xenballooned_pages(map->count, map->pages); + } kfree(map->pages); kfree(map->grants); kfree(map->map_ops); @@ -231,18 +212,17 @@ static int find_grant_ptes(pte_t *pte, pgtable_t token, { struct grant_map *map = data; unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT; + int flags = map->flags | GNTMAP_application_map | GNTMAP_contains_pte; u64 pte_maddr; BUG_ON(pgnr >= map->count); pte_maddr = arbitrary_virt_to_machine(pte).maddr; - gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, - GNTMAP_contains_pte | map->flags, + gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, flags, map->grants[pgnr].ref, map->grants[pgnr].domid); - gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, - GNTMAP_contains_pte | map->flags, - 0 /* handle */); + gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, flags, + -1 /* handle */); return 0; } @@ -250,6 +230,21 @@ static int map_grant_pages(struct grant_map *map) { int i, err = 0; + if (!use_ptemod) { + /* Note: it could already be mapped */ + if (map->map_ops[0].handle != -1) + return 0; + for (i = 0; i < map->count; i++) { + unsigned long addr = (unsigned long) + pfn_to_kaddr(page_to_pfn(map->pages[i])); + gnttab_set_map_op(&map->map_ops[i], addr, map->flags, + map->grants[i].ref, + map->grants[i].domid); + gnttab_set_unmap_op(&map->unmap_ops[i], addr, + map->flags, -1 /* handle */); + } + } + pr_debug("map %d+%d\n", map->index, map->count); err = gnttab_map_refs(map->map_ops, map->pages, map->count); if (err) @@ -258,28 +253,81 @@ static int map_grant_pages(struct grant_map *map) for (i = 0; i < map->count; i++) { if (map->map_ops[i].status) err = -EINVAL; - map->unmap_ops[i].handle = map->map_ops[i].handle; + else { + BUG_ON(map->map_ops[i].handle == -1); + map->unmap_ops[i].handle = map->map_ops[i].handle; + pr_debug("map handle=%d\n", map->map_ops[i].handle); + } } return err; } -static int unmap_grant_pages(struct grant_map *map, int offset, int pages) +static int __unmap_grant_pages(struct grant_map *map, int offset, int pages) { int i, err = 0; - pr_debug("map %d+%d [%d+%d]\n", map->index, map->count, offset, pages); - err = gnttab_unmap_refs(map->unmap_ops + offset, map->pages, pages); + if (map->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) { + int pgno = (map->notify.addr >> PAGE_SHIFT); + if (pgno >= offset && pgno < offset + pages && use_ptemod) { + void __user *tmp = (void __user *) + map->vma->vm_start + map->notify.addr; + err = copy_to_user(tmp, &err, 1); + if (err) + return err; + map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE; + } else if (pgno >= offset && pgno < offset + pages) { + uint8_t *tmp = kmap(map->pages[pgno]); + tmp[map->notify.addr & (PAGE_SIZE-1)] = 0; + kunmap(map->pages[pgno]); + map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE; + } + } + + err = gnttab_unmap_refs(map->unmap_ops + offset, map->pages + offset, pages); if (err) return err; for (i = 0; i < pages; i++) { if (map->unmap_ops[offset+i].status) err = -EINVAL; - map->unmap_ops[offset+i].handle = 0; + pr_debug("unmap handle=%d st=%d\n", + map->unmap_ops[offset+i].handle, + map->unmap_ops[offset+i].status); + map->unmap_ops[offset+i].handle = -1; } return err; } +static int unmap_grant_pages(struct grant_map *map, int offset, int pages) +{ + int range, err = 0; + + pr_debug("unmap %d+%d [%d+%d]\n", map->index, map->count, offset, pages); + + /* It is possible the requested range will have a "hole" where we + * already unmapped some of the grants. Only unmap valid ranges. + */ + while (pages && !err) { + while (pages && map->unmap_ops[offset].handle == -1) { + offset++; + pages--; + } + range = 0; + while (range < pages) { + if (map->unmap_ops[offset+range].handle == -1) { + range--; + break; + } + range++; + } + err = __unmap_grant_pages(map, offset, range); + offset += range; + pages -= range; + } + + return err; +} + /* ------------------------------------------------------------------ */ static void gntdev_vma_close(struct vm_area_struct *vma) @@ -287,22 +335,13 @@ static void gntdev_vma_close(struct vm_area_struct *vma) struct grant_map *map = vma->vm_private_data; pr_debug("close %p\n", vma); - map->is_mapped = 0; map->vma = NULL; vma->vm_private_data = NULL; -} - -static int gntdev_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - pr_debug("vaddr %p, pgoff %ld (shouldn't happen)\n", - vmf->virtual_address, vmf->pgoff); - vmf->flags = VM_FAULT_ERROR; - return 0; + gntdev_put_map(map); } static struct vm_operations_struct gntdev_vmops = { .close = gntdev_vma_close, - .fault = gntdev_vma_fault, }; /* ------------------------------------------------------------------ */ @@ -320,8 +359,6 @@ static void mn_invl_range_start(struct mmu_notifier *mn, list_for_each_entry(map, &priv->maps, next) { if (!map->vma) continue; - if (!map->is_mapped) - continue; if (map->vma->vm_start >= end) continue; if (map->vma->vm_end <= start) @@ -386,16 +423,17 @@ static int gntdev_open(struct inode *inode, struct file *flip) INIT_LIST_HEAD(&priv->maps); spin_lock_init(&priv->lock); - priv->limit = limit; - priv->mm = get_task_mm(current); - if (!priv->mm) { - kfree(priv); - return -ENOMEM; + if (use_ptemod) { + priv->mm = get_task_mm(current); + if (!priv->mm) { + kfree(priv); + return -ENOMEM; + } + priv->mn.ops = &gntdev_mmu_ops; + ret = mmu_notifier_register(&priv->mn, priv->mm); + mmput(priv->mm); } - priv->mn.ops = &gntdev_mmu_ops; - ret = mmu_notifier_register(&priv->mn, priv->mm); - mmput(priv->mm); if (ret) { kfree(priv); @@ -412,21 +450,19 @@ static int gntdev_release(struct inode *inode, struct file *flip) { struct gntdev_priv *priv = flip->private_data; struct grant_map *map; - int err; pr_debug("priv %p\n", priv); spin_lock(&priv->lock); while (!list_empty(&priv->maps)) { map = list_entry(priv->maps.next, struct grant_map, next); - err = gntdev_del_map(map); - if (WARN_ON(err)) - gntdev_free_map(map); - + list_del(&map->next); + gntdev_put_map(map); } spin_unlock(&priv->lock); - mmu_notifier_unregister(&priv->mn, priv->mm); + if (use_ptemod) + mmu_notifier_unregister(&priv->mn, priv->mm); kfree(priv); return 0; } @@ -443,16 +479,21 @@ static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv, pr_debug("priv %p, add %d\n", priv, op.count); if (unlikely(op.count <= 0)) return -EINVAL; - if (unlikely(op.count > priv->limit)) - return -EINVAL; err = -ENOMEM; map = gntdev_alloc_map(priv, op.count); if (!map) return err; + + if (unlikely(atomic_add_return(op.count, &pages_mapped) > limit)) { + pr_debug("can't map: over limit\n"); + gntdev_put_map(map); + return err; + } + if (copy_from_user(map->grants, &u->refs, sizeof(map->grants[0]) * op.count) != 0) { - gntdev_free_map(map); + gntdev_put_map(map); return err; } @@ -461,13 +502,9 @@ static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv, op.index = map->index << PAGE_SHIFT; spin_unlock(&priv->lock); - if (copy_to_user(u, &op, sizeof(op)) != 0) { - spin_lock(&priv->lock); - gntdev_del_map(map); - spin_unlock(&priv->lock); - gntdev_free_map(map); - return err; - } + if (copy_to_user(u, &op, sizeof(op)) != 0) + return -EFAULT; + return 0; } @@ -484,11 +521,12 @@ static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv, spin_lock(&priv->lock); map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count); - if (map) - err = gntdev_del_map(map); + if (map) { + list_del(&map->next); + gntdev_put_map(map); + err = 0; + } spin_unlock(&priv->lock); - if (!err) - gntdev_free_map(map); return err; } @@ -496,43 +534,66 @@ static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv, struct ioctl_gntdev_get_offset_for_vaddr __user *u) { struct ioctl_gntdev_get_offset_for_vaddr op; + struct vm_area_struct *vma; struct grant_map *map; if (copy_from_user(&op, u, sizeof(op)) != 0) return -EFAULT; pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr); - spin_lock(&priv->lock); - map = gntdev_find_map_vaddr(priv, op.vaddr); - if (map == NULL || - map->vma->vm_start != op.vaddr) { - spin_unlock(&priv->lock); + vma = find_vma(current->mm, op.vaddr); + if (!vma || vma->vm_ops != &gntdev_vmops) return -EINVAL; - } + + map = vma->vm_private_data; + if (!map) + return -EINVAL; + op.offset = map->index << PAGE_SHIFT; op.count = map->count; - spin_unlock(&priv->lock); if (copy_to_user(u, &op, sizeof(op)) != 0) return -EFAULT; return 0; } -static long gntdev_ioctl_set_max_grants(struct gntdev_priv *priv, - struct ioctl_gntdev_set_max_grants __user *u) +static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u) { - struct ioctl_gntdev_set_max_grants op; + struct ioctl_gntdev_unmap_notify op; + struct grant_map *map; + int rc; - if (copy_from_user(&op, u, sizeof(op)) != 0) + if (copy_from_user(&op, u, sizeof(op))) return -EFAULT; - pr_debug("priv %p, limit %d\n", priv, op.count); - if (op.count > limit) - return -E2BIG; + + if (op.action & ~(UNMAP_NOTIFY_CLEAR_BYTE|UNMAP_NOTIFY_SEND_EVENT)) + return -EINVAL; spin_lock(&priv->lock); - priv->limit = op.count; + + list_for_each_entry(map, &priv->maps, next) { + uint64_t begin = map->index << PAGE_SHIFT; + uint64_t end = (map->index + map->count) << PAGE_SHIFT; + if (op.index >= begin && op.index < end) + goto found; + } + rc = -ENOENT; + goto unlock_out; + + found: + if ((op.action & UNMAP_NOTIFY_CLEAR_BYTE) && + (map->flags & GNTMAP_readonly)) { + rc = -EINVAL; + goto unlock_out; + } + + map->notify.flags = op.action; + map->notify.addr = op.index - (map->index << PAGE_SHIFT); + map->notify.event = op.event_channel_port; + rc = 0; + unlock_out: spin_unlock(&priv->lock); - return 0; + return rc; } static long gntdev_ioctl(struct file *flip, @@ -551,8 +612,8 @@ static long gntdev_ioctl(struct file *flip, case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR: return gntdev_ioctl_get_offset_for_vaddr(priv, ptr); - case IOCTL_GNTDEV_SET_MAX_GRANTS: - return gntdev_ioctl_set_max_grants(priv, ptr); + case IOCTL_GNTDEV_SET_UNMAP_NOTIFY: + return gntdev_ioctl_notify(priv, ptr); default: pr_debug("priv %p, unknown cmd %x\n", priv, cmd); @@ -568,7 +629,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) int index = vma->vm_pgoff; int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; struct grant_map *map; - int err = -EINVAL; + int i, err = -EINVAL; if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED)) return -EINVAL; @@ -580,47 +641,70 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) map = gntdev_find_map_index(priv, index, count); if (!map) goto unlock_out; - if (map->vma) + if (use_ptemod && map->vma) goto unlock_out; - if (priv->mm != vma->vm_mm) { + if (use_ptemod && priv->mm != vma->vm_mm) { printk(KERN_WARNING "Huh? Other mm?\n"); goto unlock_out; } + atomic_inc(&map->users); + vma->vm_ops = &gntdev_vmops; vma->vm_flags |= VM_RESERVED|VM_DONTCOPY|VM_DONTEXPAND|VM_PFNMAP; vma->vm_private_data = map; - map->vma = vma; - map->flags = GNTMAP_host_map | GNTMAP_application_map; - if (!(vma->vm_flags & VM_WRITE)) - map->flags |= GNTMAP_readonly; + if (use_ptemod) + map->vma = vma; + + if (map->flags) { + if ((vma->vm_flags & VM_WRITE) && + (map->flags & GNTMAP_readonly)) + return -EINVAL; + } else { + map->flags = GNTMAP_host_map; + if (!(vma->vm_flags & VM_WRITE)) + map->flags |= GNTMAP_readonly; + } spin_unlock(&priv->lock); - err = apply_to_page_range(vma->vm_mm, vma->vm_start, - vma->vm_end - vma->vm_start, - find_grant_ptes, map); - if (err) { - printk(KERN_WARNING "find_grant_ptes() failure.\n"); - return err; + if (use_ptemod) { + err = apply_to_page_range(vma->vm_mm, vma->vm_start, + vma->vm_end - vma->vm_start, + find_grant_ptes, map); + if (err) { + printk(KERN_WARNING "find_grant_ptes() failure.\n"); + goto out_put_map; + } } err = map_grant_pages(map); - if (err) { - printk(KERN_WARNING "map_grant_pages() failure.\n"); - return err; - } + if (err) + goto out_put_map; - map->is_mapped = 1; + if (!use_ptemod) { + for (i = 0; i < count; i++) { + err = vm_insert_page(vma, vma->vm_start + i*PAGE_SIZE, + map->pages[i]); + if (err) + goto out_put_map; + } + } return 0; unlock_out: spin_unlock(&priv->lock); return err; + +out_put_map: + if (use_ptemod) + map->vma = NULL; + gntdev_put_map(map); + return err; } static const struct file_operations gntdev_fops = { @@ -646,6 +730,8 @@ static int __init gntdev_init(void) if (!xen_domain()) return -ENODEV; + use_ptemod = xen_pv_domain(); + err = misc_register(&gntdev_miscdev); if (err != 0) { printk(KERN_ERR "Could not register gntdev device\n"); diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 9ef54eb..3745a31 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -458,7 +458,14 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, if (ret) return ret; + if (xen_feature(XENFEAT_auto_translated_physmap)) + return ret; + for (i = 0; i < count; i++) { + /* Do not add to override if the map failed. */ + if (map_ops[i].status) + continue; + /* m2p override only supported for GNTMAP_contains_pte mappings */ if (!(map_ops[i].flags & GNTMAP_contains_pte)) continue; @@ -483,6 +490,9 @@ int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, if (ret) return ret; + if (xen_feature(XENFEAT_auto_translated_physmap)) + return ret; + for (i = 0; i < count; i++) { ret = m2p_remove_override(pages[i]); if (ret) diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c new file mode 100644 index 0000000..a4ff225 --- /dev/null +++ b/drivers/xen/xen-balloon.c @@ -0,0 +1,256 @@ +/****************************************************************************** + * Xen balloon driver - enables returning/claiming memory to/from Xen. + * + * Copyright (c) 2003, B Dragovic + * Copyright (c) 2003-2004, M Williamson, K Fraser + * Copyright (c) 2005 Dan M. Smith, IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sysdev.h> +#include <linux/capability.h> + +#include <xen/xen.h> +#include <xen/interface/xen.h> +#include <xen/balloon.h> +#include <xen/xenbus.h> +#include <xen/features.h> +#include <xen/page.h> + +#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) + +#define BALLOON_CLASS_NAME "xen_memory" + +static struct sys_device balloon_sysdev; + +static int register_balloon(struct sys_device *sysdev); + +static struct xenbus_watch target_watch = +{ + .node = "memory/target" +}; + +/* React to a change in the target key */ +static void watch_target(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + unsigned long long new_target; + int err; + + err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target); + if (err != 1) { + /* This is ok (for domain0 at least) - so just return */ + return; + } + + /* The given memory/target value is in KiB, so it needs converting to + * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10. + */ + balloon_set_new_target(new_target >> (PAGE_SHIFT - 10)); +} + +static int balloon_init_watcher(struct notifier_block *notifier, + unsigned long event, + void *data) +{ + int err; + + err = register_xenbus_watch(&target_watch); + if (err) + printk(KERN_ERR "Failed to set balloon watcher\n"); + + return NOTIFY_DONE; +} + +static struct notifier_block xenstore_notifier; + +static int __init balloon_init(void) +{ + if (!xen_domain()) + return -ENODEV; + + pr_info("xen-balloon: Initialising balloon driver.\n"); + + register_balloon(&balloon_sysdev); + + target_watch.callback = watch_target; + xenstore_notifier.notifier_call = balloon_init_watcher; + + register_xenstore_notifier(&xenstore_notifier); + + return 0; +} +subsys_initcall(balloon_init); + +static void balloon_exit(void) +{ + /* XXX - release balloon here */ + return; +} + +module_exit(balloon_exit); + +#define BALLOON_SHOW(name, format, args...) \ + static ssize_t show_##name(struct sys_device *dev, \ + struct sysdev_attribute *attr, \ + char *buf) \ + { \ + return sprintf(buf, format, ##args); \ + } \ + static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL) + +BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages)); +BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_low)); +BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high)); + +static SYSDEV_ULONG_ATTR(schedule_delay, 0444, balloon_stats.schedule_delay); +static SYSDEV_ULONG_ATTR(max_schedule_delay, 0644, balloon_stats.max_schedule_delay); +static SYSDEV_ULONG_ATTR(retry_count, 0444, balloon_stats.retry_count); +static SYSDEV_ULONG_ATTR(max_retry_count, 0644, balloon_stats.max_retry_count); + +static ssize_t show_target_kb(struct sys_device *dev, struct sysdev_attribute *attr, + char *buf) +{ + return sprintf(buf, "%lu\n", PAGES2KB(balloon_stats.target_pages)); +} + +static ssize_t store_target_kb(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, + size_t count) +{ + char *endchar; + unsigned long long target_bytes; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + target_bytes = simple_strtoull(buf, &endchar, 0) * 1024; + + balloon_set_new_target(target_bytes >> PAGE_SHIFT); + + return count; +} + +static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR, + show_target_kb, store_target_kb); + + +static ssize_t show_target(struct sys_device *dev, struct sysdev_attribute *attr, + char *buf) +{ + return sprintf(buf, "%llu\n", + (unsigned long long)balloon_stats.target_pages + << PAGE_SHIFT); +} + +static ssize_t store_target(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, + size_t count) +{ + char *endchar; + unsigned long long target_bytes; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + target_bytes = memparse(buf, &endchar); + + balloon_set_new_target(target_bytes >> PAGE_SHIFT); + + return count; +} + +static SYSDEV_ATTR(target, S_IRUGO | S_IWUSR, + show_target, store_target); + + +static struct sysdev_attribute *balloon_attrs[] = { + &attr_target_kb, + &attr_target, + &attr_schedule_delay.attr, + &attr_max_schedule_delay.attr, + &attr_retry_count.attr, + &attr_max_retry_count.attr +}; + +static struct attribute *balloon_info_attrs[] = { + &attr_current_kb.attr, + &attr_low_kb.attr, + &attr_high_kb.attr, + NULL +}; + +static struct attribute_group balloon_info_group = { + .name = "info", + .attrs = balloon_info_attrs +}; + +static struct sysdev_class balloon_sysdev_class = { + .name = BALLOON_CLASS_NAME +}; + +static int register_balloon(struct sys_device *sysdev) +{ + int i, error; + + error = sysdev_class_register(&balloon_sysdev_class); + if (error) + return error; + + sysdev->id = 0; + sysdev->cls = &balloon_sysdev_class; + + error = sysdev_register(sysdev); + if (error) { + sysdev_class_unregister(&balloon_sysdev_class); + return error; + } + + for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) { + error = sysdev_create_file(sysdev, balloon_attrs[i]); + if (error) + goto fail; + } + + error = sysfs_create_group(&sysdev->kobj, &balloon_info_group); + if (error) + goto fail; + + return 0; + + fail: + while (--i >= 0) + sysdev_remove_file(sysdev, balloon_attrs[i]); + sysdev_unregister(sysdev); + sysdev_class_unregister(&balloon_sysdev_class); + return error; +} + +MODULE_LICENSE("GPL"); diff --git a/include/xen/balloon.h b/include/xen/balloon.h new file mode 100644 index 0000000..a2b22f0 --- /dev/null +++ b/include/xen/balloon.h @@ -0,0 +1,25 @@ +/****************************************************************************** + * Xen balloon functionality + */ + +#define RETRY_UNLIMITED 0 + +struct balloon_stats { + /* We aim for 'current allocation' == 'target allocation'. */ + unsigned long current_pages; + unsigned long target_pages; + /* Number of pages in high- and low-memory balloons. */ + unsigned long balloon_low; + unsigned long balloon_high; + unsigned long schedule_delay; + unsigned long max_schedule_delay; + unsigned long retry_count; + unsigned long max_retry_count; +}; + +extern struct balloon_stats balloon_stats; + +void balloon_set_new_target(unsigned long target); + +int alloc_xenballooned_pages(int nr_pages, struct page** pages); +void free_xenballooned_pages(int nr_pages, struct page** pages); diff --git a/include/xen/gntalloc.h b/include/xen/gntalloc.h new file mode 100644 index 0000000..76bd580 --- /dev/null +++ b/include/xen/gntalloc.h @@ -0,0 +1,82 @@ +/****************************************************************************** + * gntalloc.h + * + * Interface to /dev/xen/gntalloc. + * + * Author: Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * This file is in the public domain. + */ + +#ifndef __LINUX_PUBLIC_GNTALLOC_H__ +#define __LINUX_PUBLIC_GNTALLOC_H__ + +/* + * Allocates a new page and creates a new grant reference. + */ +#define IOCTL_GNTALLOC_ALLOC_GREF \ +_IOC(_IOC_NONE, 'G', 5, sizeof(struct ioctl_gntalloc_alloc_gref)) +struct ioctl_gntalloc_alloc_gref { + /* IN parameters */ + /* The ID of the domain to be given access to the grants. */ + uint16_t domid; + /* Flags for this mapping */ + uint16_t flags; + /* Number of pages to map */ + uint32_t count; + /* OUT parameters */ + /* The offset to be used on a subsequent call to mmap(). */ + uint64_t index; + /* The grant references of the newly created grant, one per page */ + /* Variable size, depending on count */ + uint32_t gref_ids[1]; +}; + +#define GNTALLOC_FLAG_WRITABLE 1 + +/* + * Deallocates the grant reference, allowing the associated page to be freed if + * no other domains are using it. + */ +#define IOCTL_GNTALLOC_DEALLOC_GREF \ +_IOC(_IOC_NONE, 'G', 6, sizeof(struct ioctl_gntalloc_dealloc_gref)) +struct ioctl_gntalloc_dealloc_gref { + /* IN parameters */ + /* The offset returned in the map operation */ + uint64_t index; + /* Number of references to unmap */ + uint32_t count; +}; + +/* + * Sets up an unmap notification within the page, so that the other side can do + * cleanup if this side crashes. Required to implement cross-domain robust + * mutexes or close notification on communication channels. + * + * Each mapped page only supports one notification; multiple calls referring to + * the same page overwrite the previous notification. You must clear the + * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it + * to occur. + */ +#define IOCTL_GNTALLOC_SET_UNMAP_NOTIFY \ +_IOC(_IOC_NONE, 'G', 7, sizeof(struct ioctl_gntalloc_unmap_notify)) +struct ioctl_gntalloc_unmap_notify { + /* IN parameters */ + /* Offset in the file descriptor for a byte within the page (same as + * used in mmap). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to + * be cleared. Otherwise, it can be any byte in the page whose + * notification we are adjusting. + */ + uint64_t index; + /* Action(s) to take on unmap */ + uint32_t action; + /* Event channel to notify */ + uint32_t event_channel_port; +}; + +/* Clear (set to zero) the byte specified by index */ +#define UNMAP_NOTIFY_CLEAR_BYTE 0x1 +/* Send an interrupt on the indicated event channel */ +#define UNMAP_NOTIFY_SEND_EVENT 0x2 + +#endif /* __LINUX_PUBLIC_GNTALLOC_H__ */ diff --git a/include/xen/gntdev.h b/include/xen/gntdev.h index eb23f41..5304bd3 100644 --- a/include/xen/gntdev.h +++ b/include/xen/gntdev.h @@ -116,4 +116,35 @@ struct ioctl_gntdev_set_max_grants { uint32_t count; }; +/* + * Sets up an unmap notification within the page, so that the other side can do + * cleanup if this side crashes. Required to implement cross-domain robust + * mutexes or close notification on communication channels. + * + * Each mapped page only supports one notification; multiple calls referring to + * the same page overwrite the previous notification. You must clear the + * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it + * to occur. + */ +#define IOCTL_GNTDEV_SET_UNMAP_NOTIFY \ +_IOC(_IOC_NONE, 'G', 7, sizeof(struct ioctl_gntdev_unmap_notify)) +struct ioctl_gntdev_unmap_notify { + /* IN parameters */ + /* Offset in the file descriptor for a byte within the page (same as + * used in mmap). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to + * be cleared. Otherwise, it can be any byte in the page whose + * notification we are adjusting. + */ + uint64_t index; + /* Action(s) to take on unmap */ + uint32_t action; + /* Event channel to notify */ + uint32_t event_channel_port; +}; + +/* Clear (set to zero) the byte specified by index */ +#define UNMAP_NOTIFY_CLEAR_BYTE 0x1 +/* Send an interrupt on the indicated event channel */ +#define UNMAP_NOTIFY_SEND_EVENT 0x2 + #endif /* __LINUX_PUBLIC_GNTDEV_H__ */ |