diff options
219 files changed, 7461 insertions, 5640 deletions
diff --git a/Documentation/filesystems/9p.txt b/Documentation/filesystems/9p.txt index b90f537..bf80806 100644 --- a/Documentation/filesystems/9p.txt +++ b/Documentation/filesystems/9p.txt @@ -42,10 +42,12 @@ OPTIONS trans=name select an alternative transport. Valid options are currently: - unix - specifying a named pipe mount point - tcp - specifying a normal TCP/IP connection - fd - used passed file descriptors for connection + unix - specifying a named pipe mount point + tcp - specifying a normal TCP/IP connection + fd - used passed file descriptors for connection (see rfdno and wfdno) + virtio - connect to the next virtio channel available + (from lguest or KVM with trans_virtio module) uname=name user name to attempt mount as on the remote server. The server may override or ignore this value. Certain user diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile index c0b7a45..bac037e 100644 --- a/Documentation/lguest/Makefile +++ b/Documentation/lguest/Makefile @@ -1,28 +1,8 @@ # This creates the demonstration utility "lguest" which runs a Linux guest. - -# For those people that have a separate object dir, look there for .config -KBUILD_OUTPUT := ../.. -ifdef O - ifeq ("$(origin O)", "command line") - KBUILD_OUTPUT := $(O) - endif -endif -# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary. -include $(KBUILD_OUTPUT)/.config -LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000) - -CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -Wl,-T,lguest.lds +CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include LDLIBS:=-lz -# Removing this works for some versions of ld.so (eg. Ubuntu Feisty) and -# not others (eg. FC7). -LDFLAGS+=-static -all: lguest.lds lguest -# The linker script on x86 is so complex the only way of creating one -# which will link our binary in the right place is to mangle the -# default one. -lguest.lds: - $(LD) --verbose | awk '/^==========/ { PRINT=1; next; } /SIZEOF_HEADERS/ { gsub(/0x[0-9A-F]*/, "$(LGUEST_GUEST_TOP)") } { if (PRINT) print $$0; }' > $@ +all: lguest clean: - rm -f lguest.lds lguest + rm -f lguest diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 103e346..5bdc37f 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -1,10 +1,7 @@ /*P:100 This is the Launcher code, a simple program which lays out the * "physical" memory for the new Guest by mapping the kernel image and the * virtual devices, then reads repeatedly from /dev/lguest to run the Guest. - * - * The only trick: the Makefile links it at a high address so it will be clear - * of the guest memory region. It means that each Guest cannot have more than - * about 2.5G of memory on a normally configured Host. :*/ +:*/ #define _LARGEFILE64_SOURCE #define _GNU_SOURCE #include <stdio.h> @@ -15,6 +12,7 @@ #include <stdlib.h> #include <elf.h> #include <sys/mman.h> +#include <sys/param.h> #include <sys/types.h> #include <sys/stat.h> #include <sys/wait.h> @@ -34,7 +32,9 @@ #include <termios.h> #include <getopt.h> #include <zlib.h> -/*L:110 We can ignore the 28 include files we need for this program, but I do +#include <assert.h> +#include <sched.h> +/*L:110 We can ignore the 30 include files we need for this program, but I do * want to draw attention to the use of kernel-style types. * * As Linus said, "C is a Spartan language, and so should your naming be." I @@ -45,8 +45,14 @@ typedef unsigned long long u64; typedef uint32_t u32; typedef uint16_t u16; typedef uint8_t u8; -#include "../../include/linux/lguest_launcher.h" -#include "../../include/asm-x86/e820_32.h" +#include "linux/lguest_launcher.h" +#include "linux/pci_ids.h" +#include "linux/virtio_config.h" +#include "linux/virtio_net.h" +#include "linux/virtio_blk.h" +#include "linux/virtio_console.h" +#include "linux/virtio_ring.h" +#include "asm-x86/bootparam.h" /*:*/ #define PAGE_PRESENT 0x7 /* Present, RW, Execute */ @@ -55,6 +61,10 @@ typedef uint8_t u8; #ifndef SIOCBRADDIF #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ #endif +/* We can have up to 256 pages for devices. */ +#define DEVICE_PAGES 256 +/* This fits nicely in a single 4096-byte page. */ +#define VIRTQUEUE_NUM 127 /*L:120 verbose is both a global flag and a macro. The C preprocessor allows * this, and although I wouldn't recommend it, it works quite nicely here. */ @@ -65,8 +75,10 @@ static bool verbose; /* The pipe to send commands to the waker process */ static int waker_fd; -/* The top of guest physical memory. */ -static u32 top; +/* The pointer to the start of guest memory. */ +static void *guest_base; +/* The maximum guest physical address allowed, and maximum possible. */ +static unsigned long guest_limit, guest_max; /* This is our list of devices. */ struct device_list @@ -76,8 +88,17 @@ struct device_list fd_set infds; int max_infd; + /* Counter to assign interrupt numbers. */ + unsigned int next_irq; + + /* Counter to print out convenient device numbers. */ + unsigned int device_num; + /* The descriptor page for the devices. */ - struct lguest_device_desc *descs; + u8 *descpage; + + /* The tail of the last descriptor. */ + unsigned int desc_used; /* A single linked list of devices. */ struct device *dev; @@ -85,31 +106,111 @@ struct device_list struct device **lastdev; }; +/* The list of Guest devices, based on command line arguments. */ +static struct device_list devices; + /* The device structure describes a single device. */ struct device { /* The linked-list pointer. */ struct device *next; - /* The descriptor for this device, as mapped into the Guest. */ + + /* The this device's descriptor, as mapped into the Guest. */ struct lguest_device_desc *desc; - /* The memory page(s) of this device, if any. Also mapped in Guest. */ - void *mem; + + /* The name of this device, for --verbose. */ + const char *name; /* If handle_input is set, it wants to be called when this file * descriptor is ready. */ int fd; bool (*handle_input)(int fd, struct device *me); - /* If handle_output is set, it wants to be called when the Guest sends - * DMA to this key. */ - unsigned long watch_key; - u32 (*handle_output)(int fd, const struct iovec *iov, - unsigned int num, struct device *me); + /* Any queues attached to this device */ + struct virtqueue *vq; /* Device-specific data. */ void *priv; }; +/* The virtqueue structure describes a queue attached to a device. */ +struct virtqueue +{ + struct virtqueue *next; + + /* Which device owns me. */ + struct device *dev; + + /* The configuration for this queue. */ + struct lguest_vqconfig config; + + /* The actual ring of buffers. */ + struct vring vring; + + /* Last available index we saw. */ + u16 last_avail_idx; + + /* The routine to call when the Guest pings us. */ + void (*handle_output)(int fd, struct virtqueue *me); +}; + +/* Since guest is UP and we don't run at the same time, we don't need barriers. + * But I include them in the code in case others copy it. */ +#define wmb() + +/* Convert an iovec element to the given type. + * + * This is a fairly ugly trick: we need to know the size of the type and + * alignment requirement to check the pointer is kosher. It's also nice to + * have the name of the type in case we report failure. + * + * Typing those three things all the time is cumbersome and error prone, so we + * have a macro which sets them all up and passes to the real function. */ +#define convert(iov, type) \ + ((type *)_convert((iov), sizeof(type), __alignof__(type), #type)) + +static void *_convert(struct iovec *iov, size_t size, size_t align, + const char *name) +{ + if (iov->iov_len != size) + errx(1, "Bad iovec size %zu for %s", iov->iov_len, name); + if ((unsigned long)iov->iov_base % align != 0) + errx(1, "Bad alignment %p for %s", iov->iov_base, name); + return iov->iov_base; +} + +/* The virtio configuration space is defined to be little-endian. x86 is + * little-endian too, but it's nice to be explicit so we have these helpers. */ +#define cpu_to_le16(v16) (v16) +#define cpu_to_le32(v32) (v32) +#define cpu_to_le64(v64) (v64) +#define le16_to_cpu(v16) (v16) +#define le32_to_cpu(v32) (v32) +#define le64_to_cpu(v32) (v64) + +/*L:100 The Launcher code itself takes us out into userspace, that scary place + * where pointers run wild and free! Unfortunately, like most userspace + * programs, it's quite boring (which is why everyone likes to hack on the + * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it + * will get you through this section. Or, maybe not. + * + * The Launcher sets up a big chunk of memory to be the Guest's "physical" + * memory and stores it in "guest_base". In other words, Guest physical == + * Launcher virtual with an offset. + * + * This can be tough to get your head around, but usually it just means that we + * use these trivial conversion functions when the Guest gives us it's + * "physical" addresses: */ +static void *from_guest_phys(unsigned long addr) +{ + return guest_base + addr; +} + +static unsigned long to_guest_phys(const void *addr) +{ + return (addr - guest_base); +} + /*L:130 * Loading the Kernel. * @@ -123,43 +224,55 @@ static int open_or_die(const char *name, int flags) return fd; } -/* map_zeroed_pages() takes a (page-aligned) address and a number of pages. */ -static void *map_zeroed_pages(unsigned long addr, unsigned int num) +/* map_zeroed_pages() takes a number of pages. */ +static void *map_zeroed_pages(unsigned int num) { - /* We cache the /dev/zero file-descriptor so we only open it once. */ - static int fd = -1; - - if (fd == -1) - fd = open_or_die("/dev/zero", O_RDONLY); + int fd = open_or_die("/dev/zero", O_RDONLY); + void *addr; /* We use a private mapping (ie. if we write to the page, it will be - * copied), and obviously we insist that it be mapped where we ask. */ - if (mmap((void *)addr, getpagesize() * num, - PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, 0) - != (void *)addr) - err(1, "Mmaping %u pages of /dev/zero @%p", num, (void *)addr); - - /* Returning the address is just a courtesy: can simplify callers. */ - return (void *)addr; + * copied). */ + addr = mmap(NULL, getpagesize() * num, + PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0); + if (addr == MAP_FAILED) + err(1, "Mmaping %u pages of /dev/zero", num); + + return addr; } -/* To find out where to start we look for the magic Guest string, which marks - * the code we see in lguest_asm.S. This is a hack which we are currently - * plotting to replace with the normal Linux entry point. */ -static unsigned long entry_point(void *start, void *end, - unsigned long page_offset) +/* Get some more pages for a device. */ +static void *get_pages(unsigned int num) { - void *p; + void *addr = from_guest_phys(guest_limit); - /* The scan gives us the physical starting address. We want the - * virtual address in this case, and fortunately, we already figured - * out the physical-virtual difference and passed it here in - * "page_offset". */ - for (p = start; p < end; p++) - if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0) - return (long)p + strlen("GenuineLguest") + page_offset; + guest_limit += num * getpagesize(); + if (guest_limit > guest_max) + errx(1, "Not enough memory for devices"); + return addr; +} - err(1, "Is this image a genuine lguest?"); +/* This routine is used to load the kernel or initrd. It tries mmap, but if + * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries), + * it falls back to reading the memory in. */ +static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) +{ + ssize_t r; + + /* We map writable even though for some segments are marked read-only. + * The kernel really wants to be writable: it patches its own + * instructions. + * + * MAP_PRIVATE means that the page won't be copied until a write is + * done to it. This allows us to share untouched memory between + * Guests. */ + if (mmap(addr, len, PROT_READ|PROT_WRITE|PROT_EXEC, + MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED) + return; + + /* pread does a seek and a read in one shot: saves a few lines. */ + r = pread(fd, addr, len, offset); + if (r != len) + err(1, "Reading offset %lu len %lu gave %zi", offset, len, r); } /* This routine takes an open vmlinux image, which is in ELF, and maps it into @@ -167,19 +280,14 @@ static unsigned long entry_point(void *start, void *end, * by all modern binaries on Linux including the kernel. * * The ELF headers give *two* addresses: a physical address, and a virtual - * address. The Guest kernel expects to be placed in memory at the physical - * address, and the page tables set up so it will correspond to that virtual - * address. We return the difference between the virtual and physical - * addresses in the "page_offset" pointer. + * address. We use the physical address; the Guest will map itself to the + * virtual address. * * We return the starting address. */ -static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, - unsigned long *page_offset) +static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) { - void *addr; Elf32_Phdr phdr[ehdr->e_phnum]; unsigned int i; - unsigned long start = -1UL, end = 0; /* Sanity checks on the main ELF header: an x86 executable with a * reasonable number of correctly-sized program headers. */ @@ -199,9 +307,6 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) err(1, "Reading program headers"); - /* We don't know page_offset yet. */ - *page_offset = 0; - /* Try all the headers: there are usually only three. A read-only one, * a read-write one, and a "note" section which isn't loadable. */ for (i = 0; i < ehdr->e_phnum; i++) { @@ -212,158 +317,53 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, verbose("Section %i: size %i addr %p\n", i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); - /* We expect a simple linear address space: every segment must - * have the same difference between virtual (p_vaddr) and - * physical (p_paddr) address. */ - if (!*page_offset) - *page_offset = phdr[i].p_vaddr - phdr[i].p_paddr; - else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr) - errx(1, "Page offset of section %i different", i); - - /* We track the first and last address we mapped, so we can - * tell entry_point() where to scan. */ - if (phdr[i].p_paddr < start) - start = phdr[i].p_paddr; - if (phdr[i].p_paddr + phdr[i].p_filesz > end) - end = phdr[i].p_paddr + phdr[i].p_filesz; - - /* We map this section of the file at its physical address. We - * map it read & write even if the header says this segment is - * read-only. The kernel really wants to be writable: it - * patches its own instructions which would normally be - * read-only. - * - * MAP_PRIVATE means that the page won't be copied until a - * write is done to it. This allows us to share much of the - * kernel memory between Guests. */ - addr = mmap((void *)phdr[i].p_paddr, - phdr[i].p_filesz, - PROT_READ|PROT_WRITE|PROT_EXEC, - MAP_FIXED|MAP_PRIVATE, - elf_fd, phdr[i].p_offset); - if (addr != (void *)phdr[i].p_paddr) - err(1, "Mmaping vmlinux seg %i gave %p not %p", - i, addr, (void *)phdr[i].p_paddr); + /* We map this section of the file at its physical address. */ + map_at(elf_fd, from_guest_phys(phdr[i].p_paddr), + phdr[i].p_offset, phdr[i].p_filesz); } - return entry_point((void *)start, (void *)end, *page_offset); + /* The entry point is given in the ELF header. */ + return ehdr->e_entry; } -/*L:170 Prepare to be SHOCKED and AMAZED. And possibly a trifle nauseated. - * - * We know that CONFIG_PAGE_OFFSET sets what virtual address the kernel expects - * to be. We don't know what that option was, but we can figure it out - * approximately by looking at the addresses in the code. I chose the common - * case of reading a memory location into the %eax register: - * - * movl <some-address>, %eax - * - * This gets encoded as five bytes: "0xA1 <4-byte-address>". For example, - * "0xA1 0x18 0x60 0x47 0xC0" reads the address 0xC0476018 into %eax. - * - * In this example can guess that the kernel was compiled with - * CONFIG_PAGE_OFFSET set to 0xC0000000 (it's always a round number). If the - * kernel were larger than 16MB, we might see 0xC1 addresses show up, but our - * kernel isn't that bloated yet. - * - * Unfortunately, x86 has variable-length instructions, so finding this - * particular instruction properly involves writing a disassembler. Instead, - * we rely on statistics. We look for "0xA1" and tally the different bytes - * which occur 4 bytes later (the "0xC0" in our example above). When one of - * those bytes appears three times, we can be reasonably confident that it - * forms the start of CONFIG_PAGE_OFFSET. +/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're + * supposed to jump into it and it will unpack itself. We used to have to + * perform some hairy magic because the unpacking code scared me. * - * This is amazingly reliable. */ -static unsigned long intuit_page_offset(unsigned char *img, unsigned long len) + * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote + * a small patch to jump over the tricky bits in the Guest, so now we just read + * the funky header so we know where in the file to load, and away we go! */ +static unsigned long load_bzimage(int fd) { - unsigned int i, possibilities[256] = { 0 }; + struct boot_params boot; + int r; + /* Modern bzImages get loaded at 1M. */ + void *p = from_guest_phys(0x100000); - for (i = 0; i + 4 < len; i++) { - /* mov 0xXXXXXXXX,%eax */ - if (img[i] == 0xA1 && ++possibilities[img[i+4]] > 3) - return (unsigned long)img[i+4] << 24; - } - errx(1, "could not determine page offset"); -} + /* Go back to the start of the file and read the header. It should be + * a Linux boot header (see Documentation/i386/boot.txt) */ + lseek(fd, 0, SEEK_SET); + read(fd, &boot, sizeof(boot)); -/*L:160 Unfortunately the entire ELF image isn't compressed: the segments - * which need loading are extracted and compressed raw. This denies us the - * information we need to make a fully-general loader. */ -static unsigned long unpack_bzimage(int fd, unsigned long *page_offset) -{ - gzFile f; - int ret, len = 0; - /* A bzImage always gets loaded at physical address 1M. This is - * actually configurable as CONFIG_PHYSICAL_START, but as the comment - * there says, "Don't change this unless you know what you are doing". - * Indeed. */ - void *img = (void *)0x100000; - - /* gzdopen takes our file descriptor (carefully placed at the start of - * the GZIP header we found) and returns a gzFile. */ - f = gzdopen(fd, "rb"); - /* We read it into memory in 64k chunks until we hit the end. */ - while ((ret = gzread(f, img + len, 65536)) > 0) - len += ret; - if (ret < 0) - err(1, "reading image from bzImage"); - - verbose("Unpacked size %i addr %p\n", len, img); - - /* Without the ELF header, we can't tell virtual-physical gap. This is - * CONFIG_PAGE_OFFSET, and people do actually change it. Fortunately, - * I have a clever way of figuring it out from the code itself. */ - *page_offset = intuit_page_offset(img, len); - - return entry_point(img, img + len, *page_offset); -} + /* Inside the setup_hdr, we expect the magic "HdrS" */ + if (memcmp(&boot.hdr.header, "HdrS", 4) != 0) + errx(1, "This doesn't look like a bzImage to me"); -/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're - * supposed to jump into it and it will unpack itself. We can't do that - * because the Guest can't run the unpacking code, and adding features to - * lguest kills puppies, so we don't want to. - * - * The bzImage is formed by putting the decompressing code in front of the - * compressed kernel code. So we can simple scan through it looking for the - * first "gzip" header, and start decompressing from there. */ -static unsigned long load_bzimage(int fd, unsigned long *page_offset) -{ - unsigned char c; - int state = 0; - - /* GZIP header is 0x1F 0x8B <method> <flags>... <compressed-by>. */ - while (read(fd, &c, 1) == 1) { - switch (state) { - case 0: - if (c == 0x1F) - state++; - break; - case 1: - if (c == 0x8B) - state++; - else - state = 0; - break; - case 2 ... 8: - state++; - break; - case 9: - /* Seek back to the start of the gzip header. */ - lseek(fd, -10, SEEK_CUR); - /* One final check: "compressed under UNIX". */ - if (c != 0x03) - state = -1; - else - return unpack_bzimage(fd, page_offset); - } - } - errx(1, "Could not find kernel in bzImage"); + /* Skip over the extra sectors of the header. */ + lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET); + + /* Now read everything into memory. in nice big chunks. */ + while ((r = read(fd, p, 65536)) > 0) + p += r; + + /* Finally, code32_start tells us where to enter the kernel. */ + return boot.hdr.code32_start; } /*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels * come wrapped up in the self-decompressing "bzImage" format. With some funky * coding, we can load those, too. */ -static unsigned long load_kernel(int fd, unsigned long *page_offset) +static unsigned long load_kernel(int fd) { Elf32_Ehdr hdr; @@ -373,10 +373,10 @@ static unsigned long load_kernel(int fd, unsigned long *page_offset) /* If it's an ELF file, it starts with "\177ELF" */ if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) - return map_elf(fd, &hdr, page_offset); + return map_elf(fd, &hdr); /* Otherwise we assume it's a bzImage, and try to unpack it */ - return load_bzimage(fd, page_offset); + return load_bzimage(fd); } /* This is a trivial little helper to align pages. Andi Kleen hated it because @@ -402,59 +402,45 @@ static unsigned long load_initrd(const char *name, unsigned long mem) int ifd; struct stat st; unsigned long len; - void *iaddr; ifd = open_or_die(name, O_RDONLY); /* fstat() is needed to get the file size. */ if (fstat(ifd, &st) < 0) err(1, "fstat() on initrd '%s'", name); - /* The length needs to be rounded up to a page size: mmap needs the - * address to be page aligned. */ + /* We map the initrd at the top of memory, but mmap wants it to be + * page-aligned, so we round the size up for that. */ len = page_align(st.st_size); - /* We map the initrd at the top of memory. */ - iaddr = mmap((void *)mem - len, st.st_size, - PROT_READ|PROT_EXEC|PROT_WRITE, - MAP_FIXED|MAP_PRIVATE, ifd, 0); - if (iaddr != (void *)mem - len) - err(1, "Mmaping initrd '%s' returned %p not %p", - name, iaddr, (void *)mem - len); + map_at(ifd, from_guest_phys(mem - len), 0, st.st_size); /* Once a file is mapped, you can close the file descriptor. It's a * little odd, but quite useful. */ close(ifd); - verbose("mapped initrd %s size=%lu @ %p\n", name, st.st_size, iaddr); + verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len); /* We return the initrd size. */ return len; } -/* Once we know how much memory we have, and the address the Guest kernel - * expects, we can construct simple linear page tables which will get the Guest - * far enough into the boot to create its own. +/* Once we know how much memory we have, we can construct simple linear page + * tables which set virtual == physical which will get the Guest far enough + * into the boot to create its own. * * We lay them out of the way, just below the initrd (which is why we need to * know its size). */ static unsigned long setup_pagetables(unsigned long mem, - unsigned long initrd_size, - unsigned long page_offset) + unsigned long initrd_size) { - u32 *pgdir, *linear; + unsigned long *pgdir, *linear; unsigned int mapped_pages, i, linear_pages; - unsigned int ptes_per_page = getpagesize()/sizeof(u32); + unsigned int ptes_per_page = getpagesize()/sizeof(void *); - /* Ideally we map all physical memory starting at page_offset. - * However, if page_offset is 0xC0000000 we can only map 1G of physical - * (0xC0000000 + 1G overflows). */ - if (mem <= -page_offset) - mapped_pages = mem/getpagesize(); - else - mapped_pages = -page_offset/getpagesize(); + mapped_pages = mem/getpagesize(); /* Each PTE page can map ptes_per_page pages: how many do we need? */ linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page; /* We put the toplevel page directory page at the top of memory. */ - pgdir = (void *)mem - initrd_size - getpagesize(); + pgdir = from_guest_phys(mem) - initrd_size - getpagesize(); /* Now we use the next linear_pages pages as pte pages */ linear = (void *)pgdir - linear_pages*getpagesize(); @@ -465,20 +451,19 @@ static unsigned long setup_pagetables(unsigned long mem, for (i = 0; i < mapped_pages; i++) linear[i] = ((i * getpagesize()) | PAGE_PRESENT); - /* The top level points to the linear page table pages above. The - * entry representing page_offset points to the first one, and they - * continue from there. */ + /* The top level points to the linear page table pages above. */ for (i = 0; i < mapped_pages; i += ptes_per_page) { - pgdir[(i + page_offset/getpagesize())/ptes_per_page] - = (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT); + pgdir[i/ptes_per_page] + = ((to_guest_phys(linear) + i*sizeof(void *)) + | PAGE_PRESENT); } - verbose("Linear mapping of %u pages in %u pte pages at %p\n", - mapped_pages, linear_pages, linear); + verbose("Linear mapping of %u pages in %u pte pages at %#lx\n", + mapped_pages, linear_pages, to_guest_phys(linear)); /* We return the top level (guest-physical) address: the kernel needs * to know where it is. */ - return (unsigned long)pgdir; + return to_guest_phys(pgdir); } /* Simple routine to roll all the commandline arguments together with spaces @@ -498,14 +483,17 @@ static void concat(char *dst, char *args[]) /* This is where we actually tell the kernel to initialize the Guest. We saw * the arguments it expects when we looked at initialize() in lguest_user.c: - * the top physical page to allow, the top level pagetable, the entry point and - * the page_offset constant for the Guest. */ -static int tell_kernel(u32 pgdir, u32 start, u32 page_offset) + * the base of guest "physical" memory, the top physical page to allow, the + * top level pagetable and the entry point for the Guest. */ +static int tell_kernel(unsigned long pgdir, unsigned long start) { - u32 args[] = { LHREQ_INITIALIZE, - top/getpagesize(), pgdir, start, page_offset }; + unsigned long args[] = { LHREQ_INITIALIZE, + (unsigned long)guest_base, + guest_limit / getpagesize(), pgdir, start }; int fd; + verbose("Guest: %p - %p (%#lx)\n", + guest_base, guest_base + guest_limit, guest_limit); fd = open_or_die("/dev/lguest", O_RDWR); if (write(fd, args, sizeof(args)) < 0) err(1, "Writing to /dev/lguest"); @@ -515,11 +503,11 @@ static int tell_kernel(u32 pgdir, u32 start, u32 page_offset) } /*:*/ -static void set_fd(int fd, struct device_list *devices) +static void add_device_fd(int fd) { - FD_SET(fd, &devices->infds); - if (fd > devices->max_infd) - devices->max_infd = fd; + FD_SET(fd, &devices.infds); + if (fd > devices.max_infd) + devices.max_infd = fd; } /*L:200 @@ -537,36 +525,38 @@ static void set_fd(int fd, struct device_list *devices) * * This, of course, is merely a different *kind* of icky. */ -static void wake_parent(int pipefd, int lguest_fd, struct device_list *devices) +static void wake_parent(int pipefd, int lguest_fd) { /* Add the pipe from the Launcher to the fdset in the device_list, so * we watch it, too. */ - set_fd(pipefd, devices); + add_device_fd(pipefd); for (;;) { - fd_set rfds = devices->infds; - u32 args[] = { LHREQ_BREAK, 1 }; + fd_set rfds = devices.infds; + unsigned long args[] = { LHREQ_BREAK, 1 }; /* Wait until input is ready from one of the devices. */ - select(devices->max_infd+1, &rfds, NULL, NULL, NULL); + select(devices.max_infd+1, &rfds, NULL, NULL, NULL); /* Is it a message from the Launcher? */ if (FD_ISSET(pipefd, &rfds)) { - int ignorefd; + int fd; /* If read() returns 0, it means the Launcher has * exited. We silently follow. */ - if (read(pipefd, &ignorefd, sizeof(ignorefd)) == 0) + if (read(pipefd, &fd, sizeof(fd)) == 0) exit(0); - /* Otherwise it's telling us there's a problem with one - * of the devices, and we should ignore that file - * descriptor from now on. */ - FD_CLR(ignorefd, &devices->infds); + /* Otherwise it's telling us to change what file + * descriptors we're to listen to. */ + if (fd >= 0) + FD_SET(fd, &devices.infds); + else + FD_CLR(-fd - 1, &devices.infds); } else /* Send LHREQ_BREAK command. */ write(lguest_fd, args, sizeof(args)); } } /* This routine just sets up a pipe to the Waker process. */ -static int setup_waker(int lguest_fd, struct device_list *device_list) +static int setup_waker(int lguest_fd) { int pipefd[2], child; @@ -580,7 +570,7 @@ static int setup_waker(int lguest_fd, struct device_list *device_list) if (child == 0) { /* Close the "writing" end of our copy of the pipe */ close(pipefd[1]); - wake_parent(pipefd[0], lguest_fd, device_list); + wake_parent(pipefd[0], lguest_fd); } /* Close the reading end of our copy of the pipe. */ close(pipefd[0]); @@ -602,83 +592,128 @@ static void *_check_pointer(unsigned long addr, unsigned int size, { /* We have to separately check addr and addr+size, because size could * be huge and addr + size might wrap around. */ - if (addr >= top || addr + size >= top) - errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr); + if (addr >= guest_limit || addr + size >= guest_limit) + errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr); /* We return a pointer for the caller's convenience, now we know it's * safe to use. */ - return (void *)addr; + return from_guest_phys(addr); } /* A macro which transparently hands the line number to the real function. */ #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) -/* The Guest has given us the address of a "struct lguest_dma". We check it's - * OK and convert it to an iovec (which is a simple array of ptr/size - * pairs). */ -static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num) +/* This function returns the next descriptor in the chain, or vq->vring.num. */ +static unsigned next_desc(struct virtqueue *vq, unsigned int i) { - unsigned int i; - struct lguest_dma *udma; - - /* First we make sure that the array memory itself is valid. */ - udma = check_pointer(dma, sizeof(*udma)); - /* Now we check each element */ - for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { - /* A zero length ends the array. */ - if (!udma->len[i]) - break; + unsigned int next; - iov[i].iov_base = check_pointer(udma->addr[i], udma->len[i]); - iov[i].iov_len = udma->len[i]; - } - *num = i; + /* If this descriptor says it doesn't chain, we're done. */ + if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT)) + return vq->vring.num; + + /* Check they're not leading us off end of descriptors. */ + next = vq->vring.desc[i].next; + /* Make sure compiler knows to grab that: we don't want it changing! */ + wmb(); - /* We return the pointer to where the caller should write the amount of - * the buffer used. */ - return &udma->used_len; + if (next >= vq->vring.num) + errx(1, "Desc next is %u", next); + + return next; +} + +/* This looks in the virtqueue and for the first available buffer, and converts + * it to an iovec for convenient access. Since descriptors consist of some + * number of output then some number of input descriptors, it's actually two + * iovecs, but we pack them into one and note how many of each there were. + * + * This function returns the descriptor number found, or vq->vring.num (which + * is never a valid descriptor number) if none was found. */ +static unsigned get_vq_desc(struct virtqueue *vq, + struct iovec iov[], + unsigned int *out_num, unsigned int *in_num) +{ + unsigned int i, head; + + /* Check it isn't doing very strange things with descriptor numbers. */ + if ((u16)(vq->vring.avail->idx - vq->last_avail_idx) > vq->vring.num) + errx(1, "Guest moved used index from %u to %u", + vq->last_avail_idx, vq->vring.avail->idx); + + /* If there's nothing new since last we looked, return invalid. */ + if (vq->vring.avail->idx == vq->last_avail_idx) + return vq->vring.num; + + /* Grab the next descriptor number they're advertising, and increment + * the index we've seen. */ + head = vq->vring.avail->ring[vq->last_avail_idx++ % vq->vring.num]; + + /* If their number is silly, that's a fatal mistake. */ + if (head >= vq->vring.num) + errx(1, "Guest says index %u is available", head); + + /* When we start there are none of either input nor output. */ + *out_num = *in_num = 0; + + i = head; + do { + /* Grab the first descriptor, and check it's OK. */ + iov[*out_num + *in_num].iov_len = vq->vring.desc[i].len; + iov[*out_num + *in_num].iov_base + = check_pointer(vq->vring.desc[i].addr, + vq->vring.desc[i].len); + /* If this is an input descriptor, increment that count. */ + if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE) + (*in_num)++; + else { + /* If it's an output descriptor, they're all supposed + * to come before any input descriptors. */ + if (*in_num) + errx(1, "Descriptor has out after in"); + (*out_num)++; + } + + /* If we've got too many, that implies a descriptor loop. */ + if (*out_num + *in_num > vq->vring.num) + errx(1, "Looped descriptor"); + } while ((i = next_desc(vq, i)) != vq->vring.num); + + return head; } -/* This routine gets a DMA buffer from the Guest for a given key, and converts - * it to an iovec array. It returns the interrupt the Guest wants when we're - * finished, and a pointer to the "used_len" field to fill in. */ -static u32 *get_dma_buffer(int fd, void *key, - struct iovec iov[], unsigned int *num, u32 *irq) +/* Once we've used one of their buffers, we tell them about it. We'll then + * want to send them an interrupt, using trigger_irq(). */ +static void add_used(struct virtqueue *vq, unsigned int head, int len) { - u32 buf[] = { LHREQ_GETDMA, (u32)key }; - unsigned long udma; - u32 *res; - - /* Ask the kernel for a DMA buffer corresponding to this key. */ - udma = write(fd, buf, sizeof(buf)); - /* They haven't registered any, or they're all used? */ - if (udma == (unsigned long)-1) - return NULL; - - /* Convert it into our iovec array */ - res = dma2iov(udma, iov, num); - /* The kernel stashes irq in ->used_len to get it out to us. */ - *irq = *res; - /* Return a pointer to ((struct lguest_dma *)udma)->used_len. */ - return res; + struct vring_used_elem *used; + + /* Get a pointer to the next entry in the used ring. */ + used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; + used->id = head; + used->len = len; + /* Make sure buffer is written before we update index. */ + wmb(); + vq->vring.used->idx++; } -/* This is a convenient routine to send the Guest an interrupt. */ -static void trigger_irq(int fd, u32 irq) +/* This actually sends the interrupt for this virtqueue */ +static void trigger_irq(int fd, struct virtqueue *vq) { - u32 buf[] = { LHREQ_IRQ, irq }; + unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; + + if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) + return; + + /* Send the Guest an interrupt tell them we used something up. */ if (write(fd, buf, sizeof(buf)) != 0) - err(1, "Triggering irq %i", irq); + err(1, "Triggering irq %i", vq->config.irq); } -/* This simply sets up an iovec array where we can put data to be discarded. - * This happens when the Guest doesn't want or can't handle the input: we have - * to get rid of it somewhere, and if we bury it in the ceiling space it will - * start to smell after a week. */ -static void discard_iovec(struct iovec *iov, unsigned int *num) +/* And here's the combo meal deal. Supersize me! */ +static void add_used_and_trigger(int fd, struct virtqueue *vq, + unsigned int head, int len) { - static char discard_buf[1024]; - *num = 1; - iov->iov_base = discard_buf; - iov->iov_len = sizeof(discard_buf); + add_used(vq, head, len); + trigger_irq(fd, vq); } /* Here is the input terminal setting we save, and the routine to restore them @@ -701,38 +736,39 @@ struct console_abort /* This is the routine which handles console input (ie. stdin). */ static bool handle_console_input(int fd, struct device *dev) { - u32 irq = 0, *lenp; int len; - unsigned int num; - struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; + unsigned int head, in_num, out_num; + struct iovec iov[dev->vq->vring.num]; struct console_abort *abort = dev->priv; - /* First we get the console buffer from the Guest. The key is dev->mem - * which was set to 0 in setup_console(). */ - lenp = get_dma_buffer(fd, dev->mem, iov, &num, &irq); - if (!lenp) { - /* If it's not ready for input, warn and set up to discard. */ - warn("console: no dma buffer!"); - discard_iovec(iov, &num); - } + /* First we need a console buffer from the Guests's input virtqueue. */ + head = get_vq_desc(dev->vq, iov, &out_num, &in_num); + + /* If they're not ready for input, stop listening to this file + * descriptor. We'll start again once they add an input buffer. */ + if (head == dev->vq->vring.num) + return false; + + if (out_num) + errx(1, "Output buffers in console in queue?"); /* This is why we convert to iovecs: the readv() call uses them, and so * it reads straight into the Guest's buffer. */ - len = readv(dev->fd, iov, num); + len = readv(dev->fd, iov, in_num); if (len <= 0) { /* This implies that the console is closed, is /dev/null, or - * something went terribly wrong. We still go through the rest - * of the logic, though, especially the exit handling below. */ + * something went terribly wrong. */ warnx("Failed to get console input, ignoring console."); - len = 0; + /* Put the input terminal back. */ + restore_term(); + /* Remove callback from input vq, so it doesn't restart us. */ + dev->vq->handle_output = NULL; + /* Stop listening to this fd: don't call us again. */ + return false; } - /* If we read the data into the Guest, fill in the length and send the - * interrupt. */ - if (lenp) { - *lenp = len; - trigger_irq(fd, irq); - } + /* Tell the Guest about the new input. */ + add_used_and_trigger(fd, dev->vq, head, len); /* Three ^C within one second? Exit. * @@ -746,7 +782,7 @@ static bool handle_console_input(int fd, struct device *dev) struct timeval now; gettimeofday(&now, NULL); if (now.tv_sec <= abort->start.tv_sec+1) { - u32 args[] = { LHREQ_BREAK, 0 }; + unsigned long args[] = { LHREQ_BREAK, 0 }; /* Close the fd so Waker will know it has to * exit. */ close(waker_fd); @@ -761,214 +797,163 @@ static bool handle_console_input(int fd, struct device *dev) /* Any other key resets the abort counter. */ abort->count = 0; - /* Now, if we didn't read anything, put the input terminal back and - * return failure (meaning, don't call us again). */ - if (!len) { - restore_term(); - return false; - } /* Everything went OK! */ return true; } -/* Handling console output is much simpler than input. */ -static u32 handle_console_output(int fd, const struct iovec *iov, - unsigned num, struct device*dev) +/* Handling output for console is simple: we just get all the output buffers + * and write them to stdout. */ +static void handle_console_output(int fd, struct virtqueue *vq) { - /* Whatever the Guest sends, write it to standard output. Return the - * number of bytes written. */ - return writev(STDOUT_FILENO, iov, num); -} - -/* Guest->Host network output is also pretty easy. */ -static u32 handle_tun_output(int fd, const struct iovec *iov, - unsigned num, struct device *dev) -{ - /* We put a flag in the "priv" pointer of the network device, and set - * it as soon as we see output. We'll see why in handle_tun_input() */ - *(bool *)dev->priv = true; - /* Whatever packet the Guest sent us, write it out to the tun - * device. */ - return writev(dev->fd, iov, num); + unsigned int head, out, in; + int len; + struct iovec iov[vq->vring.num]; + + /* Keep getting output buffers from the Guest until we run out. */ + while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { + if (in) + errx(1, "Input buffers in output queue?"); + len = writev(STDOUT_FILENO, iov, out); + add_used_and_trigger(fd, vq, head, len); + } } -/* This matches the peer_key() in lguest_net.c. The key for any given slot - * is the address of the network device's page plus 4 * the slot number. */ -static unsigned long peer_offset(unsigned int peernum) +/* Handling output for network is also simple: we get all the output buffers + * and write them (ignoring the first element) to this device's file descriptor + * (stdout). */ +static void handle_net_output(int fd, struct virtqueue *vq) { - return 4 * peernum; + unsigned int head, out, in; + int len; + struct iovec iov[vq->vring.num]; + + /* Keep getting output buffers from the Guest until we run out. */ + while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { + if (in) + errx(1, "Input buffers in output queue?"); + /* Check header, but otherwise ignore it (we said we supported + * no features). */ + (void)convert(&iov[0], struct virtio_net_hdr); + len = writev(vq->dev->fd, iov+1, out-1); + add_used_and_trigger(fd, vq, head, len); + } } -/* This is where we handle a packet coming in from the tun device */ +/* This is where we handle a packet coming in from the tun device to our + * Guest. */ static bool handle_tun_input(int fd, struct device *dev) { - u32 irq = 0, *lenp; + unsigned int head, in_num, out_num; int len; - unsigned num; - struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; + struct iovec iov[dev->vq->vring.num]; + struct virtio_net_hdr *hdr; - /* First we get a buffer the Guest has bound to its key. */ - lenp = get_dma_buffer(fd, dev->mem+peer_offset(NET_PEERNUM), iov, &num, - &irq); - if (!lenp) { + /* First we need a network buffer from the Guests's recv virtqueue. */ + head = get_vq_desc(dev->vq, iov, &out_num, &in_num); + if (head == dev->vq->vring.num) { /* Now, it's expected that if we try to send a packet too - * early, the Guest won't be ready yet. This is why we set a - * flag when the Guest sends its first packet. If it's sent a - * packet we assume it should be ready to receive them. - * - * Actually, this is what the status bits in the descriptor are - * for: we should *use* them. FIXME! */ - if (*(bool *)dev->priv) + * early, the Guest won't be ready yet. Wait until the device + * status says it's ready. */ + /* FIXME: Actually want DRIVER_ACTIVE here. */ + if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) warn("network: no dma buffer!"); - discard_iovec(iov, &num); - } + /* We'll turn this back on if input buffers are registered. */ + return false; + } else if (out_num) + errx(1, "Output buffers in network recv queue?"); + + /* First element is the header: we set it to 0 (no features). */ + hdr = convert(&iov[0], struct virtio_net_hdr); + hdr->flags = 0; + hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; /* Read the packet from the device directly into the Guest's buffer. */ - len = readv(dev->fd, iov, num); + len = readv(dev->fd, iov+1, in_num-1); if (len <= 0) err(1, "reading network"); - /* Write the used_len, and trigger the interrupt for the Guest */ - if (lenp) { - *lenp = len; - trigger_irq(fd, irq); - } + /* Tell the Guest about the new packet. */ + add_used_and_trigger(fd, dev->vq, head, sizeof(*hdr) + len); + verbose("tun input packet len %i [%02x %02x] (%s)\n", len, - ((u8 *)iov[0].iov_base)[0], ((u8 *)iov[0].iov_base)[1], - lenp ? "sent" : "discarded"); + ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1], + head != dev->vq->vring.num ? "sent" : "discarded"); + /* All good. */ return true; } -/* The last device handling routine is block output: the Guest has sent a DMA - * to the block device. It will have placed the command it wants in the - * "struct lguest_block_page". */ -static u32 handle_block_output(int fd, const struct iovec *iov, - unsigned num, struct device *dev) +/* This callback ensures we try again, in case we stopped console or net + * delivery because Guest didn't have any buffers. */ +static void enable_fd(int fd, struct virtqueue *vq) { - struct lguest_block_page *p = dev->mem; - u32 irq, *lenp; - unsigned int len, reply_num; - struct iovec reply[LGUEST_MAX_DMA_SECTIONS]; - off64_t device_len, off = (off64_t)p->sector * 512; - - /* First we extract the device length from the dev->priv pointer. */ - device_len = *(off64_t *)dev->priv; - - /* We first check that the read or write is within the length of the - * block file. */ - if (off >= device_len) - err(1, "Bad offset %llu vs %llu", off, device_len); - /* Move to the right location in the block file. This shouldn't fail, - * but best to check. */ - if (lseek64(dev->fd, off, SEEK_SET) != off) - err(1, "Bad seek to sector %i", p->sector); - - verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off); - - /* They were supposed to bind a reply buffer at key equal to the start - * of the block device memory. We need this to tell them when the - * request is finished. */ - lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq); - if (!lenp) - err(1, "Block request didn't give us a dma buffer"); - - if (p->type) { - /* A write request. The DMA they sent contained the data, so - * write it out. */ - len = writev(dev->fd, iov, num); - /* Grr... Now we know how long the "struct lguest_dma" they - * sent was, we make sure they didn't try to write over the end - * of the block file (possibly extending it). */ - if (off + len > device_len) { - /* Trim it back to the correct length */ - ftruncate64(dev->fd, device_len); - /* Die, bad Guest, die. */ - errx(1, "Write past end %llu+%u", off, len); - } - /* The reply length is 0: we just send back an empty DMA to - * interrupt them and tell them the write is finished. */ - *lenp = 0; - } else { - /* A read request. They sent an empty DMA to start the - * request, and we put the read contents into the reply - * buffer. */ - len = readv(dev->fd, reply, reply_num); - *lenp = len; - } - - /* The result is 1 (done), 2 if there was an error (short read or - * write). */ - p->result = 1 + (p->bytes != len); - /* Now tell them we've used their reply buffer. */ - trigger_irq(fd, irq); - - /* We're supposed to return the number of bytes of the output buffer we - * used. But the block device uses the "result" field instead, so we - * don't bother. */ - return 0; + add_device_fd(vq->dev->fd); + /* Tell waker to listen to it again */ + write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd)); } -/* This is the generic routine we call when the Guest sends some DMA out. */ -static void handle_output(int fd, unsigned long dma, unsigned long key, - struct device_list *devices) +/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ +static void handle_output(int fd, unsigned long addr) { struct device *i; - u32 *lenp; - struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; - unsigned num = 0; - - /* Convert the "struct lguest_dma" they're sending to a "struct - * iovec". */ - lenp = dma2iov(dma, iov, &num); - - /* Check each device: if they expect output to this key, tell them to - * handle it. */ - for (i = devices->dev; i; i = i->next) { - if (i->handle_output && key == i->watch_key) { - /* We write the result straight into the used_len field - * for them. */ - *lenp = i->handle_output(fd, iov, num, i); - return; + struct virtqueue *vq; + + /* Check each virtqueue. */ + for (i = devices.dev; i; i = i->next) { + for (vq = i->vq; vq; vq = vq->next) { + if (vq->config.pfn == addr/getpagesize() + && vq->handle_output) { + verbose("Output to %s\n", vq->dev->name); + vq->handle_output(fd, vq); + return; + } } } - /* This can happen: the kernel sends any SEND_DMA which doesn't match - * another Guest to us. It could be that another Guest just left a - * network, for example. But it's unusual. */ - warnx("Pending dma %p, key %p", (void *)dma, (void *)key); + /* Early console write is done using notify on a nul-terminated string + * in Guest memory. */ + if (addr >= guest_limit) + errx(1, "Bad NOTIFY %#lx", addr); + + write(STDOUT_FILENO, from_guest_phys(addr), + strnlen(from_guest_phys(addr), guest_limit - addr)); } /* This is called when the waker wakes us up: check for incoming file * descriptors. */ -static void handle_input(int fd, struct device_list *devices) +static void handle_input(int fd) { /* select() wants a zeroed timeval to mean "don't wait". */ struct timeval poll = { .tv_sec = 0, .tv_usec = 0 }; for (;;) { struct device *i; - fd_set fds = devices->infds; + fd_set fds = devices.infds; /* If nothing is ready, we're done. */ - if (select(devices->max_infd+1, &fds, NULL, NULL, &poll) == 0) + if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0) break; /* Otherwise, call the device(s) which have readable * file descriptors and a method of handling them. */ - for (i = devices->dev; i; i = i->next) { + for (i = devices.dev; i; i = i->next) { if (i->handle_input && FD_ISSET(i->fd, &fds)) { + int dev_fd; + if (i->handle_input(fd, i)) + continue; + /* If handle_input() returns false, it means we - * should no longer service it. - * handle_console_input() does this. */ - if (!i->handle_input(fd, i)) { - /* Clear it from the set of input file - * descriptors kept at the head of the - * device list. */ - FD_CLR(i->fd, &devices->infds); - /* Tell waker to ignore it too... */ - write(waker_fd, &i->fd, sizeof(i->fd)); - } + * should no longer service it. Networking and + * console do this when there's no input + * buffers to deliver into. Console also uses + * it when it discovers that stdin is + * closed. */ + FD_CLR(i->fd, &devices.infds); + /* Tell waker to ignore it too, by sending a + * negative fd number (-1, since 0 is a valid + * FD number). */ + dev_fd = -i->fd - 1; + write(waker_fd, &dev_fd, sizeof(dev_fd)); } } } @@ -982,43 +967,93 @@ static void handle_input(int fd, struct device_list *devices) * routines to allocate them. * * This routine allocates a new "struct lguest_device_desc" from descriptor - * table in the devices array just above the Guest's normal memory. */ -static struct lguest_device_desc * -new_dev_desc(struct lguest_device_desc *descs, - u16 type, u16 features, u16 num_pages) + * table just above the Guest's normal memory. It returns a pointer to that + * descriptor. */ +static struct lguest_device_desc *new_dev_desc(u16 type) { - unsigned int i; + struct lguest_device_desc *d; - for (i = 0; i < LGUEST_MAX_DEVICES; i++) { - if (!descs[i].type) { - descs[i].type = type; - descs[i].features = features; - descs[i].num_pages = num_pages; - /* If they said the device needs memory, we allocate - * that now, bumping up the top of Guest memory. */ - if (num_pages) { - map_zeroed_pages(top, num_pages); - descs[i].pfn = top/getpagesize(); - top += num_pages*getpagesize(); - } - return &descs[i]; - } - } - errx(1, "too many devices"); + /* We only have one page for all the descriptors. */ + if (devices.desc_used + sizeof(*d) > getpagesize()) + errx(1, "Too many devices"); + + /* We don't need to set config_len or status: page is 0 already. */ + d = (void *)devices.descpage + devices.desc_used; + d->type = type; + devices.desc_used += sizeof(*d); + + return d; } -/* This monster routine does all the creation and setup of a new device, - * including caling new_dev_desc() to allocate the descriptor and device - * memory. */ -static struct device *new_device(struct device_list *devices, - u16 type, u16 num_pages, u16 features, - int fd, - bool (*handle_input)(int, struct device *), - unsigned long watch_off, - u32 (*handle_output)(int, - const struct iovec *, - unsigned, - struct device *)) +/* Each device descriptor is followed by some configuration information. + * The first byte is a "status" byte for the Guest to report what's happening. + * After that are fields: u8 type, u8 len, [... len bytes...]. + * + * This routine adds a new field to an existing device's descriptor. It only + * works for the last device, but that's OK because that's how we use it. */ +static void add_desc_field(struct device *dev, u8 type, u8 len, const void *c) +{ + /* This is the last descriptor, right? */ + assert(devices.descpage + devices.desc_used + == (u8 *)(dev->desc + 1) + dev->desc->config_len); + + /* We only have one page of device descriptions. */ + if (devices.desc_used + 2 + len > getpagesize()) + errx(1, "Too many devices"); + + /* Copy in the new config header: type then length. */ + devices.descpage[devices.desc_used++] = type; + devices.descpage[devices.desc_used++] = len; + memcpy(devices.descpage + devices.desc_used, c, len); + devices.desc_used += len; + + /* Update the device descriptor length: two byte head then data. */ + dev->desc->config_len += 2 + len; +} + +/* This routine adds a virtqueue to a device. We specify how many descriptors + * the virtqueue is to have. */ +static void add_virtqueue(struct device *dev, unsigned int num_descs, + void (*handle_output)(int fd, struct virtqueue *me)) +{ + unsigned int pages; + struct virtqueue **i, *vq = malloc(sizeof(*vq)); + void *p; + + /* First we need some pages for this virtqueue. */ + pages = (vring_size(num_descs) + getpagesize() - 1) / getpagesize(); + p = get_pages(pages); + + /* Initialize the configuration. */ + vq->config.num = num_descs; + vq->config.irq = devices.next_irq++; + vq->config.pfn = to_guest_phys(p) / getpagesize(); + + /* Initialize the vring. */ + vring_init(&vq->vring, num_descs, p); + + /* Add the configuration information to this device's descriptor. */ + add_desc_field(dev, VIRTIO_CONFIG_F_VIRTQUEUE, + sizeof(vq->config), &vq->config); + + /* Add to tail of list, so dev->vq is first vq, dev->vq->next is + * second. */ + for (i = &dev->vq; *i; i = &(*i)->next); + *i = vq; + + /* Link virtqueue back to device. */ + vq->dev = dev; + + /* Set up handler. */ + vq->handle_output = handle_output; + if (!handle_output) + vq->vring.used->flags = VRING_USED_F_NO_NOTIFY; +} + +/* This routine does all the creation and setup of a new device, including + * caling new_dev_desc() to allocate the descriptor and device memory. */ +static struct device *new_device(const char *name, u16 type, int fd, + bool (*handle_input)(int, struct device *)) { struct device *dev = malloc(sizeof(*dev)); @@ -1026,27 +1061,25 @@ static struct device *new_device(struct device_list *devices, * easier, but the user expects the devices to be arranged on the bus * in command-line order. The first network device on the command line * is eth0, the first block device /dev/lgba, etc. */ - *devices->lastdev = dev; + *devices.lastdev = dev; dev->next = NULL; - devices->lastdev = &dev->next; + devices.lastdev = &dev->next; /* Now we populate the fields one at a time. */ dev->fd = fd; /* If we have an input handler for this file descriptor, then we add it * to the device_list's fdset and maxfd. */ if (handle_input) - set_fd(dev->fd, devices); - dev->desc = new_dev_desc(devices->descs, type, features, num_pages); - dev->mem = (void *)(dev->desc->pfn * getpagesize()); + add_device_fd(dev->fd); + dev->desc = new_dev_desc(type); dev->handle_input = handle_input; - dev->watch_key = (unsigned long)dev->mem + watch_off; - dev->handle_output = handle_output; + dev->name = name; return dev; } /* Our first setup routine is the console. It's a fairly simple device, but * UNIX tty handling makes it uglier than it could be. */ -static void setup_console(struct device_list *devices) +static void setup_console(void) { struct device *dev; @@ -1062,127 +1095,38 @@ static void setup_console(struct device_list *devices) atexit(restore_term); } - /* We don't currently require any memory for the console, so we ask for - * 0 pages. */ - dev = new_device(devices, LGUEST_DEVICE_T_CONSOLE, 0, 0, - STDIN_FILENO, handle_console_input, - LGUEST_CONSOLE_DMA_KEY, handle_console_output); + dev = new_device("console", VIRTIO_ID_CONSOLE, + STDIN_FILENO, handle_console_input); /* We store the console state in dev->priv, and initialize it. */ dev->priv = malloc(sizeof(struct console_abort)); ((struct console_abort *)dev->priv)->count = 0; - verbose("device %p: console\n", - (void *)(dev->desc->pfn * getpagesize())); -} -/* Setting up a block file is also fairly straightforward. */ -static void setup_block_file(const char *filename, struct device_list *devices) -{ - int fd; - struct device *dev; - off64_t *device_len; - struct lguest_block_page *p; - - /* We open with O_LARGEFILE because otherwise we get stuck at 2G. We - * open with O_DIRECT because otherwise our benchmarks go much too - * fast. */ - fd = open_or_die(filename, O_RDWR|O_LARGEFILE|O_DIRECT); - - /* We want one page, and have no input handler (the block file never - * has anything interesting to say to us). Our timing will be quite - * random, so it should be a reasonable randomness source. */ - dev = new_device(devices, LGUEST_DEVICE_T_BLOCK, 1, - LGUEST_DEVICE_F_RANDOMNESS, - fd, NULL, 0, handle_block_output); - - /* We store the device size in the private area */ - device_len = dev->priv = malloc(sizeof(*device_len)); - /* This is the safe way of establishing the size of our device: it - * might be a normal file or an actual block device like /dev/hdb. */ - *device_len = lseek64(fd, 0, SEEK_END); - - /* The device memory is a "struct lguest_block_page". It's zeroed - * already, we just need to put in the device size. Block devices - * think in sectors (ie. 512 byte chunks), so we translate here. */ - p = dev->mem; - p->num_sectors = *device_len/512; - verbose("device %p: block %i sectors\n", - (void *)(dev->desc->pfn * getpagesize()), p->num_sectors); + /* The console needs two virtqueues: the input then the output. When + * they put something the input queue, we make sure we're listening to + * stdin. When they put something in the output queue, we write it to + * stdout. */ + add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); + add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output); + + verbose("device %u: console\n", devices.device_num++); } +/*:*/ -/* - * Network Devices. +/*M:010 Inter-guest networking is an interesting area. Simplest is to have a + * --sharenet=<name> option which opens or creates a named pipe. This can be + * used to send packets to another guest in a 1:1 manner. * - * Setting up network devices is quite a pain, because we have three types. - * First, we have the inter-Guest network. This is a file which is mapped into - * the address space of the Guests who are on the network. Because it is a - * shared mapping, the same page underlies all the devices, and they can send - * DMA to each other. + * More sopisticated is to use one of the tools developed for project like UML + * to do networking. * - * Remember from our network driver, the Guest is told what slot in the page it - * is to use. We use exclusive fnctl locks to reserve a slot. If another - * Guest is using a slot, the lock will fail and we try another. Because fnctl - * locks are cleaned up automatically when we die, this cleverly means that our - * reservation on the slot will vanish if we crash. */ -static unsigned int find_slot(int netfd, const char *filename) -{ - struct flock fl; - - fl.l_type = F_WRLCK; - fl.l_whence = SEEK_SET; - fl.l_len = 1; - /* Try a 1 byte lock in each possible position number */ - for (fl.l_start = 0; - fl.l_start < getpagesize()/sizeof(struct lguest_net); - fl.l_start++) { - /* If we succeed, return the slot number. */ - if (fcntl(netfd, F_SETLK, &fl) == 0) - return fl.l_start; - } - errx(1, "No free slots in network file %s", filename); -} - -/* This function sets up the network file */ -static void setup_net_file(const char *filename, - struct device_list *devices) -{ - int netfd; - struct device *dev; - - /* We don't use open_or_die() here: for friendliness we create the file - * if it doesn't already exist. */ - netfd = open(filename, O_RDWR, 0); - if (netfd < 0) { - if (errno == ENOENT) { - netfd = open(filename, O_RDWR|O_CREAT, 0600); - if (netfd >= 0) { - /* If we succeeded, initialize the file with a - * blank page. */ - char page[getpagesize()]; - memset(page, 0, sizeof(page)); - write(netfd, page, sizeof(page)); - } - } - if (netfd < 0) - err(1, "cannot open net file '%s'", filename); - } - - /* We need 1 page, and the features indicate the slot to use and that - * no checksum is needed. We never touch this device again; it's - * between the Guests on the network, so we don't register input or - * output handlers. */ - dev = new_device(devices, LGUEST_DEVICE_T_NET, 1, - find_slot(netfd, filename)|LGUEST_NET_F_NOCSUM, - -1, NULL, 0, NULL); - - /* Map the shared file. */ - if (mmap(dev->mem, getpagesize(), PROT_READ|PROT_WRITE, - MAP_FIXED|MAP_SHARED, netfd, 0) != dev->mem) - err(1, "could not mmap '%s'", filename); - verbose("device %p: shared net %s, peer %i\n", - (void *)(dev->desc->pfn * getpagesize()), filename, - dev->desc->features & ~LGUEST_NET_F_NOCSUM); -} -/*:*/ + * Faster is to do virtio bonding in kernel. Doing this 1:1 would be + * completely generic ("here's my vring, attach to your vring") and would work + * for any traffic. Of course, namespace and permissions issues need to be + * dealt with. A more sophisticated "multi-channel" virtio_net.c could hide + * multiple inter-guest channels behind one interface, although it would + * require some manner of hotplugging new virtio channels. + * + * Finally, we could implement a virtio network switch in the kernel. :*/ static u32 str2ip(const char *ipaddr) { @@ -1217,7 +1161,7 @@ static void add_to_bridge(int fd, const char *if_name, const char *br_name) /* This sets up the Host end of the network device with an IP address, brings * it up so packets will flow, the copies the MAC address into the hwaddr - * pointer (in practice, the Host's slot in the network device's memory). */ + * pointer. */ static void configure_device(int fd, const char *devname, u32 ipaddr, unsigned char hwaddr[6]) { @@ -1243,18 +1187,18 @@ static void configure_device(int fd, const char *devname, u32 ipaddr, memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6); } -/*L:195 The other kind of network is a Host<->Guest network. This can either - * use briding or routing, but the principle is the same: it uses the "tun" - * device to inject packets into the Host as if they came in from a normal - * network card. We just shunt packets between the Guest and the tun - * device. */ -static void setup_tun_net(const char *arg, struct device_list *devices) +/*L:195 Our network is a Host<->Guest network. This can either use bridging or + * routing, but the principle is the same: it uses the "tun" device to inject + * packets into the Host as if they came in from a normal network card. We + * just shunt packets between the Guest and the tun device. */ +static void setup_tun_net(const char *arg) { struct device *dev; struct ifreq ifr; int netfd, ipfd; u32 ip; const char *br_name = NULL; + u8 hwaddr[6]; /* We open the /dev/net/tun device and tell it we want a tap device. A * tap device is like a tun device, only somehow different. To tell @@ -1270,21 +1214,13 @@ static void setup_tun_net(const char *arg, struct device_list *devices) * device: trust us! */ ioctl(netfd, TUNSETNOCSUM, 1); - /* We create the net device with 1 page, using the features field of - * the descriptor to tell the Guest it is in slot 1 (NET_PEERNUM), and - * that the device has fairly random timing. We do *not* specify - * LGUEST_NET_F_NOCSUM: these packets can reach the real world. - * - * We will put our MAC address is slot 0 for the Guest to see, so - * it will send packets to us using the key "peer_offset(0)": */ - dev = new_device(devices, LGUEST_DEVICE_T_NET, 1, - NET_PEERNUM|LGUEST_DEVICE_F_RANDOMNESS, netfd, - handle_tun_input, peer_offset(0), handle_tun_output); + /* First we create a new network device. */ + dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input); - /* We keep a flag which says whether we've seen packets come out from - * this network device. */ - dev->priv = malloc(sizeof(bool)); - *(bool *)dev->priv = false; + /* Network devices need a receive and a send queue, just like + * console. */ + add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); + add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output); /* We need a socket to perform the magic network ioctls to bring up the * tap interface, connect to the bridge etc. Any socket will do! */ @@ -1300,44 +1236,251 @@ static void setup_tun_net(const char *arg, struct device_list *devices) } else /* It is an IP address to set up the device with */ ip = str2ip(arg); - /* We are peer 0, ie. first slot, so we hand dev->mem to this routine - * to write the MAC address at the start of the device memory. */ - configure_device(ipfd, ifr.ifr_name, ip, dev->mem); + /* Set up the tun device, and get the mac address for the interface. */ + configure_device(ipfd, ifr.ifr_name, ip, hwaddr); - /* Set "promisc" bit: we want every single packet if we're going to - * bridge to other machines (and otherwise it doesn't matter). */ - *((u8 *)dev->mem) |= 0x1; + /* Tell Guest what MAC address to use. */ + add_desc_field(dev, VIRTIO_CONFIG_NET_MAC_F, sizeof(hwaddr), hwaddr); + /* We don't seed the socket any more; setup is done. */ close(ipfd); - verbose("device %p: tun net %u.%u.%u.%u\n", - (void *)(dev->desc->pfn * getpagesize()), - (u8)(ip>>24), (u8)(ip>>16), (u8)(ip>>8), (u8)ip); + verbose("device %u: tun net %u.%u.%u.%u\n", + devices.device_num++, + (u8)(ip>>24),(u8)(ip>>16),(u8)(ip>>8),(u8)ip); if (br_name) verbose("attached to bridge: %s\n", br_name); } + + +/* + * Block device. + * + * Serving a block device is really easy: the Guest asks for a block number and + * we read or write that position in the file. + * + * Unfortunately, this is amazingly slow: the Guest waits until the read is + * finished before running anything else, even if it could be doing useful + * work. We could use async I/O, except it's reputed to suck so hard that + * characters actually go missing from your code when you try to use it. + * + * So we farm the I/O out to thread, and communicate with it via a pipe. */ + +/* This hangs off device->priv, with the data. */ +struct vblk_info +{ + /* The size of the file. */ + off64_t len; + + /* The file descriptor for the file. */ + int fd; + + /* IO thread listens on this file descriptor [0]. */ + int workpipe[2]; + + /* IO thread writes to this file descriptor to mark it done, then + * Launcher triggers interrupt to Guest. */ + int done_fd; +}; + +/* This is the core of the I/O thread. It returns true if it did something. */ +static bool service_io(struct device *dev) +{ + struct vblk_info *vblk = dev->priv; + unsigned int head, out_num, in_num, wlen; + int ret; + struct virtio_blk_inhdr *in; + struct virtio_blk_outhdr *out; + struct iovec iov[dev->vq->vring.num]; + off64_t off; + + head = get_vq_desc(dev->vq, iov, &out_num, &in_num); + if (head == dev->vq->vring.num) + return false; + + if (out_num == 0 || in_num == 0) + errx(1, "Bad virtblk cmd %u out=%u in=%u", + head, out_num, in_num); + + out = convert(&iov[0], struct virtio_blk_outhdr); + in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr); + off = out->sector * 512; + + /* This is how we implement barriers. Pretty poor, no? */ + if (out->type & VIRTIO_BLK_T_BARRIER) + fdatasync(vblk->fd); + + if (out->type & VIRTIO_BLK_T_SCSI_CMD) { + fprintf(stderr, "Scsi commands unsupported\n"); + in->status = VIRTIO_BLK_S_UNSUPP; + wlen = sizeof(in); + } else if (out->type & VIRTIO_BLK_T_OUT) { + /* Write */ + + /* Move to the right location in the block file. This can fail + * if they try to write past end. */ + if (lseek64(vblk->fd, off, SEEK_SET) != off) + err(1, "Bad seek to sector %llu", out->sector); + + ret = writev(vblk->fd, iov+1, out_num-1); + verbose("WRITE to sector %llu: %i\n", out->sector, ret); + + /* Grr... Now we know how long the descriptor they sent was, we + * make sure they didn't try to write over the end of the block + * file (possibly extending it). */ + if (ret > 0 && off + ret > vblk->len) { + /* Trim it back to the correct length */ + ftruncate64(vblk->fd, vblk->len); + /* Die, bad Guest, die. */ + errx(1, "Write past end %llu+%u", off, ret); + } + wlen = sizeof(in); + in->status = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); + } else { + /* Read */ + + /* Move to the right location in the block file. This can fail + * if they try to read past end. */ + if (lseek64(vblk->fd, off, SEEK_SET) != off) + err(1, "Bad seek to sector %llu", out->sector); + + ret = readv(vblk->fd, iov+1, in_num-1); + verbose("READ from sector %llu: %i\n", out->sector, ret); + if (ret >= 0) { + wlen = sizeof(in) + ret; + in->status = VIRTIO_BLK_S_OK; + } else { + wlen = sizeof(in); + in->status = VIRTIO_BLK_S_IOERR; + } + } + + /* We can't trigger an IRQ, because we're not the Launcher. It does + * that when we tell it we're done. */ + add_used(dev->vq, head, wlen); + return true; +} + +/* This is the thread which actually services the I/O. */ +static int io_thread(void *_dev) +{ + struct device *dev = _dev; + struct vblk_info *vblk = dev->priv; + char c; + + /* Close other side of workpipe so we get 0 read when main dies. */ + close(vblk->workpipe[1]); + /* Close the other side of the done_fd pipe. */ + close(dev->fd); + + /* When this read fails, it means Launcher died, so we follow. */ + while (read(vblk->workpipe[0], &c, 1) == 1) { + /* We acknowledge each request immediately, to reduce latency, + * rather than waiting until we've done them all. I haven't + * measured to see if it makes any difference. */ + while (service_io(dev)) + write(vblk->done_fd, &c, 1); + } + return 0; +} + +/* When the thread says some I/O is done, we interrupt the Guest. */ +static bool handle_io_finish(int fd, struct device *dev) +{ + char c; + + /* If child died, presumably it printed message. */ + if (read(dev->fd, &c, 1) != 1) + exit(1); + + /* It did some work, so trigger the irq. */ + trigger_irq(fd, dev->vq); + return true; +} + +/* When the Guest submits some I/O, we wake the I/O thread. */ +static void handle_virtblk_output(int fd, struct virtqueue *vq) +{ + struct vblk_info *vblk = vq->dev->priv; + char c = 0; + + /* Wake up I/O thread and tell it to go to work! */ + if (write(vblk->workpipe[1], &c, 1) != 1) + /* Presumably it indicated why it died. */ + exit(1); +} + +/* This creates a virtual block device. */ +static void setup_block_file(const char *filename) +{ + int p[2]; + struct device *dev; + struct vblk_info *vblk; + void *stack; + u64 cap; + unsigned int val; + + /* This is the pipe the I/O thread will use to tell us I/O is done. */ + pipe(p); + + /* The device responds to return from I/O thread. */ + dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish); + + /* The device has a virtqueue. */ + add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output); + + /* Allocate the room for our own bookkeeping */ + vblk = dev->priv = malloc(sizeof(*vblk)); + + /* First we open the file and store the length. */ + vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE); + vblk->len = lseek64(vblk->fd, 0, SEEK_END); + + /* Tell Guest how many sectors this device has. */ + cap = cpu_to_le64(vblk->len / 512); + add_desc_field(dev, VIRTIO_CONFIG_BLK_F_CAPACITY, sizeof(cap), &cap); + + /* Tell Guest not to put in too many descriptors at once: two are used + * for the in and out elements. */ + val = cpu_to_le32(VIRTQUEUE_NUM - 2); + add_desc_field(dev, VIRTIO_CONFIG_BLK_F_SEG_MAX, sizeof(val), &val); + + /* The I/O thread writes to this end of the pipe when done. */ + vblk->done_fd = p[1]; + + /* This is how we tell the I/O thread about more work. */ + pipe(vblk->workpipe); + + /* Create stack for thread and run it */ + stack = malloc(32768); + if (clone(io_thread, stack + 32768, CLONE_VM, dev) == -1) + err(1, "Creating clone"); + + /* We don't need to keep the I/O thread's end of the pipes open. */ + close(vblk->done_fd); + close(vblk->workpipe[0]); + + verbose("device %u: virtblock %llu sectors\n", + devices.device_num, cap); +} /* That's the end of device setup. */ /*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves * its input and output, and finally, lays it to rest. */ -static void __attribute__((noreturn)) -run_guest(int lguest_fd, struct device_list *device_list) +static void __attribute__((noreturn)) run_guest(int lguest_fd) { for (;;) { - u32 args[] = { LHREQ_BREAK, 0 }; - unsigned long arr[2]; + unsigned long args[] = { LHREQ_BREAK, 0 }; + unsigned long notify_addr; int readval; /* We read from the /dev/lguest device to run the Guest. */ - readval = read(lguest_fd, arr, sizeof(arr)); - - /* The read can only really return sizeof(arr) (the Guest did a - * SEND_DMA to us), or an error. */ + readval = read(lguest_fd, ¬ify_addr, sizeof(notify_addr)); - /* For a successful read, arr[0] is the address of the "struct - * lguest_dma", and arr[1] is the key the Guest sent to. */ - if (readval == sizeof(arr)) { - handle_output(lguest_fd, arr[0], arr[1], device_list); + /* One unsigned long means the Guest did HCALL_NOTIFY */ + if (readval == sizeof(notify_addr)) { + verbose("Notify on address %#lx\n", notify_addr); + handle_output(lguest_fd, notify_addr); continue; /* ENOENT means the Guest died. Reading tells us why. */ } else if (errno == ENOENT) { @@ -1351,7 +1494,7 @@ run_guest(int lguest_fd, struct device_list *device_list) /* Service input, then unset the BREAK which releases * the Waker. */ - handle_input(lguest_fd, device_list); + handle_input(lguest_fd); if (write(lguest_fd, args, sizeof(args)) < 0) err(1, "Resetting break"); } @@ -1365,7 +1508,6 @@ run_guest(int lguest_fd, struct device_list *device_list) static struct option opts[] = { { "verbose", 0, NULL, 'v' }, - { "sharenet", 1, NULL, 's' }, { "tunnet", 1, NULL, 't' }, { "block", 1, NULL, 'b' }, { "initrd", 1, NULL, 'i' }, @@ -1374,37 +1516,21 @@ static struct option opts[] = { static void usage(void) { errx(1, "Usage: lguest [--verbose] " - "[--sharenet=<filename>|--tunnet=(<ipaddr>|bridge:<bridgename>)\n" + "[--tunnet=(<ipaddr>|bridge:<bridgename>)\n" "|--block=<filename>|--initrd=<filename>]...\n" "<mem-in-mb> vmlinux [args...]"); } -/*L:100 The Launcher code itself takes us out into userspace, that scary place - * where pointers run wild and free! Unfortunately, like most userspace - * programs, it's quite boring (which is why everyone like to hack on the - * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it - * will get you through this section. Or, maybe not. - * - * The Launcher binary sits up high, usually starting at address 0xB8000000. - * Everything below this is the "physical" memory for the Guest. For example, - * if the Guest were to write a "1" at physical address 0, we would see a "1" - * in the Launcher at "(int *)0". Guest physical == Launcher virtual. - * - * This can be tough to get your head around, but usually it just means that we - * don't need to do any conversion when the Guest gives us it's "physical" - * addresses. - */ +/*L:105 The main routine is where the real work begins: */ int main(int argc, char *argv[]) { - /* Memory, top-level pagetable, code startpoint, PAGE_OFFSET and size - * of the (optional) initrd. */ - unsigned long mem = 0, pgdir, start, page_offset, initrd_size = 0; + /* Memory, top-level pagetable, code startpoint and size of the + * (optional) initrd. */ + unsigned long mem = 0, pgdir, start, initrd_size = 0; /* A temporary and the /dev/lguest file descriptor. */ int i, c, lguest_fd; - /* The list of Guest devices, based on command line arguments. */ - struct device_list device_list; - /* The boot information for the Guest: at guest-physical address 0. */ - void *boot = (void *)0; + /* The boot information for the Guest. */ + struct boot_params *boot; /* If they specify an initrd file to load. */ const char *initrd_name = NULL; @@ -1412,11 +1538,12 @@ int main(int argc, char *argv[]) * device receive input from a file descriptor, we keep an fdset * (infds) and the maximum fd number (max_infd) with the head of the * list. We also keep a pointer to the last device, for easy appending - * to the list. */ - device_list.max_infd = -1; - device_list.dev = NULL; - device_list.lastdev = &device_list.dev; - FD_ZERO(&device_list.infds); + * to the list. Finally, we keep the next interrupt number to hand out + * (1: remember that 0 is used by the timer). */ + FD_ZERO(&devices.infds); + devices.max_infd = -1; + devices.lastdev = &devices.dev; + devices.next_irq = 1; /* We need to know how much memory so we can set up the device * descriptor and memory pages for the devices as we parse the command @@ -1424,9 +1551,16 @@ int main(int argc, char *argv[]) * of memory now. */ for (i = 1; i < argc; i++) { if (argv[i][0] != '-') { - mem = top = atoi(argv[i]) * 1024 * 1024; - device_list.descs = map_zeroed_pages(top, 1); - top += getpagesize(); + mem = atoi(argv[i]) * 1024 * 1024; + /* We start by mapping anonymous pages over all of + * guest-physical memory range. This fills it with 0, + * and ensures that the Guest won't be killed when it + * tries to access it. */ + guest_base = map_zeroed_pages(mem / getpagesize() + + DEVICE_PAGES); + guest_limit = mem; + guest_max = mem + DEVICE_PAGES*getpagesize(); + devices.descpage = get_pages(1); break; } } @@ -1437,14 +1571,11 @@ int main(int argc, char *argv[]) case 'v': verbose = true; break; - case 's': - setup_net_file(optarg, &device_list); - break; case 't': - setup_tun_net(optarg, &device_list); + setup_tun_net(optarg); break; case 'b': - setup_block_file(optarg, &device_list); + setup_block_file(optarg); break; case 'i': initrd_name = optarg; @@ -1459,56 +1590,60 @@ int main(int argc, char *argv[]) if (optind + 2 > argc) usage(); - /* We always have a console device */ - setup_console(&device_list); + verbose("Guest base is at %p\n", guest_base); - /* We start by mapping anonymous pages over all of guest-physical - * memory range. This fills it with 0, and ensures that the Guest - * won't be killed when it tries to access it. */ - map_zeroed_pages(0, mem / getpagesize()); + /* We always have a console device */ + setup_console(); /* Now we load the kernel */ - start = load_kernel(open_or_die(argv[optind+1], O_RDONLY), - &page_offset); + start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); + + /* Boot information is stashed at physical address 0 */ + boot = from_guest_phys(0); /* Map the initrd image if requested (at top of physical memory) */ if (initrd_name) { initrd_size = load_initrd(initrd_name, mem); /* These are the location in the Linux boot header where the * start and size of the initrd are expected to be found. */ - *(unsigned long *)(boot+0x218) = mem - initrd_size; - *(unsigned long *)(boot+0x21c) = initrd_size; + boot->hdr.ramdisk_image = mem - initrd_size; + boot->hdr.ramdisk_size = initrd_size; /* The bootloader type 0xFF means "unknown"; that's OK. */ - *(unsigned char *)(boot+0x210) = 0xFF; + boot->hdr.type_of_loader = 0xFF; } /* Set up the initial linear pagetables, starting below the initrd. */ - pgdir = setup_pagetables(mem, initrd_size, page_offset); + pgdir = setup_pagetables(mem, initrd_size); /* The Linux boot header contains an "E820" memory map: ours is a * simple, single region. */ - *(char*)(boot+E820NR) = 1; - *((struct e820entry *)(boot+E820MAP)) - = ((struct e820entry) { 0, mem, E820_RAM }); + boot->e820_entries = 1; + boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM }); /* The boot header contains a command line pointer: we put the command - * line after the boot header (at address 4096) */ - *(void **)(boot + 0x228) = boot + 4096; - concat(boot + 4096, argv+optind+2); + * line after the boot header. */ + boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); + concat((char *)(boot + 1), argv+optind+2); + + /* Boot protocol version: 2.07 supports the fields for lguest. */ + boot->hdr.version = 0x207; + + /* The hardware_subarch value of "1" tells the Guest it's an lguest. */ + boot->hdr.hardware_subarch = 1; - /* The guest type value of "1" tells the Guest it's under lguest. */ - *(int *)(boot + 0x23c) = 1; + /* Tell the entry path not to try to reload segment registers. */ + boot->hdr.loadflags |= KEEP_SEGMENTS; /* We tell the kernel to initialize the Guest: this returns the open * /dev/lguest file descriptor. */ - lguest_fd = tell_kernel(pgdir, start, page_offset); + lguest_fd = tell_kernel(pgdir, start); /* We fork off a child process, which wakes the Launcher whenever one * of the input file descriptors needs attention. Otherwise we would * run the Guest until it tries to output something. */ - waker_fd = setup_waker(lguest_fd, &device_list); + waker_fd = setup_waker(lguest_fd); /* Finally, run the Guest. This doesn't return. */ - run_guest(lguest_fd, &device_list); + run_guest(lguest_fd); } /*:*/ diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt index 821617b..7885ab2 100644 --- a/Documentation/lguest/lguest.txt +++ b/Documentation/lguest/lguest.txt @@ -6,7 +6,7 @@ Lguest is designed to be a minimal hypervisor for the Linux kernel, for Linux developers and users to experiment with virtualization with the minimum of complexity. Nonetheless, it should have sufficient features to make it useful for specific tasks, and, of course, you are -encouraged to fork and enhance it. +encouraged to fork and enhance it (see drivers/lguest/README). Features: @@ -23,19 +23,30 @@ Developer features: Running Lguest: -- Lguest runs the same kernel as guest and host. You can configure - them differently, but usually it's easiest not to. +- The easiest way to run lguest is to use same kernel as guest and host. + You can configure them differently, but usually it's easiest not to. You will need to configure your kernel with the following options: - CONFIG_HIGHMEM64G=n ("High Memory Support" "64GB")[1] - CONFIG_TUN=y/m ("Universal TUN/TAP device driver support") - CONFIG_EXPERIMENTAL=y ("Prompt for development and/or incomplete code/drivers") - CONFIG_PARAVIRT=y ("Paravirtualization support (EXPERIMENTAL)") - CONFIG_LGUEST=y/m ("Linux hypervisor example code") - - and I recommend: - CONFIG_HZ=100 ("Timer frequency")[2] + "General setup": + "Prompt for development and/or incomplete code/drivers" = Y + (CONFIG_EXPERIMENTAL=y) + + "Processor type and features": + "Paravirtualized guest support" = Y + "Lguest guest support" = Y + "High Memory Support" = off/4GB + "Alignment value to which kernel should be aligned" = 0x100000 + (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and + CONFIG_PHYSICAL_ALIGN=0x100000) + + "Device Drivers": + "Network device support" + "Universal TUN/TAP device driver support" = M/Y + (CONFIG_TUN=m) + "Virtualization" + "Linux hypervisor example code" = M/Y + (CONFIG_LGUEST=m) - A tool called "lguest" is available in this directory: type "make" to build it. If you didn't build your kernel in-tree, use "make @@ -51,14 +62,17 @@ Running Lguest: dd if=/dev/zero of=rootfile bs=1M count=2048 qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d + Make sure that you install a getty on /dev/hvc0 if you want to log in on the + console! + - "modprobe lg" if you built it as a module. - Run an lguest as root: - Documentation/lguest/lguest 64m vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/lgba + Documentation/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/vda Explanation: - 64m: the amount of memory to use. + 64: the amount of memory to use, in MB. vmlinux: the kernel image found in the top of your build directory. You can also use a standard bzImage. @@ -66,10 +80,10 @@ Running Lguest: --tunnet=192.168.19.1: configures a "tap" device for networking with this IP address. - --block=rootfile: a file or block device which becomes /dev/lgba + --block=rootfile: a file or block device which becomes /dev/vda inside the guest. - root=/dev/lgba: this (and anything else on the command line) are + root=/dev/vda: this (and anything else on the command line) are kernel boot parameters. - Configuring networking. I usually have the host masquerade, using @@ -99,31 +113,7 @@ Running Lguest: "--sharenet=<filename>": any two guests using the same file are on the same network. This file is created if it does not exist. -Lguest I/O model: - -Lguest uses a simplified DMA model plus shared memory for I/O. Guests -can communicate with each other if they share underlying memory -(usually by the lguest program mmaping the same file), but they can -use any non-shared memory to communicate with the lguest process. - -Guests can register DMA buffers at any key (must be a valid physical -address) using the LHCALL_BIND_DMA(key, dmabufs, num<<8|irq) -hypercall. "dmabufs" is the physical address of an array of "num" -"struct lguest_dma": each contains a used_len, and an array of -physical addresses and lengths. When a transfer occurs, the -"used_len" field of one of the buffers which has used_len 0 will be -set to the length transferred and the irq will fire. +There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest -Using an irq value of 0 unbinds the dma buffers. - -To send DMA, the LHCALL_SEND_DMA(key, dma_physaddr) hypercall is used, -and the bytes used is written to the used_len field. This can be 0 if -noone else has bound a DMA buffer to that key or some other error. -DMA buffers bound by the same guest are ignored. - -Cheers! +Good luck! Rusty Russell rusty@rustcorp.com.au. - -[1] These are on various places on the TODO list, waiting for you to - get annoyed enough at the limitation to fix it. -[2] Lguest is not yet tickless when idle. See [1]. diff --git a/MAINTAINERS b/MAINTAINERS index 1fd6d02..40245af 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2338,6 +2338,8 @@ L: linuxppc-dev@ozlabs.org S: Maintained LINUX FOR POWERPC EMBEDDED PPC8XX +P: Vitaly Bordug +M: vitb@kernel.crashing.org P: Marcelo Tosatti M: marcelo@kvack.org W: http://www.penguinppc.org/ diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c index ee07dce..2d00a08 100644 --- a/arch/alpha/kernel/pci_iommu.c +++ b/arch/alpha/kernel/pci_iommu.c @@ -7,6 +7,7 @@ #include <linux/pci.h> #include <linux/slab.h> #include <linux/bootmem.h> +#include <linux/scatterlist.h> #include <linux/log2.h> #include <asm/io.h> diff --git a/arch/arm/common/dmabounce.c b/arch/arm/common/dmabounce.c index 9d371e4..52fc6a8 100644 --- a/arch/arm/common/dmabounce.c +++ b/arch/arm/common/dmabounce.c @@ -29,6 +29,7 @@ #include <linux/dma-mapping.h> #include <linux/dmapool.h> #include <linux/list.h> +#include <linux/scatterlist.h> #include <asm/cacheflush.h> diff --git a/arch/arm/mach-pxa/pxa300.c b/arch/arm/mach-pxa/pxa300.c index 5363b13..7a34231 100644 --- a/arch/arm/mach-pxa/pxa300.c +++ b/arch/arm/mach-pxa/pxa300.c @@ -23,8 +23,10 @@ static struct pxa3xx_mfp_addr_map pxa300_mfp_addr_map[] __initdata = { MFP_ADDR_X(GPIO0, GPIO2, 0x00b4), MFP_ADDR_X(GPIO3, GPIO26, 0x027c), - MFP_ADDR_X(GPIO27, GPIO127, 0x0400), - MFP_ADDR_X(GPIO0_2, GPIO6_2, 0x02ec), + MFP_ADDR_X(GPIO27, GPIO98, 0x0400), + MFP_ADDR_X(GPIO99, GPIO127, 0x0600), + MFP_ADDR_X(GPIO0_2, GPIO1_2, 0x0674), + MFP_ADDR_X(GPIO2_2, GPIO6_2, 0x02dc), MFP_ADDR(nBE0, 0x0204), MFP_ADDR(nBE1, 0x0208), diff --git a/arch/arm/vfp/vfpdouble.c b/arch/arm/vfp/vfpdouble.c index 74e89f8..190a09a 100644 --- a/arch/arm/vfp/vfpdouble.c +++ b/arch/arm/vfp/vfpdouble.c @@ -1132,7 +1132,7 @@ u32 vfp_double_cpdo(u32 inst, u32 fpscr) unsigned int vecitr, veclen, vecstride; struct op *fop; - vecstride = (1 + ((fpscr & FPSCR_STRIDE_MASK) == FPSCR_STRIDE_MASK)) * 2; + vecstride = (1 + ((fpscr & FPSCR_STRIDE_MASK) == FPSCR_STRIDE_MASK)); fop = (op == FOP_EXT) ? &fops_ext[FEXT_TO_IDX(inst)] : &fops[FOP_TO_IDX(op)]; @@ -1184,10 +1184,10 @@ u32 vfp_double_cpdo(u32 inst, u32 fpscr) * CHECK: It appears to be undefined whether we stop when * we encounter an exception. We continue. */ - dest = FREG_BANK(dest) + ((FREG_IDX(dest) + vecstride) & 6); - dn = FREG_BANK(dn) + ((FREG_IDX(dn) + vecstride) & 6); + dest = FREG_BANK(dest) + ((FREG_IDX(dest) + vecstride) & 3); + dn = FREG_BANK(dn) + ((FREG_IDX(dn) + vecstride) & 3); if (FREG_BANK(dm) != 0) - dm = FREG_BANK(dm) + ((FREG_IDX(dm) + vecstride) & 6); + dm = FREG_BANK(dm) + ((FREG_IDX(dm) + vecstride) & 3); } return exceptions; diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c index eea3f50..b4e210d 100644 --- a/arch/arm/vfp/vfpmodule.c +++ b/arch/arm/vfp/vfpmodule.c @@ -229,7 +229,7 @@ void VFP9_bounce(u32 trigger, u32 fpexc, struct pt_regs *regs) /* * Enable access to the VFP so we can handle the bounce. */ - fmxr(FPEXC, fpexc & ~(FPEXC_EX|FPEXC_INV|FPEXC_UFC|FPEXC_IOC)); + fmxr(FPEXC, fpexc & ~(FPEXC_EX|FPEXC_FPV2|FPEXC_INV|FPEXC_UFC|FPEXC_OFC|FPEXC_IOC)); orig_fpscr = fpscr = fmrx(FPSCR); diff --git a/arch/avr32/boards/atstk1000/atstk1002.c b/arch/avr32/boards/atstk1000/atstk1002.c index 6b9e466..5be0d13 100644 --- a/arch/avr32/boards/atstk1000/atstk1002.c +++ b/arch/avr32/boards/atstk1000/atstk1002.c @@ -16,6 +16,7 @@ #include <linux/string.h> #include <linux/types.h> #include <linux/spi/spi.h> +#include <linux/spi/at73c213.h> #include <video/atmel_lcdc.h> @@ -49,7 +50,26 @@ static struct eth_platform_data __initdata eth_data[2] = { }; #ifndef CONFIG_BOARD_ATSTK1002_SW1_CUSTOM +#ifndef CONFIG_BOARD_ATSTK1002_SW3_CUSTOM +static struct at73c213_board_info at73c213_data = { + .ssc_id = 0, + .shortname = "AVR32 STK1000 external DAC", +}; +#endif +#endif + +#ifndef CONFIG_BOARD_ATSTK1002_SW1_CUSTOM static struct spi_board_info spi0_board_info[] __initdata = { +#ifndef CONFIG_BOARD_ATSTK1002_SW3_CUSTOM + { + /* AT73C213 */ + .modalias = "at73c213", + .max_speed_hz = 200000, + .chip_select = 0, + .mode = SPI_MODE_1, + .platform_data = &at73c213_data, + }, +#endif { /* QVGA display */ .modalias = "ltv350qv", @@ -180,6 +200,38 @@ static void setup_j2_leds(void) } #endif +#ifndef CONFIG_BOARD_ATSTK1002_SW1_CUSTOM +#ifndef CONFIG_BOARD_ATSTK1002_SW3_CUSTOM +static void __init at73c213_set_clk(struct at73c213_board_info *info) +{ + struct clk *gclk; + struct clk *pll; + + gclk = clk_get(NULL, "gclk0"); + if (IS_ERR(gclk)) + goto err_gclk; + pll = clk_get(NULL, "pll0"); + if (IS_ERR(pll)) + goto err_pll; + + if (clk_set_parent(gclk, pll)) { + pr_debug("STK1000: failed to set pll0 as parent for DAC clock\n"); + goto err_set_clk; + } + + at32_select_periph(GPIO_PIN_PA(30), GPIO_PERIPH_A, 0); + info->dac_clk = gclk; + +err_set_clk: + clk_put(pll); +err_pll: + clk_put(gclk); +err_gclk: + return; +} +#endif +#endif + void __init setup_board(void) { #ifdef CONFIG_BOARD_ATSTK1002_SW2_CUSTOM @@ -248,6 +300,12 @@ static int __init atstk1002_init(void) setup_j2_leds(); +#ifndef CONFIG_BOARD_ATSTK1002_SW3_CUSTOM +#ifndef CONFIG_BOARD_ATSTK1002_SW1_CUSTOM + at73c213_set_clk(&at73c213_data); +#endif +#endif + return 0; } postcore_initcall(atstk1002_init); diff --git a/arch/avr32/mach-at32ap/at32ap7000.c b/arch/avr32/mach-at32ap/at32ap7000.c index f6d154c..a9d9ec0 100644 --- a/arch/avr32/mach-at32ap/at32ap7000.c +++ b/arch/avr32/mach-at32ap/at32ap7000.c @@ -556,6 +556,17 @@ static struct clk pico_clk = { .users = 1, }; +static struct resource dmaca0_resource[] = { + { + .start = 0xff200000, + .end = 0xff20ffff, + .flags = IORESOURCE_MEM, + }, + IRQ(2), +}; +DEFINE_DEV(dmaca, 0); +DEV_CLK(hclk, dmaca0, hsb, 10); + /* -------------------------------------------------------------------- * HMATRIX * -------------------------------------------------------------------- */ @@ -655,6 +666,7 @@ void __init at32_add_system_devices(void) platform_device_register(&at32_eic0_device); platform_device_register(&smc0_device); platform_device_register(&pdc_device); + platform_device_register(&dmaca0_device); platform_device_register(&at32_systc0_device); @@ -960,6 +972,96 @@ at32_add_device_spi(unsigned int id, struct spi_board_info *b, unsigned int n) } /* -------------------------------------------------------------------- + * TWI + * -------------------------------------------------------------------- */ +static struct resource atmel_twi0_resource[] __initdata = { + PBMEM(0xffe00800), + IRQ(5), +}; +static struct clk atmel_twi0_pclk = { + .name = "twi_pclk", + .parent = &pba_clk, + .mode = pba_clk_mode, + .get_rate = pba_clk_get_rate, + .index = 2, +}; + +struct platform_device *__init at32_add_device_twi(unsigned int id) +{ + struct platform_device *pdev; + + if (id != 0) + return NULL; + + pdev = platform_device_alloc("atmel_twi", id); + if (!pdev) + return NULL; + + if (platform_device_add_resources(pdev, atmel_twi0_resource, + ARRAY_SIZE(atmel_twi0_resource))) + goto err_add_resources; + + select_peripheral(PA(6), PERIPH_A, 0); /* SDA */ + select_peripheral(PA(7), PERIPH_A, 0); /* SDL */ + + atmel_twi0_pclk.dev = &pdev->dev; + + platform_device_add(pdev); + return pdev; + +err_add_resources: + platform_device_put(pdev); + return NULL; +} + +/* -------------------------------------------------------------------- + * MMC + * -------------------------------------------------------------------- */ +static struct resource atmel_mci0_resource[] __initdata = { + PBMEM(0xfff02400), + IRQ(28), +}; +static struct clk atmel_mci0_pclk = { + .name = "mci_clk", + .parent = &pbb_clk, + .mode = pbb_clk_mode, + .get_rate = pbb_clk_get_rate, + .index = 9, +}; + +struct platform_device *__init at32_add_device_mci(unsigned int id) +{ + struct platform_device *pdev; + + if (id != 0) + return NULL; + + pdev = platform_device_alloc("atmel_mci", id); + if (!pdev) + return NULL; + + if (platform_device_add_resources(pdev, atmel_mci0_resource, + ARRAY_SIZE(atmel_mci0_resource))) + goto err_add_resources; + + select_peripheral(PA(10), PERIPH_A, 0); /* CLK */ + select_peripheral(PA(11), PERIPH_A, 0); /* CMD */ + select_peripheral(PA(12), PERIPH_A, 0); /* DATA0 */ + select_peripheral(PA(13), PERIPH_A, 0); /* DATA1 */ + select_peripheral(PA(14), PERIPH_A, 0); /* DATA2 */ + select_peripheral(PA(15), PERIPH_A, 0); /* DATA3 */ + + atmel_mci0_pclk.dev = &pdev->dev; + + platform_device_add(pdev); + return pdev; + +err_add_resources: + platform_device_put(pdev); + return NULL; +} + +/* -------------------------------------------------------------------- * LCDC * -------------------------------------------------------------------- */ static struct atmel_lcdfb_info atmel_lcdfb0_data; @@ -1228,6 +1330,241 @@ out_free_pdev: } /* -------------------------------------------------------------------- + * IDE / CompactFlash + * -------------------------------------------------------------------- */ +static struct resource at32_smc_cs4_resource[] __initdata = { + { + .start = 0x04000000, + .end = 0x07ffffff, + .flags = IORESOURCE_MEM, + }, + IRQ(~0UL), /* Magic IRQ will be overridden */ +}; +static struct resource at32_smc_cs5_resource[] __initdata = { + { + .start = 0x20000000, + .end = 0x23ffffff, + .flags = IORESOURCE_MEM, + }, + IRQ(~0UL), /* Magic IRQ will be overridden */ +}; + +static int __init at32_init_ide_or_cf(struct platform_device *pdev, + unsigned int cs, unsigned int extint) +{ + static unsigned int extint_pin_map[4] __initdata = { + GPIO_PIN_PB(25), + GPIO_PIN_PB(26), + GPIO_PIN_PB(27), + GPIO_PIN_PB(28), + }; + static bool common_pins_initialized __initdata = false; + unsigned int extint_pin; + int ret; + + if (extint >= ARRAY_SIZE(extint_pin_map)) + return -EINVAL; + extint_pin = extint_pin_map[extint]; + + switch (cs) { + case 4: + ret = platform_device_add_resources(pdev, + at32_smc_cs4_resource, + ARRAY_SIZE(at32_smc_cs4_resource)); + if (ret) + return ret; + + select_peripheral(PE(21), PERIPH_A, 0); /* NCS4 -> OE_N */ + set_ebi_sfr_bits(HMATRIX_BIT(CS4A)); + break; + case 5: + ret = platform_device_add_resources(pdev, + at32_smc_cs5_resource, + ARRAY_SIZE(at32_smc_cs5_resource)); + if (ret) + return ret; + + select_peripheral(PE(22), PERIPH_A, 0); /* NCS5 -> OE_N */ + set_ebi_sfr_bits(HMATRIX_BIT(CS5A)); + break; + default: + return -EINVAL; + } + + if (!common_pins_initialized) { + select_peripheral(PE(19), PERIPH_A, 0); /* CFCE1 -> CS0_N */ + select_peripheral(PE(20), PERIPH_A, 0); /* CFCE2 -> CS1_N */ + select_peripheral(PE(23), PERIPH_A, 0); /* CFRNW -> DIR */ + select_peripheral(PE(24), PERIPH_A, 0); /* NWAIT <- IORDY */ + common_pins_initialized = true; + } + + at32_select_periph(extint_pin, GPIO_PERIPH_A, AT32_GPIOF_DEGLITCH); + + pdev->resource[1].start = EIM_IRQ_BASE + extint; + pdev->resource[1].end = pdev->resource[1].start; + + return 0; +} + +struct platform_device *__init +at32_add_device_ide(unsigned int id, unsigned int extint, + struct ide_platform_data *data) +{ + struct platform_device *pdev; + + pdev = platform_device_alloc("at32_ide", id); + if (!pdev) + goto fail; + + if (platform_device_add_data(pdev, data, + sizeof(struct ide_platform_data))) + goto fail; + + if (at32_init_ide_or_cf(pdev, data->cs, extint)) + goto fail; + + platform_device_add(pdev); + return pdev; + +fail: + platform_device_put(pdev); + return NULL; +} + +struct platform_device *__init +at32_add_device_cf(unsigned int id, unsigned int extint, + struct cf_platform_data *data) +{ + struct platform_device *pdev; + + pdev = platform_device_alloc("at32_cf", id); + if (!pdev) + goto fail; + + if (platform_device_add_data(pdev, data, + sizeof(struct cf_platform_data))) + goto fail; + + if (at32_init_ide_or_cf(pdev, data->cs, extint)) + goto fail; + + if (data->detect_pin != GPIO_PIN_NONE) + at32_select_gpio(data->detect_pin, AT32_GPIOF_DEGLITCH); + if (data->reset_pin != GPIO_PIN_NONE) + at32_select_gpio(data->reset_pin, 0); + if (data->vcc_pin != GPIO_PIN_NONE) + at32_select_gpio(data->vcc_pin, 0); + /* READY is used as extint, so we can't select it as gpio */ + + platform_device_add(pdev); + return pdev; + +fail: + platform_device_put(pdev); + return NULL; +} + +/* -------------------------------------------------------------------- + * AC97C + * -------------------------------------------------------------------- */ +static struct resource atmel_ac97c0_resource[] __initdata = { + PBMEM(0xfff02800), + IRQ(29), +}; +static struct clk atmel_ac97c0_pclk = { + .name = "pclk", + .parent = &pbb_clk, + .mode = pbb_clk_mode, + .get_rate = pbb_clk_get_rate, + .index = 10, +}; + +struct platform_device *__init at32_add_device_ac97c(unsigned int id) +{ + struct platform_device *pdev; + + if (id != 0) + return NULL; + + pdev = platform_device_alloc("atmel_ac97c", id); + if (!pdev) + return NULL; + + if (platform_device_add_resources(pdev, atmel_ac97c0_resource, + ARRAY_SIZE(atmel_ac97c0_resource))) + goto err_add_resources; + + select_peripheral(PB(20), PERIPH_B, 0); /* SYNC */ + select_peripheral(PB(21), PERIPH_B, 0); /* SDO */ + select_peripheral(PB(22), PERIPH_B, 0); /* SDI */ + select_peripheral(PB(23), PERIPH_B, 0); /* SCLK */ + + atmel_ac97c0_pclk.dev = &pdev->dev; + + platform_device_add(pdev); + return pdev; + +err_add_resources: + platform_device_put(pdev); + return NULL; +} + +/* -------------------------------------------------------------------- + * ABDAC + * -------------------------------------------------------------------- */ +static struct resource abdac0_resource[] __initdata = { + PBMEM(0xfff02000), + IRQ(27), +}; +static struct clk abdac0_pclk = { + .name = "pclk", + .parent = &pbb_clk, + .mode = pbb_clk_mode, + .get_rate = pbb_clk_get_rate, + .index = 8, +}; +static struct clk abdac0_sample_clk = { + .name = "sample_clk", + .mode = genclk_mode, + .get_rate = genclk_get_rate, + .set_rate = genclk_set_rate, + .set_parent = genclk_set_parent, + .index = 6, +}; + +struct platform_device *__init at32_add_device_abdac(unsigned int id) +{ + struct platform_device *pdev; + + if (id != 0) + return NULL; + + pdev = platform_device_alloc("abdac", id); + if (!pdev) + return NULL; + + if (platform_device_add_resources(pdev, abdac0_resource, + ARRAY_SIZE(abdac0_resource))) + goto err_add_resources; + + select_peripheral(PB(20), PERIPH_A, 0); /* DATA1 */ + select_peripheral(PB(21), PERIPH_A, 0); /* DATA0 */ + select_peripheral(PB(22), PERIPH_A, 0); /* DATAN1 */ + select_peripheral(PB(23), PERIPH_A, 0); /* DATAN0 */ + + abdac0_pclk.dev = &pdev->dev; + abdac0_sample_clk.dev = &pdev->dev; + + platform_device_add(pdev); + return pdev; + +err_add_resources: + platform_device_put(pdev); + return NULL; +} + +/* -------------------------------------------------------------------- * GCLK * -------------------------------------------------------------------- */ static struct clk gclk0 = { @@ -1290,6 +1627,7 @@ struct clk *at32_clock_list[] = { &smc0_mck, &pdc_hclk, &pdc_pclk, + &dmaca0_hclk, &pico_clk, &pio0_mck, &pio1_mck, @@ -1307,6 +1645,8 @@ struct clk *at32_clock_list[] = { &macb1_pclk, &atmel_spi0_spi_clk, &atmel_spi1_spi_clk, + &atmel_twi0_pclk, + &atmel_mci0_pclk, &atmel_lcdfb0_hck1, &atmel_lcdfb0_pixclk, &ssc0_pclk, @@ -1314,6 +1654,9 @@ struct clk *at32_clock_list[] = { &ssc2_pclk, &usba0_hclk, &usba0_pclk, + &atmel_ac97c0_pclk, + &abdac0_pclk, + &abdac0_sample_clk, &gclk0, &gclk1, &gclk2, @@ -1355,6 +1698,7 @@ void __init at32_clock_init(void) genclk_init_parent(&gclk3); genclk_init_parent(&gclk4); genclk_init_parent(&atmel_lcdfb0_pixclk); + genclk_init_parent(&abdac0_sample_clk); /* * Turn on all clocks that have at least one user already, and diff --git a/arch/avr32/mach-at32ap/extint.c b/arch/avr32/mach-at32ap/extint.c index 8acd010..f5bfd4c 100644 --- a/arch/avr32/mach-at32ap/extint.c +++ b/arch/avr32/mach-at32ap/extint.c @@ -142,7 +142,7 @@ static int eic_set_irq_type(unsigned int irq, unsigned int flow_type) return ret; } -struct irq_chip eic_chip = { +static struct irq_chip eic_chip = { .name = "eic", .ack = eic_ack_irq, .mask = eic_mask_irq, diff --git a/arch/avr32/mach-at32ap/pm.h b/arch/avr32/mach-at32ap/pm.h index 47efd0d..694d521 100644 --- a/arch/avr32/mach-at32ap/pm.h +++ b/arch/avr32/mach-at32ap/pm.h @@ -113,8 +113,8 @@ /* Register access macros */ #define pm_readl(reg) \ - __raw_readl((void __iomem *)AT32_PM_BASE + PM_##reg) + __raw_readl((void __iomem __force *)AT32_PM_BASE + PM_##reg) #define pm_writel(reg,value) \ - __raw_writel((value), (void __iomem *)AT32_PM_BASE + PM_##reg) + __raw_writel((value), (void __iomem __force *)AT32_PM_BASE + PM_##reg) #endif /* __ARCH_AVR32_MACH_AT32AP_PM_H__ */ diff --git a/arch/avr32/mach-at32ap/time-tc.c b/arch/avr32/mach-at32ap/time-tc.c index e3070bd..1026586 100644 --- a/arch/avr32/mach-at32ap/time-tc.c +++ b/arch/avr32/mach-at32ap/time-tc.c @@ -79,7 +79,7 @@ static int avr32_timer_calc_div_and_set_jiffies(struct clk *pclk) { unsigned int cycles_max = (clocksource_avr32.mask + 1) / 2; unsigned int divs[] = { 4, 8, 16, 32 }; - int divs_size = sizeof(divs) / sizeof(*divs); + int divs_size = ARRAY_SIZE(divs); int i = 0; unsigned long count_hz; unsigned long shift; diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index f6e44fc..5bed8be 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -227,28 +227,40 @@ config SCHED_NO_NO_OMIT_FRAME_POINTER If in doubt, say "Y". config PARAVIRT - bool "Paravirtualization support (EXPERIMENTAL)" - depends on EXPERIMENTAL + bool depends on !(X86_VISWS || X86_VOYAGER) help - Paravirtualization is a way of running multiple instances of - Linux on the same machine, under a hypervisor. This option - changes the kernel so it can modify itself when it is run - under a hypervisor, improving performance significantly. - However, when run without a hypervisor the kernel is - theoretically slower. If in doubt, say N. + This changes the kernel so it can modify itself when it is run + under a hypervisor, potentially improving performance significantly + over full virtualization. However, when run without a hypervisor + the kernel is theoretically slower and slightly larger. + +menuconfig PARAVIRT_GUEST + bool "Paravirtualized guest support" + help + Say Y here to get to see options related to running Linux under + various hypervisors. This option alone does not add any kernel code. + + If you say N, all options in this submenu will be skipped and disabled. + +if PARAVIRT_GUEST source "arch/x86/xen/Kconfig" config VMI - bool "VMI Paravirt-ops support" - depends on PARAVIRT + bool "VMI Guest support" + select PARAVIRT + depends on !(X86_VISWS || X86_VOYAGER) help VMI provides a paravirtualized interface to the VMware ESX server (it could be used by other hypervisors in theory too, but is not at the moment), by linking the kernel to a GPL-ed ROM module provided by the hypervisor. +source "arch/x86/lguest/Kconfig" + +endif + config ACPI_SRAT bool default y diff --git a/arch/i386/Makefile b/arch/i386/Makefile index b88e47c..b81cb64 100644 --- a/arch/i386/Makefile +++ b/arch/i386/Makefile @@ -99,6 +99,9 @@ core-$(CONFIG_X86_ES7000) := arch/x86/mach-es7000/ # Xen paravirtualization support core-$(CONFIG_XEN) += arch/x86/xen/ +# lguest paravirtualization support +core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/ + # default subarch .h files mflags-y += -Iinclude/asm-x86/mach-default diff --git a/arch/m68knommu/Kconfig b/arch/m68knommu/Kconfig index f52c627..f4b582c 100644 --- a/arch/m68knommu/Kconfig +++ b/arch/m68knommu/Kconfig @@ -451,6 +451,12 @@ config MOD5272 help Support for the Netburner MOD-5272 board. +config SAVANTrosie1 + bool "Savant Rosie1 board support" + depends on M523x + help + Support for the Savant Rosie1 board. + config ROMFS_FROM_ROM bool "ROMFS image not RAM resident" depends on (NETtel || SNAPGEAR) @@ -492,7 +498,12 @@ config SNEHA bool default y depends on CPU16B - + +config SAVANT + bool + default y + depends on SAVANTrosie1 + config AVNET bool default y diff --git a/arch/m68knommu/Makefile b/arch/m68knommu/Makefile index 92227aa..30aa255 100644 --- a/arch/m68knommu/Makefile +++ b/arch/m68knommu/Makefile @@ -48,6 +48,7 @@ board-$(CONFIG_SNEHA) := SNEHA board-$(CONFIG_M5208EVB) := M5208EVB board-$(CONFIG_MOD5272) := MOD5272 board-$(CONFIG_AVNET) := AVNET +board-$(CONFIG_SAVANT) := SAVANT BOARD := $(board-y) model-$(CONFIG_RAMKERNEL) := ram @@ -117,4 +118,4 @@ core-y += arch/m68knommu/kernel/ \ libs-y += arch/m68knommu/lib/ archclean: - $(Q)$(MAKE) $(clean)=arch/m68knommu/boot + diff --git a/arch/m68knommu/defconfig b/arch/m68knommu/defconfig index 3891de0..5a0ecaa 100644 --- a/arch/m68knommu/defconfig +++ b/arch/m68knommu/defconfig @@ -1,41 +1,48 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.17 -# Tue Jun 27 12:57:06 2006 +# Linux kernel version: 2.6.23 +# Thu Oct 18 13:17:38 2007 # CONFIG_M68K=y # CONFIG_MMU is not set # CONFIG_FPU is not set +CONFIG_ZONE_DMA=y CONFIG_RWSEM_GENERIC_SPINLOCK=y # CONFIG_RWSEM_XCHGADD_ALGORITHM is not set +# CONFIG_ARCH_HAS_ILOG2_U32 is not set +# CONFIG_ARCH_HAS_ILOG2_U64 is not set CONFIG_GENERIC_FIND_NEXT_BIT=y CONFIG_GENERIC_HWEIGHT=y +CONFIG_GENERIC_HARDIRQS=y CONFIG_GENERIC_CALIBRATE_DELAY=y CONFIG_TIME_LOW_RES=y +CONFIG_NO_IOPORT=y +CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" # -# Code maturity level options +# General setup # CONFIG_EXPERIMENTAL=y CONFIG_BROKEN_ON_SMP=y CONFIG_INIT_ENV_ARG_LIMIT=32 - -# -# General setup -# CONFIG_LOCALVERSION="" CONFIG_LOCALVERSION_AUTO=y # CONFIG_SYSVIPC is not set # CONFIG_POSIX_MQUEUE is not set # CONFIG_BSD_PROCESS_ACCT is not set -# CONFIG_SYSCTL is not set +# CONFIG_TASKSTATS is not set +# CONFIG_USER_NS is not set # CONFIG_AUDIT is not set # CONFIG_IKCONFIG is not set +CONFIG_LOG_BUF_SHIFT=14 +# CONFIG_SYSFS_DEPRECATED is not set # CONFIG_RELAY is not set -CONFIG_INITRAMFS_SOURCE="" -CONFIG_UID16=y +# CONFIG_BLK_DEV_INITRD is not set # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set +CONFIG_SYSCTL=y CONFIG_EMBEDDED=y +CONFIG_UID16=y +CONFIG_SYSCTL_SYSCALL=y # CONFIG_KALLSYMS is not set # CONFIG_HOTPLUG is not set CONFIG_PRINTK=y @@ -44,20 +51,25 @@ CONFIG_ELF_CORE=y CONFIG_BASE_FULL=y # CONFIG_FUTEX is not set # CONFIG_EPOLL is not set +# CONFIG_SIGNALFD is not set +# CONFIG_EVENTFD is not set +# CONFIG_VM_EVENT_COUNTERS is not set CONFIG_SLAB=y +# CONFIG_SLUB is not set +# CONFIG_SLOB is not set CONFIG_TINY_SHMEM=y CONFIG_BASE_SMALL=0 -# CONFIG_SLOB is not set - -# -# Loadable module support -# -# CONFIG_MODULES is not set - -# -# Block layer -# +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +# CONFIG_MODULE_FORCE_UNLOAD is not set +# CONFIG_MODVERSIONS is not set +# CONFIG_MODULE_SRCVERSION_ALL is not set +# CONFIG_KMOD is not set +CONFIG_BLOCK=y +# CONFIG_LBD is not set # CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_LSF is not set +# CONFIG_BLK_DEV_BSG is not set # # IO Schedulers @@ -99,6 +111,7 @@ CONFIG_CLOCK_DIV=1 # # Platform # +# CONFIG_UC5272 is not set CONFIG_M5272C3=y # CONFIG_COBRA5272 is not set # CONFIG_CANCam is not set @@ -107,7 +120,6 @@ CONFIG_M5272C3=y # CONFIG_CPU16B is not set # CONFIG_MOD5272 is not set CONFIG_FREESCALE=y -# CONFIG_LARGE_ALLOCS is not set CONFIG_4KSTACKS=y # @@ -121,6 +133,11 @@ CONFIG_RAMAUTOBIT=y # CONFIG_RAM8BIT is not set # CONFIG_RAM16BIT is not set # CONFIG_RAM32BIT is not set + +# +# ROM configuration +# +# CONFIG_ROM is not set CONFIG_RAMKERNEL=y # CONFIG_ROMKERNEL is not set CONFIG_SELECT_MEMORY_MODEL=y @@ -131,20 +148,19 @@ CONFIG_FLATMEM=y CONFIG_FLAT_NODE_MEM_MAP=y # CONFIG_SPARSEMEM_STATIC is not set CONFIG_SPLIT_PTLOCK_CPUS=4 +# CONFIG_RESOURCES_64BIT is not set +CONFIG_ZONE_DMA_FLAG=1 +CONFIG_VIRT_TO_BUS=y # # Bus options (PCI, PCMCIA, EISA, MCA, ISA) # # CONFIG_PCI is not set +# CONFIG_ARCH_SUPPORTS_MSI is not set # # PCCARD (PCMCIA/CardBus) support # -# CONFIG_PCCARD is not set - -# -# PCI Hotplug Support -# # # Executable file formats @@ -168,7 +184,6 @@ CONFIG_NET=y # # Networking options # -# CONFIG_NETDEBUG is not set CONFIG_PACKET=y # CONFIG_PACKET_MMAP is not set CONFIG_UNIX=y @@ -187,27 +202,21 @@ CONFIG_IP_FIB_HASH=y # CONFIG_INET_IPCOMP is not set # CONFIG_INET_XFRM_TUNNEL is not set # CONFIG_INET_TUNNEL is not set +# CONFIG_INET_XFRM_MODE_TRANSPORT is not set +# CONFIG_INET_XFRM_MODE_TUNNEL is not set +# CONFIG_INET_XFRM_MODE_BEET is not set # CONFIG_INET_DIAG is not set # CONFIG_TCP_CONG_ADVANCED is not set -CONFIG_TCP_CONG_BIC=y +CONFIG_TCP_CONG_CUBIC=y +CONFIG_DEFAULT_TCP_CONG="cubic" +# CONFIG_TCP_MD5SIG is not set # CONFIG_IPV6 is not set # CONFIG_INET6_XFRM_TUNNEL is not set # CONFIG_INET6_TUNNEL is not set +# CONFIG_NETWORK_SECMARK is not set # CONFIG_NETFILTER is not set - -# -# DCCP Configuration (EXPERIMENTAL) -# # CONFIG_IP_DCCP is not set - -# -# SCTP Configuration (EXPERIMENTAL) -# # CONFIG_IP_SCTP is not set - -# -# TIPC Configuration (EXPERIMENTAL) -# # CONFIG_TIPC is not set # CONFIG_ATM is not set # CONFIG_BRIDGE is not set @@ -218,7 +227,6 @@ CONFIG_TCP_CONG_BIC=y # CONFIG_ATALK is not set # CONFIG_X25 is not set # CONFIG_LAPB is not set -# CONFIG_NET_DIVERT is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set @@ -234,7 +242,17 @@ CONFIG_TCP_CONG_BIC=y # CONFIG_HAMRADIO is not set # CONFIG_IRDA is not set # CONFIG_BT is not set +# CONFIG_AF_RXRPC is not set + +# +# Wireless +# +# CONFIG_CFG80211 is not set +# CONFIG_WIRELESS_EXT is not set +# CONFIG_MAC80211 is not set # CONFIG_IEEE80211 is not set +# CONFIG_RFKILL is not set +# CONFIG_NET_9P is not set # # Device Drivers @@ -245,16 +263,8 @@ CONFIG_TCP_CONG_BIC=y # CONFIG_STANDALONE=y CONFIG_PREVENT_FIRMWARE_BUILD=y -# CONFIG_FW_LOADER is not set - -# -# Connector - unified userspace <-> kernelspace linker -# +# CONFIG_SYS_HYPERVISOR is not set # CONFIG_CONNECTOR is not set - -# -# Memory Technology Devices (MTD) -# CONFIG_MTD=y # CONFIG_MTD_DEBUG is not set # CONFIG_MTD_CONCAT is not set @@ -266,11 +276,13 @@ CONFIG_MTD_PARTITIONS=y # User Modules And Translation Layers # CONFIG_MTD_CHAR=y +CONFIG_MTD_BLKDEVS=y CONFIG_MTD_BLOCK=y # CONFIG_FTL is not set # CONFIG_NFTL is not set # CONFIG_INFTL is not set # CONFIG_RFD_FTL is not set +# CONFIG_SSFDC is not set # # RAM/ROM/Flash chip drivers @@ -290,7 +302,6 @@ CONFIG_MTD_CFI_I2=y CONFIG_MTD_RAM=y # CONFIG_MTD_ROM is not set # CONFIG_MTD_ABSENT is not set -# CONFIG_MTD_OBSOLETE_CHIPS is not set # # Mapping drivers for chip access @@ -313,42 +324,25 @@ CONFIG_MTD_UCLINUX=y # CONFIG_MTD_DOC2000 is not set # CONFIG_MTD_DOC2001 is not set # CONFIG_MTD_DOC2001PLUS is not set - -# -# NAND Flash Device Drivers -# # CONFIG_MTD_NAND is not set - -# -# OneNAND Flash Device Drivers -# # CONFIG_MTD_ONENAND is not set # -# Parallel port support +# UBI - Unsorted block images # +# CONFIG_MTD_UBI is not set # CONFIG_PARPORT is not set - -# -# Plug and Play support -# - -# -# Block devices -# +CONFIG_BLK_DEV=y # CONFIG_BLK_DEV_COW_COMMON is not set # CONFIG_BLK_DEV_LOOP is not set # CONFIG_BLK_DEV_NBD is not set CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_COUNT=16 CONFIG_BLK_DEV_RAM_SIZE=4096 -# CONFIG_BLK_DEV_INITRD is not set +CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024 # CONFIG_CDROM_PKTCDVD is not set # CONFIG_ATA_OVER_ETH is not set - -# -# ATA/ATAPI/MFM/RLL support -# +# CONFIG_MISC_DEVICES is not set # CONFIG_IDE is not set # @@ -356,67 +350,29 @@ CONFIG_BLK_DEV_RAM_SIZE=4096 # # CONFIG_RAID_ATTRS is not set # CONFIG_SCSI is not set - -# -# Multi-device support (RAID and LVM) -# +# CONFIG_SCSI_DMA is not set +# CONFIG_SCSI_NETLINK is not set # CONFIG_MD is not set - -# -# Fusion MPT device support -# -# CONFIG_FUSION is not set - -# -# IEEE 1394 (FireWire) support -# - -# -# I2O device support -# - -# -# Network device support -# CONFIG_NETDEVICES=y +# CONFIG_NETDEVICES_MULTIQUEUE is not set # CONFIG_DUMMY is not set # CONFIG_BONDING is not set +# CONFIG_MACVLAN is not set # CONFIG_EQUALIZER is not set # CONFIG_TUN is not set - -# -# PHY device support -# # CONFIG_PHYLIB is not set - -# -# Ethernet (10 or 100Mbit) -# CONFIG_NET_ETHERNET=y # CONFIG_MII is not set CONFIG_FEC=y # CONFIG_FEC2 is not set +# CONFIG_NETDEV_1000 is not set +# CONFIG_NETDEV_10000 is not set # -# Ethernet (1000 Mbit) -# - -# -# Ethernet (10000 Mbit) -# - -# -# Token Ring devices -# - -# -# Wireless LAN (non-hamradio) -# -# CONFIG_NET_RADIO is not set - -# -# Wan interfaces +# Wireless LAN # +# CONFIG_WLAN_PRE80211 is not set +# CONFIG_WLAN_80211 is not set # CONFIG_WAN is not set CONFIG_PPP=y # CONFIG_PPP_MULTILINK is not set @@ -427,20 +383,14 @@ CONFIG_PPP=y # CONFIG_PPP_BSDCOMP is not set # CONFIG_PPP_MPPE is not set # CONFIG_PPPOE is not set +# CONFIG_PPPOL2TP is not set # CONFIG_SLIP is not set +CONFIG_SLHC=y # CONFIG_SHAPER is not set # CONFIG_NETCONSOLE is not set # CONFIG_NETPOLL is not set # CONFIG_NET_POLL_CONTROLLER is not set - -# -# ISDN subsystem -# # CONFIG_ISDN is not set - -# -# Telephony Support -# # CONFIG_PHONE is not set # @@ -472,34 +422,13 @@ CONFIG_SERIAL_COLDFIRE=y # CONFIG_UNIX98_PTYS is not set CONFIG_LEGACY_PTYS=y CONFIG_LEGACY_PTY_COUNT=256 - -# -# IPMI -# # CONFIG_IPMI_HANDLER is not set - -# -# Watchdog Cards -# # CONFIG_WATCHDOG is not set +# CONFIG_HW_RANDOM is not set # CONFIG_GEN_RTC is not set -# CONFIG_DTLK is not set # CONFIG_R3964 is not set - -# -# Ftape, the floppy tape device driver -# # CONFIG_RAW_DRIVER is not set - -# -# TPM devices -# # CONFIG_TCG_TPM is not set -# CONFIG_TELCLOCK is not set - -# -# I2C support -# # CONFIG_I2C is not set # @@ -507,101 +436,74 @@ CONFIG_LEGACY_PTY_COUNT=256 # # CONFIG_SPI is not set # CONFIG_SPI_MASTER is not set - -# -# Dallas's 1-wire bus -# # CONFIG_W1 is not set - -# -# Hardware Monitoring support -# +# CONFIG_POWER_SUPPLY is not set # CONFIG_HWMON is not set -# CONFIG_HWMON_VID is not set # -# Misc devices +# Multifunction device drivers # +# CONFIG_MFD_SM501 is not set # # Multimedia devices # # CONFIG_VIDEO_DEV is not set -CONFIG_VIDEO_V4L2=y +# CONFIG_DVB_CORE is not set +CONFIG_DAB=y # -# Digital Video Broadcasting Devices +# Graphics support # -# CONFIG_DVB is not set +# CONFIG_BACKLIGHT_LCD_SUPPORT is not set # -# Graphics support +# Display device support # +# CONFIG_DISPLAY_SUPPORT is not set +# CONFIG_VGASTATE is not set +CONFIG_VIDEO_OUTPUT_CONTROL=y # CONFIG_FB is not set # # Sound # # CONFIG_SOUND is not set - -# -# USB support -# -# CONFIG_USB_ARCH_HAS_HCD is not set -# CONFIG_USB_ARCH_HAS_OHCI is not set -# CONFIG_USB_ARCH_HAS_EHCI is not set - -# -# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' -# - -# -# USB Gadget Support -# -# CONFIG_USB_GADGET is not set - -# -# MMC/SD Card support -# +# CONFIG_USB_SUPPORT is not set # CONFIG_MMC is not set - -# -# LED devices -# # CONFIG_NEW_LEDS is not set +# CONFIG_RTC_CLASS is not set # -# LED drivers +# DMA Engine support # +# CONFIG_DMA_ENGINE is not set # -# LED Triggers +# DMA Clients # # -# InfiniBand support +# DMA Devices # # -# EDAC - error detection and reporting (RAS) (EXPERIMENTAL) +# Userspace I/O # - -# -# Real Time Clock -# -# CONFIG_RTC_CLASS is not set +# CONFIG_UIO is not set # # File systems # CONFIG_EXT2_FS=y # CONFIG_EXT2_FS_XATTR is not set -# CONFIG_EXT2_FS_XIP is not set # CONFIG_EXT3_FS is not set +# CONFIG_EXT4DEV_FS is not set # CONFIG_REISERFS_FS is not set # CONFIG_JFS_FS is not set # CONFIG_FS_POSIX_ACL is not set # CONFIG_XFS_FS is not set +# CONFIG_GFS2_FS is not set # CONFIG_OCFS2_FS is not set # CONFIG_MINIX_FS is not set CONFIG_ROMFS_FS=y @@ -629,6 +531,7 @@ CONFIG_ROMFS_FS=y # Pseudo filesystems # CONFIG_PROC_FS=y +CONFIG_PROC_SYSCTL=y CONFIG_SYSFS=y # CONFIG_TMPFS is not set # CONFIG_HUGETLB_PAGE is not set @@ -645,7 +548,6 @@ CONFIG_RAMFS=y # CONFIG_BEFS_FS is not set # CONFIG_BFS_FS is not set # CONFIG_EFS_FS is not set -# CONFIG_JFFS_FS is not set # CONFIG_JFFS2_FS is not set # CONFIG_CRAMFS is not set # CONFIG_VXFS_FS is not set @@ -664,7 +566,6 @@ CONFIG_RAMFS=y # CONFIG_NCP_FS is not set # CONFIG_CODA_FS is not set # CONFIG_AFS_FS is not set -# CONFIG_9P_FS is not set # # Partition Types @@ -678,15 +579,21 @@ CONFIG_MSDOS_PARTITION=y # CONFIG_NLS is not set # +# Distributed Lock Manager +# +# CONFIG_DLM is not set + +# # Kernel hacking # # CONFIG_PRINTK_TIME is not set +# CONFIG_ENABLE_MUST_CHECK is not set # CONFIG_MAGIC_SYSRQ is not set +# CONFIG_UNUSED_SYMBOLS is not set +# CONFIG_DEBUG_FS is not set +# CONFIG_HEADERS_CHECK is not set # CONFIG_DEBUG_KERNEL is not set -CONFIG_LOG_BUF_SHIFT=14 # CONFIG_DEBUG_BUGVERBOSE is not set -# CONFIG_DEBUG_FS is not set -# CONFIG_UNWIND_INFO is not set # CONFIG_FULLDEBUG is not set # CONFIG_HIGHPROFILE is not set # CONFIG_BOOTPARAM is not set @@ -699,20 +606,16 @@ CONFIG_LOG_BUF_SHIFT=14 # # CONFIG_KEYS is not set # CONFIG_SECURITY is not set - -# -# Cryptographic options -# # CONFIG_CRYPTO is not set # -# Hardware crypto devices -# - -# # Library routines # # CONFIG_CRC_CCITT is not set # CONFIG_CRC16 is not set +# CONFIG_CRC_ITU_T is not set # CONFIG_CRC32 is not set +# CONFIG_CRC7 is not set # CONFIG_LIBCRC32C is not set +CONFIG_HAS_IOMEM=y +CONFIG_HAS_DMA=y diff --git a/arch/m68knommu/kernel/setup.c b/arch/m68knommu/kernel/setup.c index 3f86ade..74bf949 100644 --- a/arch/m68knommu/kernel/setup.c +++ b/arch/m68knommu/kernel/setup.c @@ -151,27 +151,15 @@ void setup_arch(char **cmdline_p) #ifdef CONFIG_ELITE printk(KERN_INFO "Modified for M5206eLITE by Rob Scott, rscott@mtrob.fdns.net\n"); #endif -#ifdef CONFIG_TELOS - printk(KERN_INFO "Modified for Omnia ToolVox by James D. Schettine, james@telos-systems.com\n"); -#endif #endif printk(KERN_INFO "Flat model support (C) 1998,1999 Kenneth Albanowski, D. Jeff Dionne\n"); #if defined( CONFIG_PILOT ) && defined( CONFIG_M68328 ) printk(KERN_INFO "TRG SuperPilot FLASH card support <info@trgnet.com>\n"); #endif - #if defined( CONFIG_PILOT ) && defined( CONFIG_M68EZ328 ) printk(KERN_INFO "PalmV support by Lineo Inc. <jeff@uclinux.com>\n"); #endif - -#ifdef CONFIG_M68EZ328ADS - printk(KERN_INFO "M68EZ328ADS board support (C) 1999 Vladimir Gurevich <vgurevic@cisco.com>\n"); -#endif - -#ifdef CONFIG_ALMA_ANS - printk(KERN_INFO "Alma Electronics board support (C) 1999 Vladimir Gurevich <vgurevic@cisco.com>\n"); -#endif #if defined (CONFIG_M68360) printk(KERN_INFO "QUICC port done by SED Systems <hamilton@sedsystems.ca>,\n"); printk(KERN_INFO "based on 2.0.38 port by Lineo Inc. <mleslie@lineo.com>.\n"); @@ -188,11 +176,9 @@ void setup_arch(char **cmdline_p) "BSS=0x%06x-0x%06x\n", (int) &_stext, (int) &_etext, (int) &_sdata, (int) &_edata, (int) &_sbss, (int) &_ebss); - printk(KERN_DEBUG "KERNEL -> ROMFS=0x%06x-0x%06x MEM=0x%06x-0x%06x " - "STACK=0x%06x-0x%06x\n", + printk(KERN_DEBUG "MEMORY -> ROMFS=0x%06x-0x%06x MEM=0x%06x-0x%06x\n ", (int) &_ebss, (int) memory_start, - (int) memory_start, (int) memory_end, - (int) memory_end, (int) _ramend); + (int) memory_start, (int) memory_end); #endif /* Keep a copy of command line */ @@ -287,12 +273,3 @@ struct seq_operations cpuinfo_op = { .show = show_cpuinfo, }; -void arch_gettod(int *year, int *mon, int *day, int *hour, - int *min, int *sec) -{ - if (mach_gettod) - mach_gettod(year, mon, day, hour, min, sec); - else - *year = *mon = *day = *hour = *min = *sec = 0; -} - diff --git a/arch/m68knommu/kernel/signal.c b/arch/m68knommu/kernel/signal.c index 437f8c6..7037137 100644 --- a/arch/m68knommu/kernel/signal.c +++ b/arch/m68knommu/kernel/signal.c @@ -781,15 +781,7 @@ asmlinkage int do_signal(sigset_t *oldset, struct pt_regs *regs) /* Did we come from a system call? */ if (regs->orig_d0 >= 0) { /* Restart the system call - no handlers present */ - if (regs->d0 == -ERESTARTNOHAND - || regs->d0 == -ERESTARTSYS - || regs->d0 == -ERESTARTNOINTR) { - regs->d0 = regs->orig_d0; - regs->pc -= 2; - } else if (regs->d0 == -ERESTART_RESTARTBLOCK) { - regs->d0 = __NR_restart_syscall; - regs->pc -= 2; - } + handle_restart(regs, NULL, 0); } return 0; } diff --git a/arch/m68knommu/kernel/time.c b/arch/m68knommu/kernel/time.c index 467053d..77e5375 100644 --- a/arch/m68knommu/kernel/time.c +++ b/arch/m68knommu/kernel/time.c @@ -27,7 +27,6 @@ #define TICK_SIZE (tick_nsec / 1000) - static inline int set_rtc_mmss(unsigned long nowtime) { if (mach_set_clock_mmss) @@ -39,15 +38,11 @@ static inline int set_rtc_mmss(unsigned long nowtime) * timer_interrupt() needs to keep up the real-time clock, * as well as call the "do_timer()" routine every clocktick */ -static irqreturn_t timer_interrupt(int irq, void *dummy) +irqreturn_t arch_timer_interrupt(int irq, void *dummy) { /* last time the cmos clock got updated */ static long last_rtc_update=0; - /* may need to kick the hardware timer */ - if (mach_tick) - mach_tick(); - write_seqlock(&xtime_lock); do_timer(1); @@ -103,10 +98,10 @@ void time_init(void) { unsigned int year, mon, day, hour, min, sec; - extern void arch_gettod(int *year, int *mon, int *day, int *hour, - int *min, int *sec); - - arch_gettod(&year, &mon, &day, &hour, &min, &sec); + if (mach_gettod) + mach_gettod(&year, &mon, &day, &hour, &min, &sec); + else + year = mon = day = hour = min = sec = 0; if ((year += 1900) < 1970) year += 100; @@ -114,7 +109,7 @@ void time_init(void) xtime.tv_nsec = 0; wall_to_monotonic.tv_sec = -xtime.tv_sec; - mach_sched_init(timer_interrupt); + hw_timer_init(); } /* @@ -128,7 +123,7 @@ void do_gettimeofday(struct timeval *tv) do { seq = read_seqbegin_irqsave(&xtime_lock, flags); - usec = mach_gettimeoffset ? mach_gettimeoffset() : 0; + usec = hw_timer_offset(); sec = xtime.tv_sec; usec += (xtime.tv_nsec / 1000); } while (read_seqretry_irqrestore(&xtime_lock, seq, flags)); @@ -160,8 +155,7 @@ int do_settimeofday(struct timespec *tv) * Discover what correction gettimeofday * would have done, and then undo it! */ - if (mach_gettimeoffset) - nsec -= (mach_gettimeoffset() * 1000); + nsec -= (hw_timer_offset() * 1000); wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); diff --git a/arch/m68knommu/platform/5206/config.c b/arch/m68knommu/platform/5206/config.c index d0f2dc5..b3c4dd4 100644 --- a/arch/m68knommu/platform/5206/config.c +++ b/arch/m68knommu/platform/5206/config.c @@ -10,13 +10,10 @@ /***************************************************************************/ #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/param.h> #include <linux/init.h> #include <linux/interrupt.h> -#include <asm/irq.h> #include <asm/dma.h> -#include <asm/traps.h> #include <asm/machdep.h> #include <asm/coldfire.h> #include <asm/mcftimer.h> @@ -25,9 +22,6 @@ /***************************************************************************/ -void coldfire_tick(void); -void coldfire_timer_init(irq_handler_t handler); -unsigned long coldfire_timer_offset(void); void coldfire_reset(void); /***************************************************************************/ @@ -97,9 +91,6 @@ int mcf_timerirqpending(int timer) void config_BSP(char *commandp, int size) { mcf_setimr(MCFSIM_IMR_MASKALL); - mach_sched_init = coldfire_timer_init; - mach_tick = coldfire_tick; - mach_gettimeoffset = coldfire_timer_offset; mach_reset = coldfire_reset; } diff --git a/arch/m68knommu/platform/5206e/config.c b/arch/m68knommu/platform/5206e/config.c index 425703f..f84a4ae 100644 --- a/arch/m68knommu/platform/5206e/config.c +++ b/arch/m68knommu/platform/5206e/config.c @@ -9,23 +9,16 @@ /***************************************************************************/ #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/param.h> #include <linux/interrupt.h> -#include <asm/irq.h> #include <asm/dma.h> -#include <asm/traps.h> #include <asm/machdep.h> #include <asm/coldfire.h> -#include <asm/mcftimer.h> #include <asm/mcfsim.h> #include <asm/mcfdma.h> /***************************************************************************/ -void coldfire_tick(void); -void coldfire_timer_init(irq_handler_t handler); -unsigned long coldfire_timer_offset(void); void coldfire_reset(void); /***************************************************************************/ @@ -102,9 +95,6 @@ void config_BSP(char *commandp, int size) commandp[size-1] = 0; #endif /* CONFIG_NETtel */ - mach_sched_init = coldfire_timer_init; - mach_tick = coldfire_tick; - mach_gettimeoffset = coldfire_timer_offset; mach_reset = coldfire_reset; } diff --git a/arch/m68knommu/platform/520x/config.c b/arch/m68knommu/platform/520x/config.c index a2c95be..6edbd41 100644 --- a/arch/m68knommu/platform/520x/config.c +++ b/arch/m68knommu/platform/520x/config.c @@ -27,9 +27,6 @@ unsigned int dma_device_address[MAX_M68K_DMA_CHANNELS]; /***************************************************************************/ -void coldfire_pit_tick(void); -void coldfire_pit_init(irq_handler_t handler); -unsigned long coldfire_pit_offset(void); void coldfire_reset(void); /***************************************************************************/ @@ -47,10 +44,7 @@ void mcf_autovector(unsigned int vec) void config_BSP(char *commandp, int size) { - mach_sched_init = coldfire_pit_init; - mach_tick = coldfire_pit_tick; - mach_gettimeoffset = coldfire_pit_offset; - mach_reset = coldfire_reset; + mach_reset = coldfire_reset; } /***************************************************************************/ diff --git a/arch/m68knommu/platform/523x/config.c b/arch/m68knommu/platform/523x/config.c index 0a3af05..e7f80c8 100644 --- a/arch/m68knommu/platform/523x/config.c +++ b/arch/m68knommu/platform/523x/config.c @@ -13,12 +13,10 @@ /***************************************************************************/ #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/param.h> #include <linux/init.h> #include <linux/interrupt.h> #include <asm/dma.h> -#include <asm/traps.h> #include <asm/machdep.h> #include <asm/coldfire.h> #include <asm/mcfsim.h> @@ -26,9 +24,6 @@ /***************************************************************************/ -void coldfire_pit_tick(void); -void coldfire_pit_init(irq_handler_t handler); -unsigned long coldfire_pit_offset(void); void coldfire_reset(void); /***************************************************************************/ @@ -62,9 +57,6 @@ void mcf_autovector(unsigned int vec) void config_BSP(char *commandp, int size) { mcf_disableall(); - mach_sched_init = coldfire_pit_init; - mach_tick = coldfire_pit_tick; - mach_gettimeoffset = coldfire_pit_offset; mach_reset = coldfire_reset; } diff --git a/arch/m68knommu/platform/5249/config.c b/arch/m68knommu/platform/5249/config.c index dc2c362..d4d3943 100644 --- a/arch/m68knommu/platform/5249/config.c +++ b/arch/m68knommu/platform/5249/config.c @@ -9,24 +9,17 @@ /***************************************************************************/ #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/param.h> #include <linux/init.h> #include <linux/interrupt.h> -#include <asm/irq.h> #include <asm/dma.h> -#include <asm/traps.h> #include <asm/machdep.h> #include <asm/coldfire.h> -#include <asm/mcftimer.h> #include <asm/mcfsim.h> #include <asm/mcfdma.h> /***************************************************************************/ -void coldfire_tick(void); -void coldfire_timer_init(irq_handler_t handler); -unsigned long coldfire_timer_offset(void); void coldfire_reset(void); /***************************************************************************/ @@ -95,9 +88,6 @@ int mcf_timerirqpending(int timer) void config_BSP(char *commandp, int size) { mcf_setimr(MCFSIM_IMR_MASKALL); - mach_sched_init = coldfire_timer_init; - mach_tick = coldfire_tick; - mach_gettimeoffset = coldfire_timer_offset; mach_reset = coldfire_reset; } diff --git a/arch/m68knommu/platform/5272/config.c b/arch/m68knommu/platform/5272/config.c index 1365a83..634a637 100644 --- a/arch/m68knommu/platform/5272/config.c +++ b/arch/m68knommu/platform/5272/config.c @@ -10,24 +10,17 @@ /***************************************************************************/ #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/param.h> #include <linux/init.h> #include <linux/interrupt.h> -#include <asm/irq.h> #include <asm/dma.h> -#include <asm/traps.h> #include <asm/machdep.h> #include <asm/coldfire.h> -#include <asm/mcftimer.h> #include <asm/mcfsim.h> #include <asm/mcfdma.h> /***************************************************************************/ -void coldfire_tick(void); -void coldfire_timer_init(irq_handler_t handler); -unsigned long coldfire_timer_offset(void); void coldfire_reset(void); extern unsigned int mcf_timervector; @@ -128,9 +121,6 @@ void config_BSP(char *commandp, int size) mcf_timervector = 69; mcf_profilevector = 70; - mach_sched_init = coldfire_timer_init; - mach_tick = coldfire_tick; - mach_gettimeoffset = coldfire_timer_offset; mach_reset = coldfire_reset; } diff --git a/arch/m68knommu/platform/527x/config.c b/arch/m68knommu/platform/527x/config.c index 1b82044..9cbfbc6 100644 --- a/arch/m68knommu/platform/527x/config.c +++ b/arch/m68knommu/platform/527x/config.c @@ -13,12 +13,10 @@ /***************************************************************************/ #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/param.h> #include <linux/init.h> #include <linux/interrupt.h> #include <asm/dma.h> -#include <asm/traps.h> #include <asm/machdep.h> #include <asm/coldfire.h> #include <asm/mcfsim.h> @@ -26,9 +24,6 @@ /***************************************************************************/ -void coldfire_pit_tick(void); -void coldfire_pit_init(irq_handler_t handler); -unsigned long coldfire_pit_offset(void); void coldfire_reset(void); /***************************************************************************/ @@ -62,9 +57,6 @@ void mcf_autovector(unsigned int vec) void config_BSP(char *commandp, int size) { mcf_disableall(); - mach_sched_init = coldfire_pit_init; - mach_tick = coldfire_pit_tick; - mach_gettimeoffset = coldfire_pit_offset; mach_reset = coldfire_reset; } diff --git a/arch/m68knommu/platform/528x/config.c b/arch/m68knommu/platform/528x/config.c index a089e95..acbd434 100644 --- a/arch/m68knommu/platform/528x/config.c +++ b/arch/m68knommu/platform/528x/config.c @@ -13,12 +13,10 @@ /***************************************************************************/ #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/param.h> #include <linux/init.h> #include <linux/interrupt.h> #include <asm/dma.h> -#include <asm/traps.h> #include <asm/machdep.h> #include <asm/coldfire.h> #include <asm/mcfsim.h> @@ -26,9 +24,6 @@ /***************************************************************************/ -void coldfire_pit_tick(void); -void coldfire_pit_init(irq_handler_t handler); -unsigned long coldfire_pit_offset(void); void coldfire_reset(void); /***************************************************************************/ @@ -62,9 +57,6 @@ void mcf_autovector(unsigned int vec) void config_BSP(char *commandp, int size) { mcf_disableall(); - mach_sched_init = coldfire_pit_init; - mach_tick = coldfire_pit_tick; - mach_gettimeoffset = coldfire_pit_offset; mach_reset = coldfire_reset; } diff --git a/arch/m68knommu/platform/5307/config.c b/arch/m68knommu/platform/5307/config.c index e346161..6040821 100644 --- a/arch/m68knommu/platform/5307/config.c +++ b/arch/m68knommu/platform/5307/config.c @@ -10,25 +10,18 @@ /***************************************************************************/ #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/param.h> #include <linux/init.h> #include <linux/interrupt.h> -#include <asm/irq.h> #include <asm/dma.h> -#include <asm/traps.h> #include <asm/machdep.h> #include <asm/coldfire.h> -#include <asm/mcftimer.h> #include <asm/mcfsim.h> #include <asm/mcfdma.h> #include <asm/mcfwdebug.h> /***************************************************************************/ -void coldfire_tick(void); -void coldfire_timer_init(irq_handler_t handler); -unsigned long coldfire_timer_offset(void); void coldfire_reset(void); extern unsigned int mcf_timervector; @@ -122,9 +115,6 @@ void config_BSP(char *commandp, int size) mcf_timerlevel = 6; #endif - mach_sched_init = coldfire_timer_init; - mach_tick = coldfire_tick; - mach_gettimeoffset = coldfire_timer_offset; mach_reset = coldfire_reset; #ifdef MCF_BDM_DISABLE diff --git a/arch/m68knommu/platform/5307/entry.S b/arch/m68knommu/platform/5307/entry.S index a8cd867..b333731 100644 --- a/arch/m68knommu/platform/5307/entry.S +++ b/arch/m68knommu/platform/5307/entry.S @@ -74,7 +74,8 @@ ENTRY(system_call) movel %sp,%d2 /* get thread_info pointer */ andl #-THREAD_SIZE,%d2 /* at start of kernel stack */ movel %d2,%a0 - movel %sp,%a0@(THREAD_ESP0) /* save top of frame */ + movel %a0@,%a1 /* save top of frame */ + movel %sp,%a1@(TASK_THREAD+THREAD_ESP0) btst #(TIF_SYSCALL_TRACE%8),%a0@(TI_FLAGS+(31-TIF_SYSCALL_TRACE)/8) bnes 1f @@ -83,6 +84,8 @@ ENTRY(system_call) movel %d0,%sp@(PT_D0) /* save the return value */ jra ret_from_exception 1: + movel #-ENOSYS,%d2 /* strace needs -ENOSYS in PT_D0 */ + movel %d2,PT_D0(%sp) /* on syscall entry */ subql #4,%sp SAVE_SWITCH_STACK jbsr syscall_trace diff --git a/arch/m68knommu/platform/5307/pit.c b/arch/m68knommu/platform/5307/pit.c index f18352f..173b754 100644 --- a/arch/m68knommu/platform/5307/pit.c +++ b/arch/m68knommu/platform/5307/pit.c @@ -17,6 +17,7 @@ #include <linux/init.h> #include <linux/interrupt.h> #include <linux/irq.h> +#include <asm/machdep.h> #include <asm/io.h> #include <asm/coldfire.h> #include <asm/mcfpit.h> @@ -31,28 +32,30 @@ /***************************************************************************/ -void coldfire_pit_tick(void) +static irqreturn_t hw_tick(int irq, void *dummy) { unsigned short pcsr; /* Reset the ColdFire timer */ pcsr = __raw_readw(TA(MCFPIT_PCSR)); __raw_writew(pcsr | MCFPIT_PCSR_PIF, TA(MCFPIT_PCSR)); + + return arch_timer_interrupt(irq, dummy); } /***************************************************************************/ static struct irqaction coldfire_pit_irq = { - .name = "timer", - .flags = IRQF_DISABLED | IRQF_TIMER, + .name = "timer", + .flags = IRQF_DISABLED | IRQF_TIMER, + .handler = hw_tick, }; -void coldfire_pit_init(irq_handler_t handler) +void hw_timer_init(void) { volatile unsigned char *icrp; volatile unsigned long *imrp; - coldfire_pit_irq.handler = handler; setup_irq(MCFINT_VECBASE + MCFINT_PIT1, &coldfire_pit_irq); icrp = (volatile unsigned char *) (MCF_IPSBAR + MCFICM_INTC0 + @@ -71,7 +74,7 @@ void coldfire_pit_init(irq_handler_t handler) /***************************************************************************/ -unsigned long coldfire_pit_offset(void) +unsigned long hw_timer_offset(void) { volatile unsigned long *ipr; unsigned long pmr, pcntr, offset; diff --git a/arch/m68knommu/platform/5307/timers.c b/arch/m68knommu/platform/5307/timers.c index 64bd0ff..489dec8 100644 --- a/arch/m68knommu/platform/5307/timers.c +++ b/arch/m68knommu/platform/5307/timers.c @@ -9,10 +9,9 @@ /***************************************************************************/ #include <linux/kernel.h> +#include <linux/init.h> #include <linux/sched.h> -#include <linux/param.h> #include <linux/interrupt.h> -#include <linux/init.h> #include <linux/irq.h> #include <asm/io.h> #include <asm/traps.h> @@ -54,24 +53,28 @@ extern int mcf_timerirqpending(int timer); /***************************************************************************/ -void coldfire_tick(void) +static irqreturn_t hw_tick(int irq, void *dummy) { /* Reset the ColdFire timer */ __raw_writeb(MCFTIMER_TER_CAP | MCFTIMER_TER_REF, TA(MCFTIMER_TER)); + + return arch_timer_interrupt(irq, dummy); } /***************************************************************************/ static struct irqaction coldfire_timer_irq = { - .name = "timer", - .flags = IRQF_DISABLED | IRQF_TIMER, + .name = "timer", + .flags = IRQF_DISABLED | IRQF_TIMER, + .handler = hw_tick, }; +/***************************************************************************/ + static int ticks_per_intr; -void coldfire_timer_init(irq_handler_t handler) +void hw_timer_init(void) { - coldfire_timer_irq.handler = handler; setup_irq(mcf_timervector, &coldfire_timer_irq); __raw_writew(MCFTIMER_TMR_DISABLE, TA(MCFTIMER_TMR)); @@ -89,7 +92,7 @@ void coldfire_timer_init(irq_handler_t handler) /***************************************************************************/ -unsigned long coldfire_timer_offset(void) +unsigned long hw_timer_offset(void) { unsigned long tcn, offset; diff --git a/arch/m68knommu/platform/532x/config.c b/arch/m68knommu/platform/532x/config.c index b32c642..f77328b 100644 --- a/arch/m68knommu/platform/532x/config.c +++ b/arch/m68knommu/platform/532x/config.c @@ -18,25 +18,18 @@ /***************************************************************************/ #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/param.h> #include <linux/init.h> #include <linux/interrupt.h> -#include <asm/irq.h> #include <asm/dma.h> -#include <asm/traps.h> #include <asm/machdep.h> #include <asm/coldfire.h> -#include <asm/mcftimer.h> #include <asm/mcfsim.h> #include <asm/mcfdma.h> #include <asm/mcfwdebug.h> /***************************************************************************/ -void coldfire_tick(void); -void coldfire_timer_init(irq_handler_t handler); -unsigned long coldfire_timer_offset(void); void coldfire_reset(void); extern unsigned int mcf_timervector; @@ -104,9 +97,6 @@ void config_BSP(char *commandp, int size) mcf_timervector = 64+32; mcf_profilevector = 64+33; - mach_sched_init = coldfire_timer_init; - mach_tick = coldfire_tick; - mach_gettimeoffset = coldfire_timer_offset; mach_reset = coldfire_reset; #ifdef MCF_BDM_DISABLE diff --git a/arch/m68knommu/platform/5407/config.c b/arch/m68knommu/platform/5407/config.c index e692536..2d3b62e 100644 --- a/arch/m68knommu/platform/5407/config.c +++ b/arch/m68knommu/platform/5407/config.c @@ -10,24 +10,17 @@ /***************************************************************************/ #include <linux/kernel.h> -#include <linux/sched.h> #include <linux/param.h> #include <linux/init.h> #include <linux/interrupt.h> -#include <asm/irq.h> #include <asm/dma.h> -#include <asm/traps.h> #include <asm/machdep.h> #include <asm/coldfire.h> -#include <asm/mcftimer.h> #include <asm/mcfsim.h> #include <asm/mcfdma.h> /***************************************************************************/ -void coldfire_tick(void); -void coldfire_timer_init(irq_handler_t handler); -unsigned long coldfire_timer_offset(void); void coldfire_reset(void); extern unsigned int mcf_timervector; @@ -108,9 +101,6 @@ void config_BSP(char *commandp, int size) mcf_timerlevel = 6; #endif - mach_sched_init = coldfire_timer_init; - mach_tick = coldfire_tick; - mach_gettimeoffset = coldfire_timer_offset; mach_reset = coldfire_reset; } diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c index b0b034c..b1b4052 100644 --- a/arch/mips/mm/dma-default.c +++ b/arch/mips/mm/dma-default.c @@ -13,6 +13,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/string.h> +#include <linux/scatterlist.h> #include <asm/cache.h> #include <asm/io.h> diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c index 41f8e32..9448d4e 100644 --- a/arch/parisc/kernel/pci-dma.c +++ b/arch/parisc/kernel/pci-dma.c @@ -25,6 +25,7 @@ #include <linux/slab.h> #include <linux/string.h> #include <linux/types.h> +#include <linux/scatterlist.h> #include <asm/cacheflush.h> #include <asm/dma.h> /* for DMA_CHUNK_SIZE */ diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index c939fe8..6a79fe4 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -216,7 +216,6 @@ config PPC_EARLY_DEBUG_BEAT config PPC_EARLY_DEBUG_44x bool "Early serial debugging for IBM/AMCC 44x CPUs" depends on 44x - select PPC_UDBG_16550 help Select this to enable early debugging for IBM 44x chips via the inbuilt serial port. diff --git a/arch/powerpc/boot/dts/bamboo.dts b/arch/powerpc/boot/dts/bamboo.dts index a88ae3d..cb2fb50 100644 --- a/arch/powerpc/boot/dts/bamboo.dts +++ b/arch/powerpc/boot/dts/bamboo.dts @@ -98,11 +98,13 @@ interrupt-parent = <&MAL0>; interrupts = <0 1 2 3 4>; #interrupt-cells = <1>; + #address-cells = <0>; + #size-cells = <0>; interrupt-map = </*TXEOB*/ 0 &UIC0 a 4 /*RXEOB*/ 1 &UIC0 b 4 /*SERR*/ 2 &UIC1 0 4 /*TXDE*/ 3 &UIC1 1 4 - /*RXDE*/ 4 &UIC1 3 4>; + /*RXDE*/ 4 &UIC1 2 4>; }; POB0: opb { @@ -196,6 +198,7 @@ }; EMAC0: ethernet@ef600e00 { + linux,network-index = <0>; device_type = "network"; compatible = "ibm,emac-440ep", "ibm,emac-440gp", "ibm,emac"; interrupt-parent = <&UIC1>; @@ -210,12 +213,13 @@ rx-fifo-size = <1000>; tx-fifo-size = <800>; phy-mode = "rmii"; - phy-map = <00000001>; + phy-map = <00000000>; zmii-device = <&ZMII0>; zmii-channel = <0>; }; EMAC1: ethernet@ef600f00 { + linux,network-index = <1>; device_type = "network"; compatible = "ibm,emac-440ep", "ibm,emac-440gp", "ibm,emac"; interrupt-parent = <&UIC1>; @@ -230,7 +234,7 @@ rx-fifo-size = <1000>; tx-fifo-size = <800>; phy-mode = "rmii"; - phy-map = <00000001>; + phy-map = <00000000>; zmii-device = <&ZMII0>; zmii-channel = <1>; }; diff --git a/arch/powerpc/boot/dts/sequoia.dts b/arch/powerpc/boot/dts/sequoia.dts index 36be75b..8833dfe 100644 --- a/arch/powerpc/boot/dts/sequoia.dts +++ b/arch/powerpc/boot/dts/sequoia.dts @@ -241,6 +241,12 @@ reg = <ef600d00 c>; }; + RGMII0: emac-rgmii@ef601000 { + device_type = "rgmii-interface"; + compatible = "ibm,rgmii-440epx", "ibm,rgmii"; + reg = <ef601000 8>; + }; + EMAC0: ethernet@ef600e00 { linux,network-index = <0>; device_type = "network"; @@ -261,10 +267,12 @@ max-frame-size = <5dc>; rx-fifo-size = <1000>; tx-fifo-size = <800>; - phy-mode = "rmii"; + phy-mode = "rgmii"; phy-map = <00000000>; zmii-device = <&ZMII0>; zmii-channel = <0>; + rgmii-device = <&RGMII0>; + rgmii-channel = <0>; }; EMAC1: ethernet@ef600f00 { @@ -287,10 +295,12 @@ max-frame-size = <5dc>; rx-fifo-size = <1000>; tx-fifo-size = <800>; - phy-mode = "rmii"; + phy-mode = "rgmii"; phy-map = <00000000>; zmii-device = <&ZMII0>; zmii-channel = <1>; + rgmii-device = <&RGMII0>; + rgmii-channel = <1>; }; }; }; diff --git a/arch/powerpc/boot/dts/walnut.dts b/arch/powerpc/boot/dts/walnut.dts index ec54f4e..fa681f5 100644 --- a/arch/powerpc/boot/dts/walnut.dts +++ b/arch/powerpc/boot/dts/walnut.dts @@ -64,10 +64,15 @@ MAL: mcmal { compatible = "ibm,mcmal-405gp", "ibm,mcmal"; dcr-reg = <180 62>; - num-tx-chans = <2>; + num-tx-chans = <1>; num-rx-chans = <1>; interrupt-parent = <&UIC0>; - interrupts = <a 4 b 4 c 4 d 4 e 4>; + interrupts = < + b 4 /* TXEOB */ + c 4 /* RXEOB */ + a 4 /* SERR */ + d 4 /* TXDE */ + e 4 /* RXDE */>; }; POB0: opb { @@ -118,9 +123,10 @@ compatible = "ibm,emac-405gp", "ibm,emac"; interrupt-parent = <&UIC0>; interrupts = <9 4 f 4>; + local-mac-address = [000000000000]; /* Filled in by zImage */ reg = <ef600800 70>; mal-device = <&MAL>; - mal-tx-channel = <0 1>; + mal-tx-channel = <0>; mal-rx-channel = <0>; cell-index = <0>; max-frame-size = <5dc>; diff --git a/arch/powerpc/boot/treeboot-walnut.c b/arch/powerpc/boot/treeboot-walnut.c index 3adf2d0..bb2c309 100644 --- a/arch/powerpc/boot/treeboot-walnut.c +++ b/arch/powerpc/boot/treeboot-walnut.c @@ -57,8 +57,8 @@ void ibm405gp_fixup_clocks(unsigned int sysclk, unsigned int ser_clk) } /* setup the timebase clock to tick at the cpu frequency */ - cpc0_cr1 = cpc0_cr1 & ~ 0x00800000; - mtdcr(DCRN_CPC0_CR1, cpc0_cr1); + cpc0_cr1 = cpc0_cr1 & ~0x00800000; + mtdcr(DCRN_405_CPC0_CR1, cpc0_cr1); tb = cpu; dt_fixup_cpu_clocks(cpu, tb, 0); @@ -109,6 +109,7 @@ static void walnut_flashsel_fixup(void) setprop(sram, "reg", reg_sram, sizeof(reg_sram)); } +#define WALNUT_OPENBIOS_MAC_OFF 0xfffffe0b static void walnut_fixups(void) { ibm4xx_fixup_memsize(); @@ -116,6 +117,7 @@ static void walnut_fixups(void) ibm4xx_quiesce_eth((u32 *)0xef600800, NULL); ibm4xx_fixup_ebc_ranges("/plb/ebc"); walnut_flashsel_fixup(); + dt_fixup_mac_addresses((u8 *) WALNUT_OPENBIOS_MAC_OFF); } void platform_init(void) diff --git a/arch/powerpc/configs/bamboo_defconfig b/arch/powerpc/configs/bamboo_defconfig index d22fed6..844808e 100644 --- a/arch/powerpc/configs/bamboo_defconfig +++ b/arch/powerpc/configs/bamboo_defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.23-rc1 -# Fri Aug 3 10:46:53 2007 +# Linux kernel version: 2.6.23 +# Fri Oct 19 09:01:11 2007 # # CONFIG_PPC64 is not set @@ -22,8 +22,13 @@ CONFIG_PHYS_64BIT=y # CONFIG_PPC_MM_SLICES is not set CONFIG_NOT_COHERENT_CACHE=y CONFIG_PPC32=y +CONFIG_WORD_SIZE=32 CONFIG_PPC_MERGE=y CONFIG_MMU=y +CONFIG_GENERIC_CMOS_UPDATE=y +CONFIG_GENERIC_TIME=y +CONFIG_GENERIC_TIME_VSYSCALL=y +CONFIG_GENERIC_CLOCKEVENTS=y CONFIG_GENERIC_HARDIRQS=y CONFIG_IRQ_PER_CPU=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y @@ -67,6 +72,8 @@ CONFIG_POSIX_MQUEUE=y # CONFIG_AUDIT is not set # CONFIG_IKCONFIG is not set CONFIG_LOG_BUF_SHIFT=14 +CONFIG_FAIR_GROUP_SCHED=y +CONFIG_FAIR_USER_SCHED=y CONFIG_SYSFS_DEPRECATED=y # CONFIG_RELAY is not set CONFIG_BLK_DEV_INITRD=y @@ -87,7 +94,6 @@ CONFIG_FUTEX=y CONFIG_ANON_INODES=y CONFIG_EPOLL=y CONFIG_SIGNALFD=y -CONFIG_TIMERFD=y CONFIG_EVENTFD=y CONFIG_SHMEM=y CONFIG_VM_EVENT_COUNTERS=y @@ -133,6 +139,7 @@ CONFIG_DEFAULT_IOSCHED="anticipatory" # CONFIG_PQ2ADS is not set CONFIG_BAMBOO=y # CONFIG_EBONY is not set +# CONFIG_SEQUOIA is not set CONFIG_440EP=y CONFIG_IBM440EP_ERR42=y # CONFIG_MPIC is not set @@ -146,11 +153,16 @@ CONFIG_IBM440EP_ERR42=y # CONFIG_GENERIC_IOMAP is not set # CONFIG_CPU_FREQ is not set # CONFIG_CPM2 is not set +# CONFIG_FSL_ULI1575 is not set # # Kernel options # # CONFIG_HIGHMEM is not set +# CONFIG_TICK_ONESHOT is not set +# CONFIG_NO_HZ is not set +# CONFIG_HIGH_RES_TIMERS is not set +CONFIG_GENERIC_CLOCKEVENTS_BUILD=y # CONFIG_HZ_100 is not set CONFIG_HZ_250=y # CONFIG_HZ_300 is not set @@ -172,6 +184,7 @@ CONFIG_FLATMEM_MANUAL=y CONFIG_FLATMEM=y CONFIG_FLAT_NODE_MEM_MAP=y # CONFIG_SPARSEMEM_STATIC is not set +# CONFIG_SPARSEMEM_VMEMMAP_ENABLE is not set CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_RESOURCES_64BIT=y CONFIG_ZONE_DMA_FLAG=1 @@ -197,10 +210,6 @@ CONFIG_PCI_SYSCALL=y CONFIG_ARCH_SUPPORTS_MSI=y # CONFIG_PCI_MSI is not set # CONFIG_PCI_DEBUG is not set - -# -# PCCARD (PCMCIA/CardBus) support -# # CONFIG_PCCARD is not set # CONFIG_HOTPLUG_PCI is not set @@ -215,7 +224,7 @@ CONFIG_ARCH_SUPPORTS_MSI=y CONFIG_HIGHMEM_START=0xfe000000 CONFIG_LOWMEM_SIZE=0x30000000 CONFIG_KERNEL_START=0xc0000000 -CONFIG_TASK_SIZE=0x80000000 +CONFIG_TASK_SIZE=0xc0000000 CONFIG_CONSISTENT_START=0xff100000 CONFIG_CONSISTENT_SIZE=0x00200000 CONFIG_BOOT_LOAD=0x01000000 @@ -252,6 +261,7 @@ CONFIG_IP_PNP_BOOTP=y # CONFIG_INET_XFRM_MODE_TRANSPORT is not set # CONFIG_INET_XFRM_MODE_TUNNEL is not set # CONFIG_INET_XFRM_MODE_BEET is not set +# CONFIG_INET_LRO is not set CONFIG_INET_DIAG=y CONFIG_INET_TCP_DIAG=y # CONFIG_TCP_CONG_ADVANCED is not set @@ -309,6 +319,7 @@ CONFIG_DEFAULT_TCP_CONG="cubic" # # Generic Driver Options # +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_STANDALONE=y CONFIG_PREVENT_FIRMWARE_BUILD=y CONFIG_FW_LOADER=y @@ -353,10 +364,6 @@ CONFIG_MISC_DEVICES=y # CONFIG_SCSI_NETLINK is not set # CONFIG_ATA is not set # CONFIG_MD is not set - -# -# Fusion MPT device support -# # CONFIG_FUSION is not set # @@ -375,12 +382,36 @@ CONFIG_NETDEVICES=y # CONFIG_MACVLAN is not set # CONFIG_EQUALIZER is not set # CONFIG_TUN is not set +# CONFIG_VETH is not set +# CONFIG_IP1000 is not set # CONFIG_ARCNET is not set -# CONFIG_NET_ETHERNET is not set +# CONFIG_PHYLIB is not set +CONFIG_NET_ETHERNET=y +# CONFIG_MII is not set +# CONFIG_HAPPYMEAL is not set +# CONFIG_SUNGEM is not set +# CONFIG_CASSINI is not set +# CONFIG_NET_VENDOR_3COM is not set +# CONFIG_NET_TULIP is not set +# CONFIG_HP100 is not set +CONFIG_IBM_NEW_EMAC=y +CONFIG_IBM_NEW_EMAC_RXB=128 +CONFIG_IBM_NEW_EMAC_TXB=64 +CONFIG_IBM_NEW_EMAC_POLL_WEIGHT=32 +CONFIG_IBM_NEW_EMAC_RX_COPY_THRESHOLD=256 +CONFIG_IBM_NEW_EMAC_RX_SKB_HEADROOM=0 +# CONFIG_IBM_NEW_EMAC_DEBUG is not set +CONFIG_IBM_NEW_EMAC_ZMII=y +# CONFIG_IBM_NEW_EMAC_RGMII is not set +# CONFIG_IBM_NEW_EMAC_TAH is not set +# CONFIG_IBM_NEW_EMAC_EMAC4 is not set +# CONFIG_NET_PCI is not set +# CONFIG_B44 is not set CONFIG_NETDEV_1000=y # CONFIG_ACENIC is not set # CONFIG_DL2K is not set # CONFIG_E1000 is not set +# CONFIG_E1000E is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set @@ -388,6 +419,7 @@ CONFIG_NETDEV_1000=y # CONFIG_SIS190 is not set # CONFIG_SKGE is not set # CONFIG_SKY2 is not set +# CONFIG_SK98LIN is not set # CONFIG_VIA_VELOCITY is not set # CONFIG_TIGON3 is not set # CONFIG_BNX2 is not set @@ -396,11 +428,14 @@ CONFIG_NETDEV_1000=y CONFIG_NETDEV_10000=y # CONFIG_CHELSIO_T1 is not set # CONFIG_CHELSIO_T3 is not set +# CONFIG_IXGBE is not set # CONFIG_IXGB is not set # CONFIG_S2IO is not set # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set +# CONFIG_NIU is not set # CONFIG_MLX4_CORE is not set +# CONFIG_TEHUTI is not set # CONFIG_TR is not set # @@ -463,14 +498,11 @@ CONFIG_UNIX98_PTYS=y CONFIG_LEGACY_PTYS=y CONFIG_LEGACY_PTY_COUNT=256 # CONFIG_IPMI_HANDLER is not set -# CONFIG_WATCHDOG is not set # CONFIG_HW_RANDOM is not set # CONFIG_NVRAM is not set # CONFIG_GEN_RTC is not set # CONFIG_R3964 is not set # CONFIG_APPLICOM is not set -# CONFIG_AGP is not set -# CONFIG_DRM is not set # CONFIG_RAW_DRIVER is not set # CONFIG_TCG_TPM is not set CONFIG_DEVPORT=y @@ -484,6 +516,13 @@ CONFIG_DEVPORT=y # CONFIG_W1 is not set # CONFIG_POWER_SUPPLY is not set # CONFIG_HWMON is not set +# CONFIG_WATCHDOG is not set + +# +# Sonics Silicon Backplane +# +CONFIG_SSB_POSSIBLE=y +# CONFIG_SSB is not set # # Multifunction device drivers @@ -500,16 +539,17 @@ CONFIG_DAB=y # # Graphics support # +# CONFIG_AGP is not set +# CONFIG_DRM is not set +# CONFIG_VGASTATE is not set +CONFIG_VIDEO_OUTPUT_CONTROL=m +# CONFIG_FB is not set # CONFIG_BACKLIGHT_LCD_SUPPORT is not set # # Display device support # # CONFIG_DISPLAY_SUPPORT is not set -# CONFIG_VGASTATE is not set -CONFIG_VIDEO_OUTPUT_CONTROL=m -# CONFIG_FB is not set -# CONFIG_FB_IBM_GXT4500 is not set # # Sound @@ -536,19 +576,6 @@ CONFIG_USB_ARCH_HAS_EHCI=y # CONFIG_RTC_CLASS is not set # -# DMA Engine support -# -# CONFIG_DMA_ENGINE is not set - -# -# DMA Clients -# - -# -# DMA Devices -# - -# # Userspace I/O # # CONFIG_UIO is not set @@ -600,7 +627,6 @@ CONFIG_SYSFS=y CONFIG_TMPFS=y # CONFIG_TMPFS_POSIX_ACL is not set # CONFIG_HUGETLB_PAGE is not set -CONFIG_RAMFS=y # CONFIG_CONFIGFS_FS is not set # @@ -619,10 +645,7 @@ CONFIG_CRAMFS=y # CONFIG_QNX4FS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set - -# -# Network File Systems -# +CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y # CONFIG_NFS_V3_ACL is not set @@ -648,15 +671,7 @@ CONFIG_SUNRPC=y # # CONFIG_PARTITION_ADVANCED is not set CONFIG_MSDOS_PARTITION=y - -# -# Native Language Support -# # CONFIG_NLS is not set - -# -# Distributed Lock Manager -# # CONFIG_DLM is not set # CONFIG_UCC_SLOW is not set @@ -709,6 +724,7 @@ CONFIG_SCHED_DEBUG=y # CONFIG_DEBUG_VM is not set # CONFIG_DEBUG_LIST is not set CONFIG_FORCED_INLINING=y +# CONFIG_BOOT_PRINTK_DELAY is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_FAULT_INJECTION is not set # CONFIG_DEBUG_STACKOVERFLOW is not set @@ -728,6 +744,7 @@ CONFIG_PPC_EARLY_DEBUG=y # CONFIG_PPC_EARLY_DEBUG_PAS_REALMODE is not set # CONFIG_PPC_EARLY_DEBUG_BEAT is not set CONFIG_PPC_EARLY_DEBUG_44x=y +# CONFIG_PPC_EARLY_DEBUG_CPM is not set CONFIG_PPC_EARLY_DEBUG_44x_PHYSLOW=0xef600300 CONFIG_PPC_EARLY_DEBUG_44x_PHYSHIGH=0x0 @@ -736,6 +753,7 @@ CONFIG_PPC_EARLY_DEBUG_44x_PHYSHIGH=0x0 # # CONFIG_KEYS is not set # CONFIG_SECURITY is not set +# CONFIG_SECURITY_FILE_CAPABILITIES is not set CONFIG_CRYPTO=y CONFIG_CRYPTO_ALGAPI=y CONFIG_CRYPTO_BLKCIPHER=y @@ -755,6 +773,7 @@ CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_CBC=y CONFIG_CRYPTO_PCBC=y # CONFIG_CRYPTO_LRW is not set +# CONFIG_CRYPTO_XTS is not set # CONFIG_CRYPTO_CRYPTD is not set CONFIG_CRYPTO_DES=y # CONFIG_CRYPTO_FCRYPT is not set @@ -768,9 +787,12 @@ CONFIG_CRYPTO_DES=y # CONFIG_CRYPTO_ARC4 is not set # CONFIG_CRYPTO_KHAZAD is not set # CONFIG_CRYPTO_ANUBIS is not set +# CONFIG_CRYPTO_SEED is not set # CONFIG_CRYPTO_DEFLATE is not set # CONFIG_CRYPTO_MICHAEL_MIC is not set # CONFIG_CRYPTO_CRC32C is not set # CONFIG_CRYPTO_CAMELLIA is not set # CONFIG_CRYPTO_TEST is not set +# CONFIG_CRYPTO_AUTHENC is not set CONFIG_CRYPTO_HW=y +# CONFIG_PPC_CLOCK is not set diff --git a/arch/powerpc/configs/ebony_defconfig b/arch/powerpc/configs/ebony_defconfig index 35a95dd..d3ef642 100644 --- a/arch/powerpc/configs/ebony_defconfig +++ b/arch/powerpc/configs/ebony_defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.23-rc4 -# Thu Aug 30 16:34:11 2007 +# Linux kernel version: 2.6.23 +# Thu Oct 18 08:01:57 2007 # # CONFIG_PPC64 is not set @@ -21,8 +21,13 @@ CONFIG_PHYS_64BIT=y # CONFIG_PPC_MM_SLICES is not set CONFIG_NOT_COHERENT_CACHE=y CONFIG_PPC32=y +CONFIG_WORD_SIZE=32 CONFIG_PPC_MERGE=y CONFIG_MMU=y +CONFIG_GENERIC_CMOS_UPDATE=y +CONFIG_GENERIC_TIME=y +CONFIG_GENERIC_TIME_VSYSCALL=y +CONFIG_GENERIC_CLOCKEVENTS=y CONFIG_GENERIC_HARDIRQS=y CONFIG_IRQ_PER_CPU=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y @@ -66,6 +71,8 @@ CONFIG_POSIX_MQUEUE=y # CONFIG_AUDIT is not set # CONFIG_IKCONFIG is not set CONFIG_LOG_BUF_SHIFT=14 +CONFIG_FAIR_GROUP_SCHED=y +CONFIG_FAIR_USER_SCHED=y CONFIG_SYSFS_DEPRECATED=y # CONFIG_RELAY is not set CONFIG_BLK_DEV_INITRD=y @@ -86,7 +93,6 @@ CONFIG_FUTEX=y CONFIG_ANON_INODES=y CONFIG_EPOLL=y CONFIG_SIGNALFD=y -CONFIG_TIMERFD=y CONFIG_EVENTFD=y CONFIG_SHMEM=y CONFIG_VM_EVENT_COUNTERS=y @@ -130,7 +136,9 @@ CONFIG_DEFAULT_IOSCHED="anticipatory" # CONFIG_PPC_CELL is not set # CONFIG_PPC_CELL_NATIVE is not set # CONFIG_PQ2ADS is not set +# CONFIG_BAMBOO is not set CONFIG_EBONY=y +# CONFIG_SEQUOIA is not set CONFIG_440GP=y # CONFIG_MPIC is not set # CONFIG_MPIC_WEIRD is not set @@ -149,6 +157,10 @@ CONFIG_440GP=y # Kernel options # # CONFIG_HIGHMEM is not set +# CONFIG_TICK_ONESHOT is not set +# CONFIG_NO_HZ is not set +# CONFIG_HIGH_RES_TIMERS is not set +CONFIG_GENERIC_CLOCKEVENTS_BUILD=y # CONFIG_HZ_100 is not set CONFIG_HZ_250=y # CONFIG_HZ_300 is not set @@ -170,6 +182,7 @@ CONFIG_FLATMEM_MANUAL=y CONFIG_FLATMEM=y CONFIG_FLAT_NODE_MEM_MAP=y # CONFIG_SPARSEMEM_STATIC is not set +# CONFIG_SPARSEMEM_VMEMMAP_ENABLE is not set CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_RESOURCES_64BIT=y CONFIG_ZONE_DMA_FLAG=1 @@ -194,10 +207,6 @@ CONFIG_PCI_SYSCALL=y CONFIG_ARCH_SUPPORTS_MSI=y # CONFIG_PCI_MSI is not set # CONFIG_PCI_DEBUG is not set - -# -# PCCARD (PCMCIA/CardBus) support -# # CONFIG_PCCARD is not set # CONFIG_HOTPLUG_PCI is not set @@ -212,7 +221,7 @@ CONFIG_ARCH_SUPPORTS_MSI=y CONFIG_HIGHMEM_START=0xfe000000 CONFIG_LOWMEM_SIZE=0x30000000 CONFIG_KERNEL_START=0xc0000000 -CONFIG_TASK_SIZE=0x80000000 +CONFIG_TASK_SIZE=0xc0000000 CONFIG_CONSISTENT_START=0xff100000 CONFIG_CONSISTENT_SIZE=0x00200000 CONFIG_BOOT_LOAD=0x01000000 @@ -249,6 +258,7 @@ CONFIG_IP_PNP_BOOTP=y # CONFIG_INET_XFRM_MODE_TRANSPORT is not set # CONFIG_INET_XFRM_MODE_TUNNEL is not set # CONFIG_INET_XFRM_MODE_BEET is not set +# CONFIG_INET_LRO is not set CONFIG_INET_DIAG=y CONFIG_INET_TCP_DIAG=y # CONFIG_TCP_CONG_ADVANCED is not set @@ -306,6 +316,7 @@ CONFIG_DEFAULT_TCP_CONG="cubic" # # Generic Driver Options # +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_STANDALONE=y CONFIG_PREVENT_FIRMWARE_BUILD=y CONFIG_FW_LOADER=y @@ -332,6 +343,7 @@ CONFIG_MTD_BLOCK=y # CONFIG_INFTL is not set # CONFIG_RFD_FTL is not set # CONFIG_SSFDC is not set +# CONFIG_MTD_OOPS is not set # # RAM/ROM/Flash chip drivers @@ -364,6 +376,7 @@ CONFIG_MTD_CFI_UTIL=y # CONFIG_MTD_COMPLEX_MAPPINGS is not set # CONFIG_MTD_PHYSMAP is not set CONFIG_MTD_PHYSMAP_OF=y +# CONFIG_MTD_INTEL_VR_NOR is not set # CONFIG_MTD_PLATRAM is not set # @@ -423,10 +436,6 @@ CONFIG_MISC_DEVICES=y # CONFIG_SCSI_NETLINK is not set # CONFIG_ATA is not set # CONFIG_MD is not set - -# -# Fusion MPT device support -# # CONFIG_FUSION is not set # @@ -443,12 +452,36 @@ CONFIG_NETDEVICES=y # CONFIG_MACVLAN is not set # CONFIG_EQUALIZER is not set # CONFIG_TUN is not set +# CONFIG_VETH is not set +# CONFIG_IP1000 is not set # CONFIG_ARCNET is not set -# CONFIG_NET_ETHERNET is not set +# CONFIG_PHYLIB is not set +CONFIG_NET_ETHERNET=y +# CONFIG_MII is not set +# CONFIG_HAPPYMEAL is not set +# CONFIG_SUNGEM is not set +# CONFIG_CASSINI is not set +# CONFIG_NET_VENDOR_3COM is not set +# CONFIG_NET_TULIP is not set +# CONFIG_HP100 is not set +CONFIG_IBM_NEW_EMAC=y +CONFIG_IBM_NEW_EMAC_RXB=128 +CONFIG_IBM_NEW_EMAC_TXB=64 +CONFIG_IBM_NEW_EMAC_POLL_WEIGHT=32 +CONFIG_IBM_NEW_EMAC_RX_COPY_THRESHOLD=256 +CONFIG_IBM_NEW_EMAC_RX_SKB_HEADROOM=0 +# CONFIG_IBM_NEW_EMAC_DEBUG is not set +CONFIG_IBM_NEW_EMAC_ZMII=y +# CONFIG_IBM_NEW_EMAC_RGMII is not set +# CONFIG_IBM_NEW_EMAC_TAH is not set +# CONFIG_IBM_NEW_EMAC_EMAC4 is not set +# CONFIG_NET_PCI is not set +# CONFIG_B44 is not set CONFIG_NETDEV_1000=y # CONFIG_ACENIC is not set # CONFIG_DL2K is not set # CONFIG_E1000 is not set +# CONFIG_E1000E is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set @@ -456,6 +489,7 @@ CONFIG_NETDEV_1000=y # CONFIG_SIS190 is not set # CONFIG_SKGE is not set # CONFIG_SKY2 is not set +# CONFIG_SK98LIN is not set # CONFIG_VIA_VELOCITY is not set # CONFIG_TIGON3 is not set # CONFIG_BNX2 is not set @@ -464,11 +498,14 @@ CONFIG_NETDEV_1000=y CONFIG_NETDEV_10000=y # CONFIG_CHELSIO_T1 is not set # CONFIG_CHELSIO_T3 is not set +# CONFIG_IXGBE is not set # CONFIG_IXGB is not set # CONFIG_S2IO is not set # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set +# CONFIG_NIU is not set # CONFIG_MLX4_CORE is not set +# CONFIG_TEHUTI is not set # CONFIG_TR is not set # @@ -537,8 +574,6 @@ CONFIG_LEGACY_PTY_COUNT=256 # CONFIG_GEN_RTC is not set # CONFIG_R3964 is not set # CONFIG_APPLICOM is not set -# CONFIG_AGP is not set -# CONFIG_DRM is not set # CONFIG_RAW_DRIVER is not set # CONFIG_TCG_TPM is not set CONFIG_DEVPORT=y @@ -554,6 +589,12 @@ CONFIG_DEVPORT=y # CONFIG_HWMON is not set # +# Sonics Silicon Backplane +# +CONFIG_SSB_POSSIBLE=y +# CONFIG_SSB is not set + +# # Multifunction device drivers # # CONFIG_MFD_SM501 is not set @@ -568,16 +609,17 @@ CONFIG_DEVPORT=y # # Graphics support # +# CONFIG_AGP is not set +# CONFIG_DRM is not set +# CONFIG_VGASTATE is not set +# CONFIG_VIDEO_OUTPUT_CONTROL is not set +# CONFIG_FB is not set # CONFIG_BACKLIGHT_LCD_SUPPORT is not set # # Display device support # # CONFIG_DISPLAY_SUPPORT is not set -# CONFIG_VGASTATE is not set -# CONFIG_VIDEO_OUTPUT_CONTROL is not set -# CONFIG_FB is not set -# CONFIG_FB_IBM_GXT4500 is not set # # Sound @@ -604,19 +646,6 @@ CONFIG_USB_ARCH_HAS_EHCI=y # CONFIG_RTC_CLASS is not set # -# DMA Engine support -# -# CONFIG_DMA_ENGINE is not set - -# -# DMA Clients -# - -# -# DMA Devices -# - -# # Userspace I/O # # CONFIG_UIO is not set @@ -668,7 +697,6 @@ CONFIG_SYSFS=y CONFIG_TMPFS=y # CONFIG_TMPFS_POSIX_ACL is not set # CONFIG_HUGETLB_PAGE is not set -CONFIG_RAMFS=y # CONFIG_CONFIGFS_FS is not set # @@ -684,10 +712,12 @@ CONFIG_RAMFS=y CONFIG_JFFS2_FS=y CONFIG_JFFS2_FS_DEBUG=0 CONFIG_JFFS2_FS_WRITEBUFFER=y +# CONFIG_JFFS2_FS_WBUF_VERIFY is not set # CONFIG_JFFS2_SUMMARY is not set # CONFIG_JFFS2_FS_XATTR is not set # CONFIG_JFFS2_COMPRESSION_OPTIONS is not set CONFIG_JFFS2_ZLIB=y +# CONFIG_JFFS2_LZO is not set CONFIG_JFFS2_RTIME=y # CONFIG_JFFS2_RUBIN is not set CONFIG_CRAMFS=y @@ -696,10 +726,7 @@ CONFIG_CRAMFS=y # CONFIG_QNX4FS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set - -# -# Network File Systems -# +CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y # CONFIG_NFS_V3_ACL is not set @@ -725,15 +752,7 @@ CONFIG_SUNRPC=y # # CONFIG_PARTITION_ADVANCED is not set CONFIG_MSDOS_PARTITION=y - -# -# Native Language Support -# # CONFIG_NLS is not set - -# -# Distributed Lock Manager -# # CONFIG_DLM is not set # CONFIG_UCC_SLOW is not set @@ -787,6 +806,7 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_VM is not set # CONFIG_DEBUG_LIST is not set CONFIG_FORCED_INLINING=y +# CONFIG_BOOT_PRINTK_DELAY is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_FAULT_INJECTION is not set # CONFIG_DEBUG_STACKOVERFLOW is not set @@ -801,6 +821,7 @@ CONFIG_FORCED_INLINING=y # # CONFIG_KEYS is not set # CONFIG_SECURITY is not set +# CONFIG_SECURITY_FILE_CAPABILITIES is not set CONFIG_CRYPTO=y CONFIG_CRYPTO_ALGAPI=y CONFIG_CRYPTO_BLKCIPHER=y @@ -820,6 +841,7 @@ CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_CBC=y CONFIG_CRYPTO_PCBC=y # CONFIG_CRYPTO_LRW is not set +# CONFIG_CRYPTO_XTS is not set # CONFIG_CRYPTO_CRYPTD is not set CONFIG_CRYPTO_DES=y # CONFIG_CRYPTO_FCRYPT is not set @@ -833,9 +855,12 @@ CONFIG_CRYPTO_DES=y # CONFIG_CRYPTO_ARC4 is not set # CONFIG_CRYPTO_KHAZAD is not set # CONFIG_CRYPTO_ANUBIS is not set +# CONFIG_CRYPTO_SEED is not set # CONFIG_CRYPTO_DEFLATE is not set # CONFIG_CRYPTO_MICHAEL_MIC is not set # CONFIG_CRYPTO_CRC32C is not set # CONFIG_CRYPTO_CAMELLIA is not set # CONFIG_CRYPTO_TEST is not set +# CONFIG_CRYPTO_AUTHENC is not set # CONFIG_CRYPTO_HW is not set +# CONFIG_PPC_CLOCK is not set diff --git a/arch/powerpc/configs/walnut_defconfig b/arch/powerpc/configs/walnut_defconfig index 7724292..02896ec 100644 --- a/arch/powerpc/configs/walnut_defconfig +++ b/arch/powerpc/configs/walnut_defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.23-rc4 -# Wed Sep 5 12:06:37 2007 +# Linux kernel version: 2.6.23 +# Thu Oct 18 12:54:18 2007 # # CONFIG_PPC64 is not set @@ -18,8 +18,13 @@ CONFIG_4xx=y # CONFIG_PPC_MM_SLICES is not set CONFIG_NOT_COHERENT_CACHE=y CONFIG_PPC32=y +CONFIG_WORD_SIZE=32 CONFIG_PPC_MERGE=y CONFIG_MMU=y +CONFIG_GENERIC_CMOS_UPDATE=y +CONFIG_GENERIC_TIME=y +CONFIG_GENERIC_TIME_VSYSCALL=y +CONFIG_GENERIC_CLOCKEVENTS=y CONFIG_GENERIC_HARDIRQS=y CONFIG_IRQ_PER_CPU=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y @@ -63,6 +68,8 @@ CONFIG_POSIX_MQUEUE=y # CONFIG_AUDIT is not set # CONFIG_IKCONFIG is not set CONFIG_LOG_BUF_SHIFT=14 +CONFIG_FAIR_GROUP_SCHED=y +CONFIG_FAIR_USER_SCHED=y CONFIG_SYSFS_DEPRECATED=y # CONFIG_RELAY is not set CONFIG_BLK_DEV_INITRD=y @@ -83,7 +90,6 @@ CONFIG_FUTEX=y CONFIG_ANON_INODES=y CONFIG_EPOLL=y CONFIG_SIGNALFD=y -CONFIG_TIMERFD=y CONFIG_EVENTFD=y CONFIG_SHMEM=y CONFIG_VM_EVENT_COUNTERS=y @@ -127,7 +133,9 @@ CONFIG_DEFAULT_IOSCHED="anticipatory" # CONFIG_PPC_CELL is not set # CONFIG_PPC_CELL_NATIVE is not set # CONFIG_PQ2ADS is not set +# CONFIG_KILAUEA is not set CONFIG_WALNUT=y +# CONFIG_XILINX_VIRTEX_GENERIC_BOARD is not set CONFIG_405GP=y CONFIG_IBM405_ERR77=y CONFIG_IBM405_ERR51=y @@ -148,6 +156,10 @@ CONFIG_IBM405_ERR51=y # Kernel options # # CONFIG_HIGHMEM is not set +# CONFIG_TICK_ONESHOT is not set +# CONFIG_NO_HZ is not set +# CONFIG_HIGH_RES_TIMERS is not set +CONFIG_GENERIC_CLOCKEVENTS_BUILD=y # CONFIG_HZ_100 is not set CONFIG_HZ_250=y # CONFIG_HZ_300 is not set @@ -169,6 +181,7 @@ CONFIG_FLATMEM_MANUAL=y CONFIG_FLATMEM=y CONFIG_FLAT_NODE_MEM_MAP=y # CONFIG_SPARSEMEM_STATIC is not set +# CONFIG_SPARSEMEM_VMEMMAP_ENABLE is not set CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_RESOURCES_64BIT=y CONFIG_ZONE_DMA_FLAG=1 @@ -177,6 +190,8 @@ CONFIG_VIRT_TO_BUS=y CONFIG_PROC_DEVICETREE=y # CONFIG_CMDLINE_BOOL is not set # CONFIG_PM is not set +CONFIG_SUSPEND_UP_POSSIBLE=y +CONFIG_HIBERNATION_UP_POSSIBLE=y CONFIG_SECCOMP=y CONFIG_WANT_DEVICE_TREE=y CONFIG_DEVICE_TREE="walnut.dts" @@ -190,10 +205,6 @@ CONFIG_ZONE_DMA=y # CONFIG_PCI_DOMAINS is not set # CONFIG_PCI_SYSCALL is not set # CONFIG_ARCH_SUPPORTS_MSI is not set - -# -# PCCARD (PCMCIA/CardBus) support -# # CONFIG_PCCARD is not set # @@ -207,7 +218,7 @@ CONFIG_ZONE_DMA=y CONFIG_HIGHMEM_START=0xfe000000 CONFIG_LOWMEM_SIZE=0x30000000 CONFIG_KERNEL_START=0xc0000000 -CONFIG_TASK_SIZE=0x80000000 +CONFIG_TASK_SIZE=0xc0000000 CONFIG_CONSISTENT_START=0xff100000 CONFIG_CONSISTENT_SIZE=0x00200000 CONFIG_BOOT_LOAD=0x00400000 @@ -244,6 +255,7 @@ CONFIG_IP_PNP_BOOTP=y # CONFIG_INET_XFRM_MODE_TRANSPORT is not set # CONFIG_INET_XFRM_MODE_TUNNEL is not set # CONFIG_INET_XFRM_MODE_BEET is not set +# CONFIG_INET_LRO is not set CONFIG_INET_DIAG=y CONFIG_INET_TCP_DIAG=y # CONFIG_TCP_CONG_ADVANCED is not set @@ -301,6 +313,7 @@ CONFIG_DEFAULT_TCP_CONG="cubic" # # Generic Driver Options # +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_STANDALONE=y CONFIG_PREVENT_FIRMWARE_BUILD=y CONFIG_FW_LOADER=y @@ -328,6 +341,7 @@ CONFIG_MTD_BLOCK=m # CONFIG_INFTL is not set # CONFIG_RFD_FTL is not set # CONFIG_SSFDC is not set +# CONFIG_MTD_OOPS is not set # # RAM/ROM/Flash chip drivers @@ -360,7 +374,6 @@ CONFIG_MTD_CFI_UTIL=y # CONFIG_MTD_COMPLEX_MAPPINGS is not set # CONFIG_MTD_PHYSMAP is not set CONFIG_MTD_PHYSMAP_OF=y -# CONFIG_MTD_WALNUT is not set # CONFIG_MTD_PLATRAM is not set # @@ -419,7 +432,22 @@ CONFIG_NETDEVICES=y # CONFIG_MACVLAN is not set # CONFIG_EQUALIZER is not set # CONFIG_TUN is not set -# CONFIG_NET_ETHERNET is not set +# CONFIG_VETH is not set +# CONFIG_PHYLIB is not set +CONFIG_NET_ETHERNET=y +# CONFIG_MII is not set +CONFIG_IBM_NEW_EMAC=y +CONFIG_IBM_NEW_EMAC_RXB=128 +CONFIG_IBM_NEW_EMAC_TXB=64 +CONFIG_IBM_NEW_EMAC_POLL_WEIGHT=32 +CONFIG_IBM_NEW_EMAC_RX_COPY_THRESHOLD=256 +CONFIG_IBM_NEW_EMAC_RX_SKB_HEADROOM=0 +# CONFIG_IBM_NEW_EMAC_DEBUG is not set +CONFIG_IBM_NEW_EMAC_ZMII=y +# CONFIG_IBM_NEW_EMAC_RGMII is not set +# CONFIG_IBM_NEW_EMAC_TAH is not set +# CONFIG_IBM_NEW_EMAC_EMAC4 is not set +# CONFIG_B44 is not set CONFIG_NETDEV_1000=y CONFIG_NETDEV_10000=y @@ -498,6 +526,12 @@ CONFIG_LEGACY_PTY_COUNT=256 # CONFIG_HWMON is not set # +# Sonics Silicon Backplane +# +CONFIG_SSB_POSSIBLE=y +# CONFIG_SSB is not set + +# # Multifunction device drivers # # CONFIG_MFD_SM501 is not set @@ -512,16 +546,15 @@ CONFIG_LEGACY_PTY_COUNT=256 # # Graphics support # +# CONFIG_VGASTATE is not set +CONFIG_VIDEO_OUTPUT_CONTROL=m +# CONFIG_FB is not set # CONFIG_BACKLIGHT_LCD_SUPPORT is not set # # Display device support # # CONFIG_DISPLAY_SUPPORT is not set -# CONFIG_VGASTATE is not set -CONFIG_VIDEO_OUTPUT_CONTROL=m -# CONFIG_FB is not set -# CONFIG_FB_IBM_GXT4500 is not set # # Sound @@ -546,19 +579,6 @@ CONFIG_USB_SUPPORT=y # CONFIG_RTC_CLASS is not set # -# DMA Engine support -# -# CONFIG_DMA_ENGINE is not set - -# -# DMA Clients -# - -# -# DMA Devices -# - -# # Userspace I/O # # CONFIG_UIO is not set @@ -610,7 +630,6 @@ CONFIG_SYSFS=y CONFIG_TMPFS=y # CONFIG_TMPFS_POSIX_ACL is not set # CONFIG_HUGETLB_PAGE is not set -CONFIG_RAMFS=y # CONFIG_CONFIGFS_FS is not set # @@ -630,10 +649,7 @@ CONFIG_CRAMFS=y # CONFIG_QNX4FS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set - -# -# Network File Systems -# +CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y # CONFIG_NFS_V3_ACL is not set @@ -659,15 +675,7 @@ CONFIG_SUNRPC=y # # CONFIG_PARTITION_ADVANCED is not set CONFIG_MSDOS_PARTITION=y - -# -# Native Language Support -# # CONFIG_NLS is not set - -# -# Distributed Lock Manager -# # CONFIG_DLM is not set # CONFIG_UCC_SLOW is not set @@ -720,6 +728,7 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_VM is not set # CONFIG_DEBUG_LIST is not set CONFIG_FORCED_INLINING=y +# CONFIG_BOOT_PRINTK_DELAY is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_FAULT_INJECTION is not set # CONFIG_DEBUG_STACKOVERFLOW is not set @@ -734,6 +743,7 @@ CONFIG_FORCED_INLINING=y # # CONFIG_KEYS is not set # CONFIG_SECURITY is not set +# CONFIG_SECURITY_FILE_CAPABILITIES is not set CONFIG_CRYPTO=y CONFIG_CRYPTO_ALGAPI=y CONFIG_CRYPTO_BLKCIPHER=y @@ -753,6 +763,7 @@ CONFIG_CRYPTO_ECB=y CONFIG_CRYPTO_CBC=y CONFIG_CRYPTO_PCBC=y # CONFIG_CRYPTO_LRW is not set +# CONFIG_CRYPTO_XTS is not set # CONFIG_CRYPTO_CRYPTD is not set CONFIG_CRYPTO_DES=y # CONFIG_CRYPTO_FCRYPT is not set @@ -766,9 +777,12 @@ CONFIG_CRYPTO_DES=y # CONFIG_CRYPTO_ARC4 is not set # CONFIG_CRYPTO_KHAZAD is not set # CONFIG_CRYPTO_ANUBIS is not set +# CONFIG_CRYPTO_SEED is not set # CONFIG_CRYPTO_DEFLATE is not set # CONFIG_CRYPTO_MICHAEL_MIC is not set # CONFIG_CRYPTO_CRC32C is not set # CONFIG_CRYPTO_CAMELLIA is not set # CONFIG_CRYPTO_TEST is not set +# CONFIG_CRYPTO_AUTHENC is not set CONFIG_CRYPTO_HW=y +# CONFIG_PPC_CLOCK is not set diff --git a/arch/powerpc/platforms/40x/Kconfig b/arch/powerpc/platforms/40x/Kconfig index 47b3b0a..8f6699f 100644 --- a/arch/powerpc/platforms/40x/Kconfig +++ b/arch/powerpc/platforms/40x/Kconfig @@ -100,6 +100,7 @@ config 405GP bool select IBM405_ERR77 select IBM405_ERR51 + select IBM_NEW_EMAC_ZMII config 405EP bool diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig index 51f3ea4..8390cc1 100644 --- a/arch/powerpc/platforms/44x/Kconfig +++ b/arch/powerpc/platforms/44x/Kconfig @@ -43,14 +43,14 @@ config 440EP bool select PPC_FPU select IBM440EP_ERR42 -# select IBM_NEW_EMAC_ZMII + select IBM_NEW_EMAC_ZMII config 440EPX bool select PPC_FPU -# Disabled until the new EMAC Driver is merged. -# select IBM_NEW_EMAC_EMAC4 -# select IBM_NEW_EMAC_ZMII + select IBM_NEW_EMAC_EMAC4 + select IBM_NEW_EMAC_RGMII + select IBM_NEW_EMAC_ZMII config 440GP bool diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 3c7325e..99684ea 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -48,6 +48,7 @@ config 44x bool "AMCC 44x" select PPC_DCR_NATIVE select WANT_DEVICE_TREE + select PPC_UDBG_16550 config E200 bool "Freescale e200" diff --git a/arch/sparc64/kernel/iommu_common.c b/arch/sparc64/kernel/iommu_common.c index 78e8277..b70324e 100644 --- a/arch/sparc64/kernel/iommu_common.c +++ b/arch/sparc64/kernel/iommu_common.c @@ -233,6 +233,11 @@ unsigned long prepare_sg(struct scatterlist *sg, int nents) dma_sg->dma_address = dent_addr; dma_sg->dma_length = dent_len; + if (dma_sg != sg) { + dma_sg = next_sg(dma_sg); + dma_sg->dma_length = 0; + } + return ((unsigned long) dent_addr + (unsigned long) dent_len + (IO_PAGE_SIZE - 1UL)) >> IO_PAGE_SHIFT; diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index f876471..0e45981 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -136,6 +136,7 @@ void foo(void) #ifdef CONFIG_LGUEST_GUEST BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); + OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3); diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig new file mode 100644 index 0000000..c4dffbe --- /dev/null +++ b/arch/x86/lguest/Kconfig @@ -0,0 +1,14 @@ +config LGUEST_GUEST + bool "Lguest guest support" + select PARAVIRT + depends on !X86_PAE + select VIRTIO + select VIRTIO_RING + select VIRTIO_CONSOLE + help + Lguest is a tiny in-kernel hypervisor. Selecting this will + allow your kernel to boot under lguest. This option will increase + your kernel size by about 6k. If in doubt, say N. + + If you say Y here, make sure you say Y (or M) to the virtio block + and net drivers which lguest needs. diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile new file mode 100644 index 0000000..27f0c9e --- /dev/null +++ b/arch/x86/lguest/Makefile @@ -0,0 +1 @@ +obj-y := i386_head.o boot.o diff --git a/drivers/lguest/lguest.c b/arch/x86/lguest/boot.c index 3ba337d..d2235db 100644 --- a/drivers/lguest/lguest.c +++ b/arch/x86/lguest/boot.c @@ -55,7 +55,7 @@ #include <linux/clockchips.h> #include <linux/lguest.h> #include <linux/lguest_launcher.h> -#include <linux/lguest_bus.h> +#include <linux/virtio_console.h> #include <asm/paravirt.h> #include <asm/param.h> #include <asm/page.h> @@ -65,6 +65,7 @@ #include <asm/e820.h> #include <asm/mce.h> #include <asm/io.h> +#include <asm/i387.h> /*G:010 Welcome to the Guest! * @@ -85,9 +86,10 @@ struct lguest_data lguest_data = { .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, .noirq_start = (u32)lguest_noirq_start, .noirq_end = (u32)lguest_noirq_end, + .kernel_address = PAGE_OFFSET, .blocked_interrupts = { 1 }, /* Block timer interrupts */ + .syscall_vec = SYSCALL_VECTOR, }; -struct lguest_device_desc *lguest_devices; static cycle_t clock_base; /*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first @@ -146,10 +148,10 @@ void async_hcall(unsigned long call, /* Table full, so do normal hcall which will flush table. */ hcall(call, arg1, arg2, arg3); } else { - lguest_data.hcalls[next_call].eax = call; - lguest_data.hcalls[next_call].edx = arg1; - lguest_data.hcalls[next_call].ebx = arg2; - lguest_data.hcalls[next_call].ecx = arg3; + lguest_data.hcalls[next_call].arg0 = call; + lguest_data.hcalls[next_call].arg1 = arg1; + lguest_data.hcalls[next_call].arg2 = arg2; + lguest_data.hcalls[next_call].arg3 = arg3; /* Arguments must all be written before we mark it to go */ wmb(); lguest_data.hcall_status[next_call] = 0; @@ -160,46 +162,6 @@ void async_hcall(unsigned long call, } /*:*/ -/* Wrappers for the SEND_DMA and BIND_DMA hypercalls. This is mainly because - * Jeff Garzik complained that __pa() should never appear in drivers, and this - * helps remove most of them. But also, it wraps some ugliness. */ -void lguest_send_dma(unsigned long key, struct lguest_dma *dma) -{ - /* The hcall might not write this if something goes wrong */ - dma->used_len = 0; - hcall(LHCALL_SEND_DMA, key, __pa(dma), 0); -} - -int lguest_bind_dma(unsigned long key, struct lguest_dma *dmas, - unsigned int num, u8 irq) -{ - /* This is the only hypercall which actually wants 5 arguments, and we - * only support 4. Fortunately the interrupt number is always less - * than 256, so we can pack it with the number of dmas in the final - * argument. */ - if (!hcall(LHCALL_BIND_DMA, key, __pa(dmas), (num << 8) | irq)) - return -ENOMEM; - return 0; -} - -/* Unbinding is the same hypercall as binding, but with 0 num & irq. */ -void lguest_unbind_dma(unsigned long key, struct lguest_dma *dmas) -{ - hcall(LHCALL_BIND_DMA, key, __pa(dmas), 0); -} - -/* For guests, device memory can be used as normal memory, so we cast away the - * __iomem to quieten sparse. */ -void *lguest_map(unsigned long phys_addr, unsigned long pages) -{ - return (__force void *)ioremap(phys_addr, PAGE_SIZE*pages); -} - -void lguest_unmap(void *addr) -{ - iounmap((__force void __iomem *)addr); -} - /*G:033 * Here are our first native-instruction replacements: four functions for * interrupt control. @@ -680,6 +642,7 @@ static struct clocksource lguest_clock = { .mask = CLOCKSOURCE_MASK(64), .mult = 1 << 22, .shift = 22, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; /* The "scheduler clock" is just our real clock, adjusted to start at zero */ @@ -761,11 +724,9 @@ static void lguest_time_init(void) * the TSC, otherwise it's a dumb nanosecond-resolution clock. Either * way, the "rating" is initialized so high that it's always chosen * over any other clocksource. */ - if (lguest_data.tsc_khz) { + if (lguest_data.tsc_khz) lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz, lguest_clock.shift); - lguest_clock.flags = CLOCK_SOURCE_IS_CONTINUOUS; - } clock_base = lguest_clock_read(); clocksource_register(&lguest_clock); @@ -889,6 +850,23 @@ static __init char *lguest_memory_setup(void) return "LGUEST"; } +/* Before virtqueues are set up, we use LHCALL_NOTIFY on normal memory to + * produce console output. */ +static __init int early_put_chars(u32 vtermno, const char *buf, int count) +{ + char scratch[17]; + unsigned int len = count; + + if (len > sizeof(scratch) - 1) + len = sizeof(scratch) - 1; + scratch[len] = '\0'; + memcpy(scratch, buf, len); + hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0); + + /* This routine returns the number of bytes actually written. */ + return len; +} + /*G:050 * Patching (Powerfully Placating Performance Pedants) * @@ -950,18 +928,8 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, /*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops * structures in the kernel provide points for (almost) every routine we have * to override to avoid privileged instructions. */ -__init void lguest_init(void *boot) +__init void lguest_init(void) { - /* Copy boot parameters first: the Launcher put the physical location - * in %esi, and head.S converted that to a virtual address and handed - * it to us. We use "__memcpy" because "memcpy" sometimes tries to do - * tricky things to go faster, and we're not ready for that. */ - __memcpy(&boot_params, boot, PARAM_SIZE); - /* The boot parameters also tell us where the command-line is: save - * that, too. */ - __memcpy(boot_command_line, __va(boot_params.hdr.cmd_line_ptr), - COMMAND_LINE_SIZE); - /* We're under lguest, paravirt is enabled, and we're running at * privilege level 1, not 0 as normal. */ pv_info.name = "lguest"; @@ -1033,11 +1001,7 @@ __init void lguest_init(void *boot) /*G:070 Now we've seen all the paravirt_ops, we return to * lguest_init() where the rest of the fairly chaotic boot setup - * occurs. - * - * The Host expects our first hypercall to tell it where our "struct - * lguest_data" is, so we do that first. */ - hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0); + * occurs. */ /* The native boot code sets up initial page tables immediately after * the kernel itself, and sets init_pg_tables_end so they're not @@ -1050,11 +1014,6 @@ __init void lguest_init(void *boot) * the normal data segment to get through booting. */ asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); - /* Clear the part of the kernel data which is expected to be zero. - * Normally it will be anyway, but if we're loading from a bzImage with - * CONFIG_RELOCATALE=y, the relocations will be sitting here. */ - memset(__bss_start, 0, __bss_stop - __bss_start); - /* The Host uses the top of the Guest's virtual address space for the * Host<->Guest Switcher, and it tells us how much it needs in * lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */ @@ -1092,6 +1051,9 @@ __init void lguest_init(void *boot) * adapted for lguest's use. */ add_preferred_console("hvc", 0, NULL); + /* Register our very early console. */ + virtio_cons_early_init(early_put_chars); + /* Last of all, we set the power management poweroff hook to point to * the Guest routine to power off. */ pm_power_off = lguest_power_off; diff --git a/drivers/lguest/lguest_asm.S b/arch/x86/lguest/i386_head.S index 1ddcd5c..ebc6ac7 100644 --- a/drivers/lguest/lguest_asm.S +++ b/arch/x86/lguest/i386_head.S @@ -1,25 +1,47 @@ #include <linux/linkage.h> #include <linux/lguest.h> +#include <asm/lguest_hcall.h> #include <asm/asm-offsets.h> #include <asm/thread_info.h> #include <asm/processor-flags.h> -/*G:020 This is where we begin: we have a magic signature which the launcher - * looks for. The plan is that the Linux boot protocol will be extended with a - * "platform type" field which will guide us here from the normal entry point, - * but for the moment this suffices. The normal boot code uses %esi for the - * boot header, so we do too. We convert it to a virtual address by adding - * PAGE_OFFSET, and hand it to lguest_init() as its argument (ie. %eax). +/*G:020 This is where we begin: head.S notes that the boot header's platform + * type field is "1" (lguest), so calls us here. The boot header is in %esi. + * + * WARNING: be very careful here! We're running at addresses equal to physical + * addesses (around 0), not above PAGE_OFFSET as most code expectes + * (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any + * data. * * The .section line puts this code in .init.text so it will be discarded after * boot. */ .section .init.text, "ax", @progbits -.ascii "GenuineLguest" - /* Set up initial stack. */ - movl $(init_thread_union+THREAD_SIZE),%esp - movl %esi, %eax - addl $__PAGE_OFFSET, %eax - jmp lguest_init +ENTRY(lguest_entry) + /* Make initial hypercall now, so we can set up the pagetables. */ + movl $LHCALL_LGUEST_INIT, %eax + movl $lguest_data - __PAGE_OFFSET, %edx + int $LGUEST_TRAP_ENTRY + + /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl + * instruction uses %esi implicitly. */ + movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi + + /* Copy first 32 entries of page directory to __PAGE_OFFSET entries. + * This means the first 128M of kernel memory will be mapped at + * PAGE_OFFSET where the kernel expects to run. This will get it far + * enough through boot to switch to its own pagetables. */ + movl $32, %ecx + movl %esi, %edi + addl $((__PAGE_OFFSET >> 22) * 4), %edi + rep + movsl + + /* Set up the initial stack so we can run C code. */ + movl $(init_thread_union+THREAD_SIZE),%esp + + /* Jumps are relative, and we're running __PAGE_OFFSET too low at the + * moment. */ + jmp lguest_init+__PAGE_OFFSET /*G:055 We create a macro which puts the assembler code between lgstart_ and * lgend_ markers. These templates are put in the .text section: they can't be diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 9df99e1..fbfa55c 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -3,8 +3,9 @@ # config XEN - bool "Enable support for Xen hypervisor" - depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES + bool "Xen guest support" + select PARAVIRT + depends on X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES && !(X86_VISWS || X86_VOYAGER) help This is the Linux Xen port. Enabling this will allow the kernel to boot in a paravirtualized environment under the diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c index 61c2e39..de5ba47 100644 --- a/block/ll_rw_blk.c +++ b/block/ll_rw_blk.c @@ -1351,11 +1351,21 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, new_segment: if (!sg) sg = sglist; - else + else { + /* + * If the driver previously mapped a shorter + * list, we could see a termination bit + * prematurely unless it fully inits the sg + * table on each mapping. We KNOW that there + * must be more entries here or the driver + * would be buggy, so force clear the + * termination bit to avoid doing a full + * sg_init_table() in drivers for each command. + */ + sg->page_link &= ~0x02; sg = sg_next(sg); + } - sg_dma_len(sg) = 0; - sg_dma_address(sg) = 0; sg_set_page(sg, bvec->bv_page); sg->length = nbytes; sg->offset = bvec->bv_offset; diff --git a/drivers/Kconfig b/drivers/Kconfig index 34f40ea..f4076d9 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -94,5 +94,5 @@ source "drivers/kvm/Kconfig" source "drivers/uio/Kconfig" -source "drivers/lguest/Kconfig" +source "drivers/virtio/Kconfig" endmenu diff --git a/drivers/Makefile b/drivers/Makefile index cfe38ff..560496b 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -91,3 +91,4 @@ obj-$(CONFIG_HID) += hid/ obj-$(CONFIG_PPC_PS3) += ps3/ obj-$(CONFIG_OF) += of/ obj-$(CONFIG_SSB) += ssb/ +obj-$(CONFIG_VIRTIO) += virtio/ diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index 5350542..9030c37 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -44,6 +44,7 @@ #include <linux/init.h> #include <linux/jiffies.h> #include <linux/random.h> +#include <linux/scatterlist.h> #include <asm/io.h> #include <asm/uaccess.h> #include "DAC960.h" diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index ce4b1e4..4d0119e 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -425,4 +425,10 @@ config XEN_BLKDEV_FRONTEND block device driver. It communicates with a back-end driver in another domain which drives the actual block device. +config VIRTIO_BLK + tristate "Virtio block driver (EXPERIMENTAL)" + depends on EXPERIMENTAL && VIRTIO + ---help--- + This is the virtual block driver for lguest. Say Y or M. + endif # BLK_DEV diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 014e721..7691505 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -25,10 +25,10 @@ obj-$(CONFIG_SUNVDC) += sunvdc.o obj-$(CONFIG_BLK_DEV_UMEM) += umem.o obj-$(CONFIG_BLK_DEV_NBD) += nbd.o obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o +obj-$(CONFIG_VIRTIO_BLK) += virtio_blk.o obj-$(CONFIG_VIODASD) += viodasd.o obj-$(CONFIG_BLK_DEV_SX8) += sx8.o obj-$(CONFIG_BLK_DEV_UB) += ub.o obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o -obj-$(CONFIG_LGUEST_BLOCK) += lguest_blk.o diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c index efab27f..c8132d9 100644 --- a/drivers/block/cpqarray.c +++ b/drivers/block/cpqarray.c @@ -37,6 +37,7 @@ #include <linux/spinlock.h> #include <linux/blkdev.h> #include <linux/genhd.h> +#include <linux/scatterlist.h> #include <asm/uaccess.h> #include <asm/io.h> diff --git a/drivers/block/lguest_blk.c b/drivers/block/lguest_blk.c deleted file mode 100644 index fa8e423..0000000 --- a/drivers/block/lguest_blk.c +++ /dev/null @@ -1,421 +0,0 @@ -/*D:400 - * The Guest block driver - * - * This is a simple block driver, which appears as /dev/lgba, lgbb, lgbc etc. - * The mechanism is simple: we place the information about the request in the - * device page, then use SEND_DMA (containing the data for a write, or an empty - * "ping" DMA for a read). - :*/ -/* Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -//#define DEBUG -#include <linux/init.h> -#include <linux/types.h> -#include <linux/blkdev.h> -#include <linux/interrupt.h> -#include <linux/lguest_bus.h> - -static char next_block_index = 'a'; - -/*D:420 Here is the structure which holds all the information we need about - * each Guest block device. - * - * I'm sure at this stage, you're wondering "hey, where was the adventure I was - * promised?" and thinking "Rusty sucks, I shall say nasty things about him on - * my blog". I think Real adventures have boring bits, too, and you're in the - * middle of one. But it gets better. Just not quite yet. */ -struct blockdev -{ - /* The block queue infrastructure wants a spinlock: it is held while it - * calls our block request function. We grab it in our interrupt - * handler so the responses don't mess with new requests. */ - spinlock_t lock; - - /* The disk structure registered with kernel. */ - struct gendisk *disk; - - /* The major device number for this disk, and the interrupt. We only - * really keep them here for completeness; we'd need them if we - * supported device unplugging. */ - int major; - int irq; - - /* The physical address of this device's memory page */ - unsigned long phys_addr; - /* The mapped memory page for convenient acces. */ - struct lguest_block_page *lb_page; - - /* We only have a single request outstanding at a time: this is it. */ - struct lguest_dma dma; - struct request *req; -}; - -/*D:495 We originally used end_request() throughout the driver, but it turns - * out that end_request() is deprecated, and doesn't actually end the request - * (which seems like a good reason to deprecate it!). It simply ends the first - * bio. So if we had 3 bios in a "struct request" we would do all 3, - * end_request(), do 2, end_request(), do 1 and end_request(): twice as much - * work as we needed to do. - * - * This reinforced to me that I do not understand the block layer. - * - * Nonetheless, Jens Axboe gave me this nice helper to end all chunks of a - * request. This improved disk speed by 130%. */ -static void end_entire_request(struct request *req, int uptodate) -{ - if (end_that_request_first(req, uptodate, req->hard_nr_sectors)) - BUG(); - add_disk_randomness(req->rq_disk); - blkdev_dequeue_request(req); - end_that_request_last(req, uptodate); -} - -/* I'm told there are only two stories in the world worth telling: love and - * hate. So there used to be a love scene here like this: - * - * Launcher: We could make beautiful I/O together, you and I. - * Guest: My, that's a big disk! - * - * Unfortunately, it was just too raunchy for our otherwise-gentle tale. */ - -/*D:490 This is the interrupt handler, called when a block read or write has - * been completed for us. */ -static irqreturn_t lgb_irq(int irq, void *_bd) -{ - /* We handed our "struct blockdev" as the argument to request_irq(), so - * it is passed through to us here. This tells us which device we're - * dealing with in case we have more than one. */ - struct blockdev *bd = _bd; - unsigned long flags; - - /* We weren't doing anything? Strange, but could happen if we shared - * interrupts (we don't!). */ - if (!bd->req) { - pr_debug("No work!\n"); - return IRQ_NONE; - } - - /* Not done yet? That's equally strange. */ - if (!bd->lb_page->result) { - pr_debug("No result!\n"); - return IRQ_NONE; - } - - /* We have to grab the lock before ending the request. */ - spin_lock_irqsave(&bd->lock, flags); - /* "result" is 1 for success, 2 for failure: end_entire_request() wants - * to know whether this succeeded or not. */ - end_entire_request(bd->req, bd->lb_page->result == 1); - /* Clear out request, it's done. */ - bd->req = NULL; - /* Reset incoming DMA for next time. */ - bd->dma.used_len = 0; - /* Ready for more reads or writes */ - blk_start_queue(bd->disk->queue); - spin_unlock_irqrestore(&bd->lock, flags); - - /* The interrupt was for us, we dealt with it. */ - return IRQ_HANDLED; -} - -/*D:480 The block layer's "struct request" contains a number of "struct bio"s, - * each of which contains "struct bio_vec"s, each of which contains a page, an - * offset and a length. - * - * Fortunately there are iterators to help us walk through the "struct - * request". Even more fortunately, there were plenty of places to steal the - * code from. We pack the "struct request" into our "struct lguest_dma" and - * return the total length. */ -static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma) -{ - unsigned int i = 0, len = 0; - struct req_iterator iter; - struct bio_vec *bvec; - - rq_for_each_segment(bvec, req, iter) { - /* We told the block layer not to give us too many. */ - BUG_ON(i == LGUEST_MAX_DMA_SECTIONS); - /* If we had a zero-length segment, it would look like - * the end of the data referred to by the "struct - * lguest_dma", so make sure that doesn't happen. */ - BUG_ON(!bvec->bv_len); - /* Convert page & offset to a physical address */ - dma->addr[i] = page_to_phys(bvec->bv_page) - + bvec->bv_offset; - dma->len[i] = bvec->bv_len; - len += bvec->bv_len; - i++; - } - /* If the array isn't full, we mark the end with a 0 length */ - if (i < LGUEST_MAX_DMA_SECTIONS) - dma->len[i] = 0; - return len; -} - -/* This creates an empty DMA, useful for prodding the Host without sending data - * (ie. when we want to do a read) */ -static void empty_dma(struct lguest_dma *dma) -{ - dma->len[0] = 0; -} - -/*D:470 Setting up a request is fairly easy: */ -static void setup_req(struct blockdev *bd, - int type, struct request *req, struct lguest_dma *dma) -{ - /* The type is 1 (write) or 0 (read). */ - bd->lb_page->type = type; - /* The sector on disk where the read or write starts. */ - bd->lb_page->sector = req->sector; - /* The result is initialized to 0 (unfinished). */ - bd->lb_page->result = 0; - /* The current request (so we can end it in the interrupt handler). */ - bd->req = req; - /* The number of bytes: returned as a side-effect of req_to_dma(), - * which packs the block layer's "struct request" into our "struct - * lguest_dma" */ - bd->lb_page->bytes = req_to_dma(req, dma); -} - -/*D:450 Write is pretty straightforward: we pack the request into a "struct - * lguest_dma", then use SEND_DMA to send the request. */ -static void do_write(struct blockdev *bd, struct request *req) -{ - struct lguest_dma send; - - pr_debug("lgb: WRITE sector %li\n", (long)req->sector); - setup_req(bd, 1, req, &send); - - lguest_send_dma(bd->phys_addr, &send); -} - -/* Read is similar to write, except we pack the request into our receive - * "struct lguest_dma" and send through an empty DMA just to tell the Host that - * there's a request pending. */ -static void do_read(struct blockdev *bd, struct request *req) -{ - struct lguest_dma ping; - - pr_debug("lgb: READ sector %li\n", (long)req->sector); - setup_req(bd, 0, req, &bd->dma); - - empty_dma(&ping); - lguest_send_dma(bd->phys_addr, &ping); -} - -/*D:440 This where requests come in: we get handed the request queue and are - * expected to pull a "struct request" off it until we've finished them or - * we're waiting for a reply: */ -static void do_lgb_request(struct request_queue *q) -{ - struct blockdev *bd; - struct request *req; - -again: - /* This sometimes returns NULL even on the very first time around. I - * wonder if it's something to do with letting elves handle the request - * queue... */ - req = elv_next_request(q); - if (!req) - return; - - /* We attached the struct blockdev to the disk: get it back */ - bd = req->rq_disk->private_data; - /* Sometimes we get repeated requests after blk_stop_queue(), but we - * can only handle one at a time. */ - if (bd->req) - return; - - /* We only do reads and writes: no tricky business! */ - if (!blk_fs_request(req)) { - pr_debug("Got non-command 0x%08x\n", req->cmd_type); - req->errors++; - end_entire_request(req, 0); - goto again; - } - - if (rq_data_dir(req) == WRITE) - do_write(bd, req); - else - do_read(bd, req); - - /* We've put out the request, so stop any more coming in until we get - * an interrupt, which takes us to lgb_irq() to re-enable the queue. */ - blk_stop_queue(q); -} - -/*D:430 This is the "struct block_device_operations" we attach to the disk at - * the end of lguestblk_probe(). It doesn't seem to want much. */ -static struct block_device_operations lguestblk_fops = { - .owner = THIS_MODULE, -}; - -/*D:425 Setting up a disk device seems to involve a lot of code. I'm not sure - * quite why. I do know that the IDE code sent two or three of the maintainers - * insane, perhaps this is the fringe of the same disease? - * - * As in the console code, the probe function gets handed the generic - * lguest_device from lguest_bus.c: */ -static int lguestblk_probe(struct lguest_device *lgdev) -{ - struct blockdev *bd; - int err; - int irqflags = IRQF_SHARED; - - /* First we allocate our own "struct blockdev" and initialize the easy - * fields. */ - bd = kmalloc(sizeof(*bd), GFP_KERNEL); - if (!bd) - return -ENOMEM; - - spin_lock_init(&bd->lock); - bd->irq = lgdev_irq(lgdev); - bd->req = NULL; - bd->dma.used_len = 0; - bd->dma.len[0] = 0; - /* The descriptor in the lguest_devices array provided by the Host - * gives the Guest the physical page number of the device's page. */ - bd->phys_addr = (lguest_devices[lgdev->index].pfn << PAGE_SHIFT); - - /* We use lguest_map() to get a pointer to the device page */ - bd->lb_page = lguest_map(bd->phys_addr, 1); - if (!bd->lb_page) { - err = -ENOMEM; - goto out_free_bd; - } - - /* We need a major device number: 0 means "assign one dynamically". */ - bd->major = register_blkdev(0, "lguestblk"); - if (bd->major < 0) { - err = bd->major; - goto out_unmap; - } - - /* This allocates a "struct gendisk" where we pack all the information - * about the disk which the rest of Linux sees. The argument is the - * number of minor devices desired: we need one minor for the main - * disk, and one for each partition. Of course, we can't possibly know - * how many partitions are on the disk (add_disk does that). - */ - bd->disk = alloc_disk(16); - if (!bd->disk) { - err = -ENOMEM; - goto out_unregister_blkdev; - } - - /* Every disk needs a queue for requests to come in: we set up the - * queue with a callback function (the core of our driver) and the lock - * to use. */ - bd->disk->queue = blk_init_queue(do_lgb_request, &bd->lock); - if (!bd->disk->queue) { - err = -ENOMEM; - goto out_put_disk; - } - - /* We can only handle a certain number of pointers in our SEND_DMA - * call, so we set that with blk_queue_max_hw_segments(). This is not - * to be confused with blk_queue_max_phys_segments() of course! I - * know, who could possibly confuse the two? - * - * Well, it's simple to tell them apart: this one seems to work and the - * other one didn't. */ - blk_queue_max_hw_segments(bd->disk->queue, LGUEST_MAX_DMA_SECTIONS); - - /* Due to technical limitations of our Host (and simple coding) we - * can't have a single buffer which crosses a page boundary. Tell it - * here. This means that our maximum request size is 16 - * (LGUEST_MAX_DMA_SECTIONS) pages. */ - blk_queue_segment_boundary(bd->disk->queue, PAGE_SIZE-1); - - /* We name our disk: this becomes the device name when udev does its - * magic thing and creates the device node, such as /dev/lgba. - * next_block_index is a global which starts at 'a'. Unfortunately - * this simple increment logic means that the 27th disk will be called - * "/dev/lgb{". In that case, I recommend having at least 29 disks, so - * your /dev directory will be balanced. */ - sprintf(bd->disk->disk_name, "lgb%c", next_block_index++); - - /* We look to the device descriptor again to see if this device's - * interrupts are expected to be random. If they are, we tell the irq - * subsystem. At the moment this bit is always set. */ - if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS) - irqflags |= IRQF_SAMPLE_RANDOM; - - /* Now we have the name and irqflags, we can request the interrupt; we - * give it the "struct blockdev" we have set up to pass to lgb_irq() - * when there is an interrupt. */ - err = request_irq(bd->irq, lgb_irq, irqflags, bd->disk->disk_name, bd); - if (err) - goto out_cleanup_queue; - - /* We bind our one-entry DMA pool to the key for this block device so - * the Host can reply to our requests. The key is equal to the - * physical address of the device's page, which is conveniently - * unique. */ - err = lguest_bind_dma(bd->phys_addr, &bd->dma, 1, bd->irq); - if (err) - goto out_free_irq; - - /* We finish our disk initialization and add the disk to the system. */ - bd->disk->major = bd->major; - bd->disk->first_minor = 0; - bd->disk->private_data = bd; - bd->disk->fops = &lguestblk_fops; - /* This is initialized to the disk size by the Launcher. */ - set_capacity(bd->disk, bd->lb_page->num_sectors); - add_disk(bd->disk); - - printk(KERN_INFO "%s: device %i at major %d\n", - bd->disk->disk_name, lgdev->index, bd->major); - - /* We don't need to keep the "struct blockdev" around, but if we ever - * implemented device removal, we'd need this. */ - lgdev->private = bd; - return 0; - -out_free_irq: - free_irq(bd->irq, bd); -out_cleanup_queue: - blk_cleanup_queue(bd->disk->queue); -out_put_disk: - put_disk(bd->disk); -out_unregister_blkdev: - unregister_blkdev(bd->major, "lguestblk"); -out_unmap: - lguest_unmap(bd->lb_page); -out_free_bd: - kfree(bd); - return err; -} - -/*D:410 The boilerplate code for registering the lguest block driver is just - * like the console: */ -static struct lguest_driver lguestblk_drv = { - .name = "lguestblk", - .owner = THIS_MODULE, - .device_type = LGUEST_DEVICE_T_BLOCK, - .probe = lguestblk_probe, -}; - -static __init int lguestblk_init(void) -{ - return register_lguest_driver(&lguestblk_drv); -} -module_init(lguestblk_init); - -MODULE_DESCRIPTION("Lguest block driver"); -MODULE_LICENSE("GPL"); diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index 282a6955..52dc5e1 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -27,6 +27,7 @@ #include <linux/hdreg.h> #include <linux/dma-mapping.h> #include <linux/completion.h> +#include <linux/scatterlist.h> #include <asm/io.h> #include <asm/uaccess.h> diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c new file mode 100644 index 0000000..a901eee --- /dev/null +++ b/drivers/block/virtio_blk.c @@ -0,0 +1,308 @@ +//#define DEBUG +#include <linux/spinlock.h> +#include <linux/blkdev.h> +#include <linux/hdreg.h> +#include <linux/virtio.h> +#include <linux/virtio_blk.h> +#include <linux/virtio_blk.h> + +static unsigned char virtblk_index = 'a'; +struct virtio_blk +{ + spinlock_t lock; + + struct virtio_device *vdev; + struct virtqueue *vq; + + /* The disk structure for the kernel. */ + struct gendisk *disk; + + /* Request tracking. */ + struct list_head reqs; + + mempool_t *pool; + + /* Scatterlist: can be too big for stack. */ + struct scatterlist sg[3+MAX_PHYS_SEGMENTS]; +}; + +struct virtblk_req +{ + struct list_head list; + struct request *req; + struct virtio_blk_outhdr out_hdr; + struct virtio_blk_inhdr in_hdr; +}; + +static bool blk_done(struct virtqueue *vq) +{ + struct virtio_blk *vblk = vq->vdev->priv; + struct virtblk_req *vbr; + unsigned int len; + unsigned long flags; + + spin_lock_irqsave(&vblk->lock, flags); + while ((vbr = vblk->vq->vq_ops->get_buf(vblk->vq, &len)) != NULL) { + int uptodate; + switch (vbr->in_hdr.status) { + case VIRTIO_BLK_S_OK: + uptodate = 1; + break; + case VIRTIO_BLK_S_UNSUPP: + uptodate = -ENOTTY; + break; + default: + uptodate = 0; + break; + } + + end_dequeued_request(vbr->req, uptodate); + list_del(&vbr->list); + mempool_free(vbr, vblk->pool); + } + /* In case queue is stopped waiting for more buffers. */ + blk_start_queue(vblk->disk->queue); + spin_unlock_irqrestore(&vblk->lock, flags); + return true; +} + +static bool do_req(struct request_queue *q, struct virtio_blk *vblk, + struct request *req) +{ + unsigned long num, out, in; + struct virtblk_req *vbr; + + vbr = mempool_alloc(vblk->pool, GFP_ATOMIC); + if (!vbr) + /* When another request finishes we'll try again. */ + return false; + + vbr->req = req; + if (blk_fs_request(vbr->req)) { + vbr->out_hdr.type = 0; + vbr->out_hdr.sector = vbr->req->sector; + vbr->out_hdr.ioprio = vbr->req->ioprio; + } else if (blk_pc_request(vbr->req)) { + vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD; + vbr->out_hdr.sector = 0; + vbr->out_hdr.ioprio = vbr->req->ioprio; + } else { + /* We don't put anything else in the queue. */ + BUG(); + } + + if (blk_barrier_rq(vbr->req)) + vbr->out_hdr.type |= VIRTIO_BLK_T_BARRIER; + + /* We have to zero this, otherwise blk_rq_map_sg gets upset. */ + memset(vblk->sg, 0, sizeof(vblk->sg)); + sg_set_buf(&vblk->sg[0], &vbr->out_hdr, sizeof(vbr->out_hdr)); + num = blk_rq_map_sg(q, vbr->req, vblk->sg+1); + sg_set_buf(&vblk->sg[num+1], &vbr->in_hdr, sizeof(vbr->in_hdr)); + + if (rq_data_dir(vbr->req) == WRITE) { + vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; + out = 1 + num; + in = 1; + } else { + vbr->out_hdr.type |= VIRTIO_BLK_T_IN; + out = 1; + in = 1 + num; + } + + if (vblk->vq->vq_ops->add_buf(vblk->vq, vblk->sg, out, in, vbr)) { + mempool_free(vbr, vblk->pool); + return false; + } + + list_add_tail(&vbr->list, &vblk->reqs); + return true; +} + +static void do_virtblk_request(struct request_queue *q) +{ + struct virtio_blk *vblk = NULL; + struct request *req; + unsigned int issued = 0; + + while ((req = elv_next_request(q)) != NULL) { + vblk = req->rq_disk->private_data; + BUG_ON(req->nr_phys_segments > ARRAY_SIZE(vblk->sg)); + + /* If this request fails, stop queue and wait for something to + finish to restart it. */ + if (!do_req(q, vblk, req)) { + blk_stop_queue(q); + break; + } + blkdev_dequeue_request(req); + issued++; + } + + if (issued) + vblk->vq->vq_ops->kick(vblk->vq); +} + +static int virtblk_ioctl(struct inode *inode, struct file *filp, + unsigned cmd, unsigned long data) +{ + return scsi_cmd_ioctl(filp, inode->i_bdev->bd_disk->queue, + inode->i_bdev->bd_disk, cmd, + (void __user *)data); +} + +static struct block_device_operations virtblk_fops = { + .ioctl = virtblk_ioctl, + .owner = THIS_MODULE, +}; + +static int virtblk_probe(struct virtio_device *vdev) +{ + struct virtio_blk *vblk; + int err, major; + void *token; + unsigned int len; + u64 cap; + u32 v; + + vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL); + if (!vblk) { + err = -ENOMEM; + goto out; + } + + INIT_LIST_HEAD(&vblk->reqs); + spin_lock_init(&vblk->lock); + vblk->vdev = vdev; + + /* We expect one virtqueue, for output. */ + vblk->vq = vdev->config->find_vq(vdev, blk_done); + if (IS_ERR(vblk->vq)) { + err = PTR_ERR(vblk->vq); + goto out_free_vblk; + } + + vblk->pool = mempool_create_kmalloc_pool(1,sizeof(struct virtblk_req)); + if (!vblk->pool) { + err = -ENOMEM; + goto out_free_vq; + } + + major = register_blkdev(0, "virtblk"); + if (major < 0) { + err = major; + goto out_mempool; + } + + /* FIXME: How many partitions? How long is a piece of string? */ + vblk->disk = alloc_disk(1 << 4); + if (!vblk->disk) { + err = -ENOMEM; + goto out_unregister_blkdev; + } + + vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock); + if (!vblk->disk->queue) { + err = -ENOMEM; + goto out_put_disk; + } + + sprintf(vblk->disk->disk_name, "vd%c", virtblk_index++); + vblk->disk->major = major; + vblk->disk->first_minor = 0; + vblk->disk->private_data = vblk; + vblk->disk->fops = &virtblk_fops; + + /* If barriers are supported, tell block layer that queue is ordered */ + token = vdev->config->find(vdev, VIRTIO_CONFIG_BLK_F, &len); + if (virtio_use_bit(vdev, token, len, VIRTIO_BLK_F_BARRIER)) + blk_queue_ordered(vblk->disk->queue, QUEUE_ORDERED_TAG, NULL); + + err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_CAPACITY, &cap); + if (err) { + dev_err(&vdev->dev, "Bad/missing capacity in config\n"); + goto out_put_disk; + } + + /* If capacity is too big, truncate with warning. */ + if ((sector_t)cap != cap) { + dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n", + (unsigned long long)cap); + cap = (sector_t)-1; + } + set_capacity(vblk->disk, cap); + + err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_SIZE_MAX, &v); + if (!err) + blk_queue_max_segment_size(vblk->disk->queue, v); + else if (err != -ENOENT) { + dev_err(&vdev->dev, "Bad SIZE_MAX in config\n"); + goto out_put_disk; + } + + err = virtio_config_val(vdev, VIRTIO_CONFIG_BLK_F_SEG_MAX, &v); + if (!err) + blk_queue_max_hw_segments(vblk->disk->queue, v); + else if (err != -ENOENT) { + dev_err(&vdev->dev, "Bad SEG_MAX in config\n"); + goto out_put_disk; + } + + add_disk(vblk->disk); + return 0; + +out_put_disk: + put_disk(vblk->disk); +out_unregister_blkdev: + unregister_blkdev(major, "virtblk"); +out_mempool: + mempool_destroy(vblk->pool); +out_free_vq: + vdev->config->del_vq(vblk->vq); +out_free_vblk: + kfree(vblk); +out: + return err; +} + +static void virtblk_remove(struct virtio_device *vdev) +{ + struct virtio_blk *vblk = vdev->priv; + int major = vblk->disk->major; + + BUG_ON(!list_empty(&vblk->reqs)); + blk_cleanup_queue(vblk->disk->queue); + put_disk(vblk->disk); + unregister_blkdev(major, "virtblk"); + mempool_destroy(vblk->pool); + kfree(vblk); +} + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static struct virtio_driver virtio_blk = { + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .probe = virtblk_probe, + .remove = __devexit_p(virtblk_remove), +}; + +static int __init init(void) +{ + return register_virtio_driver(&virtio_blk); +} + +static void __exit fini(void) +{ + unregister_virtio_driver(&virtio_blk); +} +module_init(init); +module_exit(fini); + +MODULE_DEVICE_TABLE(virtio, id_table); +MODULE_DESCRIPTION("Virtio block driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index 6549110..bf18d75 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -613,6 +613,10 @@ config HVC_XEN help Xen virtual console device driver +config VIRTIO_CONSOLE + bool + select HVC_DRIVER + config HVCS tristate "IBM Hypervisor Virtual Console Server support" depends on PPC_PSERIES diff --git a/drivers/char/Makefile b/drivers/char/Makefile index c78ff26..07304d5 100644 --- a/drivers/char/Makefile +++ b/drivers/char/Makefile @@ -42,7 +42,6 @@ obj-$(CONFIG_SYNCLINK_GT) += synclink_gt.o obj-$(CONFIG_N_HDLC) += n_hdlc.o obj-$(CONFIG_AMIGA_BUILTIN_SERIAL) += amiserial.o obj-$(CONFIG_SX) += sx.o generic_serial.o -obj-$(CONFIG_LGUEST_GUEST) += hvc_lguest.o obj-$(CONFIG_RIO) += rio/ generic_serial.o obj-$(CONFIG_HVC_CONSOLE) += hvc_vio.o hvsi.o obj-$(CONFIG_HVC_ISERIES) += hvc_iseries.o @@ -50,6 +49,7 @@ obj-$(CONFIG_HVC_RTAS) += hvc_rtas.o obj-$(CONFIG_HVC_BEAT) += hvc_beat.o obj-$(CONFIG_HVC_DRIVER) += hvc_console.o obj-$(CONFIG_HVC_XEN) += hvc_xen.o +obj-$(CONFIG_VIRTIO_CONSOLE) += virtio_console.o obj-$(CONFIG_RAW_DRIVER) += raw.o obj-$(CONFIG_SGI_SNSC) += snsc.o snsc_event.o obj-$(CONFIG_MSPEC) += mspec.o diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c index d1bd0f0..e4f579c 100644 --- a/drivers/char/cyclades.c +++ b/drivers/char/cyclades.c @@ -1602,8 +1602,8 @@ static void cyz_handle_tx(struct cyclades_port *info, info->icount.tx++; } #endif -ztxdone: tty_wakeup(tty); +ztxdone: /* Update tx_put */ cy_writel(&buf_ctrl->tx_put, tx_put); } diff --git a/drivers/char/hvc_lguest.c b/drivers/char/hvc_lguest.c deleted file mode 100644 index efccb21..0000000 --- a/drivers/char/hvc_lguest.c +++ /dev/null @@ -1,177 +0,0 @@ -/*D:300 - * The Guest console driver - * - * This is a trivial console driver: we use lguest's DMA mechanism to send - * bytes out, and register a DMA buffer to receive bytes in. It is assumed to - * be present and available from the very beginning of boot. - * - * Writing console drivers is one of the few remaining Dark Arts in Linux. - * Fortunately for us, the path of virtual consoles has been well-trodden by - * the PowerPC folks, who wrote "hvc_console.c" to generically support any - * virtual console. We use that infrastructure which only requires us to write - * the basic put_chars and get_chars functions and call the right register - * functions. - :*/ - -/*M:002 The console can be flooded: while the Guest is processing input the - * Host can send more. Buffering in the Host could alleviate this, but it is a - * difficult problem in general. :*/ -/* Copyright (C) 2006 Rusty Russell, IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -#include <linux/err.h> -#include <linux/init.h> -#include <linux/lguest_bus.h> -#include <asm/paravirt.h> -#include "hvc_console.h" - -/*D:340 This is our single console input buffer, with associated "struct - * lguest_dma" referring to it. Note the 0-terminated length array, and the - * use of physical address for the buffer itself. */ -static char inbuf[256]; -static struct lguest_dma cons_input = { .used_len = 0, - .addr[0] = __pa(inbuf), - .len[0] = sizeof(inbuf), - .len[1] = 0 }; - -/*D:310 The put_chars() callback is pretty straightforward. - * - * First we put the pointer and length in a "struct lguest_dma": we only have - * one pointer, so we set the second length to 0. Then we use SEND_DMA to send - * the data to (Host) buffers attached to the console key. Usually a device's - * key is a physical address within the device's memory, but because the - * console device doesn't have any associated physical memory, we use the - * LGUEST_CONSOLE_DMA_KEY constant (aka 0). */ -static int put_chars(u32 vtermno, const char *buf, int count) -{ - struct lguest_dma dma; - - /* FIXME: DMA buffers in a "struct lguest_dma" are not allowed - * to go over page boundaries. This never seems to happen, - * but if it did we'd need to fix this code. */ - dma.len[0] = count; - dma.len[1] = 0; - dma.addr[0] = __pa(buf); - - lguest_send_dma(LGUEST_CONSOLE_DMA_KEY, &dma); - /* We're expected to return the amount of data we wrote: all of it. */ - return count; -} - -/*D:350 get_chars() is the callback from the hvc_console infrastructure when - * an interrupt is received. - * - * Firstly we see if our buffer has been filled: if not, we return. The rest - * of the code deals with the fact that the hvc_console() infrastructure only - * asks us for 16 bytes at a time. We keep a "cons_offset" variable for - * partially-read buffers. */ -static int get_chars(u32 vtermno, char *buf, int count) -{ - static int cons_offset; - - /* Nothing left to see here... */ - if (!cons_input.used_len) - return 0; - - /* You want more than we have to give? Well, try wanting less! */ - if (cons_input.used_len - cons_offset < count) - count = cons_input.used_len - cons_offset; - - /* Copy across to their buffer and increment offset. */ - memcpy(buf, inbuf + cons_offset, count); - cons_offset += count; - - /* Finished? Zero offset, and reset cons_input so Host will use it - * again. */ - if (cons_offset == cons_input.used_len) { - cons_offset = 0; - cons_input.used_len = 0; - } - return count; -} -/*:*/ - -static struct hv_ops lguest_cons = { - .get_chars = get_chars, - .put_chars = put_chars, -}; - -/*D:320 Console drivers are initialized very early so boot messages can go - * out. At this stage, the console is output-only. Our driver checks we're a - * Guest, and if so hands hvc_instantiate() the console number (0), priority - * (0), and the struct hv_ops containing the put_chars() function. */ -static int __init cons_init(void) -{ - if (strcmp(pv_info.name, "lguest") != 0) - return 0; - - return hvc_instantiate(0, 0, &lguest_cons); -} -console_initcall(cons_init); - -/*D:370 To set up and manage our virtual console, we call hvc_alloc() and - * stash the result in the private pointer of the "struct lguest_device". - * Since we never remove the console device we never need this pointer again, - * but using ->private is considered good form, and you never know who's going - * to copy your driver. - * - * Once the console is set up, we bind our input buffer ready for input. */ -static int lguestcons_probe(struct lguest_device *lgdev) -{ - int err; - - /* The first argument of hvc_alloc() is the virtual console number, so - * we use zero. The second argument is the interrupt number. - * - * The third argument is a "struct hv_ops" containing the put_chars() - * and get_chars() pointers. The final argument is the output buffer - * size: we use 256 and expect the Host to have room for us to send - * that much. */ - lgdev->private = hvc_alloc(0, lgdev_irq(lgdev), &lguest_cons, 256); - if (IS_ERR(lgdev->private)) - return PTR_ERR(lgdev->private); - - /* We bind a single DMA buffer at key LGUEST_CONSOLE_DMA_KEY. - * "cons_input" is that statically-initialized global DMA buffer we saw - * above, and we also give the interrupt we want. */ - err = lguest_bind_dma(LGUEST_CONSOLE_DMA_KEY, &cons_input, 1, - lgdev_irq(lgdev)); - if (err) - printk("lguest console: failed to bind buffer.\n"); - return err; -} -/* Note the use of lgdev_irq() for the interrupt number. We tell hvc_alloc() - * to expect input when this interrupt is triggered, and then tell - * lguest_bind_dma() that is the interrupt to send us when input comes in. */ - -/*D:360 From now on the console driver follows standard Guest driver form: - * register_lguest_driver() registers the device type and probe function, and - * the probe function sets up the device. - * - * The standard "struct lguest_driver": */ -static struct lguest_driver lguestcons_drv = { - .name = "lguestcons", - .owner = THIS_MODULE, - .device_type = LGUEST_DEVICE_T_CONSOLE, - .probe = lguestcons_probe, -}; - -/* The standard init function */ -static int __init hvc_lguest_init(void) -{ - return register_lguest_driver(&lguestcons_drv); -} -module_init(hvc_lguest_init); diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c new file mode 100644 index 0000000..100e8a2 --- /dev/null +++ b/drivers/char/virtio_console.c @@ -0,0 +1,225 @@ +/*D:300 + * The Guest console driver + * + * Writing console drivers is one of the few remaining Dark Arts in Linux. + * Fortunately for us, the path of virtual consoles has been well-trodden by + * the PowerPC folks, who wrote "hvc_console.c" to generically support any + * virtual console. We use that infrastructure which only requires us to write + * the basic put_chars and get_chars functions and call the right register + * functions. + :*/ + +/*M:002 The console can be flooded: while the Guest is processing input the + * Host can send more. Buffering in the Host could alleviate this, but it is a + * difficult problem in general. :*/ +/* Copyright (C) 2006, 2007 Rusty Russell, IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <linux/err.h> +#include <linux/init.h> +#include <linux/virtio.h> +#include <linux/virtio_console.h> +#include "hvc_console.h" + +/*D:340 These represent our input and output console queues, and the virtio + * operations for them. */ +static struct virtqueue *in_vq, *out_vq; +static struct virtio_device *vdev; + +/* This is our input buffer, and how much data is left in it. */ +static unsigned int in_len; +static char *in, *inbuf; + +/* The operations for our console. */ +static struct hv_ops virtio_cons; + +/*D:310 The put_chars() callback is pretty straightforward. + * + * We turn the characters into a scatter-gather list, add it to the output + * queue and then kick the Host. Then we sit here waiting for it to finish: + * inefficient in theory, but in practice implementations will do it + * immediately (lguest's Launcher does). */ +static int put_chars(u32 vtermno, const char *buf, int count) +{ + struct scatterlist sg[1]; + unsigned int len; + + /* This is a convenient routine to initialize a single-elem sg list */ + sg_init_one(sg, buf, count); + + /* add_buf wants a token to identify this buffer: we hand it any + * non-NULL pointer, since there's only ever one buffer. */ + if (out_vq->vq_ops->add_buf(out_vq, sg, 1, 0, (void *)1) == 0) { + /* Tell Host to go! */ + out_vq->vq_ops->kick(out_vq); + /* Chill out until it's done with the buffer. */ + while (!out_vq->vq_ops->get_buf(out_vq, &len)) + cpu_relax(); + } + + /* We're expected to return the amount of data we wrote: all of it. */ + return count; +} + +/* Create a scatter-gather list representing our input buffer and put it in the + * queue. */ +static void add_inbuf(void) +{ + struct scatterlist sg[1]; + sg_init_one(sg, inbuf, PAGE_SIZE); + + /* We should always be able to add one buffer to an empty queue. */ + if (in_vq->vq_ops->add_buf(in_vq, sg, 0, 1, inbuf) != 0) + BUG(); + in_vq->vq_ops->kick(in_vq); +} + +/*D:350 get_chars() is the callback from the hvc_console infrastructure when + * an interrupt is received. + * + * Most of the code deals with the fact that the hvc_console() infrastructure + * only asks us for 16 bytes at a time. We keep in_offset and in_used fields + * for partially-filled buffers. */ +static int get_chars(u32 vtermno, char *buf, int count) +{ + /* If we don't have an input queue yet, we can't get input. */ + BUG_ON(!in_vq); + + /* No buffer? Try to get one. */ + if (!in_len) { + in = in_vq->vq_ops->get_buf(in_vq, &in_len); + if (!in) + return 0; + } + + /* You want more than we have to give? Well, try wanting less! */ + if (in_len < count) + count = in_len; + + /* Copy across to their buffer and increment offset. */ + memcpy(buf, in, count); + in += count; + in_len -= count; + + /* Finished? Re-register buffer so Host will use it again. */ + if (in_len == 0) + add_inbuf(); + + return count; +} +/*:*/ + +/*D:320 Console drivers are initialized very early so boot messages can go out, + * so we do things slightly differently from the generic virtio initialization + * of the net and block drivers. + * + * At this stage, the console is output-only. It's too early to set up a + * virtqueue, so we let the drivers do some boutique early-output thing. */ +int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int)) +{ + virtio_cons.put_chars = put_chars; + return hvc_instantiate(0, 0, &virtio_cons); +} + +/*D:370 Once we're further in boot, we get probed like any other virtio device. + * At this stage we set up the output virtqueue. + * + * To set up and manage our virtual console, we call hvc_alloc(). Since we + * never remove the console device we never need this pointer again. + * + * Finally we put our input buffer in the input queue, ready to receive. */ +static int virtcons_probe(struct virtio_device *dev) +{ + int err; + struct hvc_struct *hvc; + + vdev = dev; + + /* This is the scratch page we use to receive console input */ + inbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!inbuf) { + err = -ENOMEM; + goto fail; + } + + /* Find the input queue. */ + /* FIXME: This is why we want to wean off hvc: we do nothing + * when input comes in. */ + in_vq = vdev->config->find_vq(vdev, NULL); + if (IS_ERR(in_vq)) { + err = PTR_ERR(in_vq); + goto free; + } + + out_vq = vdev->config->find_vq(vdev, NULL); + if (IS_ERR(out_vq)) { + err = PTR_ERR(out_vq); + goto free_in_vq; + } + + /* Start using the new console output. */ + virtio_cons.get_chars = get_chars; + virtio_cons.put_chars = put_chars; + + /* The first argument of hvc_alloc() is the virtual console number, so + * we use zero. The second argument is the interrupt number; we + * currently leave this as zero: it would be better not to use the + * hvc mechanism and fix this (FIXME!). + * + * The third argument is a "struct hv_ops" containing the put_chars() + * and get_chars() pointers. The final argument is the output buffer + * size: we can do any size, so we put PAGE_SIZE here. */ + hvc = hvc_alloc(0, 0, &virtio_cons, PAGE_SIZE); + if (IS_ERR(hvc)) { + err = PTR_ERR(hvc); + goto free_out_vq; + } + + /* Register the input buffer the first time. */ + add_inbuf(); + return 0; + +free_out_vq: + vdev->config->del_vq(out_vq); +free_in_vq: + vdev->config->del_vq(in_vq); +free: + kfree(inbuf); +fail: + return err; +} + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_CONSOLE, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static struct virtio_driver virtio_console = { + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .probe = virtcons_probe, +}; + +static int __init init(void) +{ + return register_virtio_driver(&virtio_console); +} +module_init(init); + +MODULE_DEVICE_TABLE(virtio, id_table); +MODULE_DESCRIPTION("Virtio console driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index ec55a17..6a6f2e0 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -47,6 +47,7 @@ #include <linux/spinlock.h> #include <linux/kmod.h> #include <linux/pci.h> +#include <linux/scatterlist.h> #include <asm/byteorder.h> #include <asm/irq.h> diff --git a/drivers/ieee1394/dma.c b/drivers/ieee1394/dma.c index 25e113b..3051e31 100644 --- a/drivers/ieee1394/dma.c +++ b/drivers/ieee1394/dma.c @@ -12,7 +12,7 @@ #include <linux/pci.h> #include <linux/slab.h> #include <linux/vmalloc.h> -#include <asm/scatterlist.h> +#include <linux/scatterlist.h> #include "dma.h" diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index d08fb30..0751697 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -114,13 +114,16 @@ struct rdma_id_private { struct rdma_bind_list *bind_list; struct hlist_node node; - struct list_head list; - struct list_head listen_list; + struct list_head list; /* listen_any_list or cma_device.list */ + struct list_head listen_list; /* per device listens */ struct cma_device *cma_dev; struct list_head mc_list; + int internal_id; enum cma_state state; spinlock_t lock; + struct mutex qp_mutex; + struct completion comp; atomic_t refcount; wait_queue_head_t wait_remove; @@ -389,6 +392,7 @@ struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler, id_priv->id.event_handler = event_handler; id_priv->id.ps = ps; spin_lock_init(&id_priv->lock); + mutex_init(&id_priv->qp_mutex); init_completion(&id_priv->comp); atomic_set(&id_priv->refcount, 1); init_waitqueue_head(&id_priv->wait_remove); @@ -474,61 +478,86 @@ EXPORT_SYMBOL(rdma_create_qp); void rdma_destroy_qp(struct rdma_cm_id *id) { - ib_destroy_qp(id->qp); + struct rdma_id_private *id_priv; + + id_priv = container_of(id, struct rdma_id_private, id); + mutex_lock(&id_priv->qp_mutex); + ib_destroy_qp(id_priv->id.qp); + id_priv->id.qp = NULL; + mutex_unlock(&id_priv->qp_mutex); } EXPORT_SYMBOL(rdma_destroy_qp); -static int cma_modify_qp_rtr(struct rdma_cm_id *id) +static int cma_modify_qp_rtr(struct rdma_id_private *id_priv) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; - if (!id->qp) - return 0; + mutex_lock(&id_priv->qp_mutex); + if (!id_priv->id.qp) { + ret = 0; + goto out; + } /* Need to update QP attributes from default values. */ qp_attr.qp_state = IB_QPS_INIT; - ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); + ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) - return ret; + goto out; - ret = ib_modify_qp(id->qp, &qp_attr, qp_attr_mask); + ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); if (ret) - return ret; + goto out; qp_attr.qp_state = IB_QPS_RTR; - ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); + ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) - return ret; + goto out; - return ib_modify_qp(id->qp, &qp_attr, qp_attr_mask); + ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); +out: + mutex_unlock(&id_priv->qp_mutex); + return ret; } -static int cma_modify_qp_rts(struct rdma_cm_id *id) +static int cma_modify_qp_rts(struct rdma_id_private *id_priv) { struct ib_qp_attr qp_attr; int qp_attr_mask, ret; - if (!id->qp) - return 0; + mutex_lock(&id_priv->qp_mutex); + if (!id_priv->id.qp) { + ret = 0; + goto out; + } qp_attr.qp_state = IB_QPS_RTS; - ret = rdma_init_qp_attr(id, &qp_attr, &qp_attr_mask); + ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask); if (ret) - return ret; + goto out; - return ib_modify_qp(id->qp, &qp_attr, qp_attr_mask); + ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask); +out: + mutex_unlock(&id_priv->qp_mutex); + return ret; } -static int cma_modify_qp_err(struct rdma_cm_id *id) +static int cma_modify_qp_err(struct rdma_id_private *id_priv) { struct ib_qp_attr qp_attr; + int ret; - if (!id->qp) - return 0; + mutex_lock(&id_priv->qp_mutex); + if (!id_priv->id.qp) { + ret = 0; + goto out; + } qp_attr.qp_state = IB_QPS_ERR; - return ib_modify_qp(id->qp, &qp_attr, IB_QP_STATE); + ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE); +out: + mutex_unlock(&id_priv->qp_mutex); + return ret; } static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv, @@ -717,50 +746,27 @@ static void cma_cancel_route(struct rdma_id_private *id_priv) } } -static inline int cma_internal_listen(struct rdma_id_private *id_priv) -{ - return (id_priv->state == CMA_LISTEN) && id_priv->cma_dev && - cma_any_addr(&id_priv->id.route.addr.src_addr); -} - -static void cma_destroy_listen(struct rdma_id_private *id_priv) -{ - cma_exch(id_priv, CMA_DESTROYING); - - if (id_priv->cma_dev) { - switch (rdma_node_get_transport(id_priv->id.device->node_type)) { - case RDMA_TRANSPORT_IB: - if (id_priv->cm_id.ib && !IS_ERR(id_priv->cm_id.ib)) - ib_destroy_cm_id(id_priv->cm_id.ib); - break; - case RDMA_TRANSPORT_IWARP: - if (id_priv->cm_id.iw && !IS_ERR(id_priv->cm_id.iw)) - iw_destroy_cm_id(id_priv->cm_id.iw); - break; - default: - break; - } - cma_detach_from_dev(id_priv); - } - list_del(&id_priv->listen_list); - - cma_deref_id(id_priv); - wait_for_completion(&id_priv->comp); - - kfree(id_priv); -} - static void cma_cancel_listens(struct rdma_id_private *id_priv) { struct rdma_id_private *dev_id_priv; + /* + * Remove from listen_any_list to prevent added devices from spawning + * additional listen requests. + */ mutex_lock(&lock); list_del(&id_priv->list); while (!list_empty(&id_priv->listen_list)) { dev_id_priv = list_entry(id_priv->listen_list.next, struct rdma_id_private, listen_list); - cma_destroy_listen(dev_id_priv); + /* sync with device removal to avoid duplicate destruction */ + list_del_init(&dev_id_priv->list); + list_del(&dev_id_priv->listen_list); + mutex_unlock(&lock); + + rdma_destroy_id(&dev_id_priv->id); + mutex_lock(&lock); } mutex_unlock(&lock); } @@ -848,6 +854,9 @@ void rdma_destroy_id(struct rdma_cm_id *id) cma_deref_id(id_priv); wait_for_completion(&id_priv->comp); + if (id_priv->internal_id) + cma_deref_id(id_priv->id.context); + kfree(id_priv->id.route.path_rec); kfree(id_priv); } @@ -857,11 +866,11 @@ static int cma_rep_recv(struct rdma_id_private *id_priv) { int ret; - ret = cma_modify_qp_rtr(&id_priv->id); + ret = cma_modify_qp_rtr(id_priv); if (ret) goto reject; - ret = cma_modify_qp_rts(&id_priv->id); + ret = cma_modify_qp_rts(id_priv); if (ret) goto reject; @@ -871,7 +880,7 @@ static int cma_rep_recv(struct rdma_id_private *id_priv) return 0; reject: - cma_modify_qp_err(&id_priv->id); + cma_modify_qp_err(id_priv); ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, NULL, 0); return ret; @@ -947,7 +956,7 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) /* ignore event */ goto out; case IB_CM_REJ_RECEIVED: - cma_modify_qp_err(&id_priv->id); + cma_modify_qp_err(id_priv); event.status = ib_event->param.rej_rcvd.reason; event.event = RDMA_CM_EVENT_REJECTED; event.param.conn.private_data = ib_event->private_data; @@ -1404,14 +1413,13 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, cma_attach_to_dev(dev_id_priv, cma_dev); list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list); + atomic_inc(&id_priv->refcount); + dev_id_priv->internal_id = 1; ret = rdma_listen(id, id_priv->backlog); if (ret) - goto err; - - return; -err: - cma_destroy_listen(dev_id_priv); + printk(KERN_WARNING "RDMA CMA: cma_listen_on_dev, error %d, " + "listening on device %s", ret, cma_dev->device->name); } static void cma_listen_on_all(struct rdma_id_private *id_priv) @@ -2264,7 +2272,7 @@ static int cma_connect_iw(struct rdma_id_private *id_priv, sin = (struct sockaddr_in*) &id_priv->id.route.addr.dst_addr; cm_id->remote_addr = *sin; - ret = cma_modify_qp_rtr(&id_priv->id); + ret = cma_modify_qp_rtr(id_priv); if (ret) goto out; @@ -2331,7 +2339,7 @@ static int cma_accept_ib(struct rdma_id_private *id_priv, int qp_attr_mask, ret; if (id_priv->id.qp) { - ret = cma_modify_qp_rtr(&id_priv->id); + ret = cma_modify_qp_rtr(id_priv); if (ret) goto out; @@ -2370,7 +2378,7 @@ static int cma_accept_iw(struct rdma_id_private *id_priv, struct iw_cm_conn_param iw_param; int ret; - ret = cma_modify_qp_rtr(&id_priv->id); + ret = cma_modify_qp_rtr(id_priv); if (ret) return ret; @@ -2442,7 +2450,7 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) return 0; reject: - cma_modify_qp_err(id); + cma_modify_qp_err(id_priv); rdma_reject(id, NULL, 0); return ret; } @@ -2512,7 +2520,7 @@ int rdma_disconnect(struct rdma_cm_id *id) switch (rdma_node_get_transport(id->device->node_type)) { case RDMA_TRANSPORT_IB: - ret = cma_modify_qp_err(id); + ret = cma_modify_qp_err(id_priv); if (ret) goto out; /* Initiate or respond to a disconnect. */ @@ -2543,9 +2551,11 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) cma_disable_remove(id_priv, CMA_ADDR_RESOLVED)) return 0; + mutex_lock(&id_priv->qp_mutex); if (!status && id_priv->id.qp) status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid, multicast->rec.mlid); + mutex_unlock(&id_priv->qp_mutex); memset(&event, 0, sizeof event); event.status = status; @@ -2757,16 +2767,12 @@ static void cma_process_remove(struct cma_device *cma_dev) id_priv = list_entry(cma_dev->id_list.next, struct rdma_id_private, list); - if (cma_internal_listen(id_priv)) { - cma_destroy_listen(id_priv); - continue; - } - + list_del(&id_priv->listen_list); list_del_init(&id_priv->list); atomic_inc(&id_priv->refcount); mutex_unlock(&lock); - ret = cma_remove_id_dev(id_priv); + ret = id_priv->internal_id ? 1 : cma_remove_id_dev(id_priv); cma_deref_id(id_priv); if (ret) rdma_destroy_id(&id_priv->id); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 01d7008..495c803 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -147,8 +147,12 @@ static struct ib_uobject *__idr_get_uobj(struct idr *idr, int id, spin_lock(&ib_uverbs_idr_lock); uobj = idr_find(idr, id); - if (uobj) - kref_get(&uobj->ref); + if (uobj) { + if (uobj->context == context) + kref_get(&uobj->ref); + else + uobj = NULL; + } spin_unlock(&ib_uverbs_idr_lock); return uobj; diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h index 3f2d68c..2d660ae 100644 --- a/drivers/infiniband/hw/ehca/ehca_classes.h +++ b/drivers/infiniband/hw/ehca/ehca_classes.h @@ -323,7 +323,6 @@ extern int ehca_static_rate; extern int ehca_port_act_time; extern int ehca_use_hp_mr; extern int ehca_scaling_code; -extern int ehca_mr_largepage; struct ipzu_queue_resp { u32 qe_size; /* queue entry size */ diff --git a/drivers/infiniband/hw/ehca/ehca_hca.c b/drivers/infiniband/hw/ehca/ehca_hca.c index 4aa3ffa..15806d1 100644 --- a/drivers/infiniband/hw/ehca/ehca_hca.c +++ b/drivers/infiniband/hw/ehca/ehca_hca.c @@ -77,6 +77,7 @@ int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props) } memset(props, 0, sizeof(struct ib_device_attr)); + props->page_size_cap = shca->hca_cap_mr_pgsize; props->fw_ver = rblock->hw_ver; props->max_mr_size = rblock->max_mr_size; props->vendor_id = rblock->vendor_id >> 8; diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c index 7a7dab8..c6cd38c 100644 --- a/drivers/infiniband/hw/ehca/ehca_main.c +++ b/drivers/infiniband/hw/ehca/ehca_main.c @@ -65,7 +65,7 @@ int ehca_port_act_time = 30; int ehca_poll_all_eqs = 1; int ehca_static_rate = -1; int ehca_scaling_code = 0; -int ehca_mr_largepage = 0; +int ehca_mr_largepage = 1; module_param_named(open_aqp1, ehca_open_aqp1, int, S_IRUGO); module_param_named(debug_level, ehca_debug_level, int, S_IRUGO); @@ -260,13 +260,20 @@ static struct cap_descr { { HCA_CAP_MINI_QP, "HCA_CAP_MINI_QP" }, }; -int ehca_sense_attributes(struct ehca_shca *shca) +static int ehca_sense_attributes(struct ehca_shca *shca) { int i, ret = 0; u64 h_ret; struct hipz_query_hca *rblock; struct hipz_query_port *port; + static const u32 pgsize_map[] = { + HCA_CAP_MR_PGSIZE_4K, 0x1000, + HCA_CAP_MR_PGSIZE_64K, 0x10000, + HCA_CAP_MR_PGSIZE_1M, 0x100000, + HCA_CAP_MR_PGSIZE_16M, 0x1000000, + }; + rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL); if (!rblock) { ehca_gen_err("Cannot allocate rblock memory."); @@ -329,8 +336,15 @@ int ehca_sense_attributes(struct ehca_shca *shca) if (EHCA_BMASK_GET(hca_cap_descr[i].mask, shca->hca_cap)) ehca_gen_dbg(" %s", hca_cap_descr[i].descr); - shca->hca_cap_mr_pgsize = rblock->memory_page_size_supported; + /* translate supported MR page sizes; always support 4K */ + shca->hca_cap_mr_pgsize = EHCA_PAGESIZE; + if (ehca_mr_largepage) { /* support extra sizes only if enabled */ + for (i = 0; i < ARRAY_SIZE(pgsize_map); i += 2) + if (rblock->memory_page_size_supported & pgsize_map[i]) + shca->hca_cap_mr_pgsize |= pgsize_map[i + 1]; + } + /* query max MTU from first port -- it's the same for all ports */ port = (struct hipz_query_port *)rblock; h_ret = hipz_h_query_port(shca->ipz_hca_handle, 1, port); if (h_ret != H_SUCCESS) { diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c index da88738..e239bbf 100644 --- a/drivers/infiniband/hw/ehca/ehca_mrmw.c +++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c @@ -72,24 +72,14 @@ enum ehca_mr_pgsize { static u32 ehca_encode_hwpage_size(u32 pgsize) { - u32 idx = 0; - pgsize >>= 12; - /* - * map mr page size into hw code: - * 0, 1, 2, 3 for 4K, 64K, 1M, 64M - */ - while (!(pgsize & 1)) { - idx++; - pgsize >>= 4; - } - return idx; + int log = ilog2(pgsize); + WARN_ON(log < 12 || log > 24 || log & 3); + return (log - 12) / 4; } static u64 ehca_get_max_hwpage_size(struct ehca_shca *shca) { - if (shca->hca_cap_mr_pgsize & HCA_CAP_MR_PGSIZE_16M) - return EHCA_MR_PGSIZE16M; - return EHCA_MR_PGSIZE4K; + return 1UL << ilog2(shca->hca_cap_mr_pgsize); } static struct ehca_mr *ehca_mr_new(void) @@ -259,7 +249,7 @@ struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd, pginfo.u.phy.num_phys_buf = num_phys_buf; pginfo.u.phy.phys_buf_array = phys_buf_array; pginfo.next_hwpage = - ((u64)iova_start & ~(hw_pgsize - 1)) / hw_pgsize; + ((u64)iova_start & ~PAGE_MASK) / hw_pgsize; ret = ehca_reg_mr(shca, e_mr, iova_start, size, mr_access_flags, e_pd, &pginfo, &e_mr->ib.ib_mr.lkey, @@ -296,7 +286,7 @@ struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, container_of(pd->device, struct ehca_shca, ib_device); struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd); struct ehca_mr_pginfo pginfo; - int ret; + int ret, page_shift; u32 num_kpages; u32 num_hwpages; u64 hwpage_size; @@ -351,19 +341,20 @@ struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, /* determine number of MR pages */ num_kpages = NUM_CHUNKS((virt % PAGE_SIZE) + length, PAGE_SIZE); /* select proper hw_pgsize */ - if (ehca_mr_largepage && - (shca->hca_cap_mr_pgsize & HCA_CAP_MR_PGSIZE_16M)) { - int page_shift = PAGE_SHIFT; - if (e_mr->umem->hugetlb) { - /* determine page_shift, clamp between 4K and 16M */ - page_shift = (fls64(length - 1) + 3) & ~3; - page_shift = min(max(page_shift, EHCA_MR_PGSHIFT4K), - EHCA_MR_PGSHIFT16M); - } - hwpage_size = 1UL << page_shift; - } else - hwpage_size = EHCA_MR_PGSIZE4K; /* ehca1 only supports 4k */ - ehca_dbg(pd->device, "hwpage_size=%lx", hwpage_size); + page_shift = PAGE_SHIFT; + if (e_mr->umem->hugetlb) { + /* determine page_shift, clamp between 4K and 16M */ + page_shift = (fls64(length - 1) + 3) & ~3; + page_shift = min(max(page_shift, EHCA_MR_PGSHIFT4K), + EHCA_MR_PGSHIFT16M); + } + hwpage_size = 1UL << page_shift; + + /* now that we have the desired page size, shift until it's + * supported, too. 4K is always supported, so this terminates. + */ + while (!(hwpage_size & shca->hca_cap_mr_pgsize)) + hwpage_size >>= 4; reg_user_mr_fallback: num_hwpages = NUM_CHUNKS((virt % hwpage_size) + length, hwpage_size); @@ -547,7 +538,7 @@ int ehca_rereg_phys_mr(struct ib_mr *mr, pginfo.u.phy.num_phys_buf = num_phys_buf; pginfo.u.phy.phys_buf_array = phys_buf_array; pginfo.next_hwpage = - ((u64)iova_start & ~(hw_pgsize - 1)) / hw_pgsize; + ((u64)iova_start & ~PAGE_MASK) / hw_pgsize; } if (mr_rereg_mask & IB_MR_REREG_ACCESS) new_acl = mr_access_flags; @@ -809,8 +800,9 @@ struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd, ib_fmr = ERR_PTR(-EINVAL); goto alloc_fmr_exit0; } - hw_pgsize = ehca_get_max_hwpage_size(shca); - if ((1 << fmr_attr->page_shift) != hw_pgsize) { + + hw_pgsize = 1 << fmr_attr->page_shift; + if (!(hw_pgsize & shca->hca_cap_mr_pgsize)) { ehca_err(pd->device, "unsupported fmr_attr->page_shift=%x", fmr_attr->page_shift); ib_fmr = ERR_PTR(-EINVAL); @@ -826,6 +818,7 @@ struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd, /* register MR on HCA */ memset(&pginfo, 0, sizeof(pginfo)); + pginfo.hwpage_size = hw_pgsize; /* * pginfo.num_hwpages==0, ie register_rpages() will not be called * but deferred to map_phys_fmr() @@ -1776,7 +1769,7 @@ static int ehca_set_pagebuf_user1(struct ehca_mr_pginfo *pginfo, list_for_each_entry_continue( chunk, (&(pginfo->u.usr.region->chunk_list)), list) { for (i = pginfo->u.usr.next_nmap; i < chunk->nmap; ) { - pgaddr = page_to_pfn(chunk->page_list[i].page) + pgaddr = page_to_pfn(sg_page(&chunk->page_list[i])) << PAGE_SHIFT ; *kpage = phys_to_abs(pgaddr + (pginfo->next_hwpage * @@ -1832,7 +1825,7 @@ static int ehca_check_kpages_per_ate(struct scatterlist *page_list, { int t; for (t = start_idx; t <= end_idx; t++) { - u64 pgaddr = page_to_pfn(page_list[t].page) << PAGE_SHIFT; + u64 pgaddr = page_to_pfn(sg_page(&page_list[t])) << PAGE_SHIFT; ehca_gen_dbg("chunk_page=%lx value=%016lx", pgaddr, *(u64 *)abs_to_virt(phys_to_abs(pgaddr))); if (pgaddr - PAGE_SIZE != *prev_pgaddr) { @@ -1867,7 +1860,7 @@ static int ehca_set_pagebuf_user2(struct ehca_mr_pginfo *pginfo, chunk, (&(pginfo->u.usr.region->chunk_list)), list) { for (i = pginfo->u.usr.next_nmap; i < chunk->nmap; ) { if (nr_kpages == kpages_per_hwpage) { - pgaddr = ( page_to_pfn(chunk->page_list[i].page) + pgaddr = ( page_to_pfn(sg_page(&chunk->page_list[i])) << PAGE_SHIFT ); *kpage = phys_to_abs(pgaddr); if ( !(*kpage) ) { diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c index e2bd62b..de18264 100644 --- a/drivers/infiniband/hw/ehca/ehca_qp.c +++ b/drivers/infiniband/hw/ehca/ehca_qp.c @@ -451,7 +451,6 @@ static struct ehca_qp *internal_create_qp( has_srq = 1; parms.ext_type = EQPT_SRQBASE; parms.srq_qpn = my_srq->real_qp_num; - parms.srq_token = my_srq->token; } if (is_llqp && has_srq) { @@ -583,6 +582,9 @@ static struct ehca_qp *internal_create_qp( goto create_qp_exit1; } + if (has_srq) + parms.srq_token = my_qp->token; + parms.servicetype = ibqptype2servicetype(qp_type); if (parms.servicetype < 0) { ret = -EINVAL; diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 31a480e..6b33224 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -63,6 +63,10 @@ struct mlx4_ib_sqp { u8 header_buf[MLX4_IB_UD_HEADER_SIZE]; }; +enum { + MLX4_IB_MIN_SQ_STRIDE = 6 +}; + static const __be32 mlx4_ib_opcode[] = { [IB_WR_SEND] = __constant_cpu_to_be32(MLX4_OPCODE_SEND), [IB_WR_SEND_WITH_IMM] = __constant_cpu_to_be32(MLX4_OPCODE_SEND_IMM), @@ -285,9 +289,17 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, return 0; } -static int set_user_sq_size(struct mlx4_ib_qp *qp, +static int set_user_sq_size(struct mlx4_ib_dev *dev, + struct mlx4_ib_qp *qp, struct mlx4_ib_create_qp *ucmd) { + /* Sanity check SQ size before proceeding */ + if ((1 << ucmd->log_sq_bb_count) > dev->dev->caps.max_wqes || + ucmd->log_sq_stride > + ilog2(roundup_pow_of_two(dev->dev->caps.max_sq_desc_sz)) || + ucmd->log_sq_stride < MLX4_IB_MIN_SQ_STRIDE) + return -EINVAL; + qp->sq.wqe_cnt = 1 << ucmd->log_sq_bb_count; qp->sq.wqe_shift = ucmd->log_sq_stride; @@ -330,7 +342,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, qp->sq_no_prefetch = ucmd.sq_no_prefetch; - err = set_user_sq_size(qp, &ucmd); + err = set_user_sq_size(dev, qp, &ucmd); if (err) goto err; diff --git a/drivers/infiniband/hw/mthca/mthca_cq.c b/drivers/infiniband/hw/mthca/mthca_cq.c index be6e1e0..6bd9f13 100644 --- a/drivers/infiniband/hw/mthca/mthca_cq.c +++ b/drivers/infiniband/hw/mthca/mthca_cq.c @@ -204,16 +204,11 @@ static void dump_cqe(struct mthca_dev *dev, void *cqe_ptr) static inline void update_cons_index(struct mthca_dev *dev, struct mthca_cq *cq, int incr) { - __be32 doorbell[2]; - if (mthca_is_memfree(dev)) { *cq->set_ci_db = cpu_to_be32(cq->cons_index); wmb(); } else { - doorbell[0] = cpu_to_be32(MTHCA_TAVOR_CQ_DB_INC_CI | cq->cqn); - doorbell[1] = cpu_to_be32(incr - 1); - - mthca_write64(doorbell, + mthca_write64(MTHCA_TAVOR_CQ_DB_INC_CI | cq->cqn, incr - 1, dev->kar + MTHCA_CQ_DOORBELL, MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); /* @@ -731,17 +726,12 @@ repoll: int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags) { - __be32 doorbell[2]; + u32 dbhi = ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? + MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL : + MTHCA_TAVOR_CQ_DB_REQ_NOT) | + to_mcq(cq)->cqn; - doorbell[0] = cpu_to_be32(((flags & IB_CQ_SOLICITED_MASK) == - IB_CQ_SOLICITED ? - MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL : - MTHCA_TAVOR_CQ_DB_REQ_NOT) | - to_mcq(cq)->cqn); - doorbell[1] = (__force __be32) 0xffffffff; - - mthca_write64(doorbell, - to_mdev(cq->device)->kar + MTHCA_CQ_DOORBELL, + mthca_write64(dbhi, 0xffffffff, to_mdev(cq->device)->kar + MTHCA_CQ_DOORBELL, MTHCA_GET_DOORBELL_LOCK(&to_mdev(cq->device)->doorbell_lock)); return 0; @@ -750,19 +740,16 @@ int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags) int mthca_arbel_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) { struct mthca_cq *cq = to_mcq(ibcq); - __be32 doorbell[2]; - u32 sn; - __be32 ci; - - sn = cq->arm_sn & 3; - ci = cpu_to_be32(cq->cons_index); + __be32 db_rec[2]; + u32 dbhi; + u32 sn = cq->arm_sn & 3; - doorbell[0] = ci; - doorbell[1] = cpu_to_be32((cq->cqn << 8) | (2 << 5) | (sn << 3) | - ((flags & IB_CQ_SOLICITED_MASK) == - IB_CQ_SOLICITED ? 1 : 2)); + db_rec[0] = cpu_to_be32(cq->cons_index); + db_rec[1] = cpu_to_be32((cq->cqn << 8) | (2 << 5) | (sn << 3) | + ((flags & IB_CQ_SOLICITED_MASK) == + IB_CQ_SOLICITED ? 1 : 2)); - mthca_write_db_rec(doorbell, cq->arm_db); + mthca_write_db_rec(db_rec, cq->arm_db); /* * Make sure that the doorbell record in host memory is @@ -770,14 +757,12 @@ int mthca_arbel_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) */ wmb(); - doorbell[0] = cpu_to_be32((sn << 28) | - ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? - MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL : - MTHCA_ARBEL_CQ_DB_REQ_NOT) | - cq->cqn); - doorbell[1] = ci; + dbhi = (sn << 28) | + ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? + MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL : + MTHCA_ARBEL_CQ_DB_REQ_NOT) | cq->cqn; - mthca_write64(doorbell, + mthca_write64(dbhi, cq->cons_index, to_mdev(ibcq->device)->kar + MTHCA_CQ_DOORBELL, MTHCA_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->doorbell_lock)); diff --git a/drivers/infiniband/hw/mthca/mthca_doorbell.h b/drivers/infiniband/hw/mthca/mthca_doorbell.h index dd9a44d..b374dc3 100644 --- a/drivers/infiniband/hw/mthca/mthca_doorbell.h +++ b/drivers/infiniband/hw/mthca/mthca_doorbell.h @@ -58,10 +58,10 @@ static inline void mthca_write64_raw(__be64 val, void __iomem *dest) __raw_writeq((__force u64) val, dest); } -static inline void mthca_write64(__be32 val[2], void __iomem *dest, +static inline void mthca_write64(u32 hi, u32 lo, void __iomem *dest, spinlock_t *doorbell_lock) { - __raw_writeq(*(u64 *) val, dest); + __raw_writeq((__force u64) cpu_to_be64((u64) hi << 32 | lo), dest); } static inline void mthca_write_db_rec(__be32 val[2], __be32 *db) @@ -87,14 +87,17 @@ static inline void mthca_write64_raw(__be64 val, void __iomem *dest) __raw_writel(((__force u32 *) &val)[1], dest + 4); } -static inline void mthca_write64(__be32 val[2], void __iomem *dest, +static inline void mthca_write64(u32 hi, u32 lo, void __iomem *dest, spinlock_t *doorbell_lock) { unsigned long flags; + hi = (__force u32) cpu_to_be32(hi); + lo = (__force u32) cpu_to_be32(lo); + spin_lock_irqsave(doorbell_lock, flags); - __raw_writel((__force u32) val[0], dest); - __raw_writel((__force u32) val[1], dest + 4); + __raw_writel(hi, dest); + __raw_writel(lo, dest + 4); spin_unlock_irqrestore(doorbell_lock, flags); } diff --git a/drivers/infiniband/hw/mthca/mthca_eq.c b/drivers/infiniband/hw/mthca/mthca_eq.c index 8592b26..b29de51 100644 --- a/drivers/infiniband/hw/mthca/mthca_eq.c +++ b/drivers/infiniband/hw/mthca/mthca_eq.c @@ -173,11 +173,6 @@ static inline u64 async_mask(struct mthca_dev *dev) static inline void tavor_set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci) { - __be32 doorbell[2]; - - doorbell[0] = cpu_to_be32(MTHCA_EQ_DB_SET_CI | eq->eqn); - doorbell[1] = cpu_to_be32(ci & (eq->nent - 1)); - /* * This barrier makes sure that all updates to ownership bits * done by set_eqe_hw() hit memory before the consumer index @@ -187,7 +182,7 @@ static inline void tavor_set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u * having set_eqe_hw() overwrite the owner field. */ wmb(); - mthca_write64(doorbell, + mthca_write64(MTHCA_EQ_DB_SET_CI | eq->eqn, ci & (eq->nent - 1), dev->kar + MTHCA_EQ_DOORBELL, MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); } @@ -212,12 +207,7 @@ static inline void set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci) static inline void tavor_eq_req_not(struct mthca_dev *dev, int eqn) { - __be32 doorbell[2]; - - doorbell[0] = cpu_to_be32(MTHCA_EQ_DB_REQ_NOT | eqn); - doorbell[1] = 0; - - mthca_write64(doorbell, + mthca_write64(MTHCA_EQ_DB_REQ_NOT | eqn, 0, dev->kar + MTHCA_EQ_DOORBELL, MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); } @@ -230,12 +220,7 @@ static inline void arbel_eq_req_not(struct mthca_dev *dev, u32 eqn_mask) static inline void disarm_cq(struct mthca_dev *dev, int eqn, int cqn) { if (!mthca_is_memfree(dev)) { - __be32 doorbell[2]; - - doorbell[0] = cpu_to_be32(MTHCA_EQ_DB_DISARM_CQ | eqn); - doorbell[1] = cpu_to_be32(cqn); - - mthca_write64(doorbell, + mthca_write64(MTHCA_EQ_DB_DISARM_CQ | eqn, cqn, dev->kar + MTHCA_EQ_DOORBELL, MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); } diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index df01b20..0e5461c 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -1799,15 +1799,11 @@ int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, out: if (likely(nreq)) { - __be32 doorbell[2]; - - doorbell[0] = cpu_to_be32(((qp->sq.next_ind << qp->sq.wqe_shift) + - qp->send_wqe_offset) | f0 | op0); - doorbell[1] = cpu_to_be32((qp->qpn << 8) | size0); - wmb(); - mthca_write64(doorbell, + mthca_write64(((qp->sq.next_ind << qp->sq.wqe_shift) + + qp->send_wqe_offset) | f0 | op0, + (qp->qpn << 8) | size0, dev->kar + MTHCA_SEND_DOORBELL, MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); /* @@ -1829,7 +1825,6 @@ int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, { struct mthca_dev *dev = to_mdev(ibqp->device); struct mthca_qp *qp = to_mqp(ibqp); - __be32 doorbell[2]; unsigned long flags; int err = 0; int nreq; @@ -1907,13 +1902,10 @@ int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, if (unlikely(nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB)) { nreq = 0; - doorbell[0] = cpu_to_be32((qp->rq.next_ind << qp->rq.wqe_shift) | size0); - doorbell[1] = cpu_to_be32(qp->qpn << 8); - wmb(); - mthca_write64(doorbell, - dev->kar + MTHCA_RECEIVE_DOORBELL, + mthca_write64((qp->rq.next_ind << qp->rq.wqe_shift) | size0, + qp->qpn << 8, dev->kar + MTHCA_RECEIVE_DOORBELL, MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); qp->rq.next_ind = ind; @@ -1923,13 +1915,10 @@ int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, out: if (likely(nreq)) { - doorbell[0] = cpu_to_be32((qp->rq.next_ind << qp->rq.wqe_shift) | size0); - doorbell[1] = cpu_to_be32((qp->qpn << 8) | nreq); - wmb(); - mthca_write64(doorbell, - dev->kar + MTHCA_RECEIVE_DOORBELL, + mthca_write64((qp->rq.next_ind << qp->rq.wqe_shift) | size0, + qp->qpn << 8 | nreq, dev->kar + MTHCA_RECEIVE_DOORBELL, MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); } @@ -1951,7 +1940,7 @@ int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, { struct mthca_dev *dev = to_mdev(ibqp->device); struct mthca_qp *qp = to_mqp(ibqp); - __be32 doorbell[2]; + u32 dbhi; void *wqe; void *prev_wqe; unsigned long flags; @@ -1981,10 +1970,8 @@ int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, if (unlikely(nreq == MTHCA_ARBEL_MAX_WQES_PER_SEND_DB)) { nreq = 0; - doorbell[0] = cpu_to_be32((MTHCA_ARBEL_MAX_WQES_PER_SEND_DB << 24) | - ((qp->sq.head & 0xffff) << 8) | - f0 | op0); - doorbell[1] = cpu_to_be32((qp->qpn << 8) | size0); + dbhi = (MTHCA_ARBEL_MAX_WQES_PER_SEND_DB << 24) | + ((qp->sq.head & 0xffff) << 8) | f0 | op0; qp->sq.head += MTHCA_ARBEL_MAX_WQES_PER_SEND_DB; @@ -2000,7 +1987,8 @@ int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, * write MMIO send doorbell. */ wmb(); - mthca_write64(doorbell, + + mthca_write64(dbhi, (qp->qpn << 8) | size0, dev->kar + MTHCA_SEND_DOORBELL, MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); } @@ -2154,10 +2142,7 @@ int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, out: if (likely(nreq)) { - doorbell[0] = cpu_to_be32((nreq << 24) | - ((qp->sq.head & 0xffff) << 8) | - f0 | op0); - doorbell[1] = cpu_to_be32((qp->qpn << 8) | size0); + dbhi = (nreq << 24) | ((qp->sq.head & 0xffff) << 8) | f0 | op0; qp->sq.head += nreq; @@ -2173,8 +2158,8 @@ out: * write MMIO send doorbell. */ wmb(); - mthca_write64(doorbell, - dev->kar + MTHCA_SEND_DOORBELL, + + mthca_write64(dbhi, (qp->qpn << 8) | size0, dev->kar + MTHCA_SEND_DOORBELL, MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); } diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c index 3f58c11..553d681 100644 --- a/drivers/infiniband/hw/mthca/mthca_srq.c +++ b/drivers/infiniband/hw/mthca/mthca_srq.c @@ -491,7 +491,6 @@ int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, { struct mthca_dev *dev = to_mdev(ibsrq->device); struct mthca_srq *srq = to_msrq(ibsrq); - __be32 doorbell[2]; unsigned long flags; int err = 0; int first_ind; @@ -563,16 +562,13 @@ int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, if (unlikely(nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB)) { nreq = 0; - doorbell[0] = cpu_to_be32(first_ind << srq->wqe_shift); - doorbell[1] = cpu_to_be32(srq->srqn << 8); - /* * Make sure that descriptors are written * before doorbell is rung. */ wmb(); - mthca_write64(doorbell, + mthca_write64(first_ind << srq->wqe_shift, srq->srqn << 8, dev->kar + MTHCA_RECEIVE_DOORBELL, MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); @@ -581,16 +577,13 @@ int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, } if (likely(nreq)) { - doorbell[0] = cpu_to_be32(first_ind << srq->wqe_shift); - doorbell[1] = cpu_to_be32((srq->srqn << 8) | nreq); - /* * Make sure that descriptors are written before * doorbell is rung. */ wmb(); - mthca_write64(doorbell, + mthca_write64(first_ind << srq->wqe_shift, (srq->srqn << 8) | nreq, dev->kar + MTHCA_RECEIVE_DOORBELL, MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock)); } diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 1b3327a..eb7edab 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -84,9 +84,8 @@ enum { IPOIB_MCAST_RUN = 6, IPOIB_STOP_REAPER = 7, IPOIB_MCAST_STARTED = 8, - IPOIB_FLAG_NETIF_STOPPED = 9, - IPOIB_FLAG_ADMIN_CM = 10, - IPOIB_FLAG_UMCAST = 11, + IPOIB_FLAG_ADMIN_CM = 9, + IPOIB_FLAG_UMCAST = 10, IPOIB_MAX_BACKOFF_SECONDS = 16, @@ -98,9 +97,9 @@ enum { #define IPOIB_OP_RECV (1ul << 31) #ifdef CONFIG_INFINIBAND_IPOIB_CM -#define IPOIB_CM_OP_SRQ (1ul << 30) +#define IPOIB_OP_CM (1ul << 30) #else -#define IPOIB_CM_OP_SRQ (0) +#define IPOIB_OP_CM (0) #endif /* structs */ @@ -197,7 +196,6 @@ struct ipoib_cm_rx { struct ipoib_cm_tx { struct ib_cm_id *id; - struct ib_cq *cq; struct ib_qp *qp; struct list_head list; struct net_device *dev; @@ -294,6 +292,7 @@ struct ipoib_dev_priv { unsigned tx_tail; struct ib_sge tx_sge; struct ib_send_wr tx_wr; + unsigned tx_outstanding; struct ib_wc ibwc[IPOIB_NUM_WC]; @@ -504,6 +503,7 @@ void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx); void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb, unsigned int mtu); void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc); +void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc); #else struct ipoib_cm_tx; @@ -592,6 +592,9 @@ static inline void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *w { } +static inline void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) +{ +} #endif #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 0a0dcb8..8761077 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -87,7 +87,7 @@ static int ipoib_cm_post_receive(struct net_device *dev, int id) struct ib_recv_wr *bad_wr; int i, ret; - priv->cm.rx_wr.wr_id = id | IPOIB_CM_OP_SRQ; + priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; for (i = 0; i < IPOIB_CM_RX_SG; ++i) priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i]; @@ -401,7 +401,7 @@ static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space, void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) { struct ipoib_dev_priv *priv = netdev_priv(dev); - unsigned int wr_id = wc->wr_id & ~IPOIB_CM_OP_SRQ; + unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV); struct sk_buff *skb, *newskb; struct ipoib_cm_rx *p; unsigned long flags; @@ -412,7 +412,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) wr_id, wc->status); if (unlikely(wr_id >= ipoib_recvq_size)) { - if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~IPOIB_CM_OP_SRQ)) { + if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) { spin_lock_irqsave(&priv->lock, flags); list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list); ipoib_cm_start_rx_drain(priv); @@ -434,7 +434,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) goto repost; } - if (!likely(wr_id & IPOIB_CM_RX_UPDATE_MASK)) { + if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) { p = wc->qp->qp_context; if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) { spin_lock_irqsave(&priv->lock, flags); @@ -498,7 +498,7 @@ static inline int post_send(struct ipoib_dev_priv *priv, priv->tx_sge.addr = addr; priv->tx_sge.length = len; - priv->tx_wr.wr_id = wr_id; + priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM; return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr); } @@ -549,20 +549,19 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ dev->trans_start = jiffies; ++tx->tx_head; - if (tx->tx_head - tx->tx_tail == ipoib_sendq_size) { + if (++priv->tx_outstanding == ipoib_sendq_size) { ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n", tx->qp->qp_num); netif_stop_queue(dev); - set_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags); } } } -static void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ipoib_cm_tx *tx, - struct ib_wc *wc) +void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) { struct ipoib_dev_priv *priv = netdev_priv(dev); - unsigned int wr_id = wc->wr_id; + struct ipoib_cm_tx *tx = wc->qp->qp_context; + unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM; struct ipoib_tx_buf *tx_req; unsigned long flags; @@ -587,11 +586,10 @@ static void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ipoib_cm_tx *tx spin_lock_irqsave(&priv->tx_lock, flags); ++tx->tx_tail; - if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags)) && - tx->tx_head - tx->tx_tail <= ipoib_sendq_size >> 1) { - clear_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags); + if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && + netif_queue_stopped(dev) && + test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) netif_wake_queue(dev); - } if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR) { @@ -614,11 +612,6 @@ static void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ipoib_cm_tx *tx tx->neigh = NULL; } - /* queue would be re-started anyway when TX is destroyed, - * but it makes sense to do it ASAP here. */ - if (test_and_clear_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags)) - netif_wake_queue(dev); - if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { list_move(&tx->list, &priv->cm.reap_list); queue_work(ipoib_workqueue, &priv->cm.reap_task); @@ -632,19 +625,6 @@ static void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ipoib_cm_tx *tx spin_unlock_irqrestore(&priv->tx_lock, flags); } -static void ipoib_cm_tx_completion(struct ib_cq *cq, void *tx_ptr) -{ - struct ipoib_cm_tx *tx = tx_ptr; - int n, i; - - ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); - do { - n = ib_poll_cq(cq, IPOIB_NUM_WC, tx->ibwc); - for (i = 0; i < n; ++i) - ipoib_cm_handle_tx_wc(tx->dev, tx, tx->ibwc + i); - } while (n == IPOIB_NUM_WC); -} - int ipoib_cm_dev_open(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -807,17 +787,18 @@ static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even return 0; } -static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ib_cq *cq) +static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_cm_tx *tx) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_qp_init_attr attr = { - .send_cq = cq, + .send_cq = priv->cq, .recv_cq = priv->cq, .srq = priv->cm.srq, .cap.max_send_wr = ipoib_sendq_size, .cap.max_send_sge = 1, .sq_sig_type = IB_SIGNAL_ALL_WR, .qp_type = IB_QPT_RC, + .qp_context = tx }; return ib_create_qp(priv->pd, &attr); @@ -899,21 +880,7 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, goto err_tx; } - p->cq = ib_create_cq(priv->ca, ipoib_cm_tx_completion, NULL, p, - ipoib_sendq_size + 1, 0); - if (IS_ERR(p->cq)) { - ret = PTR_ERR(p->cq); - ipoib_warn(priv, "failed to allocate tx cq: %d\n", ret); - goto err_cq; - } - - ret = ib_req_notify_cq(p->cq, IB_CQ_NEXT_COMP); - if (ret) { - ipoib_warn(priv, "failed to request completion notification: %d\n", ret); - goto err_req_notify; - } - - p->qp = ipoib_cm_create_tx_qp(p->dev, p->cq); + p->qp = ipoib_cm_create_tx_qp(p->dev, p); if (IS_ERR(p->qp)) { ret = PTR_ERR(p->qp); ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret); @@ -950,12 +917,8 @@ err_modify: err_id: p->id = NULL; ib_destroy_qp(p->qp); -err_req_notify: err_qp: p->qp = NULL; - ib_destroy_cq(p->cq); -err_cq: - p->cq = NULL; err_tx: return ret; } @@ -964,6 +927,8 @@ static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p) { struct ipoib_dev_priv *priv = netdev_priv(p->dev); struct ipoib_tx_buf *tx_req; + unsigned long flags; + unsigned long begin; ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n", p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail); @@ -971,27 +936,40 @@ static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p) if (p->id) ib_destroy_cm_id(p->id); - if (p->qp) - ib_destroy_qp(p->qp); - - if (p->cq) - ib_destroy_cq(p->cq); - - if (test_bit(IPOIB_FLAG_NETIF_STOPPED, &p->flags)) - netif_wake_queue(p->dev); - if (p->tx_ring) { + /* Wait for all sends to complete */ + begin = jiffies; while ((int) p->tx_tail - (int) p->tx_head < 0) { - tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)]; - ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, - DMA_TO_DEVICE); - dev_kfree_skb_any(tx_req->skb); - ++p->tx_tail; + if (time_after(jiffies, begin + 5 * HZ)) { + ipoib_warn(priv, "timing out; %d sends not completed\n", + p->tx_head - p->tx_tail); + goto timeout; + } + + msleep(1); } + } - kfree(p->tx_ring); +timeout: + + while ((int) p->tx_tail - (int) p->tx_head < 0) { + tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)]; + ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, + DMA_TO_DEVICE); + dev_kfree_skb_any(tx_req->skb); + ++p->tx_tail; + spin_lock_irqsave(&priv->tx_lock, flags); + if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && + netif_queue_stopped(p->dev) && + test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) + netif_wake_queue(p->dev); + spin_unlock_irqrestore(&priv->tx_lock, flags); } + if (p->qp) + ib_destroy_qp(p->qp); + + kfree(p->tx_ring); kfree(p); } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 1a77e79..5063dd5 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -267,11 +267,10 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) spin_lock_irqsave(&priv->tx_lock, flags); ++priv->tx_tail; - if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags)) && - priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1) { - clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags); + if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) && + netif_queue_stopped(dev) && + test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) netif_wake_queue(dev); - } spin_unlock_irqrestore(&priv->tx_lock, flags); if (wc->status != IB_WC_SUCCESS && @@ -301,14 +300,18 @@ poll_more: for (i = 0; i < n; i++) { struct ib_wc *wc = priv->ibwc + i; - if (wc->wr_id & IPOIB_CM_OP_SRQ) { - ++done; - ipoib_cm_handle_rx_wc(dev, wc); - } else if (wc->wr_id & IPOIB_OP_RECV) { + if (wc->wr_id & IPOIB_OP_RECV) { ++done; - ipoib_ib_handle_rx_wc(dev, wc); - } else - ipoib_ib_handle_tx_wc(dev, wc); + if (wc->wr_id & IPOIB_OP_CM) + ipoib_cm_handle_rx_wc(dev, wc); + else + ipoib_ib_handle_rx_wc(dev, wc); + } else { + if (wc->wr_id & IPOIB_OP_CM) + ipoib_cm_handle_tx_wc(dev, wc); + else + ipoib_ib_handle_tx_wc(dev, wc); + } } if (n != t) @@ -401,10 +404,9 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, address->last_send = priv->tx_head; ++priv->tx_head; - if (priv->tx_head - priv->tx_tail == ipoib_sendq_size) { + if (++priv->tx_outstanding == ipoib_sendq_size) { ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n"); netif_stop_queue(dev); - set_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags); } } } @@ -436,7 +438,8 @@ void ipoib_reap_ah(struct work_struct *work) __ipoib_reap_ah(dev); if (!test_bit(IPOIB_STOP_REAPER, &priv->flags)) - queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ); + queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, + round_jiffies_relative(HZ)); } int ipoib_ib_dev_open(struct net_device *dev) @@ -472,7 +475,8 @@ int ipoib_ib_dev_open(struct net_device *dev) } clear_bit(IPOIB_STOP_REAPER, &priv->flags); - queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ); + queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, + round_jiffies_relative(HZ)); set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); @@ -561,12 +565,17 @@ void ipoib_drain_cq(struct net_device *dev) if (priv->ibwc[i].status == IB_WC_SUCCESS) priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR; - if (priv->ibwc[i].wr_id & IPOIB_CM_OP_SRQ) - ipoib_cm_handle_rx_wc(dev, priv->ibwc + i); - else if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) - ipoib_ib_handle_rx_wc(dev, priv->ibwc + i); - else - ipoib_ib_handle_tx_wc(dev, priv->ibwc + i); + if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) { + if (priv->ibwc[i].wr_id & IPOIB_OP_CM) + ipoib_cm_handle_rx_wc(dev, priv->ibwc + i); + else + ipoib_ib_handle_rx_wc(dev, priv->ibwc + i); + } else { + if (priv->ibwc[i].wr_id & IPOIB_OP_CM) + ipoib_cm_handle_tx_wc(dev, priv->ibwc + i); + else + ipoib_ib_handle_tx_wc(dev, priv->ibwc + i); + } } } while (n == IPOIB_NUM_WC); } @@ -612,6 +621,7 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush) DMA_TO_DEVICE); dev_kfree_skb_any(tx_req->skb); ++priv->tx_tail; + --priv->tx_outstanding; } for (i = 0; i < ipoib_recvq_size; ++i) { diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 362610d..a03a65e 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -148,8 +148,6 @@ static int ipoib_stop(struct net_device *dev) netif_stop_queue(dev); - clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags); - /* * Now flush workqueue to make sure a scheduled task doesn't * bring our internal state back up. @@ -902,7 +900,7 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) goto out_rx_ring_cleanup; } - /* priv->tx_head & tx_tail are already 0 */ + /* priv->tx_head, tx_tail & tx_outstanding are already 0 */ if (ipoib_ib_dev_init(dev, ca, port)) goto out_tx_ring_cleanup; diff --git a/drivers/kvm/Kconfig b/drivers/kvm/Kconfig index 8749fa4..6569206 100644 --- a/drivers/kvm/Kconfig +++ b/drivers/kvm/Kconfig @@ -47,4 +47,8 @@ config KVM_AMD Provides support for KVM on AMD processors equipped with the AMD-V (SVM) extensions. +# OK, it's a little counter-intuitive to do this, but it puts it neatly under +# the virtualization menu. +source drivers/lguest/Kconfig + endif # VIRTUALIZATION diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig index 41e2250..7eb9ecf 100644 --- a/drivers/lguest/Kconfig +++ b/drivers/lguest/Kconfig @@ -1,7 +1,6 @@ config LGUEST tristate "Linux hypervisor example code" - depends on X86 && PARAVIRT && EXPERIMENTAL && !X86_PAE && FUTEX - select LGUEST_GUEST + depends on X86_32 && EXPERIMENTAL && !X86_PAE && FUTEX && !(X86_VISWS || X86_VOYAGER) select HVC_DRIVER ---help--- This is a very simple module which allows you to run @@ -18,13 +17,3 @@ config LGUEST_GUEST The guest needs code built-in, even if the host has lguest support as a module. The drivers are tiny, so we build them in too. - -config LGUEST_NET - tristate - default y - depends on LGUEST_GUEST && NET - -config LGUEST_BLOCK - tristate - default y - depends on LGUEST_GUEST && BLOCK diff --git a/drivers/lguest/Makefile b/drivers/lguest/Makefile index e504747..5e8272d 100644 --- a/drivers/lguest/Makefile +++ b/drivers/lguest/Makefile @@ -1,10 +1,12 @@ -# Guest requires the paravirt_ops replacement and the bus driver. -obj-$(CONFIG_LGUEST_GUEST) += lguest.o lguest_asm.o lguest_bus.o +# Guest requires the device configuration and probing code. +obj-$(CONFIG_LGUEST_GUEST) += lguest_device.o # Host requires the other files, which can be a module. obj-$(CONFIG_LGUEST) += lg.o -lg-y := core.o hypercalls.o page_tables.o interrupts_and_traps.o \ - segments.o io.o lguest_user.o switcher.o +lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \ + segments.o lguest_user.o + +lg-$(CONFIG_X86_32) += x86/switcher_32.o x86/core.o Preparation Preparation!: PREFIX=P Guest: PREFIX=G diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index a0788c1..35d19ae 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -11,58 +11,20 @@ #include <linux/vmalloc.h> #include <linux/cpu.h> #include <linux/freezer.h> +#include <linux/highmem.h> #include <asm/paravirt.h> -#include <asm/desc.h> #include <asm/pgtable.h> #include <asm/uaccess.h> #include <asm/poll.h> -#include <asm/highmem.h> #include <asm/asm-offsets.h> -#include <asm/i387.h> #include "lg.h" -/* Found in switcher.S */ -extern char start_switcher_text[], end_switcher_text[], switch_to_guest[]; -extern unsigned long default_idt_entries[]; - -/* Every guest maps the core switcher code. */ -#define SHARED_SWITCHER_PAGES \ - DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE) -/* Pages for switcher itself, then two pages per cpu */ -#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS) - -/* We map at -4M for ease of mapping into the guest (one PTE page). */ -#define SWITCHER_ADDR 0xFFC00000 static struct vm_struct *switcher_vma; static struct page **switcher_page; -static int cpu_had_pge; -static struct { - unsigned long offset; - unsigned short segment; -} lguest_entry; - /* This One Big lock protects all inter-guest data structures. */ DEFINE_MUTEX(lguest_lock); -static DEFINE_PER_CPU(struct lguest *, last_guest); - -/* FIXME: Make dynamic. */ -#define MAX_LGUEST_GUESTS 16 -struct lguest lguests[MAX_LGUEST_GUESTS]; - -/* Offset from where switcher.S was compiled to where we've copied it */ -static unsigned long switcher_offset(void) -{ - return SWITCHER_ADDR - (unsigned long)start_switcher_text; -} - -/* This cpu's struct lguest_pages. */ -static struct lguest_pages *lguest_pages(unsigned int cpu) -{ - return &(((struct lguest_pages *) - (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); -} /*H:010 We need to set up the Switcher at a high virtual address. Remember the * Switcher is a few hundred bytes of assembler code which actually changes the @@ -73,9 +35,7 @@ static struct lguest_pages *lguest_pages(unsigned int cpu) * Host since it will be running as the switchover occurs. * * Trying to map memory at a particular address is an unusual thing to do, so - * it's not a simple one-liner. We also set up the per-cpu parts of the - * Switcher here. - */ + * it's not a simple one-liner. */ static __init int map_switcher(void) { int i, err; @@ -132,90 +92,11 @@ static __init int map_switcher(void) goto free_vma; } - /* Now the switcher is mapped at the right address, we can't fail! - * Copy in the compiled-in Switcher code (from switcher.S). */ + /* Now the Switcher is mapped at the right address, we can't fail! + * Copy in the compiled-in Switcher code (from <arch>_switcher.S). */ memcpy(switcher_vma->addr, start_switcher_text, end_switcher_text - start_switcher_text); - /* Most of the switcher.S doesn't care that it's been moved; on Intel, - * jumps are relative, and it doesn't access any references to external - * code or data. - * - * The only exception is the interrupt handlers in switcher.S: their - * addresses are placed in a table (default_idt_entries), so we need to - * update the table with the new addresses. switcher_offset() is a - * convenience function which returns the distance between the builtin - * switcher code and the high-mapped copy we just made. */ - for (i = 0; i < IDT_ENTRIES; i++) - default_idt_entries[i] += switcher_offset(); - - /* - * Set up the Switcher's per-cpu areas. - * - * Each CPU gets two pages of its own within the high-mapped region - * (aka. "struct lguest_pages"). Much of this can be initialized now, - * but some depends on what Guest we are running (which is set up in - * copy_in_guest_info()). - */ - for_each_possible_cpu(i) { - /* lguest_pages() returns this CPU's two pages. */ - struct lguest_pages *pages = lguest_pages(i); - /* This is a convenience pointer to make the code fit one - * statement to a line. */ - struct lguest_ro_state *state = &pages->state; - - /* The Global Descriptor Table: the Host has a different one - * for each CPU. We keep a descriptor for the GDT which says - * where it is and how big it is (the size is actually the last - * byte, not the size, hence the "-1"). */ - state->host_gdt_desc.size = GDT_SIZE-1; - state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); - - /* All CPUs on the Host use the same Interrupt Descriptor - * Table, so we just use store_idt(), which gets this CPU's IDT - * descriptor. */ - store_idt(&state->host_idt_desc); - - /* The descriptors for the Guest's GDT and IDT can be filled - * out now, too. We copy the GDT & IDT into ->guest_gdt and - * ->guest_idt before actually running the Guest. */ - state->guest_idt_desc.size = sizeof(state->guest_idt)-1; - state->guest_idt_desc.address = (long)&state->guest_idt; - state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; - state->guest_gdt_desc.address = (long)&state->guest_gdt; - - /* We know where we want the stack to be when the Guest enters - * the switcher: in pages->regs. The stack grows upwards, so - * we start it at the end of that structure. */ - state->guest_tss.esp0 = (long)(&pages->regs + 1); - /* And this is the GDT entry to use for the stack: we keep a - * couple of special LGUEST entries. */ - state->guest_tss.ss0 = LGUEST_DS; - - /* x86 can have a finegrained bitmap which indicates what I/O - * ports the process can use. We set it to the end of our - * structure, meaning "none". */ - state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); - - /* Some GDT entries are the same across all Guests, so we can - * set them up now. */ - setup_default_gdt_entries(state); - /* Most IDT entries are the same for all Guests, too.*/ - setup_default_idt_entries(state, default_idt_entries); - - /* The Host needs to be able to use the LGUEST segments on this - * CPU, too, so put them in the Host GDT. */ - get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; - get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; - } - - /* In the Switcher, we want the %cs segment register to use the - * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so - * it will be undisturbed when we switch. To change %cs and jump we - * need this structure to feed to Intel's "lcall" instruction. */ - lguest_entry.offset = (long)switch_to_guest + switcher_offset(); - lguest_entry.segment = LGUEST_CS; - printk(KERN_INFO "lguest: mapped switcher at %p\n", switcher_vma->addr); /* And we succeeded... */ @@ -247,86 +128,12 @@ static void unmap_switcher(void) __free_pages(switcher_page[i], 0); } -/*H:130 Our Guest is usually so well behaved; it never tries to do things it - * isn't allowed to. Unfortunately, Linux's paravirtual infrastructure isn't - * quite complete, because it doesn't contain replacements for the Intel I/O - * instructions. As a result, the Guest sometimes fumbles across one during - * the boot process as it probes for various things which are usually attached - * to a PC. - * - * When the Guest uses one of these instructions, we get trap #13 (General - * Protection Fault) and come here. We see if it's one of those troublesome - * instructions and skip over it. We return true if we did. */ -static int emulate_insn(struct lguest *lg) -{ - u8 insn; - unsigned int insnlen = 0, in = 0, shift = 0; - /* The eip contains the *virtual* address of the Guest's instruction: - * guest_pa just subtracts the Guest's page_offset. */ - unsigned long physaddr = guest_pa(lg, lg->regs->eip); - - /* The guest_pa() function only works for Guest kernel addresses, but - * that's all we're trying to do anyway. */ - if (lg->regs->eip < lg->page_offset) - return 0; - - /* Decoding x86 instructions is icky. */ - lgread(lg, &insn, physaddr, 1); - - /* 0x66 is an "operand prefix". It means it's using the upper 16 bits - of the eax register. */ - if (insn == 0x66) { - shift = 16; - /* The instruction is 1 byte so far, read the next byte. */ - insnlen = 1; - lgread(lg, &insn, physaddr + insnlen, 1); - } - - /* We can ignore the lower bit for the moment and decode the 4 opcodes - * we need to emulate. */ - switch (insn & 0xFE) { - case 0xE4: /* in <next byte>,%al */ - insnlen += 2; - in = 1; - break; - case 0xEC: /* in (%dx),%al */ - insnlen += 1; - in = 1; - break; - case 0xE6: /* out %al,<next byte> */ - insnlen += 2; - break; - case 0xEE: /* out %al,(%dx) */ - insnlen += 1; - break; - default: - /* OK, we don't know what this is, can't emulate. */ - return 0; - } - - /* If it was an "IN" instruction, they expect the result to be read - * into %eax, so we change %eax. We always return all-ones, which - * traditionally means "there's nothing there". */ - if (in) { - /* Lower bit tells is whether it's a 16 or 32 bit access */ - if (insn & 0x1) - lg->regs->eax = 0xFFFFFFFF; - else - lg->regs->eax |= (0xFFFF << shift); - } - /* Finally, we've "done" the instruction, so move past it. */ - lg->regs->eip += insnlen; - /* Success! */ - return 1; -} -/*:*/ - /*L:305 * Dealing With Guest Memory. * * When the Guest gives us (what it thinks is) a physical address, we can use - * the normal copy_from_user() & copy_to_user() on that address: remember, - * Guest physical == Launcher virtual. + * the normal copy_from_user() & copy_to_user() on the corresponding place in + * the memory region allocated by the Launcher. * * But we can't trust the Guest: it might be trying to access the Launcher * code. We have to check that the range is below the pfn_limit the Launcher @@ -338,148 +145,27 @@ int lguest_address_ok(const struct lguest *lg, return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr); } -/* This is a convenient routine to get a 32-bit value from the Guest (a very - * common operation). Here we can see how useful the kill_lguest() routine we - * met in the Launcher can be: we return a random value (0) instead of needing - * to return an error. */ -u32 lgread_u32(struct lguest *lg, unsigned long addr) -{ - u32 val = 0; - - /* Don't let them access lguest binary. */ - if (!lguest_address_ok(lg, addr, sizeof(val)) - || get_user(val, (u32 __user *)addr) != 0) - kill_guest(lg, "bad read address %#lx", addr); - return val; -} - -/* Same thing for writing a value. */ -void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val) -{ - if (!lguest_address_ok(lg, addr, sizeof(val)) - || put_user(val, (u32 __user *)addr) != 0) - kill_guest(lg, "bad write address %#lx", addr); -} - -/* This routine is more generic, and copies a range of Guest bytes into a - * buffer. If the copy_from_user() fails, we fill the buffer with zeroes, so - * the caller doesn't end up using uninitialized kernel memory. */ -void lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes) +/* This routine copies memory from the Guest. Here we can see how useful the + * kill_lguest() routine we met in the Launcher can be: we return a random + * value (all zeroes) instead of needing to return an error. */ +void __lgread(struct lguest *lg, void *b, unsigned long addr, unsigned bytes) { if (!lguest_address_ok(lg, addr, bytes) - || copy_from_user(b, (void __user *)addr, bytes) != 0) { + || copy_from_user(b, lg->mem_base + addr, bytes) != 0) { /* copy_from_user should do this, but as we rely on it... */ memset(b, 0, bytes); kill_guest(lg, "bad read address %#lx len %u", addr, bytes); } } -/* Similarly, our generic routine to copy into a range of Guest bytes. */ -void lgwrite(struct lguest *lg, unsigned long addr, const void *b, - unsigned bytes) +/* This is the write (copy into guest) version. */ +void __lgwrite(struct lguest *lg, unsigned long addr, const void *b, + unsigned bytes) { if (!lguest_address_ok(lg, addr, bytes) - || copy_to_user((void __user *)addr, b, bytes) != 0) + || copy_to_user(lg->mem_base + addr, b, bytes) != 0) kill_guest(lg, "bad write address %#lx len %u", addr, bytes); } -/* (end of memory access helper routines) :*/ - -static void set_ts(void) -{ - u32 cr0; - - cr0 = read_cr0(); - if (!(cr0 & 8)) - write_cr0(cr0|8); -} - -/*S:010 - * We are getting close to the Switcher. - * - * Remember that each CPU has two pages which are visible to the Guest when it - * runs on that CPU. This has to contain the state for that Guest: we copy the - * state in just before we run the Guest. - * - * Each Guest has "changed" flags which indicate what has changed in the Guest - * since it last ran. We saw this set in interrupts_and_traps.c and - * segments.c. - */ -static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages) -{ - /* Copying all this data can be quite expensive. We usually run the - * same Guest we ran last time (and that Guest hasn't run anywhere else - * meanwhile). If that's not the case, we pretend everything in the - * Guest has changed. */ - if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) { - __get_cpu_var(last_guest) = lg; - lg->last_pages = pages; - lg->changed = CHANGED_ALL; - } - - /* These copies are pretty cheap, so we do them unconditionally: */ - /* Save the current Host top-level page directory. */ - pages->state.host_cr3 = __pa(current->mm->pgd); - /* Set up the Guest's page tables to see this CPU's pages (and no - * other CPU's pages). */ - map_switcher_in_guest(lg, pages); - /* Set up the two "TSS" members which tell the CPU what stack to use - * for traps which do directly into the Guest (ie. traps at privilege - * level 1). */ - pages->state.guest_tss.esp1 = lg->esp1; - pages->state.guest_tss.ss1 = lg->ss1; - - /* Copy direct-to-Guest trap entries. */ - if (lg->changed & CHANGED_IDT) - copy_traps(lg, pages->state.guest_idt, default_idt_entries); - - /* Copy all GDT entries which the Guest can change. */ - if (lg->changed & CHANGED_GDT) - copy_gdt(lg, pages->state.guest_gdt); - /* If only the TLS entries have changed, copy them. */ - else if (lg->changed & CHANGED_GDT_TLS) - copy_gdt_tls(lg, pages->state.guest_gdt); - - /* Mark the Guest as unchanged for next time. */ - lg->changed = 0; -} - -/* Finally: the code to actually call into the Switcher to run the Guest. */ -static void run_guest_once(struct lguest *lg, struct lguest_pages *pages) -{ - /* This is a dummy value we need for GCC's sake. */ - unsigned int clobber; - - /* Copy the guest-specific information into this CPU's "struct - * lguest_pages". */ - copy_in_guest_info(lg, pages); - - /* Set the trap number to 256 (impossible value). If we fault while - * switching to the Guest (bad segment registers or bug), this will - * cause us to abort the Guest. */ - lg->regs->trapnum = 256; - - /* Now: we push the "eflags" register on the stack, then do an "lcall". - * This is how we change from using the kernel code segment to using - * the dedicated lguest code segment, as well as jumping into the - * Switcher. - * - * The lcall also pushes the old code segment (KERNEL_CS) onto the - * stack, then the address of this call. This stack layout happens to - * exactly match the stack of an interrupt... */ - asm volatile("pushf; lcall *lguest_entry" - /* This is how we tell GCC that %eax ("a") and %ebx ("b") - * are changed by this routine. The "=" means output. */ - : "=a"(clobber), "=b"(clobber) - /* %eax contains the pages pointer. ("0" refers to the - * 0-th argument above, ie "a"). %ebx contains the - * physical address of the Guest's top-level page - * directory. */ - : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir)) - /* We tell gcc that all these registers could change, - * which means we don't have to save and restore them in - * the Switcher. */ - : "memory", "%edx", "%ecx", "%edi", "%esi"); -} /*:*/ /*H:030 Let's jump straight to the the main loop which runs the Guest. @@ -489,22 +175,16 @@ int run_guest(struct lguest *lg, unsigned long __user *user) { /* We stop running once the Guest is dead. */ while (!lg->dead) { - /* We need to initialize this, otherwise gcc complains. It's - * not (yet) clever enough to see that it's initialized when we - * need it. */ - unsigned int cr2 = 0; /* Damn gcc */ - - /* First we run any hypercalls the Guest wants done: either in - * the hypercall ring in "struct lguest_data", or directly by - * using int 31 (LGUEST_TRAP_ENTRY). */ - do_hypercalls(lg); - /* It's possible the Guest did a SEND_DMA hypercall to the + /* First we run any hypercalls the Guest wants done. */ + if (lg->hcall) + do_hypercalls(lg); + + /* It's possible the Guest did a NOTIFY hypercall to the * Launcher, in which case we return from the read() now. */ - if (lg->dma_is_pending) { - if (put_user(lg->pending_dma, user) || - put_user(lg->pending_key, user+1)) + if (lg->pending_notify) { + if (put_user(lg->pending_notify, user)) return -EFAULT; - return sizeof(unsigned long)*2; + return sizeof(lg->pending_notify); } /* Check for signals */ @@ -542,144 +222,20 @@ int run_guest(struct lguest *lg, unsigned long __user *user) * the "Do Not Disturb" sign: */ local_irq_disable(); - /* Remember the awfully-named TS bit? If the Guest has asked - * to set it we set it now, so we can trap and pass that trap - * to the Guest if it uses the FPU. */ - if (lg->ts) - set_ts(); - - /* SYSENTER is an optimized way of doing system calls. We - * can't allow it because it always jumps to privilege level 0. - * A normal Guest won't try it because we don't advertise it in - * CPUID, but a malicious Guest (or malicious Guest userspace - * program) could, so we tell the CPU to disable it before - * running the Guest. */ - if (boot_cpu_has(X86_FEATURE_SEP)) - wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); - - /* Now we actually run the Guest. It will pop back out when - * something interesting happens, and we can examine its - * registers to see what it was doing. */ - run_guest_once(lg, lguest_pages(raw_smp_processor_id())); - - /* The "regs" pointer contains two extra entries which are not - * really registers: a trap number which says what interrupt or - * trap made the switcher code come back, and an error code - * which some traps set. */ - - /* If the Guest page faulted, then the cr2 register will tell - * us the bad virtual address. We have to grab this now, - * because once we re-enable interrupts an interrupt could - * fault and thus overwrite cr2, or we could even move off to a - * different CPU. */ - if (lg->regs->trapnum == 14) - cr2 = read_cr2(); - /* Similarly, if we took a trap because the Guest used the FPU, - * we have to restore the FPU it expects to see. */ - else if (lg->regs->trapnum == 7) - math_state_restore(); - - /* Restore SYSENTER if it's supposed to be on. */ - if (boot_cpu_has(X86_FEATURE_SEP)) - wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); + /* Actually run the Guest until something happens. */ + lguest_arch_run_guest(lg); /* Now we're ready to be interrupted or moved to other CPUs */ local_irq_enable(); - /* OK, so what happened? */ - switch (lg->regs->trapnum) { - case 13: /* We've intercepted a GPF. */ - /* Check if this was one of those annoying IN or OUT - * instructions which we need to emulate. If so, we - * just go back into the Guest after we've done it. */ - if (lg->regs->errcode == 0) { - if (emulate_insn(lg)) - continue; - } - break; - case 14: /* We've intercepted a page fault. */ - /* The Guest accessed a virtual address that wasn't - * mapped. This happens a lot: we don't actually set - * up most of the page tables for the Guest at all when - * we start: as it runs it asks for more and more, and - * we set them up as required. In this case, we don't - * even tell the Guest that the fault happened. - * - * The errcode tells whether this was a read or a - * write, and whether kernel or userspace code. */ - if (demand_page(lg, cr2, lg->regs->errcode)) - continue; - - /* OK, it's really not there (or not OK): the Guest - * needs to know. We write out the cr2 value so it - * knows where the fault occurred. - * - * Note that if the Guest were really messed up, this - * could happen before it's done the INITIALIZE - * hypercall, so lg->lguest_data will be NULL, so - * &lg->lguest_data->cr2 will be address 8. Writing - * into that address won't hurt the Host at all, - * though. */ - if (put_user(cr2, &lg->lguest_data->cr2)) - kill_guest(lg, "Writing cr2"); - break; - case 7: /* We've intercepted a Device Not Available fault. */ - /* If the Guest doesn't want to know, we already - * restored the Floating Point Unit, so we just - * continue without telling it. */ - if (!lg->ts) - continue; - break; - case 32 ... 255: - /* These values mean a real interrupt occurred, in - * which case the Host handler has already been run. - * We just do a friendly check if another process - * should now be run, then fall through to loop - * around: */ - cond_resched(); - case LGUEST_TRAP_ENTRY: /* Handled at top of loop */ - continue; - } - - /* If we get here, it's a trap the Guest wants to know - * about. */ - if (deliver_trap(lg, lg->regs->trapnum)) - continue; - - /* If the Guest doesn't have a handler (either it hasn't - * registered any yet, or it's one of the faults we don't let - * it handle), it dies with a cryptic error message. */ - kill_guest(lg, "unhandled trap %li at %#lx (%#lx)", - lg->regs->trapnum, lg->regs->eip, - lg->regs->trapnum == 14 ? cr2 : lg->regs->errcode); + /* Now we deal with whatever happened to the Guest. */ + lguest_arch_handle_trap(lg); } + /* The Guest is dead => "No such file or directory" */ return -ENOENT; } -/* Now we can look at each of the routines this calls, in increasing order of - * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(), - * deliver_trap() and demand_page(). After all those, we'll be ready to - * examine the Switcher, and our philosophical understanding of the Host/Guest - * duality will be complete. :*/ - -int find_free_guest(void) -{ - unsigned int i; - for (i = 0; i < MAX_LGUEST_GUESTS; i++) - if (!lguests[i].tsk) - return i; - return -1; -} - -static void adjust_pge(void *on) -{ - if (on) - write_cr4(read_cr4() | X86_CR4_PGE); - else - write_cr4(read_cr4() & ~X86_CR4_PGE); -} - /*H:000 * Welcome to the Host! * @@ -701,72 +257,50 @@ static int __init init(void) /* First we put the Switcher up in very high virtual memory. */ err = map_switcher(); if (err) - return err; + goto out; /* Now we set up the pagetable implementation for the Guests. */ err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES); - if (err) { - unmap_switcher(); - return err; - } + if (err) + goto unmap; - /* The I/O subsystem needs some things initialized. */ - lguest_io_init(); + /* We might need to reserve an interrupt vector. */ + err = init_interrupts(); + if (err) + goto free_pgtables; /* /dev/lguest needs to be registered. */ err = lguest_device_init(); - if (err) { - free_pagetables(); - unmap_switcher(); - return err; - } + if (err) + goto free_interrupts; - /* Finally, we need to turn off "Page Global Enable". PGE is an - * optimization where page table entries are specially marked to show - * they never change. The Host kernel marks all the kernel pages this - * way because it's always present, even when userspace is running. - * - * Lguest breaks this: unbeknownst to the rest of the Host kernel, we - * switch to the Guest kernel. If you don't disable this on all CPUs, - * you'll get really weird bugs that you'll chase for two days. - * - * I used to turn PGE off every time we switched to the Guest and back - * on when we return, but that slowed the Switcher down noticibly. */ - - /* We don't need the complexity of CPUs coming and going while we're - * doing this. */ - lock_cpu_hotplug(); - if (cpu_has_pge) { /* We have a broader idea of "global". */ - /* Remember that this was originally set (for cleanup). */ - cpu_had_pge = 1; - /* adjust_pge is a helper function which sets or unsets the PGE - * bit on its CPU, depending on the argument (0 == unset). */ - on_each_cpu(adjust_pge, (void *)0, 0, 1); - /* Turn off the feature in the global feature set. */ - clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); - } - unlock_cpu_hotplug(); + /* Finally we do some architecture-specific setup. */ + lguest_arch_host_init(); /* All good! */ return 0; + +free_interrupts: + free_interrupts(); +free_pgtables: + free_pagetables(); +unmap: + unmap_switcher(); +out: + return err; } /* Cleaning up is just the same code, backwards. With a little French. */ static void __exit fini(void) { lguest_device_remove(); + free_interrupts(); free_pagetables(); unmap_switcher(); - /* If we had PGE before we started, turn it back on now. */ - lock_cpu_hotplug(); - if (cpu_had_pge) { - set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); - /* adjust_pge's argument "1" means set PGE. */ - on_each_cpu(adjust_pge, (void *)1, 0, 1); - } - unlock_cpu_hotplug(); + lguest_arch_host_fini(); } +/*:*/ /* The Host side of lguest can be a module. This is a nice way for people to * play with it. */ diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index db6caac..9d5184c 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -25,17 +25,13 @@ #include <linux/mm.h> #include <asm/page.h> #include <asm/pgtable.h> -#include <irq_vectors.h> #include "lg.h" -/*H:120 This is the core hypercall routine: where the Guest gets what it - * wants. Or gets killed. Or, in the case of LHCALL_CRASH, both. - * - * Remember from the Guest: %eax == which call to make, and the arguments are - * packed into %edx, %ebx and %ecx if needed. */ -static void do_hcall(struct lguest *lg, struct lguest_regs *regs) +/*H:120 This is the core hypercall routine: where the Guest gets what it wants. + * Or gets killed. Or, in the case of LHCALL_CRASH, both. */ +static void do_hcall(struct lguest *lg, struct hcall_args *args) { - switch (regs->eax) { + switch (args->arg0) { case LHCALL_FLUSH_ASYNC: /* This call does nothing, except by breaking out of the Guest * it makes us process all the asynchronous hypercalls. */ @@ -51,7 +47,7 @@ static void do_hcall(struct lguest *lg, struct lguest_regs *regs) char msg[128]; /* If the lgread fails, it will call kill_guest() itself; the * kill_guest() with the message will be ignored. */ - lgread(lg, msg, regs->edx, sizeof(msg)); + __lgread(lg, msg, args->arg1, sizeof(msg)); msg[sizeof(msg)-1] = '\0'; kill_guest(lg, "CRASH: %s", msg); break; @@ -59,67 +55,49 @@ static void do_hcall(struct lguest *lg, struct lguest_regs *regs) case LHCALL_FLUSH_TLB: /* FLUSH_TLB comes in two flavors, depending on the * argument: */ - if (regs->edx) + if (args->arg1) guest_pagetable_clear_all(lg); else guest_pagetable_flush_user(lg); break; - case LHCALL_BIND_DMA: - /* BIND_DMA really wants four arguments, but it's the only call - * which does. So the Guest packs the number of buffers and - * the interrupt number into the final argument, and we decode - * it here. This can legitimately fail, since we currently - * place a limit on the number of DMA pools a Guest can have. - * So we return true or false from this call. */ - regs->eax = bind_dma(lg, regs->edx, regs->ebx, - regs->ecx >> 8, regs->ecx & 0xFF); - break; /* All these calls simply pass the arguments through to the right * routines. */ - case LHCALL_SEND_DMA: - send_dma(lg, regs->edx, regs->ebx); - break; - case LHCALL_LOAD_GDT: - load_guest_gdt(lg, regs->edx, regs->ebx); - break; - case LHCALL_LOAD_IDT_ENTRY: - load_guest_idt_entry(lg, regs->edx, regs->ebx, regs->ecx); - break; case LHCALL_NEW_PGTABLE: - guest_new_pagetable(lg, regs->edx); + guest_new_pagetable(lg, args->arg1); break; case LHCALL_SET_STACK: - guest_set_stack(lg, regs->edx, regs->ebx, regs->ecx); + guest_set_stack(lg, args->arg1, args->arg2, args->arg3); break; case LHCALL_SET_PTE: - guest_set_pte(lg, regs->edx, regs->ebx, mkgpte(regs->ecx)); + guest_set_pte(lg, args->arg1, args->arg2, __pte(args->arg3)); break; case LHCALL_SET_PMD: - guest_set_pmd(lg, regs->edx, regs->ebx); - break; - case LHCALL_LOAD_TLS: - guest_load_tls(lg, regs->edx); + guest_set_pmd(lg, args->arg1, args->arg2); break; case LHCALL_SET_CLOCKEVENT: - guest_set_clockevent(lg, regs->edx); + guest_set_clockevent(lg, args->arg1); break; - case LHCALL_TS: /* This sets the TS flag, as we saw used in run_guest(). */ - lg->ts = regs->edx; + lg->ts = args->arg1; break; case LHCALL_HALT: /* Similarly, this sets the halted flag for run_guest(). */ lg->halted = 1; break; + case LHCALL_NOTIFY: + lg->pending_notify = args->arg1; + break; default: - kill_guest(lg, "Bad hypercall %li\n", regs->eax); + if (lguest_arch_do_hcall(lg, args)) + kill_guest(lg, "Bad hypercall %li\n", args->arg0); } } +/*:*/ -/* Asynchronous hypercalls are easy: we just look in the array in the Guest's - * "struct lguest_data" and see if there are any new ones marked "ready". +/*H:124 Asynchronous hypercalls are easy: we just look in the array in the + * Guest's "struct lguest_data" to see if any new ones are marked "ready". * * We are careful to do these in order: obviously we respect the order the * Guest put them in the ring, but we also promise the Guest that they will @@ -134,10 +112,9 @@ static void do_async_hcalls(struct lguest *lg) if (copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st))) return; - /* We process "struct lguest_data"s hcalls[] ring once. */ for (i = 0; i < ARRAY_SIZE(st); i++) { - struct lguest_regs regs; + struct hcall_args args; /* We remember where we were up to from last time. This makes * sure that the hypercalls are done in the order the Guest * places them in the ring. */ @@ -152,18 +129,16 @@ static void do_async_hcalls(struct lguest *lg) if (++lg->next_hcall == LHCALL_RING_SIZE) lg->next_hcall = 0; - /* We copy the hypercall arguments into a fake register - * structure. This makes life simple for do_hcall(). */ - if (get_user(regs.eax, &lg->lguest_data->hcalls[n].eax) - || get_user(regs.edx, &lg->lguest_data->hcalls[n].edx) - || get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx) - || get_user(regs.ebx, &lg->lguest_data->hcalls[n].ebx)) { + /* Copy the hypercall arguments into a local copy of + * the hcall_args struct. */ + if (copy_from_user(&args, &lg->lguest_data->hcalls[n], + sizeof(struct hcall_args))) { kill_guest(lg, "Fetching async hypercalls"); break; } /* Do the hypercall, same as a normal one. */ - do_hcall(lg, ®s); + do_hcall(lg, &args); /* Mark the hypercall done. */ if (put_user(0xFF, &lg->lguest_data->hcall_status[n])) { @@ -171,9 +146,9 @@ static void do_async_hcalls(struct lguest *lg) break; } - /* Stop doing hypercalls if we've just done a DMA to the - * Launcher: it needs to service this first. */ - if (lg->dma_is_pending) + /* Stop doing hypercalls if they want to notify the Launcher: + * it needs to service this first. */ + if (lg->pending_notify) break; } } @@ -182,76 +157,35 @@ static void do_async_hcalls(struct lguest *lg) * Guest makes a hypercall, we end up here to set things up: */ static void initialize(struct lguest *lg) { - u32 tsc_speed; /* You can't do anything until you're initialized. The Guest knows the * rules, so we're unforgiving here. */ - if (lg->regs->eax != LHCALL_LGUEST_INIT) { - kill_guest(lg, "hypercall %li before LGUEST_INIT", - lg->regs->eax); + if (lg->hcall->arg0 != LHCALL_LGUEST_INIT) { + kill_guest(lg, "hypercall %li before INIT", lg->hcall->arg0); return; } - /* We insist that the Time Stamp Counter exist and doesn't change with - * cpu frequency. Some devious chip manufacturers decided that TSC - * changes could be handled in software. I decided that time going - * backwards might be good for benchmarks, but it's bad for users. - * - * We also insist that the TSC be stable: the kernel detects unreliable - * TSCs for its own purposes, and we use that here. */ - if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable()) - tsc_speed = tsc_khz; - else - tsc_speed = 0; - - /* The pointer to the Guest's "struct lguest_data" is the only - * argument. */ - lg->lguest_data = (struct lguest_data __user *)lg->regs->edx; - /* If we check the address they gave is OK now, we can simply - * copy_to_user/from_user from now on rather than using lgread/lgwrite. - * I put this in to show that I'm not immune to writing stupid - * optimizations. */ - if (!lguest_address_ok(lg, lg->regs->edx, sizeof(*lg->lguest_data))) { + if (lguest_arch_init_hypercalls(lg)) kill_guest(lg, "bad guest page %p", lg->lguest_data); - return; - } + /* The Guest tells us where we're not to deliver interrupts by putting * the range of addresses into "struct lguest_data". */ if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start) - || get_user(lg->noirq_end, &lg->lguest_data->noirq_end) - /* We tell the Guest that it can't use the top 4MB of virtual - * addresses used by the Switcher. */ - || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem) - || put_user(tsc_speed, &lg->lguest_data->tsc_khz) - /* We also give the Guest a unique id, as used in lguest_net.c. */ - || put_user(lg->guestid, &lg->lguest_data->guestid)) + || get_user(lg->noirq_end, &lg->lguest_data->noirq_end)) kill_guest(lg, "bad guest page %p", lg->lguest_data); /* We write the current time into the Guest's data page once now. */ write_timestamp(lg); + /* page_tables.c will also do some setup. */ + page_table_guest_data_init(lg); + /* This is the one case where the above accesses might have been the * first write to a Guest page. This may have caused a copy-on-write * fault, but the Guest might be referring to the old (read-only) * page. */ guest_pagetable_clear_all(lg); } -/* Now we've examined the hypercall code; our Guest can make requests. There - * is one other way we can do things for the Guest, as we see in - * emulate_insn(). */ - -/*H:110 Tricky point: we mark the hypercall as "done" once we've done it. - * Normally we don't need to do this: the Guest will run again and update the - * trap number before we come back around the run_guest() loop to - * do_hypercalls(). - * - * However, if we are signalled or the Guest sends DMA to the Launcher, that - * loop will exit without running the Guest. When it comes back it would try - * to re-run the hypercall. */ -static void clear_hcall(struct lguest *lg) -{ - lg->regs->trapnum = 255; -} /*H:100 * Hypercalls @@ -261,16 +195,12 @@ static void clear_hcall(struct lguest *lg) */ void do_hypercalls(struct lguest *lg) { - /* Not initialized yet? */ + /* Not initialized yet? This hypercall must do it. */ if (unlikely(!lg->lguest_data)) { - /* Did the Guest make a hypercall? We might have come back for - * some other reason (an interrupt, a different trap). */ - if (lg->regs->trapnum == LGUEST_TRAP_ENTRY) { - /* Set up the "struct lguest_data" */ - initialize(lg); - /* The hypercall is done. */ - clear_hcall(lg); - } + /* Set up the "struct lguest_data" */ + initialize(lg); + /* Hcall is done. */ + lg->hcall = NULL; return; } @@ -280,12 +210,21 @@ void do_hypercalls(struct lguest *lg) do_async_hcalls(lg); /* If we stopped reading the hypercall ring because the Guest did a - * SEND_DMA to the Launcher, we want to return now. Otherwise if the - * Guest asked us to do a hypercall, we do it. */ - if (!lg->dma_is_pending && lg->regs->trapnum == LGUEST_TRAP_ENTRY) { - do_hcall(lg, lg->regs); - /* The hypercall is done. */ - clear_hcall(lg); + * NOTIFY to the Launcher, we want to return now. Otherwise we do + * the hypercall. */ + if (!lg->pending_notify) { + do_hcall(lg, lg->hcall); + /* Tricky point: we reset the hcall pointer to mark the + * hypercall as "done". We use the hcall pointer rather than + * the trap number to indicate a hypercall is pending. + * Normally it doesn't matter: the Guest will run again and + * update the trap number before we come back here. + * + * However, if we are signalled or the Guest sends DMA to the + * Launcher, the run_guest() loop will exit without running the + * Guest. When it comes back it would try to re-run the + * hypercall. */ + lg->hcall = NULL; } } @@ -295,6 +234,6 @@ void write_timestamp(struct lguest *lg) { struct timespec now; ktime_get_real_ts(&now); - if (put_user(now, &lg->lguest_data->time)) + if (copy_to_user(&lg->lguest_data->time, &now, sizeof(struct timespec))) kill_guest(lg, "Writing timestamp"); } diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index 3973123..8296698 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -12,8 +12,14 @@ * them first, so we also have a way of "reflecting" them into the Guest as if * they had been delivered to it directly. :*/ #include <linux/uaccess.h> +#include <linux/interrupt.h> +#include <linux/module.h> #include "lg.h" +/* Allow Guests to use a non-128 (ie. non-Linux) syscall trap. */ +static unsigned int syscall_vector = SYSCALL_VECTOR; +module_param(syscall_vector, uint, 0444); + /* The address of the interrupt handler is split into two bits: */ static unsigned long idt_address(u32 lo, u32 hi) { @@ -39,7 +45,7 @@ static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val) { /* Stack grows upwards: move stack then write value. */ *gstack -= 4; - lgwrite_u32(lg, *gstack, val); + lgwrite(lg, *gstack, u32, val); } /*H:210 The set_guest_interrupt() routine actually delivers the interrupt or @@ -56,8 +62,9 @@ static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val) * it). */ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) { - unsigned long gstack; + unsigned long gstack, origstack; u32 eflags, ss, irq_enable; + unsigned long virtstack; /* There are two cases for interrupts: one where the Guest is already * in the kernel, and a more complex one where the Guest is in @@ -65,8 +72,10 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) if ((lg->regs->ss&0x3) != GUEST_PL) { /* The Guest told us their kernel stack with the SET_STACK * hypercall: both the virtual address and the segment */ - gstack = guest_pa(lg, lg->esp1); + virtstack = lg->esp1; ss = lg->ss1; + + origstack = gstack = guest_pa(lg, virtstack); /* We push the old stack segment and pointer onto the new * stack: when the Guest does an "iret" back from the interrupt * handler the CPU will notice they're dropping privilege @@ -75,8 +84,10 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) push_guest_stack(lg, &gstack, lg->regs->esp); } else { /* We're staying on the same Guest (kernel) stack. */ - gstack = guest_pa(lg, lg->regs->esp); + virtstack = lg->regs->esp; ss = lg->regs->ss; + + origstack = gstack = guest_pa(lg, virtstack); } /* Remember that we never let the Guest actually disable interrupts, so @@ -102,7 +113,7 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err) /* Now we've pushed all the old state, we change the stack, the code * segment and the address to execute. */ lg->regs->ss = ss; - lg->regs->esp = gstack + lg->page_offset; + lg->regs->esp = virtstack + (gstack - origstack); lg->regs->cs = (__KERNEL_CS|GUEST_PL); lg->regs->eip = idt_address(lo, hi); @@ -165,7 +176,7 @@ void maybe_do_interrupt(struct lguest *lg) /* Look at the IDT entry the Guest gave us for this interrupt. The * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip * over them. */ - idt = &lg->idt[FIRST_EXTERNAL_VECTOR+irq]; + idt = &lg->arch.idt[FIRST_EXTERNAL_VECTOR+irq]; /* If they don't have a handler (yet?), we just ignore it */ if (idt_present(idt->a, idt->b)) { /* OK, mark it no longer pending and deliver it. */ @@ -183,6 +194,47 @@ void maybe_do_interrupt(struct lguest *lg) * timer interrupt. */ write_timestamp(lg); } +/*:*/ + +/* Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent + * me a patch, so we support that too. It'd be a big step for lguest if half + * the Plan 9 user base were to start using it. + * + * Actually now I think of it, it's possible that Ron *is* half the Plan 9 + * userbase. Oh well. */ +static bool could_be_syscall(unsigned int num) +{ + /* Normal Linux SYSCALL_VECTOR or reserved vector? */ + return num == SYSCALL_VECTOR || num == syscall_vector; +} + +/* The syscall vector it wants must be unused by Host. */ +bool check_syscall_vector(struct lguest *lg) +{ + u32 vector; + + if (get_user(vector, &lg->lguest_data->syscall_vec)) + return false; + + return could_be_syscall(vector); +} + +int init_interrupts(void) +{ + /* If they want some strange system call vector, reserve it now */ + if (syscall_vector != SYSCALL_VECTOR + && test_and_set_bit(syscall_vector, used_vectors)) { + printk("lg: couldn't reserve syscall %u\n", syscall_vector); + return -EBUSY; + } + return 0; +} + +void free_interrupts(void) +{ + if (syscall_vector != SYSCALL_VECTOR) + clear_bit(syscall_vector, used_vectors); +} /*H:220 Now we've got the routines to deliver interrupts, delivering traps * like page fault is easy. The only trick is that Intel decided that some @@ -197,14 +249,14 @@ int deliver_trap(struct lguest *lg, unsigned int num) { /* Trap numbers are always 8 bit, but we set an impossible trap number * for traps inside the Switcher, so check that here. */ - if (num >= ARRAY_SIZE(lg->idt)) + if (num >= ARRAY_SIZE(lg->arch.idt)) return 0; /* Early on the Guest hasn't set the IDT entries (or maybe it put a * bogus one in): if we fail here, the Guest will be killed. */ - if (!idt_present(lg->idt[num].a, lg->idt[num].b)) + if (!idt_present(lg->arch.idt[num].a, lg->arch.idt[num].b)) return 0; - set_guest_interrupt(lg, lg->idt[num].a, lg->idt[num].b, has_err(num)); + set_guest_interrupt(lg, lg->arch.idt[num].a, lg->arch.idt[num].b, has_err(num)); return 1; } @@ -218,28 +270,20 @@ int deliver_trap(struct lguest *lg, unsigned int num) * system calls down from 1750ns to 270ns. Plus, if lguest didn't do it, all * the other hypervisors would tease it. * - * This routine determines if a trap can be delivered directly. */ -static int direct_trap(const struct lguest *lg, - const struct desc_struct *trap, - unsigned int num) + * This routine indicates if a particular trap number could be delivered + * directly. */ +static int direct_trap(unsigned int num) { /* Hardware interrupts don't go to the Guest at all (except system * call). */ - if (num >= FIRST_EXTERNAL_VECTOR && num != SYSCALL_VECTOR) + if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num)) return 0; /* The Host needs to see page faults (for shadow paging and to save the * fault address), general protection faults (in/out emulation) and * device not available (TS handling), and of course, the hypercall * trap. */ - if (num == 14 || num == 13 || num == 7 || num == LGUEST_TRAP_ENTRY) - return 0; - - /* Only trap gates (type 15) can go direct to the Guest. Interrupt - * gates (type 14) disable interrupts as they are entered, which we - * never let the Guest do. Not present entries (type 0x0) also can't - * go direct, of course 8) */ - return idt_type(trap->a, trap->b) == 0xF; + return num != 14 && num != 13 && num != 7 && num != LGUEST_TRAP_ENTRY; } /*:*/ @@ -348,15 +392,11 @@ void load_guest_idt_entry(struct lguest *lg, unsigned int num, u32 lo, u32 hi) * to copy this again. */ lg->changed |= CHANGED_IDT; - /* The IDT which we keep in "struct lguest" only contains 32 entries - * for the traps and LGUEST_IRQS (32) entries for interrupts. We - * ignore attempts to set handlers for higher interrupt numbers, except - * for the system call "interrupt" at 128: we have a special IDT entry - * for that. */ - if (num < ARRAY_SIZE(lg->idt)) - set_trap(lg, &lg->idt[num], num, lo, hi); - else if (num == SYSCALL_VECTOR) - set_trap(lg, &lg->syscall_idt, num, lo, hi); + /* Check that the Guest doesn't try to step outside the bounds. */ + if (num >= ARRAY_SIZE(lg->arch.idt)) + kill_guest(lg, "Setting idt entry %u", num); + else + set_trap(lg, &lg->arch.idt[num], num, lo, hi); } /* The default entry for each interrupt points into the Switcher routines which @@ -399,20 +439,21 @@ void copy_traps(const struct lguest *lg, struct desc_struct *idt, /* We can simply copy the direct traps, otherwise we use the default * ones in the Switcher: they will return to the Host. */ - for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) { - if (direct_trap(lg, &lg->idt[i], i)) - idt[i] = lg->idt[i]; + for (i = 0; i < ARRAY_SIZE(lg->arch.idt); i++) { + /* If no Guest can ever override this trap, leave it alone. */ + if (!direct_trap(i)) + continue; + + /* Only trap gates (type 15) can go direct to the Guest. + * Interrupt gates (type 14) disable interrupts as they are + * entered, which we never let the Guest do. Not present + * entries (type 0x0) also can't go direct, of course. */ + if (idt_type(lg->arch.idt[i].a, lg->arch.idt[i].b) == 0xF) + idt[i] = lg->arch.idt[i]; else + /* Reset it to the default. */ default_idt_entry(&idt[i], i, def[i]); } - - /* Don't forget the system call trap! The IDT entries for other - * interupts never change, so no need to copy them. */ - i = SYSCALL_VECTOR; - if (direct_trap(lg, &lg->syscall_idt, i)) - idt[i] = lg->syscall_idt; - else - default_idt_entry(&idt[i], i, def[i]); } void guest_set_clockevent(struct lguest *lg, unsigned long delta) diff --git a/drivers/lguest/io.c b/drivers/lguest/io.c deleted file mode 100644 index ea68613..0000000 --- a/drivers/lguest/io.c +++ /dev/null @@ -1,626 +0,0 @@ -/*P:300 The I/O mechanism in lguest is simple yet flexible, allowing the Guest - * to talk to the Launcher or directly to another Guest. It uses familiar - * concepts of DMA and interrupts, plus some neat code stolen from - * futexes... :*/ - -/* Copyright (C) 2006 Rusty Russell IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include <linux/types.h> -#include <linux/futex.h> -#include <linux/jhash.h> -#include <linux/mm.h> -#include <linux/highmem.h> -#include <linux/uaccess.h> -#include "lg.h" - -/*L:300 - * I/O - * - * Getting data in and out of the Guest is quite an art. There are numerous - * ways to do it, and they all suck differently. We try to keep things fairly - * close to "real" hardware so our Guest's drivers don't look like an alien - * visitation in the middle of the Linux code, and yet make sure that Guests - * can talk directly to other Guests, not just the Launcher. - * - * To do this, the Guest gives us a key when it binds or sends DMA buffers. - * The key corresponds to a "physical" address inside the Guest (ie. a virtual - * address inside the Launcher process). We don't, however, use this key - * directly. - * - * We want Guests which share memory to be able to DMA to each other: two - * Launchers can mmap memory the same file, then the Guests can communicate. - * Fortunately, the futex code provides us with a way to get a "union - * futex_key" corresponding to the memory lying at a virtual address: if the - * two processes share memory, the "union futex_key" for that memory will match - * even if the memory is mapped at different addresses in each. So we always - * convert the keys to "union futex_key"s to compare them. - * - * Before we dive into this though, we need to look at another set of helper - * routines used throughout the Host kernel code to access Guest memory. - :*/ -static struct list_head dma_hash[61]; - -/* An unfortunate side effect of the Linux double-linked list implementation is - * that there's no good way to statically initialize an array of linked - * lists. */ -void lguest_io_init(void) -{ - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(dma_hash); i++) - INIT_LIST_HEAD(&dma_hash[i]); -} - -/* FIXME: allow multi-page lengths. */ -static int check_dma_list(struct lguest *lg, const struct lguest_dma *dma) -{ - unsigned int i; - - for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { - if (!dma->len[i]) - return 1; - if (!lguest_address_ok(lg, dma->addr[i], dma->len[i])) - goto kill; - if (dma->len[i] > PAGE_SIZE) - goto kill; - /* We could do over a page, but is it worth it? */ - if ((dma->addr[i] % PAGE_SIZE) + dma->len[i] > PAGE_SIZE) - goto kill; - } - return 1; - -kill: - kill_guest(lg, "bad DMA entry: %u@%#lx", dma->len[i], dma->addr[i]); - return 0; -} - -/*L:330 This is our hash function, using the wonderful Jenkins hash. - * - * The futex key is a union with three parts: an unsigned long word, a pointer, - * and an int "offset". We could use jhash_2words() which takes three u32s. - * (Ok, the hash functions are great: the naming sucks though). - * - * It's nice to be portable to 64-bit platforms, so we use the more generic - * jhash2(), which takes an array of u32, the number of u32s, and an initial - * u32 to roll in. This is uglier, but breaks down to almost the same code on - * 32-bit platforms like this one. - * - * We want a position in the array, so we modulo ARRAY_SIZE(dma_hash) (ie. 61). - */ -static unsigned int hash(const union futex_key *key) -{ - return jhash2((u32*)&key->both.word, - (sizeof(key->both.word)+sizeof(key->both.ptr))/4, - key->both.offset) - % ARRAY_SIZE(dma_hash); -} - -/* This is a convenience routine to compare two keys. It's a much bemoaned C - * weakness that it doesn't allow '==' on structures or unions, so we have to - * open-code it like this. */ -static inline int key_eq(const union futex_key *a, const union futex_key *b) -{ - return (a->both.word == b->both.word - && a->both.ptr == b->both.ptr - && a->both.offset == b->both.offset); -} - -/*L:360 OK, when we need to actually free up a Guest's DMA array we do several - * things, so we have a convenient function to do it. - * - * The caller must hold a read lock on dmainfo owner's current->mm->mmap_sem - * for the drop_futex_key_refs(). */ -static void unlink_dma(struct lguest_dma_info *dmainfo) -{ - /* You locked this too, right? */ - BUG_ON(!mutex_is_locked(&lguest_lock)); - /* This is how we know that the entry is free. */ - dmainfo->interrupt = 0; - /* Remove it from the hash table. */ - list_del(&dmainfo->list); - /* Drop the references we were holding (to the inode or mm). */ - drop_futex_key_refs(&dmainfo->key); -} - -/*L:350 This is the routine which we call when the Guest asks to unregister a - * DMA array attached to a given key. Returns true if the array was found. */ -static int unbind_dma(struct lguest *lg, - const union futex_key *key, - unsigned long dmas) -{ - int i, ret = 0; - - /* We don't bother with the hash table, just look through all this - * Guest's DMA arrays. */ - for (i = 0; i < LGUEST_MAX_DMA; i++) { - /* In theory it could have more than one array on the same key, - * or one array on multiple keys, so we check both */ - if (key_eq(key, &lg->dma[i].key) && dmas == lg->dma[i].dmas) { - unlink_dma(&lg->dma[i]); - ret = 1; - break; - } - } - return ret; -} - -/*L:340 BIND_DMA: this is the hypercall which sets up an array of "struct - * lguest_dma" for receiving I/O. - * - * The Guest wants to bind an array of "struct lguest_dma"s to a particular key - * to receive input. This only happens when the Guest is setting up a new - * device, so it doesn't have to be very fast. - * - * It returns 1 on a successful registration (it can fail if we hit the limit - * of registrations for this Guest). - */ -int bind_dma(struct lguest *lg, - unsigned long ukey, unsigned long dmas, u16 numdmas, u8 interrupt) -{ - unsigned int i; - int ret = 0; - union futex_key key; - /* Futex code needs the mmap_sem. */ - struct rw_semaphore *fshared = ¤t->mm->mmap_sem; - - /* Invalid interrupt? (We could kill the guest here). */ - if (interrupt >= LGUEST_IRQS) - return 0; - - /* We need to grab the Big Lguest Lock, because other Guests may be - * trying to look through this Guest's DMAs to send something while - * we're doing this. */ - mutex_lock(&lguest_lock); - down_read(fshared); - if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) { - kill_guest(lg, "bad dma key %#lx", ukey); - goto unlock; - } - - /* We want to keep this key valid once we drop mmap_sem, so we have to - * hold a reference. */ - get_futex_key_refs(&key); - - /* If the Guest specified an interrupt of 0, that means they want to - * unregister this array of "struct lguest_dma"s. */ - if (interrupt == 0) - ret = unbind_dma(lg, &key, dmas); - else { - /* Look through this Guest's dma array for an unused entry. */ - for (i = 0; i < LGUEST_MAX_DMA; i++) { - /* If the interrupt is non-zero, the entry is already - * used. */ - if (lg->dma[i].interrupt) - continue; - - /* OK, a free one! Fill on our details. */ - lg->dma[i].dmas = dmas; - lg->dma[i].num_dmas = numdmas; - lg->dma[i].next_dma = 0; - lg->dma[i].key = key; - lg->dma[i].guestid = lg->guestid; - lg->dma[i].interrupt = interrupt; - - /* Now we add it to the hash table: the position - * depends on the futex key that we got. */ - list_add(&lg->dma[i].list, &dma_hash[hash(&key)]); - /* Success! */ - ret = 1; - goto unlock; - } - } - /* If we didn't find a slot to put the key in, drop the reference - * again. */ - drop_futex_key_refs(&key); -unlock: - /* Unlock and out. */ - up_read(fshared); - mutex_unlock(&lguest_lock); - return ret; -} - -/*L:385 Note that our routines to access a different Guest's memory are called - * lgread_other() and lgwrite_other(): these names emphasize that they are only - * used when the Guest is *not* the current Guest. - * - * The interface for copying from another process's memory is called - * access_process_vm(), with a final argument of 0 for a read, and 1 for a - * write. - * - * We need lgread_other() to read the destination Guest's "struct lguest_dma" - * array. */ -static int lgread_other(struct lguest *lg, - void *buf, u32 addr, unsigned bytes) -{ - if (!lguest_address_ok(lg, addr, bytes) - || access_process_vm(lg->tsk, addr, buf, bytes, 0) != bytes) { - memset(buf, 0, bytes); - kill_guest(lg, "bad address in registered DMA struct"); - return 0; - } - return 1; -} - -/* "lgwrite()" to another Guest: used to update the destination "used_len" once - * we've transferred data into the buffer. */ -static int lgwrite_other(struct lguest *lg, u32 addr, - const void *buf, unsigned bytes) -{ - if (!lguest_address_ok(lg, addr, bytes) - || (access_process_vm(lg->tsk, addr, (void *)buf, bytes, 1) - != bytes)) { - kill_guest(lg, "bad address writing to registered DMA"); - return 0; - } - return 1; -} - -/*L:400 This is the generic engine which copies from a source "struct - * lguest_dma" from this Guest into another Guest's "struct lguest_dma". The - * destination Guest's pages have already been mapped, as contained in the - * pages array. - * - * If you're wondering if there's a nice "copy from one process to another" - * routine, so was I. But Linux isn't really set up to copy between two - * unrelated processes, so we have to write it ourselves. - */ -static u32 copy_data(struct lguest *srclg, - const struct lguest_dma *src, - const struct lguest_dma *dst, - struct page *pages[]) -{ - unsigned int totlen, si, di, srcoff, dstoff; - void *maddr = NULL; - - /* We return the total length transferred. */ - totlen = 0; - - /* We keep indexes into the source and destination "struct lguest_dma", - * and an offset within each region. */ - si = di = 0; - srcoff = dstoff = 0; - - /* We loop until the source or destination is exhausted. */ - while (si < LGUEST_MAX_DMA_SECTIONS && src->len[si] - && di < LGUEST_MAX_DMA_SECTIONS && dst->len[di]) { - /* We can only transfer the rest of the src buffer, or as much - * as will fit into the destination buffer. */ - u32 len = min(src->len[si] - srcoff, dst->len[di] - dstoff); - - /* For systems using "highmem" we need to use kmap() to access - * the page we want. We often use the same page over and over, - * so rather than kmap() it on every loop, we set the maddr - * pointer to NULL when we need to move to the next - * destination page. */ - if (!maddr) - maddr = kmap(pages[di]); - - /* Copy directly from (this Guest's) source address to the - * destination Guest's kmap()ed buffer. Note that maddr points - * to the start of the page: we need to add the offset of the - * destination address and offset within the buffer. */ - - /* FIXME: This is not completely portable. I looked at - * copy_to_user_page(), and some arch's seem to need special - * flushes. x86 is fine. */ - if (copy_from_user(maddr + (dst->addr[di] + dstoff)%PAGE_SIZE, - (void __user *)src->addr[si], len) != 0) { - /* If a copy failed, it's the source's fault. */ - kill_guest(srclg, "bad address in sending DMA"); - totlen = 0; - break; - } - - /* Increment the total and src & dst offsets */ - totlen += len; - srcoff += len; - dstoff += len; - - /* Presumably we reached the end of the src or dest buffers: */ - if (srcoff == src->len[si]) { - /* Move to the next buffer at offset 0 */ - si++; - srcoff = 0; - } - if (dstoff == dst->len[di]) { - /* We need to unmap that destination page and reset - * maddr ready for the next one. */ - kunmap(pages[di]); - maddr = NULL; - di++; - dstoff = 0; - } - } - - /* If we still had a page mapped at the end, unmap now. */ - if (maddr) - kunmap(pages[di]); - - return totlen; -} - -/*L:390 This is how we transfer a "struct lguest_dma" from the source Guest - * (the current Guest which called SEND_DMA) to another Guest. */ -static u32 do_dma(struct lguest *srclg, const struct lguest_dma *src, - struct lguest *dstlg, const struct lguest_dma *dst) -{ - int i; - u32 ret; - struct page *pages[LGUEST_MAX_DMA_SECTIONS]; - - /* We check that both source and destination "struct lguest_dma"s are - * within the bounds of the source and destination Guests */ - if (!check_dma_list(dstlg, dst) || !check_dma_list(srclg, src)) - return 0; - - /* We need to map the pages which correspond to each parts of - * destination buffer. */ - for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) { - if (dst->len[i] == 0) - break; - /* get_user_pages() is a complicated function, especially since - * we only want a single page. But it works, and returns the - * number of pages. Note that we're holding the destination's - * mmap_sem, as get_user_pages() requires. */ - if (get_user_pages(dstlg->tsk, dstlg->mm, - dst->addr[i], 1, 1, 1, pages+i, NULL) - != 1) { - /* This means the destination gave us a bogus buffer */ - kill_guest(dstlg, "Error mapping DMA pages"); - ret = 0; - goto drop_pages; - } - } - - /* Now copy the data until we run out of src or dst. */ - ret = copy_data(srclg, src, dst, pages); - -drop_pages: - while (--i >= 0) - put_page(pages[i]); - return ret; -} - -/*L:380 Transferring data from one Guest to another is not as simple as I'd - * like. We've found the "struct lguest_dma_info" bound to the same address as - * the send, we need to copy into it. - * - * This function returns true if the destination array was empty. */ -static int dma_transfer(struct lguest *srclg, - unsigned long udma, - struct lguest_dma_info *dst) -{ - struct lguest_dma dst_dma, src_dma; - struct lguest *dstlg; - u32 i, dma = 0; - - /* From the "struct lguest_dma_info" we found in the hash, grab the - * Guest. */ - dstlg = &lguests[dst->guestid]; - /* Read in the source "struct lguest_dma" handed to SEND_DMA. */ - lgread(srclg, &src_dma, udma, sizeof(src_dma)); - - /* We need the destination's mmap_sem, and we already hold the source's - * mmap_sem for the futex key lookup. Normally this would suggest that - * we could deadlock if the destination Guest was trying to send to - * this source Guest at the same time, which is another reason that all - * I/O is done under the big lguest_lock. */ - down_read(&dstlg->mm->mmap_sem); - - /* Look through the destination DMA array for an available buffer. */ - for (i = 0; i < dst->num_dmas; i++) { - /* We keep a "next_dma" pointer which often helps us avoid - * looking at lots of previously-filled entries. */ - dma = (dst->next_dma + i) % dst->num_dmas; - if (!lgread_other(dstlg, &dst_dma, - dst->dmas + dma * sizeof(struct lguest_dma), - sizeof(dst_dma))) { - goto fail; - } - if (!dst_dma.used_len) - break; - } - - /* If we found a buffer, we do the actual data copy. */ - if (i != dst->num_dmas) { - unsigned long used_lenp; - unsigned int ret; - - ret = do_dma(srclg, &src_dma, dstlg, &dst_dma); - /* Put used length in the source "struct lguest_dma"'s used_len - * field. It's a little tricky to figure out where that is, - * though. */ - lgwrite_u32(srclg, - udma+offsetof(struct lguest_dma, used_len), ret); - /* Tranferring 0 bytes is OK if the source buffer was empty. */ - if (ret == 0 && src_dma.len[0] != 0) - goto fail; - - /* The destination Guest might be running on a different CPU: - * we have to make sure that it will see the "used_len" field - * change to non-zero *after* it sees the data we copied into - * the buffer. Hence a write memory barrier. */ - wmb(); - /* Figuring out where the destination's used_len field for this - * "struct lguest_dma" in the array is also a little ugly. */ - used_lenp = dst->dmas - + dma * sizeof(struct lguest_dma) - + offsetof(struct lguest_dma, used_len); - lgwrite_other(dstlg, used_lenp, &ret, sizeof(ret)); - /* Move the cursor for next time. */ - dst->next_dma++; - } - up_read(&dstlg->mm->mmap_sem); - - /* We trigger the destination interrupt, even if the destination was - * empty and we didn't transfer anything: this gives them a chance to - * wake up and refill. */ - set_bit(dst->interrupt, dstlg->irqs_pending); - /* Wake up the destination process. */ - wake_up_process(dstlg->tsk); - /* If we passed the last "struct lguest_dma", the receive had no - * buffers left. */ - return i == dst->num_dmas; - -fail: - up_read(&dstlg->mm->mmap_sem); - return 0; -} - -/*L:370 This is the counter-side to the BIND_DMA hypercall; the SEND_DMA - * hypercall. We find out who's listening, and send to them. */ -void send_dma(struct lguest *lg, unsigned long ukey, unsigned long udma) -{ - union futex_key key; - int empty = 0; - struct rw_semaphore *fshared = ¤t->mm->mmap_sem; - -again: - mutex_lock(&lguest_lock); - down_read(fshared); - /* Get the futex key for the key the Guest gave us */ - if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) { - kill_guest(lg, "bad sending DMA key"); - goto unlock; - } - /* Since the key must be a multiple of 4, the futex key uses the lower - * bit of the "offset" field (which would always be 0) to indicate a - * mapping which is shared with other processes (ie. Guests). */ - if (key.shared.offset & 1) { - struct lguest_dma_info *i; - /* Look through the hash for other Guests. */ - list_for_each_entry(i, &dma_hash[hash(&key)], list) { - /* Don't send to ourselves. */ - if (i->guestid == lg->guestid) - continue; - if (!key_eq(&key, &i->key)) - continue; - - /* If dma_transfer() tells us the destination has no - * available buffers, we increment "empty". */ - empty += dma_transfer(lg, udma, i); - break; - } - /* If the destination is empty, we release our locks and - * give the destination Guest a brief chance to restock. */ - if (empty == 1) { - /* Give any recipients one chance to restock. */ - up_read(¤t->mm->mmap_sem); - mutex_unlock(&lguest_lock); - /* Next time, we won't try again. */ - empty++; - goto again; - } - } else { - /* Private mapping: Guest is sending to its Launcher. We set - * the "dma_is_pending" flag so that the main loop will exit - * and the Launcher's read() from /dev/lguest will return. */ - lg->dma_is_pending = 1; - lg->pending_dma = udma; - lg->pending_key = ukey; - } -unlock: - up_read(fshared); - mutex_unlock(&lguest_lock); -} -/*:*/ - -void release_all_dma(struct lguest *lg) -{ - unsigned int i; - - BUG_ON(!mutex_is_locked(&lguest_lock)); - - down_read(&lg->mm->mmap_sem); - for (i = 0; i < LGUEST_MAX_DMA; i++) { - if (lg->dma[i].interrupt) - unlink_dma(&lg->dma[i]); - } - up_read(&lg->mm->mmap_sem); -} - -/*M:007 We only return a single DMA buffer to the Launcher, but it would be - * more efficient to return a pointer to the entire array of DMA buffers, which - * it can cache and choose one whenever it wants. - * - * Currently the Launcher uses a write to /dev/lguest, and the return value is - * the address of the DMA structure with the interrupt number placed in - * dma->used_len. If we wanted to return the entire array, we need to return - * the address, array size and interrupt number: this seems to require an - * ioctl(). :*/ - -/*L:320 This routine looks for a DMA buffer registered by the Guest on the - * given key (using the BIND_DMA hypercall). */ -unsigned long get_dma_buffer(struct lguest *lg, - unsigned long ukey, unsigned long *interrupt) -{ - unsigned long ret = 0; - union futex_key key; - struct lguest_dma_info *i; - struct rw_semaphore *fshared = ¤t->mm->mmap_sem; - - /* Take the Big Lguest Lock to stop other Guests sending this Guest DMA - * at the same time. */ - mutex_lock(&lguest_lock); - /* To match between Guests sharing the same underlying memory we steal - * code from the futex infrastructure. This requires that we hold the - * "mmap_sem" for our process (the Launcher), and pass it to the futex - * code. */ - down_read(fshared); - - /* This can fail if it's not a valid address, or if the address is not - * divisible by 4 (the futex code needs that, we don't really). */ - if (get_futex_key((u32 __user *)ukey, fshared, &key) != 0) { - kill_guest(lg, "bad registered DMA buffer"); - goto unlock; - } - /* Search the hash table for matching entries (the Launcher can only - * send to its own Guest for the moment, so the entry must be for this - * Guest) */ - list_for_each_entry(i, &dma_hash[hash(&key)], list) { - if (key_eq(&key, &i->key) && i->guestid == lg->guestid) { - unsigned int j; - /* Look through the registered DMA array for an - * available buffer. */ - for (j = 0; j < i->num_dmas; j++) { - struct lguest_dma dma; - - ret = i->dmas + j * sizeof(struct lguest_dma); - lgread(lg, &dma, ret, sizeof(dma)); - if (dma.used_len == 0) - break; - } - /* Store the interrupt the Guest wants when the buffer - * is used. */ - *interrupt = i->interrupt; - break; - } - } -unlock: - up_read(fshared); - mutex_unlock(&lguest_lock); - return ret; -} -/*:*/ - -/*L:410 This really has completed the Launcher. Not only have we now finished - * the longest chapter in our journey, but this also means we are over halfway - * through! - * - * Enough prevaricating around the bush: it is time for us to dive into the - * core of the Host, in "make Host". - */ diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 64f0abe..d9144be 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -1,119 +1,25 @@ #ifndef _LGUEST_H #define _LGUEST_H -#include <asm/desc.h> - -#define GDT_ENTRY_LGUEST_CS 10 -#define GDT_ENTRY_LGUEST_DS 11 -#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8) -#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8) - #ifndef __ASSEMBLY__ #include <linux/types.h> #include <linux/init.h> #include <linux/stringify.h> -#include <linux/binfmts.h> -#include <linux/futex.h> #include <linux/lguest.h> #include <linux/lguest_launcher.h> #include <linux/wait.h> #include <linux/err.h> #include <asm/semaphore.h> -#include "irq_vectors.h" - -#define GUEST_PL 1 -struct lguest_regs -{ - /* Manually saved part. */ - unsigned long ebx, ecx, edx; - unsigned long esi, edi, ebp; - unsigned long gs; - unsigned long eax; - unsigned long fs, ds, es; - unsigned long trapnum, errcode; - /* Trap pushed part */ - unsigned long eip; - unsigned long cs; - unsigned long eflags; - unsigned long esp; - unsigned long ss; -}; +#include <asm/lguest.h> void free_pagetables(void); int init_pagetables(struct page **switcher_page, unsigned int pages); -/* Full 4G segment descriptors, suitable for CS and DS. */ -#define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00}) -#define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300}) - -struct lguest_dma_info -{ - struct list_head list; - union futex_key key; - unsigned long dmas; - u16 next_dma; - u16 num_dmas; - u16 guestid; - u8 interrupt; /* 0 when not registered */ -}; - -/*H:310 The page-table code owes a great debt of gratitude to Andi Kleen. He - * reviewed the original code which used "u32" for all page table entries, and - * insisted that it would be far clearer with explicit typing. I thought it - * was overkill, but he was right: it is much clearer than it was before. - * - * We have separate types for the Guest's ptes & pgds and the shadow ptes & - * pgds. There's already a Linux type for these (pte_t and pgd_t) but they - * change depending on kernel config options (PAE). */ - -/* Each entry is identical: lower 12 bits of flags and upper 20 bits for the - * "page frame number" (0 == first physical page, etc). They are different - * types so the compiler will warn us if we mix them improperly. */ -typedef union { - struct { unsigned flags:12, pfn:20; }; - struct { unsigned long val; } raw; -} spgd_t; -typedef union { - struct { unsigned flags:12, pfn:20; }; - struct { unsigned long val; } raw; -} spte_t; -typedef union { - struct { unsigned flags:12, pfn:20; }; - struct { unsigned long val; } raw; -} gpgd_t; -typedef union { - struct { unsigned flags:12, pfn:20; }; - struct { unsigned long val; } raw; -} gpte_t; - -/* We have two convenient macros to convert a "raw" value as handed to us by - * the Guest into the correct Guest PGD or PTE type. */ -#define mkgpte(_val) ((gpte_t){.raw.val = _val}) -#define mkgpgd(_val) ((gpgd_t){.raw.val = _val}) -/*:*/ - struct pgdir { - unsigned long cr3; - spgd_t *pgdir; -}; - -/* This is a guest-specific page (mapped ro) into the guest. */ -struct lguest_ro_state -{ - /* Host information we need to restore when we switch back. */ - u32 host_cr3; - struct Xgt_desc_struct host_idt_desc; - struct Xgt_desc_struct host_gdt_desc; - u32 host_sp; - - /* Fields which are used when guest is running. */ - struct Xgt_desc_struct guest_idt_desc; - struct Xgt_desc_struct guest_gdt_desc; - struct i386_hw_tss guest_tss; - struct desc_struct guest_idt[IDT_ENTRIES]; - struct desc_struct guest_gdt[GDT_ENTRIES]; + unsigned long gpgdir; + pgd_t *pgdir; }; /* We have two pages shared with guests, per cpu. */ @@ -141,9 +47,11 @@ struct lguest struct lguest_data __user *lguest_data; struct task_struct *tsk; struct mm_struct *mm; /* == tsk->mm, but that becomes NULL on exit */ - u16 guestid; u32 pfn_limit; - u32 page_offset; + /* This provides the offset to the base of guest-physical + * memory in the Launcher. */ + void __user *mem_base; + unsigned long kernel_address; u32 cr2; int halted; int ts; @@ -151,6 +59,9 @@ struct lguest u32 esp1; u8 ss1; + /* If a hypercall was asked for, this points to the arguments. */ + struct hcall_args *hcall; + /* Do we need to stop what we're doing and return to userspace? */ int break_out; wait_queue_head_t break_wq; @@ -167,24 +78,15 @@ struct lguest struct task_struct *wake; unsigned long noirq_start, noirq_end; - int dma_is_pending; - unsigned long pending_dma; /* struct lguest_dma */ - unsigned long pending_key; /* address they're sending to */ + unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */ unsigned int stack_pages; u32 tsc_khz; - struct lguest_dma_info dma[LGUEST_MAX_DMA]; - /* Dead? */ const char *dead; - /* The GDT entries copied into lguest_ro_state when running. */ - struct desc_struct gdt[GDT_ENTRIES]; - - /* The IDT entries: some copied into lguest_ro_state when running. */ - struct desc_struct idt[FIRST_EXTERNAL_VECTOR+LGUEST_IRQS]; - struct desc_struct syscall_idt; + struct lguest_arch arch; /* Virtual clock device */ struct hrtimer hrt; @@ -193,19 +95,38 @@ struct lguest DECLARE_BITMAP(irqs_pending, LGUEST_IRQS); }; -extern struct lguest lguests[]; extern struct mutex lguest_lock; /* core.c: */ -u32 lgread_u32(struct lguest *lg, unsigned long addr); -void lgwrite_u32(struct lguest *lg, unsigned long addr, u32 val); -void lgread(struct lguest *lg, void *buf, unsigned long addr, unsigned len); -void lgwrite(struct lguest *lg, unsigned long, const void *buf, unsigned len); -int find_free_guest(void); int lguest_address_ok(const struct lguest *lg, unsigned long addr, unsigned long len); +void __lgread(struct lguest *, void *, unsigned long, unsigned); +void __lgwrite(struct lguest *, unsigned long, const void *, unsigned); + +/*L:306 Using memory-copy operations like that is usually inconvient, so we + * have the following helper macros which read and write a specific type (often + * an unsigned long). + * + * This reads into a variable of the given type then returns that. */ +#define lgread(lg, addr, type) \ + ({ type _v; __lgread((lg), &_v, (addr), sizeof(_v)); _v; }) + +/* This checks that the variable is of the given type, then writes it out. */ +#define lgwrite(lg, addr, type, val) \ + do { \ + typecheck(type, val); \ + __lgwrite((lg), (addr), &(val), sizeof(val)); \ + } while(0) +/* (end of memory access helper routines) :*/ + int run_guest(struct lguest *lg, unsigned long __user *user); +/* Helper macros to obtain the first 12 or the last 20 bits, this is only the + * first step in the migration to the kernel types. pte_pfn is already defined + * in the kernel. */ +#define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) +#define pte_flags(x) (pte_val(x) & ~PAGE_MASK) +#define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) /* interrupts_and_traps.c: */ void maybe_do_interrupt(struct lguest *lg); @@ -219,6 +140,9 @@ void copy_traps(const struct lguest *lg, struct desc_struct *idt, const unsigned long *def); void guest_set_clockevent(struct lguest *lg, unsigned long delta); void init_clockdev(struct lguest *lg); +bool check_syscall_vector(struct lguest *lg); +int init_interrupts(void); +void free_interrupts(void); /* segments.c: */ void setup_default_gdt_entries(struct lguest_ro_state *state); @@ -232,28 +156,33 @@ void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt); int init_guest_pagetable(struct lguest *lg, unsigned long pgtable); void free_guest_pagetable(struct lguest *lg); void guest_new_pagetable(struct lguest *lg, unsigned long pgtable); -void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 i); +void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); void guest_pagetable_clear_all(struct lguest *lg); void guest_pagetable_flush_user(struct lguest *lg); -void guest_set_pte(struct lguest *lg, unsigned long cr3, - unsigned long vaddr, gpte_t val); +void guest_set_pte(struct lguest *lg, unsigned long gpgdir, + unsigned long vaddr, pte_t val); void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages); int demand_page(struct lguest *info, unsigned long cr2, int errcode); void pin_page(struct lguest *lg, unsigned long vaddr); +unsigned long guest_pa(struct lguest *lg, unsigned long vaddr); +void page_table_guest_data_init(struct lguest *lg); + +/* <arch>/core.c: */ +void lguest_arch_host_init(void); +void lguest_arch_host_fini(void); +void lguest_arch_run_guest(struct lguest *lg); +void lguest_arch_handle_trap(struct lguest *lg); +int lguest_arch_init_hypercalls(struct lguest *lg); +int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args); +void lguest_arch_setup_regs(struct lguest *lg, unsigned long start); + +/* <arch>/switcher.S: */ +extern char start_switcher_text[], end_switcher_text[], switch_to_guest[]; /* lguest_user.c: */ int lguest_device_init(void); void lguest_device_remove(void); -/* io.c: */ -void lguest_io_init(void); -int bind_dma(struct lguest *lg, - unsigned long key, unsigned long udma, u16 numdmas, u8 interrupt); -void send_dma(struct lguest *info, unsigned long key, unsigned long udma); -void release_all_dma(struct lguest *lg); -unsigned long get_dma_buffer(struct lguest *lg, unsigned long key, - unsigned long *interrupt); - /* hypercalls.c: */ void do_hypercalls(struct lguest *lg); void write_timestamp(struct lguest *lg); @@ -292,9 +221,5 @@ do { \ } while(0) /* (End of aside) :*/ -static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr) -{ - return vaddr - lg->page_offset; -} #endif /* __ASSEMBLY__ */ #endif /* _LGUEST_H */ diff --git a/drivers/lguest/lguest_bus.c b/drivers/lguest/lguest_bus.c deleted file mode 100644 index 5732978..0000000 --- a/drivers/lguest/lguest_bus.c +++ /dev/null @@ -1,218 +0,0 @@ -/*P:050 Lguest guests use a very simple bus for devices. It's a simple array - * of device descriptors contained just above the top of normal memory. The - * lguest bus is 80% tedious boilerplate code. :*/ -#include <linux/init.h> -#include <linux/bootmem.h> -#include <linux/lguest_bus.h> -#include <asm/io.h> -#include <asm/paravirt.h> - -static ssize_t type_show(struct device *_dev, - struct device_attribute *attr, char *buf) -{ - struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); - return sprintf(buf, "%hu", lguest_devices[dev->index].type); -} -static ssize_t features_show(struct device *_dev, - struct device_attribute *attr, char *buf) -{ - struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); - return sprintf(buf, "%hx", lguest_devices[dev->index].features); -} -static ssize_t pfn_show(struct device *_dev, - struct device_attribute *attr, char *buf) -{ - struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); - return sprintf(buf, "%u", lguest_devices[dev->index].pfn); -} -static ssize_t status_show(struct device *_dev, - struct device_attribute *attr, char *buf) -{ - struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); - return sprintf(buf, "%hx", lguest_devices[dev->index].status); -} -static ssize_t status_store(struct device *_dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); - if (sscanf(buf, "%hi", &lguest_devices[dev->index].status) != 1) - return -EINVAL; - return count; -} -static struct device_attribute lguest_dev_attrs[] = { - __ATTR_RO(type), - __ATTR_RO(features), - __ATTR_RO(pfn), - __ATTR(status, 0644, status_show, status_store), - __ATTR_NULL -}; - -/*D:130 The generic bus infrastructure requires a function which says whether a - * device matches a driver. For us, it is simple: "struct lguest_driver" - * contains a "device_type" field which indicates what type of device it can - * handle, so we just cast the args and compare: */ -static int lguest_dev_match(struct device *_dev, struct device_driver *_drv) -{ - struct lguest_device *dev = container_of(_dev,struct lguest_device,dev); - struct lguest_driver *drv = container_of(_drv,struct lguest_driver,drv); - - return (drv->device_type == lguest_devices[dev->index].type); -} -/*:*/ - -struct lguest_bus { - struct bus_type bus; - struct device dev; -}; - -static struct lguest_bus lguest_bus = { - .bus = { - .name = "lguest", - .match = lguest_dev_match, - .dev_attrs = lguest_dev_attrs, - }, - .dev = { - .parent = NULL, - .bus_id = "lguest", - } -}; - -/*D:140 This is the callback which occurs once the bus infrastructure matches - * up a device and driver, ie. in response to add_lguest_device() calling - * device_register(), or register_lguest_driver() calling driver_register(). - * - * At the moment it's always the latter: the devices are added first, since - * scan_devices() is called from a "core_initcall", and the drivers themselves - * called later as a normal "initcall". But it would work the other way too. - * - * So now we have the happy couple, we add the status bit to indicate that we - * found a driver. If the driver truly loves the device, it will return - * happiness from its probe function (ok, perhaps this wasn't my greatest - * analogy), and we set the final "driver ok" bit so the Host sees it's all - * green. */ -static int lguest_dev_probe(struct device *_dev) -{ - int ret; - struct lguest_device*dev = container_of(_dev,struct lguest_device,dev); - struct lguest_driver*drv = container_of(dev->dev.driver, - struct lguest_driver, drv); - - lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER; - ret = drv->probe(dev); - if (ret == 0) - lguest_devices[dev->index].status |= LGUEST_DEVICE_S_DRIVER_OK; - return ret; -} - -/* The last part of the bus infrastructure is the function lguest drivers use - * to register themselves. Firstly, we do nothing if there's no lguest bus - * (ie. this is not a Guest), otherwise we fill in the embedded generic "struct - * driver" fields and call the generic driver_register(). */ -int register_lguest_driver(struct lguest_driver *drv) -{ - if (!lguest_devices) - return 0; - - drv->drv.bus = &lguest_bus.bus; - drv->drv.name = drv->name; - drv->drv.owner = drv->owner; - drv->drv.probe = lguest_dev_probe; - - return driver_register(&drv->drv); -} - -/* At the moment we build all the drivers into the kernel because they're so - * simple: 8144 bytes for all three of them as I type this. And as the console - * really needs to be built in, it's actually only 3527 bytes for the network - * and block drivers. - * - * If they get complex it will make sense for them to be modularized, so we - * need to explicitly export the symbol. - * - * I don't think non-GPL modules make sense, so it's a GPL-only export. - */ -EXPORT_SYMBOL_GPL(register_lguest_driver); - -/*D:120 This is the core of the lguest bus: actually adding a new device. - * It's a separate function because it's neater that way, and because an - * earlier version of the code supported hotplug and unplug. They were removed - * early on because they were never used. - * - * As Andrew Tridgell says, "Untested code is buggy code". - * - * It's worth reading this carefully: we start with an index into the array of - * "struct lguest_device_desc"s indicating the device which is new: */ -static void add_lguest_device(unsigned int index) -{ - struct lguest_device *new; - - /* Each "struct lguest_device_desc" has a "status" field, which the - * Guest updates as the device is probed. In the worst case, the Host - * can look at these bits to tell what part of device setup failed, - * even if the console isn't available. */ - lguest_devices[index].status |= LGUEST_DEVICE_S_ACKNOWLEDGE; - new = kmalloc(sizeof(struct lguest_device), GFP_KERNEL); - if (!new) { - printk(KERN_EMERG "Cannot allocate lguest device %u\n", index); - lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED; - return; - } - - /* The "struct lguest_device" setup is pretty straight-forward example - * code. */ - new->index = index; - new->private = NULL; - memset(&new->dev, 0, sizeof(new->dev)); - new->dev.parent = &lguest_bus.dev; - new->dev.bus = &lguest_bus.bus; - sprintf(new->dev.bus_id, "%u", index); - - /* device_register() causes the bus infrastructure to look for a - * matching driver. */ - if (device_register(&new->dev) != 0) { - printk(KERN_EMERG "Cannot register lguest device %u\n", index); - lguest_devices[index].status |= LGUEST_DEVICE_S_FAILED; - kfree(new); - } -} - -/*D:110 scan_devices() simply iterates through the device array. The type 0 - * is reserved to mean "no device", and anything else means we have found a - * device: add it. */ -static void scan_devices(void) -{ - unsigned int i; - - for (i = 0; i < LGUEST_MAX_DEVICES; i++) - if (lguest_devices[i].type) - add_lguest_device(i); -} - -/*D:100 Fairly early in boot, lguest_bus_init() is called to set up the lguest - * bus. We check that we are a Guest by checking paravirt_ops.name: there are - * other ways of checking, but this seems most obvious to me. - * - * So we can access the array of "struct lguest_device_desc"s easily, we map - * that memory and store the pointer in the global "lguest_devices". Then we - * register the bus with the core. Doing two registrations seems clunky to me, - * but it seems to be the correct sysfs incantation. - * - * Finally we call scan_devices() which adds all the devices found in the - * "struct lguest_device_desc" array. */ -static int __init lguest_bus_init(void) -{ - if (strcmp(pv_info.name, "lguest") != 0) - return 0; - - /* Devices are in a single page above top of "normal" mem */ - lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1); - - if (bus_register(&lguest_bus.bus) != 0 - || device_register(&lguest_bus.dev) != 0) - panic("lguest bus registration failed"); - - scan_devices(); - return 0; -} -/* Do this after core stuff, before devices. */ -postcore_initcall(lguest_bus_init); diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c new file mode 100644 index 0000000..71c6483 --- /dev/null +++ b/drivers/lguest/lguest_device.c @@ -0,0 +1,373 @@ +/*P:050 Lguest guests use a very simple method to describe devices. It's a + * series of device descriptors contained just above the top of normal + * memory. + * + * We use the standard "virtio" device infrastructure, which provides us with a + * console, a network and a block driver. Each one expects some configuration + * information and a "virtqueue" mechanism to send and receive data. :*/ +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/lguest_launcher.h> +#include <linux/virtio.h> +#include <linux/virtio_config.h> +#include <linux/interrupt.h> +#include <linux/virtio_ring.h> +#include <linux/err.h> +#include <asm/io.h> +#include <asm/paravirt.h> +#include <asm/lguest_hcall.h> + +/* The pointer to our (page) of device descriptions. */ +static void *lguest_devices; + +/* Unique numbering for lguest devices. */ +static unsigned int dev_index; + +/* For Guests, device memory can be used as normal memory, so we cast away the + * __iomem to quieten sparse. */ +static inline void *lguest_map(unsigned long phys_addr, unsigned long pages) +{ + return (__force void *)ioremap(phys_addr, PAGE_SIZE*pages); +} + +static inline void lguest_unmap(void *addr) +{ + iounmap((__force void __iomem *)addr); +} + +/*D:100 Each lguest device is just a virtio device plus a pointer to its entry + * in the lguest_devices page. */ +struct lguest_device { + struct virtio_device vdev; + + /* The entry in the lguest_devices page for this device. */ + struct lguest_device_desc *desc; +}; + +/* Since the virtio infrastructure hands us a pointer to the virtio_device all + * the time, it helps to have a curt macro to get a pointer to the struct + * lguest_device it's enclosed in. */ +#define to_lgdev(vdev) container_of(vdev, struct lguest_device, vdev) + +/*D:130 + * Device configurations + * + * The configuration information for a device consists of a series of fields. + * The device will look for these fields during setup. + * + * For us these fields come immediately after that device's descriptor in the + * lguest_devices page. + * + * Each field starts with a "type" byte, a "length" byte, then that number of + * bytes of configuration information. The device descriptor tells us the + * total configuration length so we know when we've reached the last field. */ + +/* type + length bytes */ +#define FHDR_LEN 2 + +/* This finds the first field of a given type for a device's configuration. */ +static void *lg_find(struct virtio_device *vdev, u8 type, unsigned int *len) +{ + struct lguest_device_desc *desc = to_lgdev(vdev)->desc; + int i; + + for (i = 0; i < desc->config_len; i += FHDR_LEN + desc->config[i+1]) { + if (desc->config[i] == type) { + /* Mark it used, so Host can know we looked at it, and + * also so we won't find the same one twice. */ + desc->config[i] |= 0x80; + /* Remember, the second byte is the length. */ + *len = desc->config[i+1]; + /* We return a pointer to the field header. */ + return desc->config + i; + } + } + + /* Not found: return NULL for failure. */ + return NULL; +} + +/* Once they've found a field, getting a copy of it is easy. */ +static void lg_get(struct virtio_device *vdev, void *token, + void *buf, unsigned len) +{ + /* Check they didn't ask for more than the length of the field! */ + BUG_ON(len > ((u8 *)token)[1]); + memcpy(buf, token + FHDR_LEN, len); +} + +/* Setting the contents is also trivial. */ +static void lg_set(struct virtio_device *vdev, void *token, + const void *buf, unsigned len) +{ + BUG_ON(len > ((u8 *)token)[1]); + memcpy(token + FHDR_LEN, buf, len); +} + +/* The operations to get and set the status word just access the status field + * of the device descriptor. */ +static u8 lg_get_status(struct virtio_device *vdev) +{ + return to_lgdev(vdev)->desc->status; +} + +static void lg_set_status(struct virtio_device *vdev, u8 status) +{ + to_lgdev(vdev)->desc->status = status; +} + +/* + * Virtqueues + * + * The other piece of infrastructure virtio needs is a "virtqueue": a way of + * the Guest device registering buffers for the other side to read from or + * write into (ie. send and receive buffers). Each device can have multiple + * virtqueues: for example the console has one queue for sending and one for + * receiving. + * + * Fortunately for us, a very fast shared-memory-plus-descriptors virtqueue + * already exists in virtio_ring.c. We just need to connect it up. + * + * We start with the information we need to keep about each virtqueue. + */ + +/*D:140 This is the information we remember about each virtqueue. */ +struct lguest_vq_info +{ + /* A copy of the information contained in the device config. */ + struct lguest_vqconfig config; + + /* The address where we mapped the virtio ring, so we can unmap it. */ + void *pages; +}; + +/* When the virtio_ring code wants to prod the Host, it calls us here and we + * make a hypercall. We hand the page number of the virtqueue so the Host + * knows which virtqueue we're talking about. */ +static void lg_notify(struct virtqueue *vq) +{ + /* We store our virtqueue information in the "priv" pointer of the + * virtqueue structure. */ + struct lguest_vq_info *lvq = vq->priv; + + hcall(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT, 0, 0); +} + +/* This routine finds the first virtqueue described in the configuration of + * this device and sets it up. + * + * This is kind of an ugly duckling. It'd be nicer to have a standard + * representation of a virtqueue in the configuration space, but it seems that + * everyone wants to do it differently. The KVM guys want the Guest to + * allocate its own pages and tell the Host where they are, but for lguest it's + * simpler for the Host to simply tell us where the pages are. + * + * So we provide devices with a "find virtqueue and set it up" function. */ +static struct virtqueue *lg_find_vq(struct virtio_device *vdev, + bool (*callback)(struct virtqueue *vq)) +{ + struct lguest_vq_info *lvq; + struct virtqueue *vq; + unsigned int len; + void *token; + int err; + + /* Look for a field of the correct type to mark a virtqueue. Note that + * if this succeeds, then the type will be changed so it won't be found + * again, and future lg_find_vq() calls will find the next + * virtqueue (if any). */ + token = vdev->config->find(vdev, VIRTIO_CONFIG_F_VIRTQUEUE, &len); + if (!token) + return ERR_PTR(-ENOENT); + + lvq = kmalloc(sizeof(*lvq), GFP_KERNEL); + if (!lvq) + return ERR_PTR(-ENOMEM); + + /* Note: we could use a configuration space inside here, just like we + * do for the device. This would allow expansion in future, because + * our configuration system is designed to be expansible. But this is + * way easier. */ + if (len != sizeof(lvq->config)) { + dev_err(&vdev->dev, "Unexpected virtio config len %u\n", len); + err = -EIO; + goto free_lvq; + } + /* Make a copy of the "struct lguest_vqconfig" field. We need a copy + * because the config space might not be aligned correctly. */ + vdev->config->get(vdev, token, &lvq->config, sizeof(lvq->config)); + + /* Figure out how many pages the ring will take, and map that memory */ + lvq->pages = lguest_map((unsigned long)lvq->config.pfn << PAGE_SHIFT, + DIV_ROUND_UP(vring_size(lvq->config.num), + PAGE_SIZE)); + if (!lvq->pages) { + err = -ENOMEM; + goto free_lvq; + } + + /* OK, tell virtio_ring.c to set up a virtqueue now we know its size + * and we've got a pointer to its pages. */ + vq = vring_new_virtqueue(lvq->config.num, vdev, lvq->pages, + lg_notify, callback); + if (!vq) { + err = -ENOMEM; + goto unmap; + } + + /* Tell the interrupt for this virtqueue to go to the virtio_ring + * interrupt handler. */ + /* FIXME: We used to have a flag for the Host to tell us we could use + * the interrupt as a source of randomness: it'd be nice to have that + * back.. */ + err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED, + vdev->dev.bus_id, vq); + if (err) + goto destroy_vring; + + /* Last of all we hook up our 'struct lguest_vq_info" to the + * virtqueue's priv pointer. */ + vq->priv = lvq; + return vq; + +destroy_vring: + vring_del_virtqueue(vq); +unmap: + lguest_unmap(lvq->pages); +free_lvq: + kfree(lvq); + return ERR_PTR(err); +} +/*:*/ + +/* Cleaning up a virtqueue is easy */ +static void lg_del_vq(struct virtqueue *vq) +{ + struct lguest_vq_info *lvq = vq->priv; + + /* Tell virtio_ring.c to free the virtqueue. */ + vring_del_virtqueue(vq); + /* Unmap the pages containing the ring. */ + lguest_unmap(lvq->pages); + /* Free our own queue information. */ + kfree(lvq); +} + +/* The ops structure which hooks everything together. */ +static struct virtio_config_ops lguest_config_ops = { + .find = lg_find, + .get = lg_get, + .set = lg_set, + .get_status = lg_get_status, + .set_status = lg_set_status, + .find_vq = lg_find_vq, + .del_vq = lg_del_vq, +}; + +/* The root device for the lguest virtio devices. This makes them appear as + * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2. */ +static struct device lguest_root = { + .parent = NULL, + .bus_id = "lguest", +}; + +/*D:120 This is the core of the lguest bus: actually adding a new device. + * It's a separate function because it's neater that way, and because an + * earlier version of the code supported hotplug and unplug. They were removed + * early on because they were never used. + * + * As Andrew Tridgell says, "Untested code is buggy code". + * + * It's worth reading this carefully: we start with a pointer to the new device + * descriptor in the "lguest_devices" page. */ +static void add_lguest_device(struct lguest_device_desc *d) +{ + struct lguest_device *ldev; + + ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); + if (!ldev) { + printk(KERN_EMERG "Cannot allocate lguest dev %u\n", + dev_index++); + return; + } + + /* This devices' parent is the lguest/ dir. */ + ldev->vdev.dev.parent = &lguest_root; + /* We have a unique device index thanks to the dev_index counter. */ + ldev->vdev.index = dev_index++; + /* The device type comes straight from the descriptor. There's also a + * device vendor field in the virtio_device struct, which we leave as + * 0. */ + ldev->vdev.id.device = d->type; + /* We have a simple set of routines for querying the device's + * configuration information and setting its status. */ + ldev->vdev.config = &lguest_config_ops; + /* And we remember the device's descriptor for lguest_config_ops. */ + ldev->desc = d; + + /* register_virtio_device() sets up the generic fields for the struct + * virtio_device and calls device_register(). This makes the bus + * infrastructure look for a matching driver. */ + if (register_virtio_device(&ldev->vdev) != 0) { + printk(KERN_ERR "Failed to register lguest device %u\n", + ldev->vdev.index); + kfree(ldev); + } +} + +/*D:110 scan_devices() simply iterates through the device page. The type 0 is + * reserved to mean "end of devices". */ +static void scan_devices(void) +{ + unsigned int i; + struct lguest_device_desc *d; + + /* We start at the page beginning, and skip over each entry. */ + for (i = 0; i < PAGE_SIZE; i += sizeof(*d) + d->config_len) { + d = lguest_devices + i; + + /* Once we hit a zero, stop. */ + if (d->type == 0) + break; + + add_lguest_device(d); + } +} + +/*D:105 Fairly early in boot, lguest_devices_init() is called to set up the + * lguest device infrastructure. We check that we are a Guest by checking + * pv_info.name: there are other ways of checking, but this seems most + * obvious to me. + * + * So we can access the "struct lguest_device_desc"s easily, we map that memory + * and store the pointer in the global "lguest_devices". Then we register a + * root device from which all our devices will hang (this seems to be the + * correct sysfs incantation). + * + * Finally we call scan_devices() which adds all the devices found in the + * lguest_devices page. */ +static int __init lguest_devices_init(void) +{ + if (strcmp(pv_info.name, "lguest") != 0) + return 0; + + if (device_register(&lguest_root) != 0) + panic("Could not register lguest root"); + + /* Devices are in a single page above top of "normal" mem */ + lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1); + + scan_devices(); + return 0; +} +/* We do this after core stuff, but before the drivers. */ +postcore_initcall(lguest_devices_init); + +/*D:150 At this point in the journey we used to now wade through the lguest + * devices themselves: net, block and console. Since they're all now virtio + * devices rather than lguest-specific, I've decided to ignore them. Mostly, + * they're kind of boring. But this does mean you'll never experience the + * thrill of reading the forbidden love scene buried deep in the block driver. + * + * "make Launcher" beckons, where we answer questions like "Where do Guests + * come from?", and "What do you do when someone asks for optimization?". */ diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 80d1b58..ee405b3 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -1,73 +1,17 @@ /*P:200 This contains all the /dev/lguest code, whereby the userspace launcher * controls and communicates with the Guest. For example, the first write will - * tell us the memory size, pagetable, entry point and kernel address offset. - * A read will run the Guest until a signal is pending (-EINTR), or the Guest - * does a DMA out to the Launcher. Writes are also used to get a DMA buffer - * registered by the Guest and to send the Guest an interrupt. :*/ + * tell us the Guest's memory layout, pagetable, entry point and kernel address + * offset. A read will run the Guest until something happens, such as a signal + * or the Guest doing a NOTIFY out to the Launcher. :*/ #include <linux/uaccess.h> #include <linux/miscdevice.h> #include <linux/fs.h> #include "lg.h" -/*L:030 setup_regs() doesn't really belong in this file, but it gives us an - * early glimpse deeper into the Host so it's worth having here. - * - * Most of the Guest's registers are left alone: we used get_zeroed_page() to - * allocate the structure, so they will be 0. */ -static void setup_regs(struct lguest_regs *regs, unsigned long start) -{ - /* There are four "segment" registers which the Guest needs to boot: - * The "code segment" register (cs) refers to the kernel code segment - * __KERNEL_CS, and the "data", "extra" and "stack" segment registers - * refer to the kernel data segment __KERNEL_DS. - * - * The privilege level is packed into the lower bits. The Guest runs - * at privilege level 1 (GUEST_PL).*/ - regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL; - regs->cs = __KERNEL_CS|GUEST_PL; - - /* The "eflags" register contains miscellaneous flags. Bit 1 (0x002) - * is supposed to always be "1". Bit 9 (0x200) controls whether - * interrupts are enabled. We always leave interrupts enabled while - * running the Guest. */ - regs->eflags = 0x202; - - /* The "Extended Instruction Pointer" register says where the Guest is - * running. */ - regs->eip = start; - - /* %esi points to our boot information, at physical address 0, so don't - * touch it. */ -} - -/*L:310 To send DMA into the Guest, the Launcher needs to be able to ask for a - * DMA buffer. This is done by writing LHREQ_GETDMA and the key to - * /dev/lguest. */ -static long user_get_dma(struct lguest *lg, const u32 __user *input) -{ - unsigned long key, udma, irq; - - /* Fetch the key they wrote to us. */ - if (get_user(key, input) != 0) - return -EFAULT; - /* Look for a free Guest DMA buffer bound to that key. */ - udma = get_dma_buffer(lg, key, &irq); - if (!udma) - return -ENOENT; - - /* We need to tell the Launcher what interrupt the Guest expects after - * the buffer is filled. We stash it in udma->used_len. */ - lgwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq); - - /* The (guest-physical) address of the DMA buffer is returned from - * the write(). */ - return udma; -} - /*L:315 To force the Guest to stop running and return to the Launcher, the * Waker sets writes LHREQ_BREAK and the value "1" to /dev/lguest. The * Launcher then writes LHREQ_BREAK and "0" to release the Waker. */ -static int break_guest_out(struct lguest *lg, const u32 __user *input) +static int break_guest_out(struct lguest *lg, const unsigned long __user *input) { unsigned long on; @@ -90,9 +34,9 @@ static int break_guest_out(struct lguest *lg, const u32 __user *input) /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt * number to /dev/lguest. */ -static int user_send_irq(struct lguest *lg, const u32 __user *input) +static int user_send_irq(struct lguest *lg, const unsigned long __user *input) { - u32 irq; + unsigned long irq; if (get_user(irq, input) != 0) return -EFAULT; @@ -133,17 +77,19 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) return len; } - /* If we returned from read() last time because the Guest sent DMA, + /* If we returned from read() last time because the Guest notified, * clear the flag. */ - if (lg->dma_is_pending) - lg->dma_is_pending = 0; + if (lg->pending_notify) + lg->pending_notify = 0; /* Run the Guest until something interesting happens. */ return run_guest(lg, (unsigned long __user *)user); } -/*L:020 The initialization write supplies 4 32-bit values (in addition to the - * 32-bit LHREQ_INITIALIZE value). These are: +/*L:020 The initialization write supplies 4 pointer sized (32 or 64 bit) + * values (in addition to the LHREQ_INITIALIZE value). These are: + * + * base: The start of the Guest-physical memory inside the Launcher memory. * * pfnlimit: The highest (Guest-physical) page number the Guest should be * allowed to access. The Launcher has to live in Guest memory, so it sets @@ -153,23 +99,17 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) * pagetables (which are set up by the Launcher). * * start: The first instruction to execute ("eip" in x86-speak). - * - * page_offset: The PAGE_OFFSET constant in the Guest kernel. We should - * probably wean the code off this, but it's a very useful constant! Any - * address above this is within the Guest kernel, and any kernel address can - * quickly converted from physical to virtual by adding PAGE_OFFSET. It's - * 0xC0000000 (3G) by default, but it's configurable at kernel build time. */ -static int initialize(struct file *file, const u32 __user *input) +static int initialize(struct file *file, const unsigned long __user *input) { /* "struct lguest" contains everything we (the Host) know about a * Guest. */ struct lguest *lg; - int err, i; - u32 args[4]; + int err; + unsigned long args[4]; - /* We grab the Big Lguest lock, which protects the global array - * "lguests" and multiple simultaneous initializations. */ + /* We grab the Big Lguest lock, which protects against multiple + * simultaneous initializations. */ mutex_lock(&lguest_lock); /* You can't initialize twice! Close the device and start again... */ if (file->private_data) { @@ -182,20 +122,15 @@ static int initialize(struct file *file, const u32 __user *input) goto unlock; } - /* Find an unused guest. */ - i = find_free_guest(); - if (i < 0) { - err = -ENOSPC; + lg = kzalloc(sizeof(*lg), GFP_KERNEL); + if (!lg) { + err = -ENOMEM; goto unlock; } - /* OK, we have an index into the "lguest" array: "lg" is a convenient - * pointer. */ - lg = &lguests[i]; /* Populate the easy fields of our "struct lguest" */ - lg->guestid = i; - lg->pfn_limit = args[0]; - lg->page_offset = args[3]; + lg->mem_base = (void __user *)(long)args[0]; + lg->pfn_limit = args[1]; /* We need a complete page for the Guest registers: they are accessible * to the Guest and we can only grant it access to whole pages. */ @@ -210,17 +145,13 @@ static int initialize(struct file *file, const u32 __user *input) /* Initialize the Guest's shadow page tables, using the toplevel * address the Launcher gave us. This allocates memory, so can * fail. */ - err = init_guest_pagetable(lg, args[1]); + err = init_guest_pagetable(lg, args[2]); if (err) goto free_regs; /* Now we initialize the Guest's registers, handing it the start * address. */ - setup_regs(lg->regs, args[2]); - - /* There are a couple of GDT entries the Guest expects when first - * booting. */ - setup_guest_gdt(lg); + lguest_arch_setup_regs(lg, args[3]); /* The timer for lguest's clock needs initialization. */ init_clockdev(lg); @@ -260,18 +191,19 @@ unlock: /*L:010 The first operation the Launcher does must be a write. All writes * start with a 32 bit number: for the first write this must be * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use - * writes of other values to get DMA buffers and send interrupts. */ -static ssize_t write(struct file *file, const char __user *input, + * writes of other values to send interrupts. */ +static ssize_t write(struct file *file, const char __user *in, size_t size, loff_t *off) { /* Once the guest is initialized, we hold the "struct lguest" in the * file private data. */ struct lguest *lg = file->private_data; - u32 req; + const unsigned long __user *input = (const unsigned long __user *)in; + unsigned long req; if (get_user(req, input) != 0) return -EFAULT; - input += sizeof(req); + input++; /* If you haven't initialized, you must do that first. */ if (req != LHREQ_INITIALIZE && !lg) @@ -287,13 +219,11 @@ static ssize_t write(struct file *file, const char __user *input, switch (req) { case LHREQ_INITIALIZE: - return initialize(file, (const u32 __user *)input); - case LHREQ_GETDMA: - return user_get_dma(lg, (const u32 __user *)input); + return initialize(file, input); case LHREQ_IRQ: - return user_send_irq(lg, (const u32 __user *)input); + return user_send_irq(lg, input); case LHREQ_BREAK: - return break_guest_out(lg, (const u32 __user *)input); + return break_guest_out(lg, input); default: return -EINVAL; } @@ -319,8 +249,6 @@ static int close(struct inode *inode, struct file *file) mutex_lock(&lguest_lock); /* Cancels the hrtimer set via LHCALL_SET_CLOCKEVENT. */ hrtimer_cancel(&lg->hrt); - /* Free any DMA buffers the Guest had bound. */ - release_all_dma(lg); /* Free up the shadow page tables for the Guest. */ free_guest_pagetable(lg); /* Now all the memory cleanups are done, it's safe to release the diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index b7a924a..2a45f06 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -13,6 +13,7 @@ #include <linux/random.h> #include <linux/percpu.h> #include <asm/tlbflush.h> +#include <asm/uaccess.h> #include "lg.h" /*M:008 We hold reference to pages, which prevents them from being swapped. @@ -44,44 +45,32 @@ * (vii) Setting up the page tables initially. :*/ -/* Pages a 4k long, and each page table entry is 4 bytes long, giving us 1024 - * (or 2^10) entries per page. */ -#define PTES_PER_PAGE_SHIFT 10 -#define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT) /* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is * conveniently placed at the top 4MB, so it uses a separate, complete PTE * page. */ -#define SWITCHER_PGD_INDEX (PTES_PER_PAGE - 1) +#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) /* We actually need a separate PTE page for each CPU. Remember that after the * Switcher code itself comes two pages for each CPU, and we don't want this * CPU's guest to see the pages of any other CPU. */ -static DEFINE_PER_CPU(spte_t *, switcher_pte_pages); +static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) /*H:320 With our shadow and Guest types established, we need to deal with * them: the page table code is curly enough to need helper functions to keep * it clear and clean. * - * The first helper takes a virtual address, and says which entry in the top - * level page table deals with that address. Since each top level entry deals - * with 4M, this effectively divides by 4M. */ -static unsigned vaddr_to_pgd_index(unsigned long vaddr) -{ - return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); -} - -/* There are two functions which return pointers to the shadow (aka "real") + * There are two functions which return pointers to the shadow (aka "real") * page tables. * * spgd_addr() takes the virtual address and returns a pointer to the top-level * page directory entry for that address. Since we keep track of several page * tables, the "i" argument tells us which one we're interested in (it's * usually the current one). */ -static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) +static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) { - unsigned int index = vaddr_to_pgd_index(vaddr); + unsigned int index = pgd_index(vaddr); /* We kill any Guest trying to touch the Switcher addresses. */ if (index >= SWITCHER_PGD_INDEX) { @@ -95,28 +84,28 @@ static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) /* This routine then takes the PGD entry given above, which contains the * address of the PTE page. It then returns a pointer to the PTE entry for the * given address. */ -static spte_t *spte_addr(struct lguest *lg, spgd_t spgd, unsigned long vaddr) +static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr) { - spte_t *page = __va(spgd.pfn << PAGE_SHIFT); + pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); /* You should never call this if the PGD entry wasn't valid */ - BUG_ON(!(spgd.flags & _PAGE_PRESENT)); - return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE]; + BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); + return &page[(vaddr >> PAGE_SHIFT) % PTRS_PER_PTE]; } /* These two functions just like the above two, except they access the Guest * page tables. Hence they return a Guest address. */ static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr) { - unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); - return lg->pgdirs[lg->pgdidx].cr3 + index * sizeof(gpgd_t); + unsigned int index = vaddr >> (PGDIR_SHIFT); + return lg->pgdirs[lg->pgdidx].gpgdir + index * sizeof(pgd_t); } static unsigned long gpte_addr(struct lguest *lg, - gpgd_t gpgd, unsigned long vaddr) + pgd_t gpgd, unsigned long vaddr) { - unsigned long gpage = gpgd.pfn << PAGE_SHIFT; - BUG_ON(!(gpgd.flags & _PAGE_PRESENT)); - return gpage + ((vaddr>>PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(gpte_t); + unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; + BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); + return gpage + ((vaddr>>PAGE_SHIFT) % PTRS_PER_PTE) * sizeof(pte_t); } /*H:350 This routine takes a page number given by the Guest and converts it to @@ -149,53 +138,55 @@ static unsigned long get_pfn(unsigned long virtpfn, int write) * entry can be a little tricky. The flags are (almost) the same, but the * Guest PTE contains a virtual page number: the CPU needs the real page * number. */ -static spte_t gpte_to_spte(struct lguest *lg, gpte_t gpte, int write) +static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write) { - spte_t spte; - unsigned long pfn; + unsigned long pfn, base, flags; /* The Guest sets the global flag, because it thinks that it is using * PGE. We only told it to use PGE so it would tell us whether it was * flushing a kernel mapping or a userspace mapping. We don't actually * use the global bit, so throw it away. */ - spte.flags = (gpte.flags & ~_PAGE_GLOBAL); + flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); + + /* The Guest's pages are offset inside the Launcher. */ + base = (unsigned long)lg->mem_base / PAGE_SIZE; /* We need a temporary "unsigned long" variable to hold the answer from * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't * fit in spte.pfn. get_pfn() finds the real physical number of the * page, given the virtual number. */ - pfn = get_pfn(gpte.pfn, write); + pfn = get_pfn(base + pte_pfn(gpte), write); if (pfn == -1UL) { - kill_guest(lg, "failed to get page %u", gpte.pfn); + kill_guest(lg, "failed to get page %lu", pte_pfn(gpte)); /* When we destroy the Guest, we'll go through the shadow page * tables and release_pte() them. Make sure we don't think * this one is valid! */ - spte.flags = 0; + flags = 0; } - /* Now we assign the page number, and our shadow PTE is complete. */ - spte.pfn = pfn; - return spte; + /* Now we assemble our shadow PTE from the page number and flags. */ + return pfn_pte(pfn, __pgprot(flags)); } /*H:460 And to complete the chain, release_pte() looks like this: */ -static void release_pte(spte_t pte) +static void release_pte(pte_t pte) { /* Remember that get_user_pages() took a reference to the page, in * get_pfn()? We have to put it back now. */ - if (pte.flags & _PAGE_PRESENT) - put_page(pfn_to_page(pte.pfn)); + if (pte_flags(pte) & _PAGE_PRESENT) + put_page(pfn_to_page(pte_pfn(pte))); } /*:*/ -static void check_gpte(struct lguest *lg, gpte_t gpte) +static void check_gpte(struct lguest *lg, pte_t gpte) { - if ((gpte.flags & (_PAGE_PWT|_PAGE_PSE)) || gpte.pfn >= lg->pfn_limit) + if ((pte_flags(gpte) & (_PAGE_PWT|_PAGE_PSE)) + || pte_pfn(gpte) >= lg->pfn_limit) kill_guest(lg, "bad page table entry"); } -static void check_gpgd(struct lguest *lg, gpgd_t gpgd) +static void check_gpgd(struct lguest *lg, pgd_t gpgd) { - if ((gpgd.flags & ~_PAGE_TABLE) || gpgd.pfn >= lg->pfn_limit) + if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || pgd_pfn(gpgd) >= lg->pfn_limit) kill_guest(lg, "bad page directory entry"); } @@ -211,21 +202,21 @@ static void check_gpgd(struct lguest *lg, gpgd_t gpgd) * true. */ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) { - gpgd_t gpgd; - spgd_t *spgd; + pgd_t gpgd; + pgd_t *spgd; unsigned long gpte_ptr; - gpte_t gpte; - spte_t *spte; + pte_t gpte; + pte_t *spte; /* First step: get the top-level Guest page table entry. */ - gpgd = mkgpgd(lgread_u32(lg, gpgd_addr(lg, vaddr))); + gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t); /* Toplevel not present? We can't map it in. */ - if (!(gpgd.flags & _PAGE_PRESENT)) + if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) return 0; /* Now look at the matching shadow entry. */ spgd = spgd_addr(lg, lg->pgdidx, vaddr); - if (!(spgd->flags & _PAGE_PRESENT)) { + if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { /* No shadow entry: allocate a new shadow PTE page. */ unsigned long ptepage = get_zeroed_page(GFP_KERNEL); /* This is not really the Guest's fault, but killing it is @@ -238,34 +229,35 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) check_gpgd(lg, gpgd); /* And we copy the flags to the shadow PGD entry. The page * number in the shadow PGD is the page we just allocated. */ - spgd->raw.val = (__pa(ptepage) | gpgd.flags); + *spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd)); } /* OK, now we look at the lower level in the Guest page table: keep its * address, because we might update it later. */ gpte_ptr = gpte_addr(lg, gpgd, vaddr); - gpte = mkgpte(lgread_u32(lg, gpte_ptr)); + gpte = lgread(lg, gpte_ptr, pte_t); /* If this page isn't in the Guest page tables, we can't page it in. */ - if (!(gpte.flags & _PAGE_PRESENT)) + if (!(pte_flags(gpte) & _PAGE_PRESENT)) return 0; /* Check they're not trying to write to a page the Guest wants * read-only (bit 2 of errcode == write). */ - if ((errcode & 2) && !(gpte.flags & _PAGE_RW)) + if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) return 0; /* User access to a kernel page? (bit 3 == user access) */ - if ((errcode & 4) && !(gpte.flags & _PAGE_USER)) + if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) return 0; /* Check that the Guest PTE flags are OK, and the page number is below * the pfn_limit (ie. not mapping the Launcher binary). */ check_gpte(lg, gpte); /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ - gpte.flags |= _PAGE_ACCESSED; + gpte = pte_mkyoung(gpte); + if (errcode & 2) - gpte.flags |= _PAGE_DIRTY; + gpte = pte_mkdirty(gpte); /* Get the pointer to the shadow PTE entry we're going to set. */ spte = spte_addr(lg, *spgd, vaddr); @@ -275,21 +267,18 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) /* If this is a write, we insist that the Guest page is writable (the * final arg to gpte_to_spte()). */ - if (gpte.flags & _PAGE_DIRTY) + if (pte_dirty(gpte)) *spte = gpte_to_spte(lg, gpte, 1); - else { + else /* If this is a read, don't set the "writable" bit in the page * table entry, even if the Guest says it's writable. That way * we come back here when a write does actually ocur, so we can * update the Guest's _PAGE_DIRTY flag. */ - gpte_t ro_gpte = gpte; - ro_gpte.flags &= ~_PAGE_RW; - *spte = gpte_to_spte(lg, ro_gpte, 0); - } + *spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0); /* Finally, we write the Guest PTE entry back: we've set the * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ - lgwrite_u32(lg, gpte_ptr, gpte.raw.val); + lgwrite(lg, gpte_ptr, pte_t, gpte); /* We succeeded in mapping the page! */ return 1; @@ -305,17 +294,18 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) * mapped by the shadow page tables, and is it writable? */ static int page_writable(struct lguest *lg, unsigned long vaddr) { - spgd_t *spgd; + pgd_t *spgd; unsigned long flags; /* Look at the top level entry: is it present? */ spgd = spgd_addr(lg, lg->pgdidx, vaddr); - if (!(spgd->flags & _PAGE_PRESENT)) + if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) return 0; /* Check the flags on the pte entry itself: it must be present and * writable. */ - flags = spte_addr(lg, *spgd, vaddr)->flags; + flags = pte_flags(*(spte_addr(lg, *spgd, vaddr))); + return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); } @@ -329,22 +319,22 @@ void pin_page(struct lguest *lg, unsigned long vaddr) } /*H:450 If we chase down the release_pgd() code, it looks like this: */ -static void release_pgd(struct lguest *lg, spgd_t *spgd) +static void release_pgd(struct lguest *lg, pgd_t *spgd) { /* If the entry's not present, there's nothing to release. */ - if (spgd->flags & _PAGE_PRESENT) { + if (pgd_flags(*spgd) & _PAGE_PRESENT) { unsigned int i; /* Converting the pfn to find the actual PTE page is easy: turn * the page number into a physical address, then convert to a * virtual address (easy for kernel pages like this one). */ - spte_t *ptepage = __va(spgd->pfn << PAGE_SHIFT); + pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); /* For each entry in the page, we might need to release it. */ - for (i = 0; i < PTES_PER_PAGE; i++) + for (i = 0; i < PTRS_PER_PTE; i++) release_pte(ptepage[i]); /* Now we can free the page of PTEs */ free_page((long)ptepage); /* And zero out the PGD entry we we never release it twice. */ - spgd->raw.val = 0; + *spgd = __pgd(0); } } @@ -356,7 +346,7 @@ static void flush_user_mappings(struct lguest *lg, int idx) { unsigned int i; /* Release every pgd entry up to the kernel's address. */ - for (i = 0; i < vaddr_to_pgd_index(lg->page_offset); i++) + for (i = 0; i < pgd_index(lg->kernel_address); i++) release_pgd(lg, lg->pgdirs[idx].pgdir + i); } @@ -369,6 +359,25 @@ void guest_pagetable_flush_user(struct lguest *lg) } /*:*/ +/* We walk down the guest page tables to get a guest-physical address */ +unsigned long guest_pa(struct lguest *lg, unsigned long vaddr) +{ + pgd_t gpgd; + pte_t gpte; + + /* First step: get the top-level Guest page table entry. */ + gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t); + /* Toplevel not present? We can't map it in. */ + if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) + kill_guest(lg, "Bad address %#lx", vaddr); + + gpte = lgread(lg, gpte_addr(lg, gpgd, vaddr), pte_t); + if (!(pte_flags(gpte) & _PAGE_PRESENT)) + kill_guest(lg, "Bad address %#lx", vaddr); + + return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); +} + /* We keep several page tables. This is a simple routine to find the page * table (if any) corresponding to this top-level address the Guest has given * us. */ @@ -376,7 +385,7 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) { unsigned int i; for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) - if (lg->pgdirs[i].cr3 == pgtable) + if (lg->pgdirs[i].gpgdir == pgtable) break; return i; } @@ -385,7 +394,7 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) * allocate a new one (and so the kernel parts are not there), we set * blank_pgdir. */ static unsigned int new_pgdir(struct lguest *lg, - unsigned long cr3, + unsigned long gpgdir, int *blank_pgdir) { unsigned int next; @@ -395,7 +404,7 @@ static unsigned int new_pgdir(struct lguest *lg, next = random32() % ARRAY_SIZE(lg->pgdirs); /* If it's never been allocated at all before, try now. */ if (!lg->pgdirs[next].pgdir) { - lg->pgdirs[next].pgdir = (spgd_t *)get_zeroed_page(GFP_KERNEL); + lg->pgdirs[next].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); /* If the allocation fails, just keep using the one we have */ if (!lg->pgdirs[next].pgdir) next = lg->pgdidx; @@ -405,7 +414,7 @@ static unsigned int new_pgdir(struct lguest *lg, *blank_pgdir = 1; } /* Record which Guest toplevel this shadows. */ - lg->pgdirs[next].cr3 = cr3; + lg->pgdirs[next].gpgdir = gpgdir; /* Release all the non-kernel mappings. */ flush_user_mappings(lg, next); @@ -472,26 +481,27 @@ void guest_pagetable_clear_all(struct lguest *lg) * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. */ static void do_set_pte(struct lguest *lg, int idx, - unsigned long vaddr, gpte_t gpte) + unsigned long vaddr, pte_t gpte) { /* Look up the matching shadow page directot entry. */ - spgd_t *spgd = spgd_addr(lg, idx, vaddr); + pgd_t *spgd = spgd_addr(lg, idx, vaddr); /* If the top level isn't present, there's no entry to update. */ - if (spgd->flags & _PAGE_PRESENT) { + if (pgd_flags(*spgd) & _PAGE_PRESENT) { /* Otherwise, we start by releasing the existing entry. */ - spte_t *spte = spte_addr(lg, *spgd, vaddr); + pte_t *spte = spte_addr(lg, *spgd, vaddr); release_pte(*spte); /* If they're setting this entry as dirty or accessed, we might * as well put that entry they've given us in now. This shaves * 10% off a copy-on-write micro-benchmark. */ - if (gpte.flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) { + if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { check_gpte(lg, gpte); - *spte = gpte_to_spte(lg, gpte, gpte.flags&_PAGE_DIRTY); + *spte = gpte_to_spte(lg, gpte, + pte_flags(gpte) & _PAGE_DIRTY); } else /* Otherwise we can demand_page() it in later. */ - spte->raw.val = 0; + *spte = __pte(0); } } @@ -506,18 +516,18 @@ static void do_set_pte(struct lguest *lg, int idx, * The benefit is that when we have to track a new page table, we can copy keep * all the kernel mappings. This speeds up context switch immensely. */ void guest_set_pte(struct lguest *lg, - unsigned long cr3, unsigned long vaddr, gpte_t gpte) + unsigned long gpgdir, unsigned long vaddr, pte_t gpte) { /* Kernel mappings must be changed on all top levels. Slow, but * doesn't happen often. */ - if (vaddr >= lg->page_offset) { + if (vaddr >= lg->kernel_address) { unsigned int i; for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) if (lg->pgdirs[i].pgdir) do_set_pte(lg, i, vaddr, gpte); } else { /* Is this page table one we have a shadow for? */ - int pgdir = find_pgdir(lg, cr3); + int pgdir = find_pgdir(lg, gpgdir); if (pgdir != ARRAY_SIZE(lg->pgdirs)) /* If so, do the update. */ do_set_pte(lg, pgdir, vaddr, gpte); @@ -538,7 +548,7 @@ void guest_set_pte(struct lguest *lg, * * So with that in mind here's our code to to update a (top-level) PGD entry: */ -void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx) +void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx) { int pgdir; @@ -548,7 +558,7 @@ void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx) return; /* If they're talking about a page table we have a shadow for... */ - pgdir = find_pgdir(lg, cr3); + pgdir = find_pgdir(lg, gpgdir); if (pgdir < ARRAY_SIZE(lg->pgdirs)) /* ... throw it away. */ release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx); @@ -560,21 +570,34 @@ void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx) * its first page table is. We set some things up here: */ int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) { - /* In flush_user_mappings() we loop from 0 to - * "vaddr_to_pgd_index(lg->page_offset)". This assumes it won't hit - * the Switcher mappings, so check that now. */ - if (vaddr_to_pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX) - return -EINVAL; /* We start on the first shadow page table, and give it a blank PGD * page. */ lg->pgdidx = 0; - lg->pgdirs[lg->pgdidx].cr3 = pgtable; - lg->pgdirs[lg->pgdidx].pgdir = (spgd_t*)get_zeroed_page(GFP_KERNEL); + lg->pgdirs[lg->pgdidx].gpgdir = pgtable; + lg->pgdirs[lg->pgdidx].pgdir = (pgd_t*)get_zeroed_page(GFP_KERNEL); if (!lg->pgdirs[lg->pgdidx].pgdir) return -ENOMEM; return 0; } +/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ +void page_table_guest_data_init(struct lguest *lg) +{ + /* We get the kernel address: above this is all kernel memory. */ + if (get_user(lg->kernel_address, &lg->lguest_data->kernel_address) + /* We tell the Guest that it can't use the top 4MB of virtual + * addresses used by the Switcher. */ + || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem) + || put_user(lg->pgdirs[lg->pgdidx].gpgdir,&lg->lguest_data->pgdir)) + kill_guest(lg, "bad guest page %p", lg->lguest_data); + + /* In flush_user_mappings() we loop from 0 to + * "pgd_index(lg->kernel_address)". This assumes it won't hit the + * Switcher mappings, so check that now. */ + if (pgd_index(lg->kernel_address) >= SWITCHER_PGD_INDEX) + kill_guest(lg, "bad kernel address %#lx", lg->kernel_address); +} + /* When a Guest dies, our cleanup is fairly simple. */ void free_guest_pagetable(struct lguest *lg) { @@ -594,14 +617,14 @@ void free_guest_pagetable(struct lguest *lg) * for each CPU already set up, we just need to hook them in. */ void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) { - spte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); - spgd_t switcher_pgd; - spte_t regs_pte; + pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); + pgd_t switcher_pgd; + pte_t regs_pte; /* Make the last PGD entry for this Guest point to the Switcher's PTE * page for this CPU (with appropriate flags). */ - switcher_pgd.pfn = __pa(switcher_pte_page) >> PAGE_SHIFT; - switcher_pgd.flags = _PAGE_KERNEL; + switcher_pgd = __pgd(__pa(switcher_pte_page) | _PAGE_KERNEL); + lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; /* We also change the Switcher PTE page. When we're running the Guest, @@ -611,10 +634,8 @@ void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) * CPU's "struct lguest_pages": if we make sure the Guest's register * page is already mapped there, we don't have to copy them out * again. */ - regs_pte.pfn = __pa(lg->regs_page) >> PAGE_SHIFT; - regs_pte.flags = _PAGE_KERNEL; - switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTES_PER_PAGE] - = regs_pte; + regs_pte = pfn_pte (__pa(lg->regs_page) >> PAGE_SHIFT, __pgprot(_PAGE_KERNEL)); + switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte; } /*:*/ @@ -635,24 +656,25 @@ static __init void populate_switcher_pte_page(unsigned int cpu, unsigned int pages) { unsigned int i; - spte_t *pte = switcher_pte_page(cpu); + pte_t *pte = switcher_pte_page(cpu); /* The first entries are easy: they map the Switcher code. */ for (i = 0; i < pages; i++) { - pte[i].pfn = page_to_pfn(switcher_page[i]); - pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED; + pte[i] = mk_pte(switcher_page[i], + __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); } /* The only other thing we map is this CPU's pair of pages. */ i = pages + cpu*2; /* First page (Guest registers) is writable from the Guest */ - pte[i].pfn = page_to_pfn(switcher_page[i]); - pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW; + pte[i] = pfn_pte(page_to_pfn(switcher_page[i]), + __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)); + /* The second page contains the "struct lguest_ro_state", and is * read-only. */ - pte[i+1].pfn = page_to_pfn(switcher_page[i+1]); - pte[i+1].flags = _PAGE_PRESENT|_PAGE_ACCESSED; + pte[i+1] = pfn_pte(page_to_pfn(switcher_page[i+1]), + __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); } /*H:510 At boot or module load time, init_pagetables() allocates and populates @@ -662,7 +684,7 @@ __init int init_pagetables(struct page **switcher_page, unsigned int pages) unsigned int i; for_each_possible_cpu(i) { - switcher_pte_page(i) = (spte_t *)get_zeroed_page(GFP_KERNEL); + switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL); if (!switcher_pte_page(i)) { free_switcher_pte_pages(); return -ENOMEM; diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c index 9b81119..c2434ec 100644 --- a/drivers/lguest/segments.c +++ b/drivers/lguest/segments.c @@ -73,14 +73,14 @@ static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end) /* Segment descriptors contain a privilege level: the Guest is * sometimes careless and leaves this as 0, even though it's * running at privilege level 1. If so, we fix it here. */ - if ((lg->gdt[i].b & 0x00006000) == 0) - lg->gdt[i].b |= (GUEST_PL << 13); + if ((lg->arch.gdt[i].b & 0x00006000) == 0) + lg->arch.gdt[i].b |= (GUEST_PL << 13); /* Each descriptor has an "accessed" bit. If we don't set it * now, the CPU will try to set it when the Guest first loads * that entry into a segment register. But the GDT isn't * writable by the Guest, so bad things can happen. */ - lg->gdt[i].b |= 0x00000100; + lg->arch.gdt[i].b |= 0x00000100; } } @@ -106,12 +106,12 @@ void setup_default_gdt_entries(struct lguest_ro_state *state) void setup_guest_gdt(struct lguest *lg) { /* Start with full 0-4G segments... */ - lg->gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; - lg->gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; + lg->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; + lg->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; /* ...except the Guest is allowed to use them, so set the privilege * level appropriately in the flags. */ - lg->gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); - lg->gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); + lg->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); + lg->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); } /* Like the IDT, we never simply use the GDT the Guest gives us. We set up the @@ -126,7 +126,7 @@ void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt) unsigned int i; for (i = GDT_ENTRY_TLS_MIN; i <= GDT_ENTRY_TLS_MAX; i++) - gdt[i] = lg->gdt[i]; + gdt[i] = lg->arch.gdt[i]; } /* This is the full version */ @@ -138,7 +138,7 @@ void copy_gdt(const struct lguest *lg, struct desc_struct *gdt) * replaced. See ignored_gdt() above. */ for (i = 0; i < GDT_ENTRIES; i++) if (!ignored_gdt(i)) - gdt[i] = lg->gdt[i]; + gdt[i] = lg->arch.gdt[i]; } /* This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT). */ @@ -146,12 +146,12 @@ void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num) { /* We assume the Guest has the same number of GDT entries as the * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ - if (num > ARRAY_SIZE(lg->gdt)) + if (num > ARRAY_SIZE(lg->arch.gdt)) kill_guest(lg, "too many gdt entries %i", num); /* We read the whole thing in, then fix it up. */ - lgread(lg, lg->gdt, table, num * sizeof(lg->gdt[0])); - fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->gdt)); + __lgread(lg, lg->arch.gdt, table, num * sizeof(lg->arch.gdt[0])); + fixup_gdt_table(lg, 0, ARRAY_SIZE(lg->arch.gdt)); /* Mark that the GDT changed so the core knows it has to copy it again, * even if the Guest is run on the same CPU. */ lg->changed |= CHANGED_GDT; @@ -159,9 +159,9 @@ void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num) void guest_load_tls(struct lguest *lg, unsigned long gtls) { - struct desc_struct *tls = &lg->gdt[GDT_ENTRY_TLS_MIN]; + struct desc_struct *tls = &lg->arch.gdt[GDT_ENTRY_TLS_MIN]; - lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES); + __lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES); fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1); lg->changed |= CHANGED_GDT_TLS; } diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c new file mode 100644 index 0000000..9eed12d --- /dev/null +++ b/drivers/lguest/x86/core.c @@ -0,0 +1,577 @@ +/* + * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. + * Copyright (C) 2007, Jes Sorensen <jes@sgi.com> SGI. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include <linux/kernel.h> +#include <linux/start_kernel.h> +#include <linux/string.h> +#include <linux/console.h> +#include <linux/screen_info.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <linux/clocksource.h> +#include <linux/clockchips.h> +#include <linux/cpu.h> +#include <linux/lguest.h> +#include <linux/lguest_launcher.h> +#include <asm/paravirt.h> +#include <asm/param.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/desc.h> +#include <asm/setup.h> +#include <asm/lguest.h> +#include <asm/uaccess.h> +#include <asm/i387.h> +#include "../lg.h" + +static int cpu_had_pge; + +static struct { + unsigned long offset; + unsigned short segment; +} lguest_entry; + +/* Offset from where switcher.S was compiled to where we've copied it */ +static unsigned long switcher_offset(void) +{ + return SWITCHER_ADDR - (unsigned long)start_switcher_text; +} + +/* This cpu's struct lguest_pages. */ +static struct lguest_pages *lguest_pages(unsigned int cpu) +{ + return &(((struct lguest_pages *) + (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]); +} + +static DEFINE_PER_CPU(struct lguest *, last_guest); + +/*S:010 + * We are getting close to the Switcher. + * + * Remember that each CPU has two pages which are visible to the Guest when it + * runs on that CPU. This has to contain the state for that Guest: we copy the + * state in just before we run the Guest. + * + * Each Guest has "changed" flags which indicate what has changed in the Guest + * since it last ran. We saw this set in interrupts_and_traps.c and + * segments.c. + */ +static void copy_in_guest_info(struct lguest *lg, struct lguest_pages *pages) +{ + /* Copying all this data can be quite expensive. We usually run the + * same Guest we ran last time (and that Guest hasn't run anywhere else + * meanwhile). If that's not the case, we pretend everything in the + * Guest has changed. */ + if (__get_cpu_var(last_guest) != lg || lg->last_pages != pages) { + __get_cpu_var(last_guest) = lg; + lg->last_pages = pages; + lg->changed = CHANGED_ALL; + } + + /* These copies are pretty cheap, so we do them unconditionally: */ + /* Save the current Host top-level page directory. */ + pages->state.host_cr3 = __pa(current->mm->pgd); + /* Set up the Guest's page tables to see this CPU's pages (and no + * other CPU's pages). */ + map_switcher_in_guest(lg, pages); + /* Set up the two "TSS" members which tell the CPU what stack to use + * for traps which do directly into the Guest (ie. traps at privilege + * level 1). */ + pages->state.guest_tss.esp1 = lg->esp1; + pages->state.guest_tss.ss1 = lg->ss1; + + /* Copy direct-to-Guest trap entries. */ + if (lg->changed & CHANGED_IDT) + copy_traps(lg, pages->state.guest_idt, default_idt_entries); + + /* Copy all GDT entries which the Guest can change. */ + if (lg->changed & CHANGED_GDT) + copy_gdt(lg, pages->state.guest_gdt); + /* If only the TLS entries have changed, copy them. */ + else if (lg->changed & CHANGED_GDT_TLS) + copy_gdt_tls(lg, pages->state.guest_gdt); + + /* Mark the Guest as unchanged for next time. */ + lg->changed = 0; +} + +/* Finally: the code to actually call into the Switcher to run the Guest. */ +static void run_guest_once(struct lguest *lg, struct lguest_pages *pages) +{ + /* This is a dummy value we need for GCC's sake. */ + unsigned int clobber; + + /* Copy the guest-specific information into this CPU's "struct + * lguest_pages". */ + copy_in_guest_info(lg, pages); + + /* Set the trap number to 256 (impossible value). If we fault while + * switching to the Guest (bad segment registers or bug), this will + * cause us to abort the Guest. */ + lg->regs->trapnum = 256; + + /* Now: we push the "eflags" register on the stack, then do an "lcall". + * This is how we change from using the kernel code segment to using + * the dedicated lguest code segment, as well as jumping into the + * Switcher. + * + * The lcall also pushes the old code segment (KERNEL_CS) onto the + * stack, then the address of this call. This stack layout happens to + * exactly match the stack of an interrupt... */ + asm volatile("pushf; lcall *lguest_entry" + /* This is how we tell GCC that %eax ("a") and %ebx ("b") + * are changed by this routine. The "=" means output. */ + : "=a"(clobber), "=b"(clobber) + /* %eax contains the pages pointer. ("0" refers to the + * 0-th argument above, ie "a"). %ebx contains the + * physical address of the Guest's top-level page + * directory. */ + : "0"(pages), "1"(__pa(lg->pgdirs[lg->pgdidx].pgdir)) + /* We tell gcc that all these registers could change, + * which means we don't have to save and restore them in + * the Switcher. */ + : "memory", "%edx", "%ecx", "%edi", "%esi"); +} +/*:*/ + +/*H:040 This is the i386-specific code to setup and run the Guest. Interrupts + * are disabled: we own the CPU. */ +void lguest_arch_run_guest(struct lguest *lg) +{ + /* Remember the awfully-named TS bit? If the Guest has asked + * to set it we set it now, so we can trap and pass that trap + * to the Guest if it uses the FPU. */ + if (lg->ts) + lguest_set_ts(); + + /* SYSENTER is an optimized way of doing system calls. We + * can't allow it because it always jumps to privilege level 0. + * A normal Guest won't try it because we don't advertise it in + * CPUID, but a malicious Guest (or malicious Guest userspace + * program) could, so we tell the CPU to disable it before + * running the Guest. */ + if (boot_cpu_has(X86_FEATURE_SEP)) + wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); + + /* Now we actually run the Guest. It will pop back out when + * something interesting happens, and we can examine its + * registers to see what it was doing. */ + run_guest_once(lg, lguest_pages(raw_smp_processor_id())); + + /* The "regs" pointer contains two extra entries which are not + * really registers: a trap number which says what interrupt or + * trap made the switcher code come back, and an error code + * which some traps set. */ + + /* If the Guest page faulted, then the cr2 register will tell + * us the bad virtual address. We have to grab this now, + * because once we re-enable interrupts an interrupt could + * fault and thus overwrite cr2, or we could even move off to a + * different CPU. */ + if (lg->regs->trapnum == 14) + lg->arch.last_pagefault = read_cr2(); + /* Similarly, if we took a trap because the Guest used the FPU, + * we have to restore the FPU it expects to see. */ + else if (lg->regs->trapnum == 7) + math_state_restore(); + + /* Restore SYSENTER if it's supposed to be on. */ + if (boot_cpu_has(X86_FEATURE_SEP)) + wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); +} + +/*H:130 Our Guest is usually so well behaved; it never tries to do things it + * isn't allowed to. Unfortunately, Linux's paravirtual infrastructure isn't + * quite complete, because it doesn't contain replacements for the Intel I/O + * instructions. As a result, the Guest sometimes fumbles across one during + * the boot process as it probes for various things which are usually attached + * to a PC. + * + * When the Guest uses one of these instructions, we get trap #13 (General + * Protection Fault) and come here. We see if it's one of those troublesome + * instructions and skip over it. We return true if we did. */ +static int emulate_insn(struct lguest *lg) +{ + u8 insn; + unsigned int insnlen = 0, in = 0, shift = 0; + /* The eip contains the *virtual* address of the Guest's instruction: + * guest_pa just subtracts the Guest's page_offset. */ + unsigned long physaddr = guest_pa(lg, lg->regs->eip); + + /* This must be the Guest kernel trying to do something, not userspace! + * The bottom two bits of the CS segment register are the privilege + * level. */ + if ((lg->regs->cs & 3) != GUEST_PL) + return 0; + + /* Decoding x86 instructions is icky. */ + insn = lgread(lg, physaddr, u8); + + /* 0x66 is an "operand prefix". It means it's using the upper 16 bits + of the eax register. */ + if (insn == 0x66) { + shift = 16; + /* The instruction is 1 byte so far, read the next byte. */ + insnlen = 1; + insn = lgread(lg, physaddr + insnlen, u8); + } + + /* We can ignore the lower bit for the moment and decode the 4 opcodes + * we need to emulate. */ + switch (insn & 0xFE) { + case 0xE4: /* in <next byte>,%al */ + insnlen += 2; + in = 1; + break; + case 0xEC: /* in (%dx),%al */ + insnlen += 1; + in = 1; + break; + case 0xE6: /* out %al,<next byte> */ + insnlen += 2; + break; + case 0xEE: /* out %al,(%dx) */ + insnlen += 1; + break; + default: + /* OK, we don't know what this is, can't emulate. */ + return 0; + } + + /* If it was an "IN" instruction, they expect the result to be read + * into %eax, so we change %eax. We always return all-ones, which + * traditionally means "there's nothing there". */ + if (in) { + /* Lower bit tells is whether it's a 16 or 32 bit access */ + if (insn & 0x1) + lg->regs->eax = 0xFFFFFFFF; + else + lg->regs->eax |= (0xFFFF << shift); + } + /* Finally, we've "done" the instruction, so move past it. */ + lg->regs->eip += insnlen; + /* Success! */ + return 1; +} + +/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */ +void lguest_arch_handle_trap(struct lguest *lg) +{ + switch (lg->regs->trapnum) { + case 13: /* We've intercepted a GPF. */ + /* Check if this was one of those annoying IN or OUT + * instructions which we need to emulate. If so, we + * just go back into the Guest after we've done it. */ + if (lg->regs->errcode == 0) { + if (emulate_insn(lg)) + return; + } + break; + case 14: /* We've intercepted a page fault. */ + /* The Guest accessed a virtual address that wasn't + * mapped. This happens a lot: we don't actually set + * up most of the page tables for the Guest at all when + * we start: as it runs it asks for more and more, and + * we set them up as required. In this case, we don't + * even tell the Guest that the fault happened. + * + * The errcode tells whether this was a read or a + * write, and whether kernel or userspace code. */ + if (demand_page(lg, lg->arch.last_pagefault, lg->regs->errcode)) + return; + + /* OK, it's really not there (or not OK): the Guest + * needs to know. We write out the cr2 value so it + * knows where the fault occurred. + * + * Note that if the Guest were really messed up, this + * could happen before it's done the INITIALIZE + * hypercall, so lg->lguest_data will be NULL */ + if (lg->lguest_data && + put_user(lg->arch.last_pagefault, &lg->lguest_data->cr2)) + kill_guest(lg, "Writing cr2"); + break; + case 7: /* We've intercepted a Device Not Available fault. */ + /* If the Guest doesn't want to know, we already + * restored the Floating Point Unit, so we just + * continue without telling it. */ + if (!lg->ts) + return; + break; + case 32 ... 255: + /* These values mean a real interrupt occurred, in which case + * the Host handler has already been run. We just do a + * friendly check if another process should now be run, then + * return to run the Guest again */ + cond_resched(); + return; + case LGUEST_TRAP_ENTRY: + /* Our 'struct hcall_args' maps directly over our regs: we set + * up the pointer now to indicate a hypercall is pending. */ + lg->hcall = (struct hcall_args *)lg->regs; + return; + } + + /* We didn't handle the trap, so it needs to go to the Guest. */ + if (!deliver_trap(lg, lg->regs->trapnum)) + /* If the Guest doesn't have a handler (either it hasn't + * registered any yet, or it's one of the faults we don't let + * it handle), it dies with a cryptic error message. */ + kill_guest(lg, "unhandled trap %li at %#lx (%#lx)", + lg->regs->trapnum, lg->regs->eip, + lg->regs->trapnum == 14 ? lg->arch.last_pagefault + : lg->regs->errcode); +} + +/* Now we can look at each of the routines this calls, in increasing order of + * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(), + * deliver_trap() and demand_page(). After all those, we'll be ready to + * examine the Switcher, and our philosophical understanding of the Host/Guest + * duality will be complete. :*/ +static void adjust_pge(void *on) +{ + if (on) + write_cr4(read_cr4() | X86_CR4_PGE); + else + write_cr4(read_cr4() & ~X86_CR4_PGE); +} + +/*H:020 Now the Switcher is mapped and every thing else is ready, we need to do + * some more i386-specific initialization. */ +void __init lguest_arch_host_init(void) +{ + int i; + + /* Most of the i386/switcher.S doesn't care that it's been moved; on + * Intel, jumps are relative, and it doesn't access any references to + * external code or data. + * + * The only exception is the interrupt handlers in switcher.S: their + * addresses are placed in a table (default_idt_entries), so we need to + * update the table with the new addresses. switcher_offset() is a + * convenience function which returns the distance between the builtin + * switcher code and the high-mapped copy we just made. */ + for (i = 0; i < IDT_ENTRIES; i++) + default_idt_entries[i] += switcher_offset(); + + /* + * Set up the Switcher's per-cpu areas. + * + * Each CPU gets two pages of its own within the high-mapped region + * (aka. "struct lguest_pages"). Much of this can be initialized now, + * but some depends on what Guest we are running (which is set up in + * copy_in_guest_info()). + */ + for_each_possible_cpu(i) { + /* lguest_pages() returns this CPU's two pages. */ + struct lguest_pages *pages = lguest_pages(i); + /* This is a convenience pointer to make the code fit one + * statement to a line. */ + struct lguest_ro_state *state = &pages->state; + + /* The Global Descriptor Table: the Host has a different one + * for each CPU. We keep a descriptor for the GDT which says + * where it is and how big it is (the size is actually the last + * byte, not the size, hence the "-1"). */ + state->host_gdt_desc.size = GDT_SIZE-1; + state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); + + /* All CPUs on the Host use the same Interrupt Descriptor + * Table, so we just use store_idt(), which gets this CPU's IDT + * descriptor. */ + store_idt(&state->host_idt_desc); + + /* The descriptors for the Guest's GDT and IDT can be filled + * out now, too. We copy the GDT & IDT into ->guest_gdt and + * ->guest_idt before actually running the Guest. */ + state->guest_idt_desc.size = sizeof(state->guest_idt)-1; + state->guest_idt_desc.address = (long)&state->guest_idt; + state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; + state->guest_gdt_desc.address = (long)&state->guest_gdt; + + /* We know where we want the stack to be when the Guest enters + * the switcher: in pages->regs. The stack grows upwards, so + * we start it at the end of that structure. */ + state->guest_tss.esp0 = (long)(&pages->regs + 1); + /* And this is the GDT entry to use for the stack: we keep a + * couple of special LGUEST entries. */ + state->guest_tss.ss0 = LGUEST_DS; + + /* x86 can have a finegrained bitmap which indicates what I/O + * ports the process can use. We set it to the end of our + * structure, meaning "none". */ + state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); + + /* Some GDT entries are the same across all Guests, so we can + * set them up now. */ + setup_default_gdt_entries(state); + /* Most IDT entries are the same for all Guests, too.*/ + setup_default_idt_entries(state, default_idt_entries); + + /* The Host needs to be able to use the LGUEST segments on this + * CPU, too, so put them in the Host GDT. */ + get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; + get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; + } + + /* In the Switcher, we want the %cs segment register to use the + * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so + * it will be undisturbed when we switch. To change %cs and jump we + * need this structure to feed to Intel's "lcall" instruction. */ + lguest_entry.offset = (long)switch_to_guest + switcher_offset(); + lguest_entry.segment = LGUEST_CS; + + /* Finally, we need to turn off "Page Global Enable". PGE is an + * optimization where page table entries are specially marked to show + * they never change. The Host kernel marks all the kernel pages this + * way because it's always present, even when userspace is running. + * + * Lguest breaks this: unbeknownst to the rest of the Host kernel, we + * switch to the Guest kernel. If you don't disable this on all CPUs, + * you'll get really weird bugs that you'll chase for two days. + * + * I used to turn PGE off every time we switched to the Guest and back + * on when we return, but that slowed the Switcher down noticibly. */ + + /* We don't need the complexity of CPUs coming and going while we're + * doing this. */ + lock_cpu_hotplug(); + if (cpu_has_pge) { /* We have a broader idea of "global". */ + /* Remember that this was originally set (for cleanup). */ + cpu_had_pge = 1; + /* adjust_pge is a helper function which sets or unsets the PGE + * bit on its CPU, depending on the argument (0 == unset). */ + on_each_cpu(adjust_pge, (void *)0, 0, 1); + /* Turn off the feature in the global feature set. */ + clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); + } + unlock_cpu_hotplug(); +}; +/*:*/ + +void __exit lguest_arch_host_fini(void) +{ + /* If we had PGE before we started, turn it back on now. */ + lock_cpu_hotplug(); + if (cpu_had_pge) { + set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); + /* adjust_pge's argument "1" means set PGE. */ + on_each_cpu(adjust_pge, (void *)1, 0, 1); + } + unlock_cpu_hotplug(); +} + + +/*H:122 The i386-specific hypercalls simply farm out to the right functions. */ +int lguest_arch_do_hcall(struct lguest *lg, struct hcall_args *args) +{ + switch (args->arg0) { + case LHCALL_LOAD_GDT: + load_guest_gdt(lg, args->arg1, args->arg2); + break; + case LHCALL_LOAD_IDT_ENTRY: + load_guest_idt_entry(lg, args->arg1, args->arg2, args->arg3); + break; + case LHCALL_LOAD_TLS: + guest_load_tls(lg, args->arg1); + break; + default: + /* Bad Guest. Bad! */ + return -EIO; + } + return 0; +} + +/*H:126 i386-specific hypercall initialization: */ +int lguest_arch_init_hypercalls(struct lguest *lg) +{ + u32 tsc_speed; + + /* The pointer to the Guest's "struct lguest_data" is the only + * argument. We check that address now. */ + if (!lguest_address_ok(lg, lg->hcall->arg1, sizeof(*lg->lguest_data))) + return -EFAULT; + + /* Having checked it, we simply set lg->lguest_data to point straight + * into the Launcher's memory at the right place and then use + * copy_to_user/from_user from now on, instead of lgread/write. I put + * this in to show that I'm not immune to writing stupid + * optimizations. */ + lg->lguest_data = lg->mem_base + lg->hcall->arg1; + + /* We insist that the Time Stamp Counter exist and doesn't change with + * cpu frequency. Some devious chip manufacturers decided that TSC + * changes could be handled in software. I decided that time going + * backwards might be good for benchmarks, but it's bad for users. + * + * We also insist that the TSC be stable: the kernel detects unreliable + * TSCs for its own purposes, and we use that here. */ + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable()) + tsc_speed = tsc_khz; + else + tsc_speed = 0; + if (put_user(tsc_speed, &lg->lguest_data->tsc_khz)) + return -EFAULT; + + /* The interrupt code might not like the system call vector. */ + if (!check_syscall_vector(lg)) + kill_guest(lg, "bad syscall vector"); + + return 0; +} +/* Now we've examined the hypercall code; our Guest can make requests. There + * is one other way we can do things for the Guest, as we see in + * emulate_insn(). :*/ + +/*L:030 lguest_arch_setup_regs() + * + * Most of the Guest's registers are left alone: we used get_zeroed_page() to + * allocate the structure, so they will be 0. */ +void lguest_arch_setup_regs(struct lguest *lg, unsigned long start) +{ + struct lguest_regs *regs = lg->regs; + + /* There are four "segment" registers which the Guest needs to boot: + * The "code segment" register (cs) refers to the kernel code segment + * __KERNEL_CS, and the "data", "extra" and "stack" segment registers + * refer to the kernel data segment __KERNEL_DS. + * + * The privilege level is packed into the lower bits. The Guest runs + * at privilege level 1 (GUEST_PL).*/ + regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL; + regs->cs = __KERNEL_CS|GUEST_PL; + + /* The "eflags" register contains miscellaneous flags. Bit 1 (0x002) + * is supposed to always be "1". Bit 9 (0x200) controls whether + * interrupts are enabled. We always leave interrupts enabled while + * running the Guest. */ + regs->eflags = 0x202; + + /* The "Extended Instruction Pointer" register says where the Guest is + * running. */ + regs->eip = start; + + /* %esi points to our boot information, at physical address 0, so don't + * touch it. */ + /* There are a couple of GDT entries the Guest expects when first + * booting. */ + + setup_guest_gdt(lg); +} diff --git a/drivers/lguest/switcher.S b/drivers/lguest/x86/switcher_32.S index 7c9c230..1010b90 100644 --- a/drivers/lguest/switcher.S +++ b/drivers/lguest/x86/switcher_32.S @@ -48,7 +48,8 @@ #include <linux/linkage.h> #include <asm/asm-offsets.h> #include <asm/page.h> -#include "lg.h" +#include <asm/segment.h> +#include <asm/lguest.h> // We mark the start of the code to copy // It's placed in .text tho it's never run here @@ -132,6 +133,7 @@ ENTRY(switch_to_guest) // The Guest's register page has been mapped // Writable onto our %esp (stack) -- // We can simply pop off all Guest regs. + popl %eax popl %ebx popl %ecx popl %edx @@ -139,7 +141,6 @@ ENTRY(switch_to_guest) popl %edi popl %ebp popl %gs - popl %eax popl %fs popl %ds popl %es @@ -167,7 +168,6 @@ ENTRY(switch_to_guest) pushl %es; \ pushl %ds; \ pushl %fs; \ - pushl %eax; \ pushl %gs; \ pushl %ebp; \ pushl %edi; \ @@ -175,6 +175,7 @@ ENTRY(switch_to_guest) pushl %edx; \ pushl %ecx; \ pushl %ebx; \ + pushl %eax; \ /* Our stack and our code are using segments \ * Set in the TSS and IDT \ * Yet if we were to touch data we'd use \ diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 927cb34..7c426d0 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -274,7 +274,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) if (bitmap->offset < 0) { /* DATA BITMAP METADATA */ if (bitmap->offset - + page->index * (PAGE_SIZE/512) + + (long)(page->index * (PAGE_SIZE/512)) + size/512 > 0) /* bitmap runs in to metadata */ return -EINVAL; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8ee181a..80a67d7 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -376,7 +376,12 @@ static unsigned long get_stripe_work(struct stripe_head *sh) ack++; sh->ops.count -= ack; - BUG_ON(sh->ops.count < 0); + if (unlikely(sh->ops.count < 0)) { + printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx " + "ops.complete: %#lx\n", pending, sh->ops.pending, + sh->ops.ack, sh->ops.complete); + BUG(); + } return pending; } @@ -550,8 +555,7 @@ static void ops_complete_biofill(void *stripe_head_ref) } } } - clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack); - clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending); + set_bit(STRIPE_OP_BIOFILL, &sh->ops.complete); return_io(return_bi); @@ -2893,6 +2897,13 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); /* Now to look around and see what can be done */ + /* clean-up completed biofill operations */ + if (test_bit(STRIPE_OP_BIOFILL, &sh->ops.complete)) { + clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending); + clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack); + clear_bit(STRIPE_OP_BIOFILL, &sh->ops.complete); + } + rcu_read_lock(); for (i=disks; i--; ) { mdk_rdev_t *rdev; diff --git a/drivers/media/video/ivtv/ivtv-driver.h b/drivers/media/video/ivtv/ivtv-driver.h index 3bda1df..49ce14d 100644 --- a/drivers/media/video/ivtv/ivtv-driver.h +++ b/drivers/media/video/ivtv/ivtv-driver.h @@ -51,6 +51,7 @@ #include <linux/unistd.h> #include <linux/byteorder/swab.h> #include <linux/pagemap.h> +#include <linux/scatterlist.h> #include <linux/workqueue.h> #include <linux/mutex.h> #include <asm/uaccess.h> diff --git a/drivers/media/video/videobuf-dma-sg.c b/drivers/media/video/videobuf-dma-sg.c index 0a18286..9ab94a7 100644 --- a/drivers/media/video/videobuf-dma-sg.c +++ b/drivers/media/video/videobuf-dma-sg.c @@ -27,6 +27,7 @@ #include <linux/pci.h> #include <linux/vmalloc.h> #include <linux/pagemap.h> +#include <linux/scatterlist.h> #include <asm/page.h> #include <asm/pgtable.h> diff --git a/drivers/mmc/host/mmci.h b/drivers/mmc/host/mmci.h index 000e6a9..0f39c49 100644 --- a/drivers/mmc/host/mmci.h +++ b/drivers/mmc/host/mmci.h @@ -169,7 +169,7 @@ static inline char *mmci_kmap_atomic(struct mmci_host *host, unsigned long *flag struct scatterlist *sg = host->sg_ptr; local_irq_save(*flags); - return kmap_atomic(sg->page, KM_BIO_SRC_IRQ) + sg->offset; + return kmap_atomic(sg_page(sg), KM_BIO_SRC_IRQ) + sg->offset; } static inline void mmci_kunmap_atomic(struct mmci_host *host, void *buffer, unsigned long *flags) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 0db837e..d7c5b94 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -13,6 +13,7 @@ #include <linux/highmem.h> #include <linux/pci.h> #include <linux/dma-mapping.h> +#include <linux/scatterlist.h> #include <linux/mmc/host.h> diff --git a/drivers/mtd/chips/cfi_cmdset_0001.c b/drivers/mtd/chips/cfi_cmdset_0001.c index 3aa3dca..a9eb1c5 100644 --- a/drivers/mtd/chips/cfi_cmdset_0001.c +++ b/drivers/mtd/chips/cfi_cmdset_0001.c @@ -85,6 +85,7 @@ static int cfi_intelext_point (struct mtd_info *mtd, loff_t from, size_t len, static void cfi_intelext_unpoint (struct mtd_info *mtd, u_char *addr, loff_t from, size_t len); +static int chip_ready (struct map_info *map, struct flchip *chip, unsigned long adr, int mode); static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr, int mode); static void put_chip(struct map_info *map, struct flchip *chip, unsigned long adr); #include "fwh_lock.h" @@ -641,73 +642,13 @@ static int cfi_intelext_partition_fixup(struct mtd_info *mtd, /* * *********** CHIP ACCESS FUNCTIONS *********** */ - -static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr, int mode) +static int chip_ready (struct map_info *map, struct flchip *chip, unsigned long adr, int mode) { DECLARE_WAITQUEUE(wait, current); struct cfi_private *cfi = map->fldrv_priv; map_word status, status_OK = CMD(0x80), status_PWS = CMD(0x01); - unsigned long timeo; struct cfi_pri_intelext *cfip = cfi->cmdset_priv; - - resettime: - timeo = jiffies + HZ; - retry: - if (chip->priv && (mode == FL_WRITING || mode == FL_ERASING || mode == FL_OTP_WRITE || mode == FL_SHUTDOWN)) { - /* - * OK. We have possibility for contension on the write/erase - * operations which are global to the real chip and not per - * partition. So let's fight it over in the partition which - * currently has authority on the operation. - * - * The rules are as follows: - * - * - any write operation must own shared->writing. - * - * - any erase operation must own _both_ shared->writing and - * shared->erasing. - * - * - contension arbitration is handled in the owner's context. - * - * The 'shared' struct can be read and/or written only when - * its lock is taken. - */ - struct flchip_shared *shared = chip->priv; - struct flchip *contender; - spin_lock(&shared->lock); - contender = shared->writing; - if (contender && contender != chip) { - /* - * The engine to perform desired operation on this - * partition is already in use by someone else. - * Let's fight over it in the context of the chip - * currently using it. If it is possible to suspend, - * that other partition will do just that, otherwise - * it'll happily send us to sleep. In any case, when - * get_chip returns success we're clear to go ahead. - */ - int ret = spin_trylock(contender->mutex); - spin_unlock(&shared->lock); - if (!ret) - goto retry; - spin_unlock(chip->mutex); - ret = get_chip(map, contender, contender->start, mode); - spin_lock(chip->mutex); - if (ret) { - spin_unlock(contender->mutex); - return ret; - } - timeo = jiffies + HZ; - spin_lock(&shared->lock); - spin_unlock(contender->mutex); - } - - /* We now own it */ - shared->writing = chip; - if (mode == FL_ERASING) - shared->erasing = chip; - spin_unlock(&shared->lock); - } + unsigned long timeo = jiffies + HZ; switch (chip->state) { @@ -722,16 +663,11 @@ static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr if (chip->priv && map_word_andequal(map, status, status_PWS, status_PWS)) break; - if (time_after(jiffies, timeo)) { - printk(KERN_ERR "%s: Waiting for chip to be ready timed out. Status %lx\n", - map->name, status.x[0]); - return -EIO; - } spin_unlock(chip->mutex); cfi_udelay(1); spin_lock(chip->mutex); /* Someone else might have been playing with it. */ - goto retry; + return -EAGAIN; } case FL_READY: @@ -809,10 +745,82 @@ static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr schedule(); remove_wait_queue(&chip->wq, &wait); spin_lock(chip->mutex); - goto resettime; + return -EAGAIN; } } +static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr, int mode) +{ + int ret; + + retry: + if (chip->priv && (mode == FL_WRITING || mode == FL_ERASING + || mode == FL_OTP_WRITE || mode == FL_SHUTDOWN)) { + /* + * OK. We have possibility for contention on the write/erase + * operations which are global to the real chip and not per + * partition. So let's fight it over in the partition which + * currently has authority on the operation. + * + * The rules are as follows: + * + * - any write operation must own shared->writing. + * + * - any erase operation must own _both_ shared->writing and + * shared->erasing. + * + * - contention arbitration is handled in the owner's context. + * + * The 'shared' struct can be read and/or written only when + * its lock is taken. + */ + struct flchip_shared *shared = chip->priv; + struct flchip *contender; + spin_lock(&shared->lock); + contender = shared->writing; + if (contender && contender != chip) { + /* + * The engine to perform desired operation on this + * partition is already in use by someone else. + * Let's fight over it in the context of the chip + * currently using it. If it is possible to suspend, + * that other partition will do just that, otherwise + * it'll happily send us to sleep. In any case, when + * get_chip returns success we're clear to go ahead. + */ + ret = spin_trylock(contender->mutex); + spin_unlock(&shared->lock); + if (!ret) + goto retry; + spin_unlock(chip->mutex); + ret = chip_ready(map, contender, contender->start, mode); + spin_lock(chip->mutex); + + if (ret == -EAGAIN) { + spin_unlock(contender->mutex); + goto retry; + } + if (ret) { + spin_unlock(contender->mutex); + return ret; + } + spin_lock(&shared->lock); + spin_unlock(contender->mutex); + } + + /* We now own it */ + shared->writing = chip; + if (mode == FL_ERASING) + shared->erasing = chip; + spin_unlock(&shared->lock); + } + ret = chip_ready(map, chip, adr, mode); + if (ret == -EAGAIN) + goto retry; + + return ret; +} + static void put_chip(struct map_info *map, struct flchip *chip, unsigned long adr) { struct cfi_private *cfi = map->fldrv_priv; diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig index 8f9c3ba..246d451 100644 --- a/drivers/mtd/nand/Kconfig +++ b/drivers/mtd/nand/Kconfig @@ -300,7 +300,7 @@ config MTD_NAND_PLATFORM via platform_data. config MTD_ALAUDA - tristate "MTD driver for Olympus MAUSB-10 and Fijufilm DPC-R1" + tristate "MTD driver for Olympus MAUSB-10 and Fujifilm DPC-R1" depends on MTD_NAND && USB help These two (and possibly other) Alauda-based cardreaders for diff --git a/drivers/mtd/nand/diskonchip.c b/drivers/mtd/nand/diskonchip.c index ab9f5c5..0e72153 100644 --- a/drivers/mtd/nand/diskonchip.c +++ b/drivers/mtd/nand/diskonchip.c @@ -220,7 +220,7 @@ static int doc_ecc_decode(struct rs_control *rs, uint8_t *data, uint8_t *ecc) } } /* If the parity is wrong, no rescue possible */ - return parity ? -1 : nerr; + return parity ? -EBADMSG : nerr; } static void DoC_Delay(struct doc_priv *doc, unsigned short cycles) @@ -1034,7 +1034,7 @@ static int doc200x_correct_data(struct mtd_info *mtd, u_char *dat, WriteDOC(DOC_ECC_DIS, docptr, Mplus_ECCConf); else WriteDOC(DOC_ECC_DIS, docptr, ECCConf); - if (no_ecc_failures && (ret == -1)) { + if (no_ecc_failures && (ret == -EBADMSG)) { printk(KERN_ERR "suppressing ECC failure\n"); ret = 0; } diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c index b4e0e77..e29c1da 100644 --- a/drivers/mtd/nand/nand_base.c +++ b/drivers/mtd/nand/nand_base.c @@ -789,7 +789,7 @@ static int nand_read_page_swecc(struct mtd_info *mtd, struct nand_chip *chip, int stat; stat = chip->ecc.correct(mtd, p, &ecc_code[i], &ecc_calc[i]); - if (stat == -1) + if (stat < 0) mtd->ecc_stats.failed++; else mtd->ecc_stats.corrected += stat; @@ -833,7 +833,7 @@ static int nand_read_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip, int stat; stat = chip->ecc.correct(mtd, p, &ecc_code[i], &ecc_calc[i]); - if (stat == -1) + if (stat < 0) mtd->ecc_stats.failed++; else mtd->ecc_stats.corrected += stat; @@ -874,7 +874,7 @@ static int nand_read_page_syndrome(struct mtd_info *mtd, struct nand_chip *chip, chip->read_buf(mtd, oob, eccbytes); stat = chip->ecc.correct(mtd, p, oob, NULL); - if (stat == -1) + if (stat < 0) mtd->ecc_stats.failed++; else mtd->ecc_stats.corrected += stat; diff --git a/drivers/mtd/nand/nand_ecc.c b/drivers/mtd/nand/nand_ecc.c index fde593e..9003a13 100644 --- a/drivers/mtd/nand/nand_ecc.c +++ b/drivers/mtd/nand/nand_ecc.c @@ -189,7 +189,7 @@ int nand_correct_data(struct mtd_info *mtd, u_char *dat, if(countbits(s0 | ((uint32_t)s1 << 8) | ((uint32_t)s2 <<16)) == 1) return 1; - return -1; + return -EBADMSG; } EXPORT_SYMBOL(nand_correct_data); diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c index a757480..10490b4 100644 --- a/drivers/mtd/nand/nandsim.c +++ b/drivers/mtd/nand/nandsim.c @@ -511,7 +511,7 @@ static int init_nandsim(struct mtd_info *mtd) } if (ns->options & OPT_SMALLPAGE) { - if (ns->geom.totsz < (64 << 20)) { + if (ns->geom.totsz < (32 << 20)) { ns->geom.pgaddrbytes = 3; ns->geom.secaddrbytes = 2; } else { diff --git a/drivers/mtd/nand/s3c2410.c b/drivers/mtd/nand/s3c2410.c index 21b921d..66f76e9 100644 --- a/drivers/mtd/nand/s3c2410.c +++ b/drivers/mtd/nand/s3c2410.c @@ -488,12 +488,24 @@ static void s3c2410_nand_read_buf(struct mtd_info *mtd, u_char *buf, int len) readsb(this->IO_ADDR_R, buf, len); } +static void s3c2440_nand_read_buf(struct mtd_info *mtd, u_char *buf, int len) +{ + struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); + readsl(info->regs + S3C2440_NFDATA, buf, len / 4); +} + static void s3c2410_nand_write_buf(struct mtd_info *mtd, const u_char *buf, int len) { struct nand_chip *this = mtd->priv; writesb(this->IO_ADDR_W, buf, len); } +static void s3c2440_nand_write_buf(struct mtd_info *mtd, const u_char *buf, int len) +{ + struct s3c2410_nand_info *info = s3c2410_nand_mtd_toinfo(mtd); + writesl(info->regs + S3C2440_NFDATA, buf, len / 4); +} + /* device management functions */ static int s3c2410_nand_remove(struct platform_device *pdev) @@ -604,6 +616,8 @@ static void s3c2410_nand_init_chip(struct s3c2410_nand_info *info, info->sel_bit = S3C2440_NFCONT_nFCE; chip->cmd_ctrl = s3c2440_nand_hwcontrol; chip->dev_ready = s3c2440_nand_devready; + chip->read_buf = s3c2440_nand_read_buf; + chip->write_buf = s3c2440_nand_write_buf; break; case TYPE_S3C2412: diff --git a/drivers/mtd/onenand/onenand_sim.c b/drivers/mtd/onenand/onenand_sim.c index 0d89ad5..d64200b 100644 --- a/drivers/mtd/onenand/onenand_sim.c +++ b/drivers/mtd/onenand/onenand_sim.c @@ -88,11 +88,11 @@ do { \ /** * onenand_lock_handle - Handle Lock scheme - * @param this OneNAND device structure - * @param cmd The command to be sent + * @this: OneNAND device structure + * @cmd: The command to be sent * * Send lock command to OneNAND device. - * The lock scheme is depends on chip type. + * The lock scheme depends on chip type. */ static void onenand_lock_handle(struct onenand_chip *this, int cmd) { @@ -131,8 +131,8 @@ static void onenand_lock_handle(struct onenand_chip *this, int cmd) /** * onenand_bootram_handle - Handle BootRAM area - * @param this OneNAND device structure - * @param cmd The command to be sent + * @this: OneNAND device structure + * @cmd: The command to be sent * * Emulate BootRAM area. It is possible to do basic operation using BootRAM. */ @@ -153,10 +153,10 @@ static void onenand_bootram_handle(struct onenand_chip *this, int cmd) /** * onenand_update_interrupt - Set interrupt register - * @param this OneNAND device structure - * @param cmd The command to be sent + * @this: OneNAND device structure + * @cmd: The command to be sent * - * Update interrupt register. The status is depends on command. + * Update interrupt register. The status depends on command. */ static void onenand_update_interrupt(struct onenand_chip *this, int cmd) { @@ -189,11 +189,12 @@ static void onenand_update_interrupt(struct onenand_chip *this, int cmd) } /** - * onenand_check_overwrite - Check over-write if happend - * @param dest The destination pointer - * @param src The source pointer - * @param count The length to be check - * @return 0 on same, otherwise 1 + * onenand_check_overwrite - Check if over-write happened + * @dest: The destination pointer + * @src: The source pointer + * @count: The length to be check + * + * Returns: 0 on same, otherwise 1 * * Compare the source with destination */ @@ -213,10 +214,10 @@ static int onenand_check_overwrite(void *dest, void *src, size_t count) /** * onenand_data_handle - Handle OneNAND Core and DataRAM - * @param this OneNAND device structure - * @param cmd The command to be sent - * @param dataram Which dataram used - * @param offset The offset to OneNAND Core + * @this: OneNAND device structure + * @cmd: The command to be sent + * @dataram: Which dataram used + * @offset: The offset to OneNAND Core * * Copy data from OneNAND Core to DataRAM (read) * Copy data from DataRAM to OneNAND Core (write) @@ -295,8 +296,8 @@ static void onenand_data_handle(struct onenand_chip *this, int cmd, /** * onenand_command_handle - Handle command - * @param this OneNAND device structure - * @param cmd The command to be sent + * @this: OneNAND device structure + * @cmd: The command to be sent * * Emulate OneNAND command. */ @@ -350,8 +351,8 @@ static void onenand_command_handle(struct onenand_chip *this, int cmd) /** * onenand_writew - [OneNAND Interface] Emulate write operation - * @param value value to write - * @param addr address to write + * @value: value to write + * @addr: address to write * * Write OneNAND register with value */ @@ -373,7 +374,7 @@ static void onenand_writew(unsigned short value, void __iomem * addr) /** * flash_init - Initialize OneNAND simulator - * @param flash OneNAND simulaotr data strucutres + * @flash: OneNAND simulator data strucutres * * Initialize OneNAND simulator. */ @@ -416,7 +417,7 @@ static int __init flash_init(struct onenand_flash *flash) /** * flash_exit - Clean up OneNAND simulator - * @param flash OneNAND simulaotr data strucutres + * @flash: OneNAND simulator data structures * * Clean up OneNAND simulator. */ @@ -424,7 +425,6 @@ static void flash_exit(struct onenand_flash *flash) { vfree(ONENAND_CORE(flash)); kfree(flash->base); - kfree(flash); } static int __init onenand_sim_init(void) @@ -449,7 +449,7 @@ static int __init onenand_sim_init(void) info->onenand.write_word = onenand_writew; if (flash_init(&info->flash)) { - printk(KERN_ERR "Unable to allocat flash.\n"); + printk(KERN_ERR "Unable to allocate flash.\n"); kfree(ffchars); kfree(info); return -ENOMEM; diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index ce34b53..2538816 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -3100,4 +3100,10 @@ config NETPOLL_TRAP config NET_POLL_CONTROLLER def_bool NETPOLL +config VIRTIO_NET + tristate "Virtio network driver (EXPERIMENTAL)" + depends on EXPERIMENTAL && VIRTIO + ---help--- + This is the virtual network driver for lguest. Say Y or M. + endif # NETDEVICES diff --git a/drivers/net/Makefile b/drivers/net/Makefile index 22f78cb..5932620 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -183,7 +183,6 @@ obj-$(CONFIG_ZORRO8390) += zorro8390.o obj-$(CONFIG_HPLANCE) += hplance.o 7990.o obj-$(CONFIG_MVME147_NET) += mvme147.o 7990.o obj-$(CONFIG_EQUALIZER) += eql.o -obj-$(CONFIG_LGUEST_NET) += lguest_net.o obj-$(CONFIG_MIPS_JAZZ_SONIC) += jazzsonic.o obj-$(CONFIG_MIPS_AU1X00_ENET) += au1000_eth.o obj-$(CONFIG_MIPS_SIM_NET) += mipsnet.o @@ -243,3 +242,4 @@ obj-$(CONFIG_FS_ENET) += fs_enet/ obj-$(CONFIG_NETXEN_NIC) += netxen/ obj-$(CONFIG_NIU) += niu.o +obj-$(CONFIG_VIRTIO_NET) += virtio_net.o diff --git a/drivers/net/fec.c b/drivers/net/fec.c index 2b57820..0fbf1bb 100644 --- a/drivers/net/fec.c +++ b/drivers/net/fec.c @@ -751,13 +751,11 @@ mii_queue(struct net_device *dev, int regval, void (*func)(uint, struct net_devi if (mii_head) { mii_tail->mii_next = mip; mii_tail = mip; - } - else { + } else { mii_head = mii_tail = mip; fep->hwp->fec_mii_data = regval; } - } - else { + } else { retval = 1; } @@ -768,14 +766,11 @@ mii_queue(struct net_device *dev, int regval, void (*func)(uint, struct net_devi static void mii_do_cmd(struct net_device *dev, const phy_cmd_t *c) { - int k; - if(!c) return; - for(k = 0; (c+k)->mii_data != mk_mii_end; k++) { - mii_queue(dev, (c+k)->mii_data, (c+k)->funct); - } + for (; c->mii_data != mk_mii_end; c++) + mii_queue(dev, c->mii_data, c->funct); } static void mii_parse_sr(uint mii_reg, struct net_device *dev) @@ -792,7 +787,6 @@ static void mii_parse_sr(uint mii_reg, struct net_device *dev) status |= PHY_STAT_FAULT; if (mii_reg & 0x0020) status |= PHY_STAT_ANC; - *s = status; } @@ -1239,7 +1233,6 @@ mii_link_interrupt(int irq, void * dev_id); #endif #if defined(CONFIG_M5272) - /* * Code specific to Coldfire 5272 setup. */ @@ -2020,8 +2013,7 @@ static void mii_relink(struct work_struct *work) & (PHY_STAT_100FDX | PHY_STAT_10FDX)) duplex = 1; fec_restart(dev, duplex); - } - else + } else fec_stop(dev); #if 0 @@ -2119,8 +2111,7 @@ mii_discover_phy(uint mii_reg, struct net_device *dev) fep->phy_id = phytype << 16; mii_queue(dev, mk_mii_read(MII_REG_PHYIR2), mii_discover_phy3); - } - else { + } else { fep->phy_addr++; mii_queue(dev, mk_mii_read(MII_REG_PHYIR1), mii_discover_phy); @@ -2574,8 +2565,7 @@ fec_restart(struct net_device *dev, int duplex) if (duplex) { fecp->fec_r_cntrl = OPT_FRAME_SIZE | 0x04;/* MII enable */ fecp->fec_x_cntrl = 0x04; /* FD enable */ - } - else { + } else { /* MII enable|No Rcv on Xmit */ fecp->fec_r_cntrl = OPT_FRAME_SIZE | 0x06; fecp->fec_x_cntrl = 0x00; diff --git a/drivers/net/lguest_net.c b/drivers/net/lguest_net.c deleted file mode 100644 index abce2ee..0000000 --- a/drivers/net/lguest_net.c +++ /dev/null @@ -1,555 +0,0 @@ -/*D:500 - * The Guest network driver. - * - * This is very simple a virtual network driver, and our last Guest driver. - * The only trick is that it can talk directly to multiple other recipients - * (ie. other Guests on the same network). It can also be used with only the - * Host on the network. - :*/ - -/* Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -//#define DEBUG -#include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/module.h> -#include <linux/mm_types.h> -#include <linux/io.h> -#include <linux/lguest_bus.h> - -#define SHARED_SIZE PAGE_SIZE -#define MAX_LANS 4 -#define NUM_SKBS 8 - -/*M:011 Network code master Jeff Garzik points out numerous shortcomings in - * this driver if it aspires to greatness. - * - * Firstly, it doesn't use "NAPI": the networking's New API, and is poorer for - * it. As he says "NAPI means system-wide load leveling, across multiple - * network interfaces. Lack of NAPI can mean competition at higher loads." - * - * He also points out that we don't implement set_mac_address, so users cannot - * change the devices hardware address. When I asked why one would want to: - * "Bonding, and situations where you /do/ want the MAC address to "leak" out - * of the host onto the wider net." - * - * Finally, he would like module unloading: "It is not unrealistic to think of - * [un|re|]loading the net support module in an lguest guest. And, adding - * module support makes the programmer more responsible, because they now have - * to learn to clean up after themselves. Any driver that cannot clean up - * after itself is an incomplete driver in my book." - :*/ - -/*D:530 The "struct lguestnet_info" contains all the information we need to - * know about the network device. */ -struct lguestnet_info -{ - /* The mapped device page(s) (an array of "struct lguest_net"). */ - struct lguest_net *peer; - /* The physical address of the device page(s) */ - unsigned long peer_phys; - /* The size of the device page(s). */ - unsigned long mapsize; - - /* The lguest_device I come from */ - struct lguest_device *lgdev; - - /* My peerid (ie. my slot in the array). */ - unsigned int me; - - /* Receive queue: the network packets waiting to be filled. */ - struct sk_buff *skb[NUM_SKBS]; - struct lguest_dma dma[NUM_SKBS]; -}; -/*:*/ - -/* How many bytes left in this page. */ -static unsigned int rest_of_page(void *data) -{ - return PAGE_SIZE - ((unsigned long)data % PAGE_SIZE); -} - -/*D:570 Each peer (ie. Guest or Host) on the network binds their receive - * buffers to a different key: we simply use the physical address of the - * device's memory page plus the peer number. The Host insists that all keys - * be a multiple of 4, so we multiply the peer number by 4. */ -static unsigned long peer_key(struct lguestnet_info *info, unsigned peernum) -{ - return info->peer_phys + 4 * peernum; -} - -/* This is the routine which sets up a "struct lguest_dma" to point to a - * network packet, similar to req_to_dma() in lguest_blk.c. The structure of a - * "struct sk_buff" has grown complex over the years: it consists of a "head" - * linear section pointed to by "skb->data", and possibly an array of - * "fragments" in the case of a non-linear packet. - * - * Our receive buffers don't use fragments at all but outgoing skbs might, so - * we handle it. */ -static void skb_to_dma(const struct sk_buff *skb, unsigned int headlen, - struct lguest_dma *dma) -{ - unsigned int i, seg; - - /* First, we put the linear region into the "struct lguest_dma". Each - * entry can't go over a page boundary, so even though all our packets - * are 1514 bytes or less, we might need to use two entries here: */ - for (i = seg = 0; i < headlen; seg++, i += rest_of_page(skb->data+i)) { - dma->addr[seg] = virt_to_phys(skb->data + i); - dma->len[seg] = min((unsigned)(headlen - i), - rest_of_page(skb->data + i)); - } - - /* Now we handle the fragments: at least they're guaranteed not to go - * over a page. skb_shinfo(skb) returns a pointer to the structure - * which tells us about the number of fragments and the fragment - * array. */ - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, seg++) { - const skb_frag_t *f = &skb_shinfo(skb)->frags[i]; - /* Should not happen with MTU less than 64k - 2 * PAGE_SIZE. */ - if (seg == LGUEST_MAX_DMA_SECTIONS) { - /* We will end up sending a truncated packet should - * this ever happen. Plus, a cool log message! */ - printk("Woah dude! Megapacket!\n"); - break; - } - dma->addr[seg] = page_to_phys(f->page) + f->page_offset; - dma->len[seg] = f->size; - } - - /* If after all that we didn't use the entire "struct lguest_dma" - * array, we terminate it with a 0 length. */ - if (seg < LGUEST_MAX_DMA_SECTIONS) - dma->len[seg] = 0; -} - -/* - * Packet transmission. - * - * Our packet transmission is a little unusual. A real network card would just - * send out the packet and leave the receivers to decide if they're interested. - * Instead, we look through the network device memory page and see if any of - * the ethernet addresses match the packet destination, and if so we send it to - * that Guest. - * - * This is made a little more complicated in two cases. The first case is - * broadcast packets: for that we send the packet to all Guests on the network, - * one at a time. The second case is "promiscuous" mode, where a Guest wants - * to see all the packets on the network. We need a way for the Guest to tell - * us it wants to see all packets, so it sets the "multicast" bit on its - * published MAC address, which is never valid in a real ethernet address. - */ -#define PROMISC_BIT 0x01 - -/* This is the callback which is summoned whenever the network device's - * multicast or promiscuous state changes. If the card is in promiscuous mode, - * we advertise that in our ethernet address in the device's memory. We do the - * same if Linux wants any or all multicast traffic. */ -static void lguestnet_set_multicast(struct net_device *dev) -{ - struct lguestnet_info *info = netdev_priv(dev); - - if ((dev->flags & (IFF_PROMISC|IFF_ALLMULTI)) || dev->mc_count) - info->peer[info->me].mac[0] |= PROMISC_BIT; - else - info->peer[info->me].mac[0] &= ~PROMISC_BIT; -} - -/* A simple test function to see if a peer wants to see all packets.*/ -static int promisc(struct lguestnet_info *info, unsigned int peer) -{ - return info->peer[peer].mac[0] & PROMISC_BIT; -} - -/* Another simple function to see if a peer's advertised ethernet address - * matches a packet's destination ethernet address. */ -static int mac_eq(const unsigned char mac[ETH_ALEN], - struct lguestnet_info *info, unsigned int peer) -{ - /* Ignore multicast bit, which peer turns on to mean promisc. */ - if ((info->peer[peer].mac[0] & (~PROMISC_BIT)) != mac[0]) - return 0; - return memcmp(mac+1, info->peer[peer].mac+1, ETH_ALEN-1) == 0; -} - -/* This is the function which actually sends a packet once we've decided a - * peer wants it: */ -static void transfer_packet(struct net_device *dev, - struct sk_buff *skb, - unsigned int peernum) -{ - struct lguestnet_info *info = netdev_priv(dev); - struct lguest_dma dma; - - /* We use our handy "struct lguest_dma" packing function to prepare - * the skb for sending. */ - skb_to_dma(skb, skb_headlen(skb), &dma); - pr_debug("xfer length %04x (%u)\n", htons(skb->len), skb->len); - - /* This is the actual send call which copies the packet. */ - lguest_send_dma(peer_key(info, peernum), &dma); - - /* Check that the entire packet was transmitted. If not, it could mean - * that the other Guest registered a short receive buffer, but this - * driver should never do that. More likely, the peer is dead. */ - if (dma.used_len != skb->len) { - dev->stats.tx_carrier_errors++; - pr_debug("Bad xfer to peer %i: %i of %i (dma %p/%i)\n", - peernum, dma.used_len, skb->len, - (void *)dma.addr[0], dma.len[0]); - } else { - /* On success we update the stats. */ - dev->stats.tx_bytes += skb->len; - dev->stats.tx_packets++; - } -} - -/* Another helper function to tell is if a slot in the device memory is unused. - * Since we always set the Local Assignment bit in the ethernet address, the - * first byte can never be 0. */ -static int unused_peer(const struct lguest_net peer[], unsigned int num) -{ - return peer[num].mac[0] == 0; -} - -/* Finally, here is the routine which handles an outgoing packet. It's called - * "start_xmit" for traditional reasons. */ -static int lguestnet_start_xmit(struct sk_buff *skb, struct net_device *dev) -{ - unsigned int i; - int broadcast; - struct lguestnet_info *info = netdev_priv(dev); - /* Extract the destination ethernet address from the packet. */ - const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest; - DECLARE_MAC_BUF(mac); - - pr_debug("%s: xmit %s\n", dev->name, print_mac(mac, dest)); - - /* If it's a multicast packet, we broadcast to everyone. That's not - * very efficient, but there are very few applications which actually - * use multicast, which is a shame really. - * - * As etherdevice.h points out: "By definition the broadcast address is - * also a multicast address." So we don't have to test for broadcast - * packets separately. */ - broadcast = is_multicast_ether_addr(dest); - - /* Look through all the published ethernet addresses to see if we - * should send this packet. */ - for (i = 0; i < info->mapsize/sizeof(struct lguest_net); i++) { - /* We don't send to ourselves (we actually can't SEND_DMA to - * ourselves anyway), and don't send to unused slots.*/ - if (i == info->me || unused_peer(info->peer, i)) - continue; - - /* If it's broadcast we send it. If they want every packet we - * send it. If the destination matches their address we send - * it. Otherwise we go to the next peer. */ - if (!broadcast && !promisc(info, i) && !mac_eq(dest, info, i)) - continue; - - pr_debug("lguestnet %s: sending from %i to %i\n", - dev->name, info->me, i); - /* Our routine which actually does the transfer. */ - transfer_packet(dev, skb, i); - } - - /* An xmit routine is expected to dispose of the packet, so we do. */ - dev_kfree_skb(skb); - - /* As per kernel convention, 0 means success. This is why I love - * networking: even if we never sent to anyone, that's still - * success! */ - return 0; -} - -/*D:560 - * Packet receiving. - * - * First, here's a helper routine which fills one of our array of receive - * buffers: */ -static int fill_slot(struct net_device *dev, unsigned int slot) -{ - struct lguestnet_info *info = netdev_priv(dev); - - /* We can receive ETH_DATA_LEN (1500) byte packets, plus a standard - * ethernet header of ETH_HLEN (14) bytes. */ - info->skb[slot] = netdev_alloc_skb(dev, ETH_HLEN + ETH_DATA_LEN); - if (!info->skb[slot]) { - printk("%s: could not fill slot %i\n", dev->name, slot); - return -ENOMEM; - } - - /* skb_to_dma() is a helper which sets up the "struct lguest_dma" to - * point to the data in the skb: we also use it for sending out a - * packet. */ - skb_to_dma(info->skb[slot], ETH_HLEN + ETH_DATA_LEN, &info->dma[slot]); - - /* This is a Write Memory Barrier: it ensures that the entry in the - * receive buffer array is written *before* we set the "used_len" entry - * to 0. If the Host were looking at the receive buffer array from a - * different CPU, it could potentially see "used_len = 0" and not see - * the updated receive buffer information. This would be a horribly - * nasty bug, so make sure the compiler and CPU know this has to happen - * first. */ - wmb(); - /* Writing 0 to "used_len" tells the Host it can use this receive - * buffer now. */ - info->dma[slot].used_len = 0; - return 0; -} - -/* This is the actual receive routine. When we receive an interrupt from the - * Host to tell us a packet has been delivered, we arrive here: */ -static irqreturn_t lguestnet_rcv(int irq, void *dev_id) -{ - struct net_device *dev = dev_id; - struct lguestnet_info *info = netdev_priv(dev); - unsigned int i, done = 0; - - /* Look through our entire receive array for an entry which has data - * in it. */ - for (i = 0; i < ARRAY_SIZE(info->dma); i++) { - unsigned int length; - struct sk_buff *skb; - - length = info->dma[i].used_len; - if (length == 0) - continue; - - /* We've found one! Remember the skb (we grabbed the length - * above), and immediately refill the slot we've taken it - * from. */ - done++; - skb = info->skb[i]; - fill_slot(dev, i); - - /* This shouldn't happen: micropackets could be sent by a - * badly-behaved Guest on the network, but the Host will never - * stuff more data in the buffer than the buffer length. */ - if (length < ETH_HLEN || length > ETH_HLEN + ETH_DATA_LEN) { - pr_debug(KERN_WARNING "%s: unbelievable skb len: %i\n", - dev->name, length); - dev_kfree_skb(skb); - continue; - } - - /* skb_put(), what a great function! I've ranted about this - * function before (http://lkml.org/lkml/1999/9/26/24). You - * call it after you've added data to the end of an skb (in - * this case, it was the Host which wrote the data). */ - skb_put(skb, length); - - /* The ethernet header contains a protocol field: we use the - * standard helper to extract it, and place the result in - * skb->protocol. The helper also sets up skb->pkt_type and - * eats up the ethernet header from the front of the packet. */ - skb->protocol = eth_type_trans(skb, dev); - - /* If this device doesn't need checksums for sending, we also - * don't need to check the packets when they come in. */ - if (dev->features & NETIF_F_NO_CSUM) - skb->ip_summed = CHECKSUM_UNNECESSARY; - - /* As a last resort for debugging the driver or the lguest I/O - * subsystem, you can uncomment the "#define DEBUG" at the top - * of this file, which turns all the pr_debug() into printk() - * and floods the logs. */ - pr_debug("Receiving skb proto 0x%04x len %i type %i\n", - ntohs(skb->protocol), skb->len, skb->pkt_type); - - /* Update the packet and byte counts (visible from ifconfig, - * and good for debugging). */ - dev->stats.rx_bytes += skb->len; - dev->stats.rx_packets++; - - /* Hand our fresh network packet into the stack's "network - * interface receive" routine. That will free the packet - * itself when it's finished. */ - netif_rx(skb); - } - - /* If we found any packets, we assume the interrupt was for us. */ - return done ? IRQ_HANDLED : IRQ_NONE; -} - -/*D:550 This is where we start: when the device is brought up by dhcpd or - * ifconfig. At this point we advertise our MAC address to the rest of the - * network, and register receive buffers ready for incoming packets. */ -static int lguestnet_open(struct net_device *dev) -{ - int i; - struct lguestnet_info *info = netdev_priv(dev); - - /* Copy our MAC address into the device page, so others on the network - * can find us. */ - memcpy(info->peer[info->me].mac, dev->dev_addr, ETH_ALEN); - - /* We might already be in promisc mode (dev->flags & IFF_PROMISC). Our - * set_multicast callback handles this already, so we call it now. */ - lguestnet_set_multicast(dev); - - /* Allocate packets and put them into our "struct lguest_dma" array. - * If we fail to allocate all the packets we could still limp along, - * but it's a sign of real stress so we should probably give up now. */ - for (i = 0; i < ARRAY_SIZE(info->dma); i++) { - if (fill_slot(dev, i) != 0) - goto cleanup; - } - - /* Finally we tell the Host where our array of "struct lguest_dma" - * receive buffers is, binding it to the key corresponding to the - * device's physical memory plus our peerid. */ - if (lguest_bind_dma(peer_key(info,info->me), info->dma, - NUM_SKBS, lgdev_irq(info->lgdev)) != 0) - goto cleanup; - return 0; - -cleanup: - while (--i >= 0) - dev_kfree_skb(info->skb[i]); - return -ENOMEM; -} -/*:*/ - -/* The close routine is called when the device is no longer in use: we clean up - * elegantly. */ -static int lguestnet_close(struct net_device *dev) -{ - unsigned int i; - struct lguestnet_info *info = netdev_priv(dev); - - /* Clear all trace of our existence out of the device memory by setting - * the slot which held our MAC address to 0 (unused). */ - memset(&info->peer[info->me], 0, sizeof(info->peer[info->me])); - - /* Unregister our array of receive buffers */ - lguest_unbind_dma(peer_key(info, info->me), info->dma); - for (i = 0; i < ARRAY_SIZE(info->dma); i++) - dev_kfree_skb(info->skb[i]); - return 0; -} - -/*D:510 The network device probe function is basically a standard ethernet - * device setup. It reads the "struct lguest_device_desc" and sets the "struct - * net_device". Oh, the line-by-line excitement! Let's skip over it. :*/ -static int lguestnet_probe(struct lguest_device *lgdev) -{ - int err, irqf = IRQF_SHARED; - struct net_device *dev; - struct lguestnet_info *info; - struct lguest_device_desc *desc = &lguest_devices[lgdev->index]; - - pr_debug("lguest_net: probing for device %i\n", lgdev->index); - - dev = alloc_etherdev(sizeof(struct lguestnet_info)); - if (!dev) - return -ENOMEM; - - /* Ethernet defaults with some changes */ - ether_setup(dev); - dev->set_mac_address = NULL; - - dev->dev_addr[0] = 0x02; /* set local assignment bit (IEEE802) */ - dev->dev_addr[1] = 0x00; - memcpy(&dev->dev_addr[2], &lguest_data.guestid, 2); - dev->dev_addr[4] = 0x00; - dev->dev_addr[5] = 0x00; - - dev->open = lguestnet_open; - dev->stop = lguestnet_close; - dev->hard_start_xmit = lguestnet_start_xmit; - - /* We don't actually support multicast yet, but turning on/off - * promisc also calls dev->set_multicast_list. */ - dev->set_multicast_list = lguestnet_set_multicast; - SET_NETDEV_DEV(dev, &lgdev->dev); - - /* The network code complains if you have "scatter-gather" capability - * if you don't also handle checksums (it seem that would be - * "illogical"). So we use a lie of omission and don't tell it that we - * can handle scattered packets unless we also don't want checksums, - * even though to us they're completely independent. */ - if (desc->features & LGUEST_NET_F_NOCSUM) - dev->features = NETIF_F_SG|NETIF_F_NO_CSUM; - - info = netdev_priv(dev); - info->mapsize = PAGE_SIZE * desc->num_pages; - info->peer_phys = ((unsigned long)desc->pfn << PAGE_SHIFT); - info->lgdev = lgdev; - info->peer = lguest_map(info->peer_phys, desc->num_pages); - if (!info->peer) { - err = -ENOMEM; - goto free; - } - - /* This stores our peerid (upper bits reserved for future). */ - info->me = (desc->features & (info->mapsize-1)); - - err = register_netdev(dev); - if (err) { - pr_debug("lguestnet: registering device failed\n"); - goto unmap; - } - - if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS) - irqf |= IRQF_SAMPLE_RANDOM; - if (request_irq(lgdev_irq(lgdev), lguestnet_rcv, irqf, "lguestnet", - dev) != 0) { - pr_debug("lguestnet: cannot get irq %i\n", lgdev_irq(lgdev)); - goto unregister; - } - - pr_debug("lguestnet: registered device %s\n", dev->name); - /* Finally, we put the "struct net_device" in the generic "struct - * lguest_device"s private pointer. Again, it's not necessary, but - * makes sure the cool kernel kids don't tease us. */ - lgdev->private = dev; - return 0; - -unregister: - unregister_netdev(dev); -unmap: - lguest_unmap(info->peer); -free: - free_netdev(dev); - return err; -} - -static struct lguest_driver lguestnet_drv = { - .name = "lguestnet", - .owner = THIS_MODULE, - .device_type = LGUEST_DEVICE_T_NET, - .probe = lguestnet_probe, -}; - -static __init int lguestnet_init(void) -{ - return register_lguest_driver(&lguestnet_drv); -} -module_init(lguestnet_init); - -MODULE_DESCRIPTION("Lguest network driver"); -MODULE_LICENSE("GPL"); - -/*D:580 - * This is the last of the Drivers, and with this we have covered the many and - * wonderous and fine (and boring) details of the Guest. - * - * "make Launcher" beckons, where we answer questions like "Where do Guests - * come from?", and "What do you do when someone asks for optimization?" - */ diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c index 6471d33..5064873 100644 --- a/drivers/net/mlx4/fw.c +++ b/drivers/net/mlx4/fw.c @@ -736,7 +736,7 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param) MLX4_PUT(inbox, (u8) (PAGE_SHIFT - 12), INIT_HCA_UAR_PAGE_SZ_OFFSET); MLX4_PUT(inbox, param->log_uar_sz, INIT_HCA_LOG_UAR_SZ_OFFSET); - err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_INIT_HCA, 1000); + err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_INIT_HCA, 10000); if (err) mlx4_err(dev, "INIT_HCA returns %d\n", err); diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c new file mode 100644 index 0000000..e396c9d --- /dev/null +++ b/drivers/net/virtio_net.c @@ -0,0 +1,435 @@ +/* A simple network driver using virtio. + * + * Copyright 2007 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +//#define DEBUG +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/module.h> +#include <linux/virtio.h> +#include <linux/virtio_net.h> +#include <linux/scatterlist.h> + +/* FIXME: MTU in config. */ +#define MAX_PACKET_LEN (ETH_HLEN+ETH_DATA_LEN) + +struct virtnet_info +{ + struct virtio_device *vdev; + struct virtqueue *rvq, *svq; + struct net_device *dev; + struct napi_struct napi; + + /* Number of input buffers, and max we've ever had. */ + unsigned int num, max; + + /* Receive & send queues. */ + struct sk_buff_head recv; + struct sk_buff_head send; +}; + +static inline struct virtio_net_hdr *skb_vnet_hdr(struct sk_buff *skb) +{ + return (struct virtio_net_hdr *)skb->cb; +} + +static inline void vnet_hdr_to_sg(struct scatterlist *sg, struct sk_buff *skb) +{ + sg_init_one(sg, skb_vnet_hdr(skb), sizeof(struct virtio_net_hdr)); +} + +static bool skb_xmit_done(struct virtqueue *rvq) +{ + struct virtnet_info *vi = rvq->vdev->priv; + + /* In case we were waiting for output buffers. */ + netif_wake_queue(vi->dev); + return true; +} + +static void receive_skb(struct net_device *dev, struct sk_buff *skb, + unsigned len) +{ + struct virtio_net_hdr *hdr = skb_vnet_hdr(skb); + + if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) { + pr_debug("%s: short packet %i\n", dev->name, len); + dev->stats.rx_length_errors++; + goto drop; + } + len -= sizeof(struct virtio_net_hdr); + BUG_ON(len > MAX_PACKET_LEN); + + skb_trim(skb, len); + skb->protocol = eth_type_trans(skb, dev); + pr_debug("Receiving skb proto 0x%04x len %i type %i\n", + ntohs(skb->protocol), skb->len, skb->pkt_type); + dev->stats.rx_bytes += skb->len; + dev->stats.rx_packets++; + + if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { + pr_debug("Needs csum!\n"); + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = hdr->csum_start; + skb->csum_offset = hdr->csum_offset; + if (skb->csum_start > skb->len - 2 + || skb->csum_offset > skb->len - 2) { + if (net_ratelimit()) + printk(KERN_WARNING "%s: csum=%u/%u len=%u\n", + dev->name, skb->csum_start, + skb->csum_offset, skb->len); + goto frame_err; + } + } + + if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { + pr_debug("GSO!\n"); + switch (hdr->gso_type) { + case VIRTIO_NET_HDR_GSO_TCPV4: + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; + break; + case VIRTIO_NET_HDR_GSO_TCPV4_ECN: + skb_shinfo(skb)->gso_type = SKB_GSO_TCP_ECN; + break; + case VIRTIO_NET_HDR_GSO_UDP: + skb_shinfo(skb)->gso_type = SKB_GSO_UDP; + break; + case VIRTIO_NET_HDR_GSO_TCPV6: + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; + break; + default: + if (net_ratelimit()) + printk(KERN_WARNING "%s: bad gso type %u.\n", + dev->name, hdr->gso_type); + goto frame_err; + } + + skb_shinfo(skb)->gso_size = hdr->gso_size; + if (skb_shinfo(skb)->gso_size == 0) { + if (net_ratelimit()) + printk(KERN_WARNING "%s: zero gso size.\n", + dev->name); + goto frame_err; + } + + /* Header must be checked, and gso_segs computed. */ + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; + skb_shinfo(skb)->gso_segs = 0; + } + + netif_receive_skb(skb); + return; + +frame_err: + dev->stats.rx_frame_errors++; +drop: + dev_kfree_skb(skb); +} + +static void try_fill_recv(struct virtnet_info *vi) +{ + struct sk_buff *skb; + struct scatterlist sg[1+MAX_SKB_FRAGS]; + int num, err; + + for (;;) { + skb = netdev_alloc_skb(vi->dev, MAX_PACKET_LEN); + if (unlikely(!skb)) + break; + + skb_put(skb, MAX_PACKET_LEN); + vnet_hdr_to_sg(sg, skb); + num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1; + skb_queue_head(&vi->recv, skb); + + err = vi->rvq->vq_ops->add_buf(vi->rvq, sg, 0, num, skb); + if (err) { + skb_unlink(skb, &vi->recv); + kfree_skb(skb); + break; + } + vi->num++; + } + if (unlikely(vi->num > vi->max)) + vi->max = vi->num; + vi->rvq->vq_ops->kick(vi->rvq); +} + +static bool skb_recv_done(struct virtqueue *rvq) +{ + struct virtnet_info *vi = rvq->vdev->priv; + netif_rx_schedule(vi->dev, &vi->napi); + /* Suppress further interrupts. */ + return false; +} + +static int virtnet_poll(struct napi_struct *napi, int budget) +{ + struct virtnet_info *vi = container_of(napi, struct virtnet_info, napi); + struct sk_buff *skb = NULL; + unsigned int len, received = 0; + +again: + while (received < budget && + (skb = vi->rvq->vq_ops->get_buf(vi->rvq, &len)) != NULL) { + __skb_unlink(skb, &vi->recv); + receive_skb(vi->dev, skb, len); + vi->num--; + received++; + } + + /* FIXME: If we oom and completely run out of inbufs, we need + * to start a timer trying to fill more. */ + if (vi->num < vi->max / 2) + try_fill_recv(vi); + + /* All done? */ + if (!skb) { + netif_rx_complete(vi->dev, napi); + if (unlikely(!vi->rvq->vq_ops->restart(vi->rvq)) + && netif_rx_reschedule(vi->dev, napi)) + goto again; + } + + return received; +} + +static void free_old_xmit_skbs(struct virtnet_info *vi) +{ + struct sk_buff *skb; + unsigned int len; + + while ((skb = vi->svq->vq_ops->get_buf(vi->svq, &len)) != NULL) { + pr_debug("Sent skb %p\n", skb); + __skb_unlink(skb, &vi->send); + vi->dev->stats.tx_bytes += len; + vi->dev->stats.tx_packets++; + kfree_skb(skb); + } +} + +static int start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct virtnet_info *vi = netdev_priv(dev); + int num, err; + struct scatterlist sg[1+MAX_SKB_FRAGS]; + struct virtio_net_hdr *hdr; + const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest; + DECLARE_MAC_BUF(mac); + + pr_debug("%s: xmit %p %s\n", dev->name, skb, print_mac(mac, dest)); + + free_old_xmit_skbs(vi); + + /* Encode metadata header at front. */ + hdr = skb_vnet_hdr(skb); + if (skb->ip_summed == CHECKSUM_PARTIAL) { + hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + hdr->csum_start = skb->csum_start - skb_headroom(skb); + hdr->csum_offset = skb->csum_offset; + } else { + hdr->flags = 0; + hdr->csum_offset = hdr->csum_start = 0; + } + + if (skb_is_gso(skb)) { + hdr->gso_size = skb_shinfo(skb)->gso_size; + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_ECN) + hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4_ECN; + else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) + hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; + else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) + hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; + else if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP; + else + BUG(); + } else { + hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; + hdr->gso_size = 0; + } + + vnet_hdr_to_sg(sg, skb); + num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1; + __skb_queue_head(&vi->send, skb); + err = vi->svq->vq_ops->add_buf(vi->svq, sg, num, 0, skb); + if (err) { + pr_debug("%s: virtio not prepared to send\n", dev->name); + skb_unlink(skb, &vi->send); + netif_stop_queue(dev); + return NETDEV_TX_BUSY; + } + vi->svq->vq_ops->kick(vi->svq); + + return 0; +} + +static int virtnet_open(struct net_device *dev) +{ + struct virtnet_info *vi = netdev_priv(dev); + + try_fill_recv(vi); + + /* If we didn't even get one input buffer, we're useless. */ + if (vi->num == 0) + return -ENOMEM; + + napi_enable(&vi->napi); + return 0; +} + +static int virtnet_close(struct net_device *dev) +{ + struct virtnet_info *vi = netdev_priv(dev); + struct sk_buff *skb; + + napi_disable(&vi->napi); + + /* networking core has neutered skb_xmit_done/skb_recv_done, so don't + * worry about races vs. get(). */ + vi->rvq->vq_ops->shutdown(vi->rvq); + while ((skb = __skb_dequeue(&vi->recv)) != NULL) { + kfree_skb(skb); + vi->num--; + } + vi->svq->vq_ops->shutdown(vi->svq); + while ((skb = __skb_dequeue(&vi->send)) != NULL) + kfree_skb(skb); + + BUG_ON(vi->num != 0); + return 0; +} + +static int virtnet_probe(struct virtio_device *vdev) +{ + int err; + unsigned int len; + struct net_device *dev; + struct virtnet_info *vi; + void *token; + + /* Allocate ourselves a network device with room for our info */ + dev = alloc_etherdev(sizeof(struct virtnet_info)); + if (!dev) + return -ENOMEM; + + /* Set up network device as normal. */ + ether_setup(dev); + dev->open = virtnet_open; + dev->stop = virtnet_close; + dev->hard_start_xmit = start_xmit; + dev->features = NETIF_F_HIGHDMA; + SET_NETDEV_DEV(dev, &vdev->dev); + + /* Do we support "hardware" checksums? */ + token = vdev->config->find(vdev, VIRTIO_CONFIG_NET_F, &len); + if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_NO_CSUM)) { + /* This opens up the world of extra features. */ + dev->features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST; + if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_TSO4)) + dev->features |= NETIF_F_TSO; + if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_UFO)) + dev->features |= NETIF_F_UFO; + if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_TSO4_ECN)) + dev->features |= NETIF_F_TSO_ECN; + if (virtio_use_bit(vdev, token, len, VIRTIO_NET_F_TSO6)) + dev->features |= NETIF_F_TSO6; + } + + /* Configuration may specify what MAC to use. Otherwise random. */ + token = vdev->config->find(vdev, VIRTIO_CONFIG_NET_MAC_F, &len); + if (token) { + dev->addr_len = len; + vdev->config->get(vdev, token, dev->dev_addr, len); + } else + random_ether_addr(dev->dev_addr); + + /* Set up our device-specific information */ + vi = netdev_priv(dev); + netif_napi_add(dev, &vi->napi, virtnet_poll, 16); + vi->dev = dev; + vi->vdev = vdev; + + /* We expect two virtqueues, receive then send. */ + vi->rvq = vdev->config->find_vq(vdev, skb_recv_done); + if (IS_ERR(vi->rvq)) { + err = PTR_ERR(vi->rvq); + goto free; + } + + vi->svq = vdev->config->find_vq(vdev, skb_xmit_done); + if (IS_ERR(vi->svq)) { + err = PTR_ERR(vi->svq); + goto free_recv; + } + + /* Initialize our empty receive and send queues. */ + skb_queue_head_init(&vi->recv); + skb_queue_head_init(&vi->send); + + err = register_netdev(dev); + if (err) { + pr_debug("virtio_net: registering device failed\n"); + goto free_send; + } + pr_debug("virtnet: registered device %s\n", dev->name); + vdev->priv = vi; + return 0; + +free_send: + vdev->config->del_vq(vi->svq); +free_recv: + vdev->config->del_vq(vi->rvq); +free: + free_netdev(dev); + return err; +} + +static void virtnet_remove(struct virtio_device *vdev) +{ + unregister_netdev(vdev->priv); + free_netdev(vdev->priv); +} + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static struct virtio_driver virtio_net = { + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .probe = virtnet_probe, + .remove = __devexit_p(virtnet_remove), +}; + +static int __init init(void) +{ + return register_virtio_driver(&virtio_net); +} + +static void __exit fini(void) +{ + unregister_virtio_driver(&virtio_net); +} +module_init(init); +module_exit(fini); + +MODULE_DEVICE_TABLE(virtio, id_table); +MODULE_DESCRIPTION("Virtio network driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c index b3c4dbf..7c60cbd 100644 --- a/drivers/parisc/ccio-dma.c +++ b/drivers/parisc/ccio-dma.c @@ -42,6 +42,7 @@ #include <linux/reboot.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/scatterlist.h> #include <asm/byteorder.h> #include <asm/cache.h> /* for L1_CACHE_BYTES */ diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c index e5c3239..e527a0e 100644 --- a/drivers/parisc/sba_iommu.c +++ b/drivers/parisc/sba_iommu.c @@ -28,6 +28,7 @@ #include <linux/mm.h> #include <linux/string.h> #include <linux/pci.h> +#include <linux/scatterlist.h> #include <asm/byteorder.h> #include <asm/io.h> diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index b3d7031..0c4ab3b 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -1962,7 +1962,7 @@ static void intel_free_coherent(struct device *hwdev, size_t size, free_pages((unsigned long)vaddr, order); } -#define SG_ENT_VIRT_ADDRESS(sg) (page_address((sg)->page) + (sg)->offset) +#define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg))) static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, int nelems, int dir) { @@ -2010,7 +2010,7 @@ static int intel_nontranslate_map_sg(struct device *hddev, struct scatterlist *sg; for_each_sg(sglist, sg, nelems, i) { - BUG_ON(!sg->page); + BUG_ON(!sg_page(sg)); sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg)); sg->dma_length = sg->length; } diff --git a/drivers/s390/scsi/zfcp_aux.c b/drivers/s390/scsi/zfcp_aux.c index 7507067..fd5d0c1 100644 --- a/drivers/s390/scsi/zfcp_aux.c +++ b/drivers/s390/scsi/zfcp_aux.c @@ -559,6 +559,7 @@ zfcp_sg_list_alloc(struct zfcp_sg_list *sg_list, size_t size) retval = -ENOMEM; goto out; } + sg_init_table(sg_list->sg, sg_list->count); for (i = 0, sg = sg_list->sg; i < sg_list->count; i++, sg++) { sg->length = min(size, PAGE_SIZE); diff --git a/drivers/s390/scsi/zfcp_def.h b/drivers/s390/scsi/zfcp_def.h index 57cac70..326e7ee 100644 --- a/drivers/s390/scsi/zfcp_def.h +++ b/drivers/s390/scsi/zfcp_def.h @@ -63,7 +63,7 @@ static inline void * zfcp_sg_to_address(struct scatterlist *list) { - return (void *) (page_address(list->page) + list->offset); + return sg_virt(list); } /** @@ -74,7 +74,7 @@ zfcp_sg_to_address(struct scatterlist *list) static inline void zfcp_address_to_sg(void *address, struct scatterlist *list) { - list->page = virt_to_page(address); + sg_set_page(list, virt_to_page(address)); list->offset = ((unsigned long) address) & (PAGE_SIZE - 1); } diff --git a/drivers/s390/scsi/zfcp_erp.c b/drivers/s390/scsi/zfcp_erp.c index a6475a2..9438d0b 100644 --- a/drivers/s390/scsi/zfcp_erp.c +++ b/drivers/s390/scsi/zfcp_erp.c @@ -308,13 +308,15 @@ zfcp_erp_adisc(struct zfcp_port *port) if (send_els == NULL) goto nomem; - send_els->req = kzalloc(sizeof(struct scatterlist), GFP_ATOMIC); + send_els->req = kmalloc(sizeof(struct scatterlist), GFP_ATOMIC); if (send_els->req == NULL) goto nomem; + sg_init_table(send_els->req, 1); - send_els->resp = kzalloc(sizeof(struct scatterlist), GFP_ATOMIC); + send_els->resp = kmalloc(sizeof(struct scatterlist), GFP_ATOMIC); if (send_els->resp == NULL) goto nomem; + sg_init_table(send_els->resp, 1); address = (void *) get_zeroed_page(GFP_ATOMIC); if (address == NULL) @@ -363,7 +365,7 @@ zfcp_erp_adisc(struct zfcp_port *port) retval = -ENOMEM; freemem: if (address != NULL) - __free_pages(send_els->req->page, 0); + __free_pages(sg_page(send_els->req), 0); if (send_els != NULL) { kfree(send_els->req); kfree(send_els->resp); @@ -437,7 +439,7 @@ zfcp_erp_adisc_handler(unsigned long data) out: zfcp_port_put(port); - __free_pages(send_els->req->page, 0); + __free_pages(sg_page(send_els->req), 0); kfree(send_els->req); kfree(send_els->resp); kfree(send_els); diff --git a/drivers/scsi/ps3rom.c b/drivers/scsi/ps3rom.c index 03f19b8..17b4a7c 100644 --- a/drivers/scsi/ps3rom.c +++ b/drivers/scsi/ps3rom.c @@ -147,7 +147,7 @@ static int fetch_to_dev_buffer(struct scsi_cmnd *cmd, void *buf) req_len = fin = 0; scsi_for_each_sg(cmd, sgpnt, scsi_sg_count(cmd), k) { - kaddr = kmap_atomic(sg_page(sgpnt->page), KM_IRQ0); + kaddr = kmap_atomic(sg_page(sgpnt), KM_IRQ0); len = sgpnt->length; if ((req_len + len) > buflen) { len = buflen - req_len; diff --git a/drivers/serial/mcf.c b/drivers/serial/mcf.c new file mode 100644 index 0000000..a7d4360 --- /dev/null +++ b/drivers/serial/mcf.c @@ -0,0 +1,653 @@ +/****************************************************************************/ + +/* + * mcf.c -- Freescale ColdFire UART driver + * + * (C) Copyright 2003-2007, Greg Ungerer <gerg@snapgear.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +/****************************************************************************/ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/module.h> +#include <linux/console.h> +#include <linux/tty.h> +#include <linux/tty_flip.h> +#include <linux/serial.h> +#include <linux/serial_core.h> +#include <linux/io.h> +#include <asm/coldfire.h> +#include <asm/mcfsim.h> +#include <asm/mcfuart.h> +#include <asm/nettel.h> + +/****************************************************************************/ + +/* + * Some boards implement the DTR/DCD lines using GPIO lines, most + * don't. Dummy out the access macros for those that don't. Those + * that do should define these macros somewhere in there board + * specific inlude files. + */ +#if !defined(mcf_getppdcd) +#define mcf_getppdcd(p) (1) +#endif +#if !defined(mcf_getppdtr) +#define mcf_getppdtr(p) (1) +#endif +#if !defined(mcf_setppdtr) +#define mcf_setppdtr(p, v) do { } while (0) +#endif + +/****************************************************************************/ + +/* + * Local per-uart structure. + */ +struct mcf_uart { + struct uart_port port; + unsigned int sigs; /* Local copy of line sigs */ + unsigned char imr; /* Local IMR mirror */ +}; + +/****************************************************************************/ + +static unsigned int mcf_tx_empty(struct uart_port *port) +{ + return (readb(port->membase + MCFUART_USR) & MCFUART_USR_TXEMPTY) ? + TIOCSER_TEMT : 0; +} + +/****************************************************************************/ + +static unsigned int mcf_get_mctrl(struct uart_port *port) +{ + struct mcf_uart *pp = (struct mcf_uart *) port; + unsigned long flags; + unsigned int sigs; + + spin_lock_irqsave(&port->lock, flags); + sigs = (readb(port->membase + MCFUART_UIPR) & MCFUART_UIPR_CTS) ? + 0 : TIOCM_CTS; + sigs |= (pp->sigs & TIOCM_RTS); + sigs |= (mcf_getppdcd(port->line) ? TIOCM_CD : 0); + sigs |= (mcf_getppdtr(port->line) ? TIOCM_DTR : 0); + spin_unlock_irqrestore(&port->lock, flags); + return sigs; +} + +/****************************************************************************/ + +static void mcf_set_mctrl(struct uart_port *port, unsigned int sigs) +{ + struct mcf_uart *pp = (struct mcf_uart *) port; + unsigned long flags; + + spin_lock_irqsave(&port->lock, flags); + pp->sigs = sigs; + mcf_setppdtr(port->line, (sigs & TIOCM_DTR)); + if (sigs & TIOCM_RTS) + writeb(MCFUART_UOP_RTS, port->membase + MCFUART_UOP1); + else + writeb(MCFUART_UOP_RTS, port->membase + MCFUART_UOP0); + spin_unlock_irqrestore(&port->lock, flags); +} + +/****************************************************************************/ + +static void mcf_start_tx(struct uart_port *port) +{ + struct mcf_uart *pp = (struct mcf_uart *) port; + unsigned long flags; + + spin_lock_irqsave(&port->lock, flags); + pp->imr |= MCFUART_UIR_TXREADY; + writeb(pp->imr, port->membase + MCFUART_UIMR); + spin_unlock_irqrestore(&port->lock, flags); +} + +/****************************************************************************/ + +static void mcf_stop_tx(struct uart_port *port) +{ + struct mcf_uart *pp = (struct mcf_uart *) port; + unsigned long flags; + + spin_lock_irqsave(&port->lock, flags); + pp->imr &= ~MCFUART_UIR_TXREADY; + writeb(pp->imr, port->membase + MCFUART_UIMR); + spin_unlock_irqrestore(&port->lock, flags); +} + +/****************************************************************************/ + +static void mcf_stop_rx(struct uart_port *port) +{ + struct mcf_uart *pp = (struct mcf_uart *) port; + unsigned long flags; + + spin_lock_irqsave(&port->lock, flags); + pp->imr &= ~MCFUART_UIR_RXREADY; + writeb(pp->imr, port->membase + MCFUART_UIMR); + spin_unlock_irqrestore(&port->lock, flags); +} + +/****************************************************************************/ + +static void mcf_break_ctl(struct uart_port *port, int break_state) +{ + unsigned long flags; + + spin_lock_irqsave(&port->lock, flags); + if (break_state == -1) + writeb(MCFUART_UCR_CMDBREAKSTART, port->membase + MCFUART_UCR); + else + writeb(MCFUART_UCR_CMDBREAKSTOP, port->membase + MCFUART_UCR); + spin_unlock_irqrestore(&port->lock, flags); +} + +/****************************************************************************/ + +static void mcf_enable_ms(struct uart_port *port) +{ +} + +/****************************************************************************/ + +static int mcf_startup(struct uart_port *port) +{ + struct mcf_uart *pp = (struct mcf_uart *) port; + unsigned long flags; + + spin_lock_irqsave(&port->lock, flags); + + /* Reset UART, get it into known state... */ + writeb(MCFUART_UCR_CMDRESETRX, port->membase + MCFUART_UCR); + writeb(MCFUART_UCR_CMDRESETTX, port->membase + MCFUART_UCR); + + /* Enable the UART transmitter and receiver */ + writeb(MCFUART_UCR_RXENABLE | MCFUART_UCR_TXENABLE, + port->membase + MCFUART_UCR); + + /* Enable RX interrupts now */ + pp->imr = MCFUART_UIR_RXREADY; + writeb(pp->imr, port->membase + MCFUART_UIMR); + + spin_unlock_irqrestore(&port->lock, flags); + + return 0; +} + +/****************************************************************************/ + +static void mcf_shutdown(struct uart_port *port) +{ + struct mcf_uart *pp = (struct mcf_uart *) port; + unsigned long flags; + + spin_lock_irqsave(&port->lock, flags); + + /* Disable all interrupts now */ + pp->imr = 0; + writeb(pp->imr, port->membase + MCFUART_UIMR); + + /* Disable UART transmitter and receiver */ + writeb(MCFUART_UCR_CMDRESETRX, port->membase + MCFUART_UCR); + writeb(MCFUART_UCR_CMDRESETTX, port->membase + MCFUART_UCR); + + spin_unlock_irqrestore(&port->lock, flags); +} + +/****************************************************************************/ + +static void mcf_set_termios(struct uart_port *port, struct ktermios *termios, + struct ktermios *old) +{ + unsigned long flags; + unsigned int baud, baudclk; + unsigned char mr1, mr2; + + baud = uart_get_baud_rate(port, termios, old, 0, 230400); + baudclk = ((MCF_BUSCLK / baud) + 16) / 32; + + mr1 = MCFUART_MR1_RXIRQRDY | MCFUART_MR1_RXERRCHAR; + mr2 = 0; + + switch (termios->c_cflag & CSIZE) { + case CS5: mr1 |= MCFUART_MR1_CS5; break; + case CS6: mr1 |= MCFUART_MR1_CS6; break; + case CS7: mr1 |= MCFUART_MR1_CS7; break; + case CS8: + default: mr1 |= MCFUART_MR1_CS8; break; + } + + if (termios->c_cflag & PARENB) { + if (termios->c_cflag & CMSPAR) { + if (termios->c_cflag & PARODD) + mr1 |= MCFUART_MR1_PARITYMARK; + else + mr1 |= MCFUART_MR1_PARITYSPACE; + } else { + if (termios->c_cflag & PARODD) + mr1 |= MCFUART_MR1_PARITYODD; + else + mr1 |= MCFUART_MR1_PARITYEVEN; + } + } else { + mr1 |= MCFUART_MR1_PARITYNONE; + } + + if (termios->c_cflag & CSTOPB) + mr2 |= MCFUART_MR2_STOP2; + else + mr2 |= MCFUART_MR2_STOP1; + + if (termios->c_cflag & CRTSCTS) { + mr1 |= MCFUART_MR1_RXRTS; + mr2 |= MCFUART_MR2_TXCTS; + } + + spin_lock_irqsave(&port->lock, flags); + writeb(MCFUART_UCR_CMDRESETRX, port->membase + MCFUART_UCR); + writeb(MCFUART_UCR_CMDRESETTX, port->membase + MCFUART_UCR); + writeb(MCFUART_UCR_CMDRESETMRPTR, port->membase + MCFUART_UCR); + writeb(mr1, port->membase + MCFUART_UMR); + writeb(mr2, port->membase + MCFUART_UMR); + writeb((baudclk & 0xff00) >> 8, port->membase + MCFUART_UBG1); + writeb((baudclk & 0xff), port->membase + MCFUART_UBG2); + writeb(MCFUART_UCSR_RXCLKTIMER | MCFUART_UCSR_TXCLKTIMER, + port->membase + MCFUART_UCSR); + writeb(MCFUART_UCR_RXENABLE | MCFUART_UCR_TXENABLE, + port->membase + MCFUART_UCR); + spin_unlock_irqrestore(&port->lock, flags); +} + +/****************************************************************************/ + +static void mcf_rx_chars(struct mcf_uart *pp) +{ + struct uart_port *port = (struct uart_port *) pp; + unsigned char status, ch, flag; + + while ((status = readb(port->membase + MCFUART_USR)) & MCFUART_USR_RXREADY) { + ch = readb(port->membase + MCFUART_URB); + flag = TTY_NORMAL; + port->icount.rx++; + + if (status & MCFUART_USR_RXERR) { + writeb(MCFUART_UCR_CMDRESETERR, + port->membase + MCFUART_UCR); + + if (status & MCFUART_USR_RXBREAK) { + port->icount.brk++; + if (uart_handle_break(port)) + continue; + } else if (status & MCFUART_USR_RXPARITY) { + port->icount.parity++; + } else if (status & MCFUART_USR_RXOVERRUN) { + port->icount.overrun++; + } else if (status & MCFUART_USR_RXFRAMING) { + port->icount.frame++; + } + + status &= port->read_status_mask; + + if (status & MCFUART_USR_RXBREAK) + flag = TTY_BREAK; + else if (status & MCFUART_USR_RXPARITY) + flag = TTY_PARITY; + else if (status & MCFUART_USR_RXFRAMING) + flag = TTY_FRAME; + } + + if (uart_handle_sysrq_char(port, ch)) + continue; + uart_insert_char(port, status, MCFUART_USR_RXOVERRUN, ch, flag); + } + + tty_flip_buffer_push(port->info->tty); +} + +/****************************************************************************/ + +static void mcf_tx_chars(struct mcf_uart *pp) +{ + struct uart_port *port = (struct uart_port *) pp; + struct circ_buf *xmit = &port->info->xmit; + + if (port->x_char) { + /* Send special char - probably flow control */ + writeb(port->x_char, port->membase + MCFUART_UTB); + port->x_char = 0; + port->icount.tx++; + return; + } + + while (readb(port->membase + MCFUART_USR) & MCFUART_USR_TXREADY) { + if (xmit->head == xmit->tail) + break; + writeb(xmit->buf[xmit->tail], port->membase + MCFUART_UTB); + xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE -1); + port->icount.tx++; + } + + if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) + uart_write_wakeup(port); + + if (xmit->head == xmit->tail) { + pp->imr &= ~MCFUART_UIR_TXREADY; + writeb(pp->imr, port->membase + MCFUART_UIMR); + } +} + +/****************************************************************************/ + +static irqreturn_t mcf_interrupt(int irq, void *data) +{ + struct uart_port *port = data; + struct mcf_uart *pp = (struct mcf_uart *) port; + unsigned int isr; + + isr = readb(port->membase + MCFUART_UISR) & pp->imr; + if (isr & MCFUART_UIR_RXREADY) + mcf_rx_chars(pp); + if (isr & MCFUART_UIR_TXREADY) + mcf_tx_chars(pp); + return IRQ_HANDLED; +} + +/****************************************************************************/ + +static void mcf_config_port(struct uart_port *port, int flags) +{ + port->type = PORT_MCF; + + /* Clear mask, so no surprise interrupts. */ + writeb(0, port->membase + MCFUART_UIMR); + + if (request_irq(port->irq, mcf_interrupt, IRQF_DISABLED, "UART", port)) + printk(KERN_ERR "MCF: unable to attach ColdFire UART %d " + "interrupt vector=%d\n", port->line, port->irq); +} + +/****************************************************************************/ + +static const char *mcf_type(struct uart_port *port) +{ + return (port->type == PORT_MCF) ? "ColdFire UART" : NULL; +} + +/****************************************************************************/ + +static int mcf_request_port(struct uart_port *port) +{ + /* UARTs always present */ + return 0; +} + +/****************************************************************************/ + +static void mcf_release_port(struct uart_port *port) +{ + /* Nothing to release... */ +} + +/****************************************************************************/ + +static int mcf_verify_port(struct uart_port *port, struct serial_struct *ser) +{ + if ((ser->type != PORT_UNKNOWN) && (ser->type != PORT_MCF)) + return -EINVAL; + return 0; +} + +/****************************************************************************/ + +/* + * Define the basic serial functions we support. + */ +static struct uart_ops mcf_uart_ops = { + .tx_empty = mcf_tx_empty, + .get_mctrl = mcf_get_mctrl, + .set_mctrl = mcf_set_mctrl, + .start_tx = mcf_start_tx, + .stop_tx = mcf_stop_tx, + .stop_rx = mcf_stop_rx, + .enable_ms = mcf_enable_ms, + .break_ctl = mcf_break_ctl, + .startup = mcf_startup, + .shutdown = mcf_shutdown, + .set_termios = mcf_set_termios, + .type = mcf_type, + .request_port = mcf_request_port, + .release_port = mcf_release_port, + .config_port = mcf_config_port, + .verify_port = mcf_verify_port, +}; + +static struct mcf_uart mcf_ports[3]; + +#define MCF_MAXPORTS (sizeof(mcf_ports) / sizeof(struct mcf_uart)) + +/****************************************************************************/ +#if defined(CONFIG_SERIAL_MCF_CONSOLE) +/****************************************************************************/ + +int __init early_mcf_setup(struct mcf_platform_uart *platp) +{ + struct uart_port *port; + int i; + + for (i = 0; ((i < MCF_MAXPORTS) && (platp[i].mapbase)); i++) { + port = &mcf_ports[i].port; + + port->line = i; + port->type = PORT_MCF; + port->mapbase = platp[i].mapbase; + port->membase = (platp[i].membase) ? platp[i].membase : + (unsigned char __iomem *) port->mapbase; + port->iotype = SERIAL_IO_MEM; + port->irq = platp[i].irq; + port->uartclk = MCF_BUSCLK; + port->flags = ASYNC_BOOT_AUTOCONF; + port->ops = &mcf_uart_ops; + } + + return 0; +} + +/****************************************************************************/ + +static void mcf_console_putc(struct console *co, const char c) +{ + struct uart_port *port = &(mcf_ports + co->index)->port; + int i; + + for (i = 0; (i < 0x10000); i++) { + if (readb(port->membase + MCFUART_USR) & MCFUART_USR_TXREADY) + break; + } + writeb(c, port->membase + MCFUART_UTB); + for (i = 0; (i < 0x10000); i++) { + if (readb(port->membase + MCFUART_USR) & MCFUART_USR_TXREADY) + break; + } +} + +/****************************************************************************/ + +static void mcf_console_write(struct console *co, const char *s, unsigned int count) +{ + for (; (count); count--, s++) { + mcf_console_putc(co, *s); + if (*s == '\n') + mcf_console_putc(co, '\r'); + } +} + +/****************************************************************************/ + +static int __init mcf_console_setup(struct console *co, char *options) +{ + struct uart_port *port; + int baud = CONFIG_SERIAL_MCF_BAUDRATE; + int bits = 8; + int parity = 'n'; + int flow = 'n'; + + if ((co->index >= 0) && (co->index <= MCF_MAXPORTS)) + co->index = 0; + port = &mcf_ports[co->index].port; + if (port->membase == 0) + return -ENODEV; + + if (options) + uart_parse_options(options, &baud, &parity, &bits, &flow); + + return uart_set_options(port, co, baud, parity, bits, flow); +} + +/****************************************************************************/ + +static struct uart_driver mcf_driver; + +static struct console mcf_console = { + .name = "ttyS", + .write = mcf_console_write, + .device = uart_console_device, + .setup = mcf_console_setup, + .flags = CON_PRINTBUFFER, + .index = -1, + .data = &mcf_driver, +}; + +static int __init mcf_console_init(void) +{ + register_console(&mcf_console); + return 0; +} + +console_initcall(mcf_console_init); + +#define MCF_CONSOLE &mcf_console + +/****************************************************************************/ +#else +/****************************************************************************/ + +#define MCF_CONSOLE NULL + +/****************************************************************************/ +#endif /* CONFIG_MCF_CONSOLE */ +/****************************************************************************/ + +/* + * Define the mcf UART driver structure. + */ +static struct uart_driver mcf_driver = { + .owner = THIS_MODULE, + .driver_name = "mcf", + .dev_name = "ttyS", + .major = TTY_MAJOR, + .minor = 64, + .nr = MCF_MAXPORTS, + .cons = MCF_CONSOLE, +}; + +/****************************************************************************/ + +static int __devinit mcf_probe(struct platform_device *pdev) +{ + struct mcf_platform_uart *platp = pdev->dev.platform_data; + struct uart_port *port; + int i; + + for (i = 0; ((i < MCF_MAXPORTS) && (platp[i].mapbase)); i++) { + port = &mcf_ports[i].port; + + port->line = i; + port->type = PORT_MCF; + port->mapbase = platp[i].mapbase; + port->membase = (platp[i].membase) ? platp[i].membase : + (unsigned char __iomem *) platp[i].mapbase; + port->iotype = SERIAL_IO_MEM; + port->irq = platp[i].irq; + port->uartclk = MCF_BUSCLK; + port->ops = &mcf_uart_ops; + port->flags = ASYNC_BOOT_AUTOCONF; + + uart_add_one_port(&mcf_driver, port); + } + + return 0; +} + +/****************************************************************************/ + +static int mcf_remove(struct platform_device *pdev) +{ + struct uart_port *port; + int i; + + for (i = 0; (i < MCF_MAXPORTS); i++) { + port = &mcf_ports[i].port; + if (port) + uart_remove_one_port(&mcf_driver, port); + } + + return 0; +} + +/****************************************************************************/ + +static struct platform_driver mcf_platform_driver = { + .probe = mcf_probe, + .remove = __devexit_p(mcf_remove), + .driver = { + .name = "mcfuart", + .owner = THIS_MODULE, + }, +}; + +/****************************************************************************/ + +static int __init mcf_init(void) +{ + int rc; + + printk("ColdFire internal UART serial driver\n"); + + rc = uart_register_driver(&mcf_driver); + if (rc) + return rc; + rc = platform_driver_register(&mcf_platform_driver); + if (rc) + return rc; + return 0; +} + +/****************************************************************************/ + +static void __exit mcf_exit(void) +{ + platform_driver_unregister(&mcf_platform_driver); + uart_unregister_driver(&mcf_driver); +} + +/****************************************************************************/ + +module_init(mcf_init); +module_exit(mcf_exit); + +MODULE_AUTHOR("Greg Ungerer <gerg@snapgear.com>"); +MODULE_DESCRIPTION("Freescale ColdFire UART driver"); +MODULE_LICENSE("GPL"); + +/****************************************************************************/ diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c index 90d64a8..8bdaa15 100644 --- a/drivers/usb/core/message.c +++ b/drivers/usb/core/message.c @@ -11,9 +11,9 @@ #include <linux/timer.h> #include <linux/ctype.h> #include <linux/device.h> +#include <linux/scatterlist.h> #include <linux/usb/quirks.h> #include <asm/byteorder.h> -#include <asm/scatterlist.h> #include "hcd.h" /* for usbcore internals */ #include "usb.h" diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig new file mode 100644 index 0000000..9e33fc4 --- /dev/null +++ b/drivers/virtio/Kconfig @@ -0,0 +1,8 @@ +# Virtio always gets selected by whoever wants it. +config VIRTIO + bool + +# Similarly the virtio ring implementation. +config VIRTIO_RING + bool + depends on VIRTIO diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile new file mode 100644 index 0000000..f70e409 --- /dev/null +++ b/drivers/virtio/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_VIRTIO) += virtio.o +obj-$(CONFIG_VIRTIO_RING) += virtio_ring.o diff --git a/drivers/virtio/config.c b/drivers/virtio/config.c new file mode 100644 index 0000000..983d482 --- /dev/null +++ b/drivers/virtio/config.c @@ -0,0 +1,13 @@ +/* Configuration space parsing helpers for virtio. + * + * The configuration is [type][len][... len bytes ...] fields. + * + * Copyright 2007 Rusty Russell, IBM Corporation. + * GPL v2 or later. + */ +#include <linux/err.h> +#include <linux/virtio.h> +#include <linux/virtio_config.h> +#include <linux/bug.h> +#include <asm/system.h> + diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c new file mode 100644 index 0000000..15d7787 --- /dev/null +++ b/drivers/virtio/virtio.c @@ -0,0 +1,189 @@ +#include <linux/virtio.h> +#include <linux/spinlock.h> +#include <linux/virtio_config.h> + +static ssize_t device_show(struct device *_d, + struct device_attribute *attr, char *buf) +{ + struct virtio_device *dev = container_of(_d,struct virtio_device,dev); + return sprintf(buf, "%hu", dev->id.device); +} +static ssize_t vendor_show(struct device *_d, + struct device_attribute *attr, char *buf) +{ + struct virtio_device *dev = container_of(_d,struct virtio_device,dev); + return sprintf(buf, "%hu", dev->id.vendor); +} +static ssize_t status_show(struct device *_d, + struct device_attribute *attr, char *buf) +{ + struct virtio_device *dev = container_of(_d,struct virtio_device,dev); + return sprintf(buf, "0x%08x", dev->config->get_status(dev)); +} +static ssize_t modalias_show(struct device *_d, + struct device_attribute *attr, char *buf) +{ + struct virtio_device *dev = container_of(_d,struct virtio_device,dev); + + return sprintf(buf, "virtio:d%08Xv%08X\n", + dev->id.device, dev->id.vendor); +} +static struct device_attribute virtio_dev_attrs[] = { + __ATTR_RO(device), + __ATTR_RO(vendor), + __ATTR_RO(status), + __ATTR_RO(modalias), + __ATTR_NULL +}; + +static inline int virtio_id_match(const struct virtio_device *dev, + const struct virtio_device_id *id) +{ + if (id->device != dev->id.device) + return 0; + + return id->vendor == VIRTIO_DEV_ANY_ID || id->vendor != dev->id.vendor; +} + +/* This looks through all the IDs a driver claims to support. If any of them + * match, we return 1 and the kernel will call virtio_dev_probe(). */ +static int virtio_dev_match(struct device *_dv, struct device_driver *_dr) +{ + unsigned int i; + struct virtio_device *dev = container_of(_dv,struct virtio_device,dev); + const struct virtio_device_id *ids; + + ids = container_of(_dr, struct virtio_driver, driver)->id_table; + for (i = 0; ids[i].device; i++) + if (virtio_id_match(dev, &ids[i])) + return 1; + return 0; +} + +static int virtio_uevent(struct device *_dv, struct kobj_uevent_env *env) +{ + struct virtio_device *dev = container_of(_dv,struct virtio_device,dev); + + return add_uevent_var(env, "MODALIAS=virtio:d%08Xv%08X", + dev->id.device, dev->id.vendor); +} + +static struct bus_type virtio_bus = { + .name = "virtio", + .match = virtio_dev_match, + .dev_attrs = virtio_dev_attrs, + .uevent = virtio_uevent, +}; + +static void add_status(struct virtio_device *dev, unsigned status) +{ + dev->config->set_status(dev, dev->config->get_status(dev) | status); +} + +static int virtio_dev_probe(struct device *_d) +{ + int err; + struct virtio_device *dev = container_of(_d,struct virtio_device,dev); + struct virtio_driver *drv = container_of(dev->dev.driver, + struct virtio_driver, driver); + + add_status(dev, VIRTIO_CONFIG_S_DRIVER); + err = drv->probe(dev); + if (err) + add_status(dev, VIRTIO_CONFIG_S_FAILED); + else + add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK); + return err; +} + +int register_virtio_driver(struct virtio_driver *driver) +{ + driver->driver.bus = &virtio_bus; + driver->driver.probe = virtio_dev_probe; + return driver_register(&driver->driver); +} +EXPORT_SYMBOL_GPL(register_virtio_driver); + +void unregister_virtio_driver(struct virtio_driver *driver) +{ + driver_unregister(&driver->driver); +} +EXPORT_SYMBOL_GPL(unregister_virtio_driver); + +int register_virtio_device(struct virtio_device *dev) +{ + int err; + + dev->dev.bus = &virtio_bus; + sprintf(dev->dev.bus_id, "%u", dev->index); + + /* Acknowledge that we've seen the device. */ + add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); + + /* device_register() causes the bus infrastructure to look for a + * matching driver. */ + err = device_register(&dev->dev); + if (err) + add_status(dev, VIRTIO_CONFIG_S_FAILED); + return err; +} +EXPORT_SYMBOL_GPL(register_virtio_device); + +void unregister_virtio_device(struct virtio_device *dev) +{ + device_unregister(&dev->dev); +} +EXPORT_SYMBOL_GPL(unregister_virtio_device); + +int __virtio_config_val(struct virtio_device *vdev, + u8 type, void *val, size_t size) +{ + void *token; + unsigned int len; + + token = vdev->config->find(vdev, type, &len); + if (!token) + return -ENOENT; + + if (len != size) + return -EIO; + + vdev->config->get(vdev, token, val, size); + return 0; +} +EXPORT_SYMBOL_GPL(__virtio_config_val); + +int virtio_use_bit(struct virtio_device *vdev, + void *token, unsigned int len, unsigned int bitnum) +{ + unsigned long bits[16]; + + /* This makes it convenient to pass-through find() results. */ + if (!token) + return 0; + + /* bit not in range of this bitfield? */ + if (bitnum * 8 >= len / 2) + return 0; + + /* Giant feature bitfields are silly. */ + BUG_ON(len > sizeof(bits)); + vdev->config->get(vdev, token, bits, len); + + if (!test_bit(bitnum, bits)) + return 0; + + /* Set acknowledge bit, and write it back. */ + set_bit(bitnum + len * 8 / 2, bits); + vdev->config->set(vdev, token, bits, len); + return 1; +} +EXPORT_SYMBOL_GPL(virtio_use_bit); + +static int virtio_init(void) +{ + if (bus_register(&virtio_bus) != 0) + panic("virtio bus registration failed"); + return 0; +} +core_initcall(virtio_init); diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c new file mode 100644 index 0000000..0e4baca --- /dev/null +++ b/drivers/virtio/virtio_ring.c @@ -0,0 +1,313 @@ +/* Virtio ring implementation. + * + * Copyright 2007 Rusty Russell IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <linux/virtio.h> +#include <linux/virtio_ring.h> +#include <linux/device.h> + +#ifdef DEBUG +/* For development, we want to crash whenever the ring is screwed. */ +#define BAD_RING(vq, fmt...) \ + do { dev_err(&vq->vq.vdev->dev, fmt); BUG(); } while(0) +#define START_USE(vq) \ + do { if ((vq)->in_use) panic("in_use = %i\n", (vq)->in_use); (vq)->in_use = __LINE__; mb(); } while(0) +#define END_USE(vq) \ + do { BUG_ON(!(vq)->in_use); (vq)->in_use = 0; mb(); } while(0) +#else +#define BAD_RING(vq, fmt...) \ + do { dev_err(&vq->vq.vdev->dev, fmt); (vq)->broken = true; } while(0) +#define START_USE(vq) +#define END_USE(vq) +#endif + +struct vring_virtqueue +{ + struct virtqueue vq; + + /* Actual memory layout for this queue */ + struct vring vring; + + /* Other side has made a mess, don't try any more. */ + bool broken; + + /* Number of free buffers */ + unsigned int num_free; + /* Head of free buffer list. */ + unsigned int free_head; + /* Number we've added since last sync. */ + unsigned int num_added; + + /* Last used index we've seen. */ + unsigned int last_used_idx; + + /* How to notify other side. FIXME: commonalize hcalls! */ + void (*notify)(struct virtqueue *vq); + +#ifdef DEBUG + /* They're supposed to lock for us. */ + unsigned int in_use; +#endif + + /* Tokens for callbacks. */ + void *data[]; +}; + +#define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq) + +static int vring_add_buf(struct virtqueue *_vq, + struct scatterlist sg[], + unsigned int out, + unsigned int in, + void *data) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + unsigned int i, avail, head, uninitialized_var(prev); + + BUG_ON(data == NULL); + BUG_ON(out + in > vq->vring.num); + BUG_ON(out + in == 0); + + START_USE(vq); + + if (vq->num_free < out + in) { + pr_debug("Can't add buf len %i - avail = %i\n", + out + in, vq->num_free); + END_USE(vq); + return -ENOSPC; + } + + /* We're about to use some buffers from the free list. */ + vq->num_free -= out + in; + + head = vq->free_head; + for (i = vq->free_head; out; i = vq->vring.desc[i].next, out--) { + vq->vring.desc[i].flags = VRING_DESC_F_NEXT; + vq->vring.desc[i].addr = (page_to_pfn(sg_page(sg))<<PAGE_SHIFT) + + sg->offset; + vq->vring.desc[i].len = sg->length; + prev = i; + sg++; + } + for (; in; i = vq->vring.desc[i].next, in--) { + vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE; + vq->vring.desc[i].addr = (page_to_pfn(sg_page(sg))<<PAGE_SHIFT) + + sg->offset; + vq->vring.desc[i].len = sg->length; + prev = i; + sg++; + } + /* Last one doesn't continue. */ + vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT; + + /* Update free pointer */ + vq->free_head = i; + + /* Set token. */ + vq->data[head] = data; + + /* Put entry in available array (but don't update avail->idx until they + * do sync). FIXME: avoid modulus here? */ + avail = (vq->vring.avail->idx + vq->num_added++) % vq->vring.num; + vq->vring.avail->ring[avail] = head; + + pr_debug("Added buffer head %i to %p\n", head, vq); + END_USE(vq); + return 0; +} + +static void vring_kick(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + START_USE(vq); + /* Descriptors and available array need to be set before we expose the + * new available array entries. */ + wmb(); + + vq->vring.avail->idx += vq->num_added; + vq->num_added = 0; + + /* Need to update avail index before checking if we should notify */ + mb(); + + if (!(vq->vring.used->flags & VRING_USED_F_NO_NOTIFY)) + /* Prod other side to tell it about changes. */ + vq->notify(&vq->vq); + + END_USE(vq); +} + +static void detach_buf(struct vring_virtqueue *vq, unsigned int head) +{ + unsigned int i; + + /* Clear data ptr. */ + vq->data[head] = NULL; + + /* Put back on free list: find end */ + i = head; + while (vq->vring.desc[i].flags & VRING_DESC_F_NEXT) { + i = vq->vring.desc[i].next; + vq->num_free++; + } + + vq->vring.desc[i].next = vq->free_head; + vq->free_head = head; + /* Plus final descriptor */ + vq->num_free++; +} + +/* FIXME: We need to tell other side about removal, to synchronize. */ +static void vring_shutdown(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + unsigned int i; + + for (i = 0; i < vq->vring.num; i++) + detach_buf(vq, i); +} + +static inline bool more_used(const struct vring_virtqueue *vq) +{ + return vq->last_used_idx != vq->vring.used->idx; +} + +static void *vring_get_buf(struct virtqueue *_vq, unsigned int *len) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + void *ret; + unsigned int i; + + START_USE(vq); + + if (!more_used(vq)) { + pr_debug("No more buffers in queue\n"); + END_USE(vq); + return NULL; + } + + i = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].id; + *len = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].len; + + if (unlikely(i >= vq->vring.num)) { + BAD_RING(vq, "id %u out of range\n", i); + return NULL; + } + if (unlikely(!vq->data[i])) { + BAD_RING(vq, "id %u is not a head!\n", i); + return NULL; + } + + /* detach_buf clears data, so grab it now. */ + ret = vq->data[i]; + detach_buf(vq, i); + vq->last_used_idx++; + END_USE(vq); + return ret; +} + +static bool vring_restart(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + START_USE(vq); + BUG_ON(!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)); + + /* We optimistically turn back on interrupts, then check if there was + * more to do. */ + vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT; + mb(); + if (unlikely(more_used(vq))) { + vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; + END_USE(vq); + return false; + } + + END_USE(vq); + return true; +} + +irqreturn_t vring_interrupt(int irq, void *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + if (!more_used(vq)) { + pr_debug("virtqueue interrupt with no work for %p\n", vq); + return IRQ_NONE; + } + + if (unlikely(vq->broken)) + return IRQ_HANDLED; + + pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback); + if (vq->vq.callback && !vq->vq.callback(&vq->vq)) + vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; + + return IRQ_HANDLED; +} + +static struct virtqueue_ops vring_vq_ops = { + .add_buf = vring_add_buf, + .get_buf = vring_get_buf, + .kick = vring_kick, + .restart = vring_restart, + .shutdown = vring_shutdown, +}; + +struct virtqueue *vring_new_virtqueue(unsigned int num, + struct virtio_device *vdev, + void *pages, + void (*notify)(struct virtqueue *), + bool (*callback)(struct virtqueue *)) +{ + struct vring_virtqueue *vq; + unsigned int i; + + vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL); + if (!vq) + return NULL; + + vring_init(&vq->vring, num, pages); + vq->vq.callback = callback; + vq->vq.vdev = vdev; + vq->vq.vq_ops = &vring_vq_ops; + vq->notify = notify; + vq->broken = false; + vq->last_used_idx = 0; + vq->num_added = 0; +#ifdef DEBUG + vq->in_use = false; +#endif + + /* No callback? Tell other side not to bother us. */ + if (!callback) + vq->vring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; + + /* Put everything in free lists. */ + vq->num_free = num; + vq->free_head = 0; + for (i = 0; i < num-1; i++) + vq->vring.desc[i].next = i+1; + + return &vq->vq; +} + +void vring_del_virtqueue(struct virtqueue *vq) +{ + kfree(to_vvq(vq)); +} + diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 873802d..756f7e9 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -162,6 +162,7 @@ static void v9fs_parse_options(struct v9fs_session_info *v9ses) if (*e != '\0') v9ses->uid = ~0; } + kfree(s); break; default: diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 175b4d9..23581bc 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -687,10 +687,10 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, retval = p9_client_wstat(oldfid, &wstat); clunk_newdir: - p9_client_clunk(olddirfid); + p9_client_clunk(newdirfid); clunk_olddir: - p9_client_clunk(newdirfid); + p9_client_clunk(olddirfid); done: return retval; diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 8ec9323..9728614 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -228,11 +228,28 @@ struct posix_acl *jffs2_get_acl(struct inode *inode, int type) return acl; } +static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *acl) +{ + char *value = NULL; + size_t size = 0; + int rc; + + if (acl) { + value = jffs2_acl_to_medium(acl, &size); + if (IS_ERR(value)) + return PTR_ERR(value); + } + rc = do_jffs2_setxattr(inode, xprefix, "", value, size, 0); + if (!value && rc == -ENODATA) + rc = 0; + kfree(value); + + return rc; +} + static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) { struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); - size_t size = 0; - char *value = NULL; int rc, xprefix; if (S_ISLNK(inode->i_mode)) @@ -267,17 +284,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) default: return -EINVAL; } - if (acl) { - value = jffs2_acl_to_medium(acl, &size); - if (IS_ERR(value)) - return PTR_ERR(value); - } - - rc = do_jffs2_setxattr(inode, xprefix, "", value, size, 0); - if (!value && rc == -ENODATA) - rc = 0; - if (value) - kfree(value); + rc = __jffs2_set_acl(inode, xprefix, acl); if (!rc) { switch(type) { case ACL_TYPE_ACCESS: @@ -312,37 +319,59 @@ int jffs2_permission(struct inode *inode, int mask, struct nameidata *nd) return generic_permission(inode, mask, jffs2_check_acl); } -int jffs2_init_acl(struct inode *inode, struct posix_acl *acl) +int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode) { struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); - struct posix_acl *clone; - mode_t mode; - int rc = 0; + struct posix_acl *acl, *clone; + int rc; - f->i_acl_access = JFFS2_ACL_NOT_CACHED; - f->i_acl_default = JFFS2_ACL_NOT_CACHED; + f->i_acl_default = NULL; + f->i_acl_access = NULL; + + if (S_ISLNK(*i_mode)) + return 0; /* Symlink always has no-ACL */ + + acl = jffs2_get_acl(dir_i, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); + + if (!acl) { + *i_mode &= ~current->fs->umask; + } else { + if (S_ISDIR(*i_mode)) + jffs2_iset_acl(inode, &f->i_acl_default, acl); - if (acl) { - if (S_ISDIR(inode->i_mode)) { - rc = jffs2_set_acl(inode, ACL_TYPE_DEFAULT, acl); - if (rc) - goto cleanup; - } clone = posix_acl_clone(acl, GFP_KERNEL); - rc = -ENOMEM; if (!clone) - goto cleanup; - mode = inode->i_mode; - rc = posix_acl_create_masq(clone, &mode); - if (rc >= 0) { - inode->i_mode = mode; - if (rc > 0) - rc = jffs2_set_acl(inode, ACL_TYPE_ACCESS, clone); - } + return -ENOMEM; + rc = posix_acl_create_masq(clone, (mode_t *)i_mode); + if (rc < 0) + return rc; + if (rc > 0) + jffs2_iset_acl(inode, &f->i_acl_access, clone); + posix_acl_release(clone); } - cleanup: - posix_acl_release(acl); + return 0; +} + +int jffs2_init_acl_post(struct inode *inode) +{ + struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); + int rc; + + if (f->i_acl_default) { + rc = __jffs2_set_acl(inode, JFFS2_XPREFIX_ACL_DEFAULT, f->i_acl_default); + if (rc) + return rc; + } + + if (f->i_acl_access) { + rc = __jffs2_set_acl(inode, JFFS2_XPREFIX_ACL_ACCESS, f->i_acl_access); + if (rc) + return rc; + } + return rc; } diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index 90a2dbf..76c6ebd1 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -31,7 +31,8 @@ struct jffs2_acl_header { extern struct posix_acl *jffs2_get_acl(struct inode *inode, int type); extern int jffs2_permission(struct inode *, int, struct nameidata *); extern int jffs2_acl_chmod(struct inode *); -extern int jffs2_init_acl(struct inode *, struct posix_acl *); +extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); +extern int jffs2_init_acl_post(struct inode *); extern void jffs2_clear_acl(struct jffs2_inode_info *); extern struct xattr_handler jffs2_acl_access_xattr_handler; @@ -39,10 +40,11 @@ extern struct xattr_handler jffs2_acl_default_xattr_handler; #else -#define jffs2_get_acl(inode, type) (NULL) -#define jffs2_permission NULL -#define jffs2_acl_chmod(inode) (0) -#define jffs2_init_acl(inode,dir) (0) +#define jffs2_get_acl(inode, type) (NULL) +#define jffs2_permission (NULL) +#define jffs2_acl_chmod(inode) (0) +#define jffs2_init_acl_pre(dir_i,inode,mode) (0) +#define jffs2_init_acl_post(inode) (0) #define jffs2_clear_acl(f) #endif /* CONFIG_JFFS2_FS_POSIX_ACL */ diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 8353eb9..787e392 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -182,7 +182,6 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode, struct jffs2_inode_info *f, *dir_f; struct jffs2_sb_info *c; struct inode *inode; - struct posix_acl *acl; int ret; ri = jffs2_alloc_raw_inode(); @@ -193,7 +192,7 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode, D1(printk(KERN_DEBUG "jffs2_create()\n")); - inode = jffs2_new_inode(dir_i, mode, ri, &acl); + inode = jffs2_new_inode(dir_i, mode, ri); if (IS_ERR(inode)) { D1(printk(KERN_DEBUG "jffs2_new_inode() failed\n")); @@ -211,14 +210,6 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode, ret = jffs2_do_create(c, dir_f, f, ri, dentry->d_name.name, dentry->d_name.len); - - if (ret) - goto fail_acl; - - ret = jffs2_init_security(inode, dir_i); - if (ret) - goto fail_acl; - ret = jffs2_init_acl(inode, acl); if (ret) goto fail; @@ -231,8 +222,6 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode, inode->i_ino, inode->i_mode, inode->i_nlink, f->inocache->nlink, inode->i_mapping->nrpages)); return 0; - fail_acl: - posix_acl_release(acl); fail: make_bad_inode(inode); iput(inode); @@ -309,7 +298,6 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char struct jffs2_full_dirent *fd; int namelen; uint32_t alloclen; - struct posix_acl *acl; int ret, targetlen = strlen(target); /* FIXME: If you care. We'd need to use frags for the target @@ -336,7 +324,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char return ret; } - inode = jffs2_new_inode(dir_i, S_IFLNK | S_IRWXUGO, ri, &acl); + inode = jffs2_new_inode(dir_i, S_IFLNK | S_IRWXUGO, ri); if (IS_ERR(inode)) { jffs2_free_raw_inode(ri); @@ -366,7 +354,6 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char up(&f->sem); jffs2_complete_reservation(c); jffs2_clear_inode(inode); - posix_acl_release(acl); return PTR_ERR(fn); } @@ -377,7 +364,6 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char up(&f->sem); jffs2_complete_reservation(c); jffs2_clear_inode(inode); - posix_acl_release(acl); return -ENOMEM; } @@ -395,10 +381,9 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char ret = jffs2_init_security(inode, dir_i); if (ret) { jffs2_clear_inode(inode); - posix_acl_release(acl); return ret; } - ret = jffs2_init_acl(inode, acl); + ret = jffs2_init_acl_post(inode); if (ret) { jffs2_clear_inode(inode); return ret; @@ -476,7 +461,6 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) struct jffs2_full_dirent *fd; int namelen; uint32_t alloclen; - struct posix_acl *acl; int ret; mode |= S_IFDIR; @@ -499,7 +483,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) return ret; } - inode = jffs2_new_inode(dir_i, mode, ri, &acl); + inode = jffs2_new_inode(dir_i, mode, ri); if (IS_ERR(inode)) { jffs2_free_raw_inode(ri); @@ -526,7 +510,6 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) up(&f->sem); jffs2_complete_reservation(c); jffs2_clear_inode(inode); - posix_acl_release(acl); return PTR_ERR(fn); } /* No data here. Only a metadata node, which will be @@ -540,10 +523,9 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) ret = jffs2_init_security(inode, dir_i); if (ret) { jffs2_clear_inode(inode); - posix_acl_release(acl); return ret; } - ret = jffs2_init_acl(inode, acl); + ret = jffs2_init_acl_post(inode); if (ret) { jffs2_clear_inode(inode); return ret; @@ -639,7 +621,6 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de union jffs2_device_node dev; int devlen = 0; uint32_t alloclen; - struct posix_acl *acl; int ret; if (!new_valid_dev(rdev)) @@ -666,7 +647,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de return ret; } - inode = jffs2_new_inode(dir_i, mode, ri, &acl); + inode = jffs2_new_inode(dir_i, mode, ri); if (IS_ERR(inode)) { jffs2_free_raw_inode(ri); @@ -695,7 +676,6 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de up(&f->sem); jffs2_complete_reservation(c); jffs2_clear_inode(inode); - posix_acl_release(acl); return PTR_ERR(fn); } /* No data here. Only a metadata node, which will be @@ -709,10 +689,9 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de ret = jffs2_init_security(inode, dir_i); if (ret) { jffs2_clear_inode(inode); - posix_acl_release(acl); return ret; } - ret = jffs2_init_acl(inode, acl); + ret = jffs2_init_acl_post(inode); if (ret) { jffs2_clear_inode(inode); return ret; diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 023a175..f9c5dd6 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -255,7 +255,7 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, _whole_ page. This helps to reduce the number of nodes in files which have many short writes, like syslog files. */ - start = aligned_start = 0; + aligned_start = 0; } ri = jffs2_alloc_raw_inode(); @@ -291,14 +291,11 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, } /* Adjust writtenlen for the padding we did, so we don't confuse our caller */ - if (writtenlen < (start&3)) - writtenlen = 0; - else - writtenlen -= (start&3); + writtenlen -= min(writtenlen, (start - aligned_start)); if (writtenlen) { - if (inode->i_size < (pg->index << PAGE_CACHE_SHIFT) + start + writtenlen) { - inode->i_size = (pg->index << PAGE_CACHE_SHIFT) + start + writtenlen; + if (inode->i_size < pos + writtenlen) { + inode->i_size = pos + writtenlen; inode->i_blocks = (inode->i_size + 511) >> 9; inode->i_ctime = inode->i_mtime = ITIME(je32_to_cpu(ri->ctime)); diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index ed85f9a..d2e06f7 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -402,8 +402,7 @@ void jffs2_write_super (struct super_block *sb) /* jffs2_new_inode: allocate a new inode and inocache, add it to the hash, fill in the raw_inode while you're at it. */ -struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_inode *ri, - struct posix_acl **acl) +struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_inode *ri) { struct inode *inode; struct super_block *sb = dir_i->i_sb; @@ -438,19 +437,11 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i /* POSIX ACLs have to be processed now, at least partly. The umask is only applied if there's no default ACL */ - if (!S_ISLNK(mode)) { - *acl = jffs2_get_acl(dir_i, ACL_TYPE_DEFAULT); - if (IS_ERR(*acl)) { - make_bad_inode(inode); - iput(inode); - inode = (void *)*acl; - *acl = NULL; - return inode; - } - if (!(*acl)) - mode &= ~current->fs->umask; - } else { - *acl = NULL; + ret = jffs2_init_acl_pre(dir_i, inode, &mode); + if (ret) { + make_bad_inode(inode); + iput(inode); + return ERR_PTR(ret); } ret = jffs2_do_new_inode (c, f, mode, ri); if (ret) { diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index f6743a9..bf64686 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -173,15 +173,13 @@ int jffs2_ioctl(struct inode *, struct file *, unsigned int, unsigned long); extern const struct inode_operations jffs2_symlink_inode_operations; /* fs.c */ -struct posix_acl; - int jffs2_setattr (struct dentry *, struct iattr *); int jffs2_do_setattr (struct inode *, struct iattr *); void jffs2_read_inode (struct inode *); void jffs2_clear_inode (struct inode *); void jffs2_dirty_inode(struct inode *inode); struct inode *jffs2_new_inode (struct inode *dir_i, int mode, - struct jffs2_raw_inode *ri, struct posix_acl **acl); + struct jffs2_raw_inode *ri); int jffs2_statfs (struct dentry *, struct kstatfs *); void jffs2_write_super (struct super_block *); int jffs2_remount_fs (struct super_block *, int *, char *); diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c index 2f56954..147e2cb 100644 --- a/fs/jffs2/write.c +++ b/fs/jffs2/write.c @@ -465,6 +465,14 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str up(&f->sem); jffs2_complete_reservation(c); + + ret = jffs2_init_security(&f->vfs_inode, &dir_f->vfs_inode); + if (ret) + return ret; + ret = jffs2_init_acl_post(&f->vfs_inode); + if (ret) + return ret; + ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); diff --git a/include/asm-arm/dma-mapping.h b/include/asm-arm/dma-mapping.h index 1eb8aac..e99406a 100644 --- a/include/asm-arm/dma-mapping.h +++ b/include/asm-arm/dma-mapping.h @@ -5,7 +5,7 @@ #include <linux/mm.h> /* need struct page */ -#include <asm/scatterlist.h> +#include <linux/scatterlist.h> /* * DMA-consistent mapping functions. These allocate/free a region of @@ -274,8 +274,8 @@ dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, for (i = 0; i < nents; i++, sg++) { char *virt; - sg->dma_address = page_to_dma(dev, sg->page) + sg->offset; - virt = page_address(sg->page) + sg->offset; + sg->dma_address = page_to_dma(dev, sg_page(sg)) + sg->offset; + virt = sg_virt(sg); if (!arch_is_coherent()) dma_cache_maint(virt, sg->length, dir); @@ -371,7 +371,7 @@ dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nents, int i; for (i = 0; i < nents; i++, sg++) { - char *virt = page_address(sg->page) + sg->offset; + char *virt = sg_virt(sg); if (!arch_is_coherent()) dma_cache_maint(virt, sg->length, dir); } @@ -384,7 +384,7 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nents, int i; for (i = 0; i < nents; i++, sg++) { - char *virt = page_address(sg->page) + sg->offset; + char *virt = sg_virt(sg); if (!arch_is_coherent()) dma_cache_maint(virt, sg->length, dir); } diff --git a/include/asm-avr32/arch-at32ap/board.h b/include/asm-avr32/arch-at32ap/board.h index 7dbd603..d6993a6 100644 --- a/include/asm-avr32/arch-at32ap/board.h +++ b/include/asm-avr32/arch-at32ap/board.h @@ -44,6 +44,13 @@ struct usba_platform_data { struct platform_device * at32_add_device_usba(unsigned int id, struct usba_platform_data *data); +struct ide_platform_data { + u8 cs; +}; +struct platform_device * +at32_add_device_ide(unsigned int id, unsigned int extint, + struct ide_platform_data *data); + /* depending on what's hooked up, not all SSC pins will be used */ #define ATMEL_SSC_TK 0x01 #define ATMEL_SSC_TF 0x02 @@ -58,4 +65,20 @@ at32_add_device_usba(unsigned int id, struct usba_platform_data *data); struct platform_device * at32_add_device_ssc(unsigned int id, unsigned int flags); +struct platform_device *at32_add_device_twi(unsigned int id); +struct platform_device *at32_add_device_mci(unsigned int id); +struct platform_device *at32_add_device_ac97c(unsigned int id); +struct platform_device *at32_add_device_abdac(unsigned int id); + +struct cf_platform_data { + int detect_pin; + int reset_pin; + int vcc_pin; + int ready_pin; + u8 cs; +}; +struct platform_device * +at32_add_device_cf(unsigned int id, unsigned int extint, + struct cf_platform_data *data); + #endif /* __ASM_ARCH_BOARD_H */ diff --git a/include/asm-avr32/dma-mapping.h b/include/asm-avr32/dma-mapping.h index 81e3426..a713163 100644 --- a/include/asm-avr32/dma-mapping.h +++ b/include/asm-avr32/dma-mapping.h @@ -217,8 +217,8 @@ dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, for (i = 0; i < nents; i++) { char *virt; - sg[i].dma_address = page_to_bus(sg[i].page) + sg[i].offset; - virt = page_address(sg[i].page) + sg[i].offset; + sg[i].dma_address = page_to_bus(sg_page(&sg[i])) + sg[i].offset; + virt = sg_virt(&sg[i]); dma_cache_sync(dev, virt, sg[i].length, direction); } @@ -327,8 +327,7 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int i; for (i = 0; i < nents; i++) { - dma_cache_sync(dev, page_address(sg[i].page) + sg[i].offset, - sg[i].length, direction); + dma_cache_sync(dev, sg_virt(&sg[i]), sg[i].length, direction); } } diff --git a/include/asm-blackfin/scatterlist.h b/include/asm-blackfin/scatterlist.h index 32128d5..04f4487 100644 --- a/include/asm-blackfin/scatterlist.h +++ b/include/asm-blackfin/scatterlist.h @@ -20,7 +20,6 @@ struct scatterlist { * returns, or alternatively stop on the first sg_dma_len(sg) which * is 0. */ -#define sg_address(sg) (page_address((sg)->page) + (sg)->offset) #define sg_dma_address(sg) ((sg)->dma_address) #define sg_dma_len(sg) ((sg)->length) diff --git a/include/asm-frv/scatterlist.h b/include/asm-frv/scatterlist.h index f7da007..99ba76e 100644 --- a/include/asm-frv/scatterlist.h +++ b/include/asm-frv/scatterlist.h @@ -4,19 +4,19 @@ #include <asm/types.h> /* - * Drivers must set either ->address or (preferred) ->page and ->offset + * Drivers must set either ->address or (preferred) page and ->offset * to indicate where data must be transferred to/from. * - * Using ->page is recommended since it handles highmem data as well as + * Using page is recommended since it handles highmem data as well as * low mem. ->address is restricted to data which has a virtual mapping, and - * it will go away in the future. Updating to ->page can be automated very + * it will go away in the future. Updating to page can be automated very * easily -- something like * * sg->address = some_ptr; * * can be rewritten as * - * sg->page = virt_to_page(some_ptr); + * sg_set_page(virt_to_page(some_ptr)); * sg->offset = (unsigned long) some_ptr & ~PAGE_MASK; * * and that's it. There's no excuse for not highmem enabling YOUR driver. /jens diff --git a/include/asm-m68knommu/module.h b/include/asm-m68knommu/module.h index 57e95cc..2e45ab5 100644 --- a/include/asm-m68knommu/module.h +++ b/include/asm-m68knommu/module.h @@ -1 +1,11 @@ -#include <asm-m68k/module.h> +#ifndef ASM_M68KNOMMU_MODULE_H +#define ASM_M68KNOMMU_MODULE_H + +struct mod_arch_specific { +}; + +#define Elf_Shdr Elf32_Shdr +#define Elf_Sym Elf32_Sym +#define Elf_Ehdr Elf32_Ehdr + +#endif /* ASM_M68KNOMMU_MODULE_H */ diff --git a/include/asm-m68knommu/scatterlist.h b/include/asm-m68knommu/scatterlist.h index 1094284..afc4788 100644 --- a/include/asm-m68knommu/scatterlist.h +++ b/include/asm-m68knommu/scatterlist.h @@ -14,7 +14,6 @@ struct scatterlist { unsigned int length; }; -#define sg_address(sg) (page_address((sg)->page) + (sg)->offset) #define sg_dma_address(sg) ((sg)->dma_address) #define sg_dma_len(sg) ((sg)->length) diff --git a/include/asm-m68knommu/uaccess.h b/include/asm-m68knommu/uaccess.h index 9ed9169..68bbe9b 100644 --- a/include/asm-m68knommu/uaccess.h +++ b/include/asm-m68knommu/uaccess.h @@ -170,10 +170,12 @@ static inline long strnlen_user(const char *src, long n) */ static inline unsigned long -clear_user(void *to, unsigned long n) +__clear_user(void *to, unsigned long n) { memset(to, 0, n); return 0; } +#define clear_user(to,n) __clear_user(to,n) + #endif /* _M68KNOMMU_UACCESS_H */ diff --git a/include/asm-parisc/scatterlist.h b/include/asm-parisc/scatterlist.h index cd3cfdf..62269b3 100644 --- a/include/asm-parisc/scatterlist.h +++ b/include/asm-parisc/scatterlist.h @@ -18,7 +18,7 @@ struct scatterlist { __u32 iova_length; /* bytes mapped */ }; -#define sg_virt_addr(sg) ((unsigned long)(page_address(sg->page) + sg->offset)) +#define sg_virt_addr(sg) ((unsigned long)sg_virt(sg)) #define sg_dma_address(sg) ((sg)->iova) #define sg_dma_len(sg) ((sg)->iova_length) diff --git a/include/asm-powerpc/dma-mapping.h b/include/asm-powerpc/dma-mapping.h index 65be95d..ff52013 100644 --- a/include/asm-powerpc/dma-mapping.h +++ b/include/asm-powerpc/dma-mapping.h @@ -285,9 +285,9 @@ dma_map_sg(struct device *dev, struct scatterlist *sgl, int nents, BUG_ON(direction == DMA_NONE); for_each_sg(sgl, sg, nents, i) { - BUG_ON(!sg->page); - __dma_sync_page(sg->page, sg->offset, sg->length, direction); - sg->dma_address = page_to_bus(sg->page) + sg->offset; + BUG_ON(!sg_page(sg)); + __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction); + sg->dma_address = page_to_bus(sg_page(sg)) + sg->offset; } return nents; @@ -328,7 +328,7 @@ static inline void dma_sync_sg_for_cpu(struct device *dev, BUG_ON(direction == DMA_NONE); for_each_sg(sgl, sg, nents, i) - __dma_sync_page(sg->page, sg->offset, sg->length, direction); + __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction); } static inline void dma_sync_sg_for_device(struct device *dev, @@ -341,7 +341,7 @@ static inline void dma_sync_sg_for_device(struct device *dev, BUG_ON(direction == DMA_NONE); for_each_sg(sgl, sg, nents, i) - __dma_sync_page(sg->page, sg->offset, sg->length, direction); + __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction); } static inline int dma_mapping_error(dma_addr_t dma_addr) diff --git a/include/asm-sh/dma-mapping.h b/include/asm-sh/dma-mapping.h index 84fefda..fcea067 100644 --- a/include/asm-sh/dma-mapping.h +++ b/include/asm-sh/dma-mapping.h @@ -2,7 +2,7 @@ #define __ASM_SH_DMA_MAPPING_H #include <linux/mm.h> -#include <asm/scatterlist.h> +#include <linux/scatterlist.h> #include <asm/cacheflush.h> #include <asm/io.h> @@ -85,10 +85,9 @@ static inline int dma_map_sg(struct device *dev, struct scatterlist *sg, for (i = 0; i < nents; i++) { #if !defined(CONFIG_PCI) || defined(CONFIG_SH_PCIDMA_NONCOHERENT) - dma_cache_sync(dev, page_address(sg[i].page) + sg[i].offset, - sg[i].length, dir); + dma_cache_sync(dev, sg_virt(&sg[i]), sg[i].length, dir); #endif - sg[i].dma_address = page_to_phys(sg[i].page) + sg[i].offset; + sg[i].dma_address = sg_phys(&sg[i]); } return nents; @@ -138,10 +137,9 @@ static inline void dma_sync_sg(struct device *dev, struct scatterlist *sg, for (i = 0; i < nelems; i++) { #if !defined(CONFIG_PCI) || defined(CONFIG_SH_PCIDMA_NONCOHERENT) - dma_cache_sync(dev, page_address(sg[i].page) + sg[i].offset, - sg[i].length, dir); + dma_cache_sync(dev, sg_virt(&sg[i]), sg[i].length, dir); #endif - sg[i].dma_address = page_to_phys(sg[i].page) + sg[i].offset; + sg[i].dma_address = sg_phys(&sg[i]); } } diff --git a/include/asm-sh64/dma-mapping.h b/include/asm-sh64/dma-mapping.h index e661857..1438b76 100644 --- a/include/asm-sh64/dma-mapping.h +++ b/include/asm-sh64/dma-mapping.h @@ -2,7 +2,7 @@ #define __ASM_SH_DMA_MAPPING_H #include <linux/mm.h> -#include <asm/scatterlist.h> +#include <linux/scatterlist.h> #include <asm/io.h> struct pci_dev; @@ -71,10 +71,9 @@ static inline int dma_map_sg(struct device *dev, struct scatterlist *sg, for (i = 0; i < nents; i++) { #if !defined(CONFIG_PCI) || defined(CONFIG_SH_PCIDMA_NONCOHERENT) - dma_cache_sync(dev, page_address(sg[i].page) + sg[i].offset, - sg[i].length, dir); + dma_cache_sync(dev, sg_virt(&sg[i]), sg[i].length, dir); #endif - sg[i].dma_address = page_to_phys(sg[i].page) + sg[i].offset; + sg[i].dma_address = sg_phys(&sg[i]); } return nents; @@ -124,10 +123,9 @@ static inline void dma_sync_sg(struct device *dev, struct scatterlist *sg, for (i = 0; i < nelems; i++) { #if !defined(CONFIG_PCI) || defined(CONFIG_SH_PCIDMA_NONCOHERENT) - dma_cache_sync(dev, page_address(sg[i].page) + sg[i].offset, - sg[i].length, dir); + dma_cache_sync(dev, sg_virt(&sg[i]), sg[i].length, dir); #endif - sg[i].dma_address = page_to_phys(sg[i].page) + sg[i].offset; + sg[i].dma_address = sg_phys(&sg[i]); } } diff --git a/include/asm-x86/Kbuild b/include/asm-x86/Kbuild index 559830e..5e3539c 100644 --- a/include/asm-x86/Kbuild +++ b/include/asm-x86/Kbuild @@ -1,6 +1,7 @@ include include/asm-generic/Kbuild.asm header-y += boot.h +header-y += bootparam.h header-y += debugreg.h header-y += ldt.h header-y += msr-index.h @@ -14,8 +15,10 @@ unifdef-y += a.out_32.h unifdef-y += a.out_64.h unifdef-y += byteorder_32.h unifdef-y += byteorder_64.h +unifdef-y += e820.h unifdef-y += elf_32.h unifdef-y += elf_64.h +unifdef-y += ist.h unifdef-y += mce.h unifdef-y += msgbuf_32.h unifdef-y += msgbuf_64.h diff --git a/include/asm-x86/bootparam.h b/include/asm-x86/bootparam.h index dc031cf..19f3ddf 100644 --- a/include/asm-x86/bootparam.h +++ b/include/asm-x86/bootparam.h @@ -10,85 +10,85 @@ #include <video/edid.h> struct setup_header { - u8 setup_sects; - u16 root_flags; - u32 syssize; - u16 ram_size; + __u8 setup_sects; + __u16 root_flags; + __u32 syssize; + __u16 ram_size; #define RAMDISK_IMAGE_START_MASK 0x07FF #define RAMDISK_PROMPT_FLAG 0x8000 #define RAMDISK_LOAD_FLAG 0x4000 - u16 vid_mode; - u16 root_dev; - u16 boot_flag; - u16 jump; - u32 header; - u16 version; - u32 realmode_swtch; - u16 start_sys; - u16 kernel_version; - u8 type_of_loader; - u8 loadflags; + __u16 vid_mode; + __u16 root_dev; + __u16 boot_flag; + __u16 jump; + __u32 header; + __u16 version; + __u32 realmode_swtch; + __u16 start_sys; + __u16 kernel_version; + __u8 type_of_loader; + __u8 loadflags; #define LOADED_HIGH (1<<0) #define KEEP_SEGMENTS (1<<6) #define CAN_USE_HEAP (1<<7) - u16 setup_move_size; - u32 code32_start; - u32 ramdisk_image; - u32 ramdisk_size; - u32 bootsect_kludge; - u16 heap_end_ptr; - u16 _pad1; - u32 cmd_line_ptr; - u32 initrd_addr_max; - u32 kernel_alignment; - u8 relocatable_kernel; - u8 _pad2[3]; - u32 cmdline_size; - u32 hardware_subarch; - u64 hardware_subarch_data; + __u16 setup_move_size; + __u32 code32_start; + __u32 ramdisk_image; + __u32 ramdisk_size; + __u32 bootsect_kludge; + __u16 heap_end_ptr; + __u16 _pad1; + __u32 cmd_line_ptr; + __u32 initrd_addr_max; + __u32 kernel_alignment; + __u8 relocatable_kernel; + __u8 _pad2[3]; + __u32 cmdline_size; + __u32 hardware_subarch; + __u64 hardware_subarch_data; } __attribute__((packed)); struct sys_desc_table { - u16 length; - u8 table[14]; + __u16 length; + __u8 table[14]; }; struct efi_info { - u32 _pad1; - u32 efi_systab; - u32 efi_memdesc_size; - u32 efi_memdesc_version; - u32 efi_memmap; - u32 efi_memmap_size; - u32 _pad2[2]; + __u32 _pad1; + __u32 efi_systab; + __u32 efi_memdesc_size; + __u32 efi_memdesc_version; + __u32 efi_memmap; + __u32 efi_memmap_size; + __u32 _pad2[2]; }; /* The so-called "zeropage" */ struct boot_params { struct screen_info screen_info; /* 0x000 */ struct apm_bios_info apm_bios_info; /* 0x040 */ - u8 _pad2[12]; /* 0x054 */ + __u8 _pad2[12]; /* 0x054 */ struct ist_info ist_info; /* 0x060 */ - u8 _pad3[16]; /* 0x070 */ - u8 hd0_info[16]; /* obsolete! */ /* 0x080 */ - u8 hd1_info[16]; /* obsolete! */ /* 0x090 */ + __u8 _pad3[16]; /* 0x070 */ + __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */ + __u8 hd1_info[16]; /* obsolete! */ /* 0x090 */ struct sys_desc_table sys_desc_table; /* 0x0a0 */ - u8 _pad4[144]; /* 0x0b0 */ + __u8 _pad4[144]; /* 0x0b0 */ struct edid_info edid_info; /* 0x140 */ struct efi_info efi_info; /* 0x1c0 */ - u32 alt_mem_k; /* 0x1e0 */ - u32 scratch; /* Scratch field! */ /* 0x1e4 */ - u8 e820_entries; /* 0x1e8 */ - u8 eddbuf_entries; /* 0x1e9 */ - u8 edd_mbr_sig_buf_entries; /* 0x1ea */ - u8 _pad6[6]; /* 0x1eb */ + __u32 alt_mem_k; /* 0x1e0 */ + __u32 scratch; /* Scratch field! */ /* 0x1e4 */ + __u8 e820_entries; /* 0x1e8 */ + __u8 eddbuf_entries; /* 0x1e9 */ + __u8 edd_mbr_sig_buf_entries; /* 0x1ea */ + __u8 _pad6[6]; /* 0x1eb */ struct setup_header hdr; /* setup header */ /* 0x1f1 */ - u8 _pad7[0x290-0x1f1-sizeof(struct setup_header)]; - u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX]; /* 0x290 */ + __u8 _pad7[0x290-0x1f1-sizeof(struct setup_header)]; + __u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX]; /* 0x290 */ struct e820entry e820_map[E820MAX]; /* 0x2d0 */ - u8 _pad8[48]; /* 0xcd0 */ + __u8 _pad8[48]; /* 0xcd0 */ struct edd_info eddbuf[EDDMAXNR]; /* 0xd00 */ - u8 _pad9[276]; /* 0xeec */ + __u8 _pad9[276]; /* 0xeec */ } __attribute__((packed)); #endif /* _ASM_BOOTPARAM_H */ diff --git a/include/asm-x86/e820.h b/include/asm-x86/e820.h index 5d4d218..3e214f3 100644 --- a/include/asm-x86/e820.h +++ b/include/asm-x86/e820.h @@ -1,5 +1,33 @@ +#ifndef __ASM_E820_H +#define __ASM_E820_H +#define E820MAP 0x2d0 /* our map */ +#define E820MAX 128 /* number of entries in E820MAP */ +#define E820NR 0x1e8 /* # entries in E820MAP */ + +#define E820_RAM 1 +#define E820_RESERVED 2 +#define E820_ACPI 3 +#define E820_NVS 4 + +#ifndef __ASSEMBLY__ +struct e820entry { + __u64 addr; /* start of memory segment */ + __u64 size; /* size of memory segment */ + __u32 type; /* type of memory segment */ +} __attribute__((packed)); + +struct e820map { + __u32 nr_map; + struct e820entry map[E820MAX]; +}; +#endif /* __ASSEMBLY__ */ + +#ifdef __KERNEL__ #ifdef CONFIG_X86_32 # include "e820_32.h" #else # include "e820_64.h" #endif +#endif /* __KERNEL__ */ + +#endif /* __ASM_E820_H */ diff --git a/include/asm-x86/e820_32.h b/include/asm-x86/e820_32.h index cf67dbb..03f60c6 100644 --- a/include/asm-x86/e820_32.h +++ b/include/asm-x86/e820_32.h @@ -12,30 +12,10 @@ #ifndef __E820_HEADER #define __E820_HEADER -#define E820MAP 0x2d0 /* our map */ -#define E820MAX 128 /* number of entries in E820MAP */ -#define E820NR 0x1e8 /* # entries in E820MAP */ - -#define E820_RAM 1 -#define E820_RESERVED 2 -#define E820_ACPI 3 -#define E820_NVS 4 - #define HIGH_MEMORY (1024*1024) #ifndef __ASSEMBLY__ -struct e820entry { - u64 addr; /* start of memory segment */ - u64 size; /* size of memory segment */ - u32 type; /* type of memory segment */ -} __attribute__((packed)); - -struct e820map { - u32 nr_map; - struct e820entry map[E820MAX]; -}; - extern struct e820map e820; extern int e820_all_mapped(unsigned long start, unsigned long end, @@ -56,5 +36,4 @@ static inline void e820_mark_nosave_regions(void) #endif #endif/*!__ASSEMBLY__*/ - #endif/*__E820_HEADER*/ diff --git a/include/asm-x86/e820_64.h b/include/asm-x86/e820_64.h index 3486e70..0bd4787 100644 --- a/include/asm-x86/e820_64.h +++ b/include/asm-x86/e820_64.h @@ -11,27 +11,7 @@ #ifndef __E820_HEADER #define __E820_HEADER -#define E820MAP 0x2d0 /* our map */ -#define E820MAX 128 /* number of entries in E820MAP */ -#define E820NR 0x1e8 /* # entries in E820MAP */ - -#define E820_RAM 1 -#define E820_RESERVED 2 -#define E820_ACPI 3 -#define E820_NVS 4 - #ifndef __ASSEMBLY__ -struct e820entry { - u64 addr; /* start of memory segment */ - u64 size; /* size of memory segment */ - u32 type; /* type of memory segment */ -} __attribute__((packed)); - -struct e820map { - u32 nr_map; - struct e820entry map[E820MAX]; -}; - extern unsigned long find_e820_area(unsigned long start, unsigned long end, unsigned size); extern void add_memory_region(unsigned long start, unsigned long size, diff --git a/include/asm-x86/ist.h b/include/asm-x86/ist.h index ef2003e..6ec6cee 100644 --- a/include/asm-x86/ist.h +++ b/include/asm-x86/ist.h @@ -17,17 +17,17 @@ */ -#ifdef __KERNEL__ - #include <linux/types.h> struct ist_info { - u32 signature; - u32 command; - u32 event; - u32 perf_level; + __u32 signature; + __u32 command; + __u32 event; + __u32 perf_level; }; +#ifdef __KERNEL__ + extern struct ist_info ist_info; #endif /* __KERNEL__ */ diff --git a/include/asm-x86/lguest.h b/include/asm-x86/lguest.h new file mode 100644 index 0000000..ccd3384 --- /dev/null +++ b/include/asm-x86/lguest.h @@ -0,0 +1,86 @@ +#ifndef _X86_LGUEST_H +#define _X86_LGUEST_H + +#define GDT_ENTRY_LGUEST_CS 10 +#define GDT_ENTRY_LGUEST_DS 11 +#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8) +#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8) + +#ifndef __ASSEMBLY__ +#include <asm/desc.h> + +#define GUEST_PL 1 + +/* Every guest maps the core switcher code. */ +#define SHARED_SWITCHER_PAGES \ + DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE) +/* Pages for switcher itself, then two pages per cpu */ +#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS) + +/* We map at -4M for ease of mapping into the guest (one PTE page). */ +#define SWITCHER_ADDR 0xFFC00000 + +/* Found in switcher.S */ +extern unsigned long default_idt_entries[]; + +struct lguest_regs +{ + /* Manually saved part. */ + unsigned long eax, ebx, ecx, edx; + unsigned long esi, edi, ebp; + unsigned long gs; + unsigned long fs, ds, es; + unsigned long trapnum, errcode; + /* Trap pushed part */ + unsigned long eip; + unsigned long cs; + unsigned long eflags; + unsigned long esp; + unsigned long ss; +}; + +/* This is a guest-specific page (mapped ro) into the guest. */ +struct lguest_ro_state +{ + /* Host information we need to restore when we switch back. */ + u32 host_cr3; + struct Xgt_desc_struct host_idt_desc; + struct Xgt_desc_struct host_gdt_desc; + u32 host_sp; + + /* Fields which are used when guest is running. */ + struct Xgt_desc_struct guest_idt_desc; + struct Xgt_desc_struct guest_gdt_desc; + struct i386_hw_tss guest_tss; + struct desc_struct guest_idt[IDT_ENTRIES]; + struct desc_struct guest_gdt[GDT_ENTRIES]; +}; + +struct lguest_arch +{ + /* The GDT entries copied into lguest_ro_state when running. */ + struct desc_struct gdt[GDT_ENTRIES]; + + /* The IDT entries: some copied into lguest_ro_state when running. */ + struct desc_struct idt[IDT_ENTRIES]; + + /* The address of the last guest-visible pagefault (ie. cr2). */ + unsigned long last_pagefault; +}; + +static inline void lguest_set_ts(void) +{ + u32 cr0; + + cr0 = read_cr0(); + if (!(cr0 & 8)) + write_cr0(cr0|8); +} + +/* Full 4G segment descriptors, suitable for CS and DS. */ +#define FULL_EXEC_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9b00}) +#define FULL_SEGMENT ((struct desc_struct){0x0000ffff, 0x00cf9300}) + +#endif /* __ASSEMBLY__ */ + +#endif diff --git a/include/asm-x86/lguest_hcall.h b/include/asm-x86/lguest_hcall.h new file mode 100644 index 0000000..f948491 --- /dev/null +++ b/include/asm-x86/lguest_hcall.h @@ -0,0 +1,71 @@ +/* Architecture specific portion of the lguest hypercalls */ +#ifndef _X86_LGUEST_HCALL_H +#define _X86_LGUEST_HCALL_H + +#define LHCALL_FLUSH_ASYNC 0 +#define LHCALL_LGUEST_INIT 1 +#define LHCALL_CRASH 2 +#define LHCALL_LOAD_GDT 3 +#define LHCALL_NEW_PGTABLE 4 +#define LHCALL_FLUSH_TLB 5 +#define LHCALL_LOAD_IDT_ENTRY 6 +#define LHCALL_SET_STACK 7 +#define LHCALL_TS 8 +#define LHCALL_SET_CLOCKEVENT 9 +#define LHCALL_HALT 10 +#define LHCALL_SET_PTE 14 +#define LHCALL_SET_PMD 15 +#define LHCALL_LOAD_TLS 16 +#define LHCALL_NOTIFY 17 + +/*G:031 First, how does our Guest contact the Host to ask for privileged + * operations? There are two ways: the direct way is to make a "hypercall", + * to make requests of the Host Itself. + * + * Our hypercall mechanism uses the highest unused trap code (traps 32 and + * above are used by real hardware interrupts). Seventeen hypercalls are + * available: the hypercall number is put in the %eax register, and the + * arguments (when required) are placed in %edx, %ebx and %ecx. If a return + * value makes sense, it's returned in %eax. + * + * Grossly invalid calls result in Sudden Death at the hands of the vengeful + * Host, rather than returning failure. This reflects Winston Churchill's + * definition of a gentleman: "someone who is only rude intentionally". */ +#define LGUEST_TRAP_ENTRY 0x1F + +#ifndef __ASSEMBLY__ +#include <asm/hw_irq.h> + +static inline unsigned long +hcall(unsigned long call, + unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ + /* "int" is the Intel instruction to trigger a trap. */ + asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) + /* The call is in %eax (aka "a"), and can be replaced */ + : "=a"(call) + /* The other arguments are in %eax, %edx, %ebx & %ecx */ + : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3) + /* "memory" means this might write somewhere in memory. + * This isn't true for all calls, but it's safe to tell + * gcc that it might happen so it doesn't get clever. */ + : "memory"); + return call; +} +/*:*/ + +void async_hcall(unsigned long call, + unsigned long arg1, unsigned long arg2, unsigned long arg3); + +/* Can't use our min() macro here: needs to be a constant */ +#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) + +#define LHCALL_RING_SIZE 64 +struct hcall_args +{ + /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */ + unsigned long arg0, arg2, arg3, arg1; +}; + +#endif /* !__ASSEMBLY__ */ +#endif /* _I386_LGUEST_HCALL_H */ diff --git a/include/asm-xtensa/dma-mapping.h b/include/asm-xtensa/dma-mapping.h index 82b03b3..8bd9d2c 100644 --- a/include/asm-xtensa/dma-mapping.h +++ b/include/asm-xtensa/dma-mapping.h @@ -58,11 +58,10 @@ dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, BUG_ON(direction == DMA_NONE); for (i = 0; i < nents; i++, sg++ ) { - BUG_ON(!sg->page); + BUG_ON(!sg_page(sg)); - sg->dma_address = page_to_phys(sg->page) + sg->offset; - consistent_sync(page_address(sg->page) + sg->offset, - sg->length, direction); + sg->dma_address = sg_phys(sg); + consistent_sync(sg_virt(sg), sg->length, direction); } return nents; @@ -128,8 +127,7 @@ dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, { int i; for (i = 0; i < nelems; i++, sg++) - consistent_sync(page_address(sg->page) + sg->offset, - sg->length, dir); + consistent_sync(sg_virt(sg), sg->length, dir); } static inline void @@ -138,8 +136,7 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, { int i; for (i = 0; i < nelems; i++, sg++) - consistent_sync(page_address(sg->page) + sg->offset, - sg->length, dir); + consistent_sync(sg_virt(sg), sg->length, dir); } static inline int dma_mapping_error(dma_addr_t dma_addr) diff --git a/include/linux/Kbuild b/include/linux/Kbuild index e3ffd14..6a65231 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -186,6 +186,7 @@ unifdef-y += cyclades.h unifdef-y += dccp.h unifdef-y += dirent.h unifdef-y += dlm.h +unifdef-y += edd.h unifdef-y += elfcore.h unifdef-y += errno.h unifdef-y += errqueue.h @@ -306,6 +307,7 @@ unifdef-y += rtc.h unifdef-y += rtnetlink.h unifdef-y += scc.h unifdef-y += sched.h +unifdef-y += screen_info.h unifdef-y += sdla.h unifdef-y += selinux_netlink.h unifdef-y += sem.h @@ -341,6 +343,9 @@ unifdef-y += user.h unifdef-y += utsname.h unifdef-y += videodev2.h unifdef-y += videodev.h +unifdef-y += virtio_config.h +unifdef-y += virtio_blk.h +unifdef-y += virtio_net.h unifdef-y += wait.h unifdef-y += wanrouter.h unifdef-y += watchdog.h diff --git a/include/linux/apm_bios.h b/include/linux/apm_bios.h index 5f921c8..9754baa 100644 --- a/include/linux/apm_bios.h +++ b/include/linux/apm_bios.h @@ -16,29 +16,29 @@ * General Public License for more details. */ -typedef unsigned short apm_event_t; -typedef unsigned short apm_eventinfo_t; +#include <linux/types.h> + +struct apm_bios_info { + __u16 version; + __u16 cseg; + __u32 offset; + __u16 cseg_16; + __u16 dseg; + __u16 flags; + __u16 cseg_len; + __u16 cseg_16_len; + __u16 dseg_len; +}; #ifdef __KERNEL__ -#include <linux/types.h> +typedef unsigned short apm_event_t; +typedef unsigned short apm_eventinfo_t; #define APM_CS (GDT_ENTRY_APMBIOS_BASE * 8) #define APM_CS_16 (APM_CS + 8) #define APM_DS (APM_CS_16 + 8) -struct apm_bios_info { - u16 version; - u16 cseg; - u32 offset; - u16 cseg_16; - u16 dseg; - u16 flags; - u16 cseg_len; - u16 cseg_16_len; - u16 dseg_len; -}; - /* Results of APM Installation Check */ #define APM_16_BIT_SUPPORT 0x0001 #define APM_32_BIT_SUPPORT 0x0002 diff --git a/include/linux/edd.h b/include/linux/edd.h index 7b64782..5d747c5 100644 --- a/include/linux/edd.h +++ b/include/linux/edd.h @@ -67,113 +67,113 @@ #define EDD_INFO_USE_INT13_FN50 (1 << 7) struct edd_device_params { - u16 length; - u16 info_flags; - u32 num_default_cylinders; - u32 num_default_heads; - u32 sectors_per_track; - u64 number_of_sectors; - u16 bytes_per_sector; - u32 dpte_ptr; /* 0xFFFFFFFF for our purposes */ - u16 key; /* = 0xBEDD */ - u8 device_path_info_length; /* = 44 */ - u8 reserved2; - u16 reserved3; - u8 host_bus_type[4]; - u8 interface_type[8]; + __u16 length; + __u16 info_flags; + __u32 num_default_cylinders; + __u32 num_default_heads; + __u32 sectors_per_track; + __u64 number_of_sectors; + __u16 bytes_per_sector; + __u32 dpte_ptr; /* 0xFFFFFFFF for our purposes */ + __u16 key; /* = 0xBEDD */ + __u8 device_path_info_length; /* = 44 */ + __u8 reserved2; + __u16 reserved3; + __u8 host_bus_type[4]; + __u8 interface_type[8]; union { struct { - u16 base_address; - u16 reserved1; - u32 reserved2; + __u16 base_address; + __u16 reserved1; + __u32 reserved2; } __attribute__ ((packed)) isa; struct { - u8 bus; - u8 slot; - u8 function; - u8 channel; - u32 reserved; + __u8 bus; + __u8 slot; + __u8 function; + __u8 channel; + __u32 reserved; } __attribute__ ((packed)) pci; /* pcix is same as pci */ struct { - u64 reserved; + __u64 reserved; } __attribute__ ((packed)) ibnd; struct { - u64 reserved; + __u64 reserved; } __attribute__ ((packed)) xprs; struct { - u64 reserved; + __u64 reserved; } __attribute__ ((packed)) htpt; struct { - u64 reserved; + __u64 reserved; } __attribute__ ((packed)) unknown; } interface_path; union { struct { - u8 device; - u8 reserved1; - u16 reserved2; - u32 reserved3; - u64 reserved4; + __u8 device; + __u8 reserved1; + __u16 reserved2; + __u32 reserved3; + __u64 reserved4; } __attribute__ ((packed)) ata; struct { - u8 device; - u8 lun; - u8 reserved1; - u8 reserved2; - u32 reserved3; - u64 reserved4; + __u8 device; + __u8 lun; + __u8 reserved1; + __u8 reserved2; + __u32 reserved3; + __u64 reserved4; } __attribute__ ((packed)) atapi; struct { - u16 id; - u64 lun; - u16 reserved1; - u32 reserved2; + __u16 id; + __u64 lun; + __u16 reserved1; + __u32 reserved2; } __attribute__ ((packed)) scsi; struct { - u64 serial_number; - u64 reserved; + __u64 serial_number; + __u64 reserved; } __attribute__ ((packed)) usb; struct { - u64 eui; - u64 reserved; + __u64 eui; + __u64 reserved; } __attribute__ ((packed)) i1394; struct { - u64 wwid; - u64 lun; + __u64 wwid; + __u64 lun; } __attribute__ ((packed)) fibre; struct { - u64 identity_tag; - u64 reserved; + __u64 identity_tag; + __u64 reserved; } __attribute__ ((packed)) i2o; struct { - u32 array_number; - u32 reserved1; - u64 reserved2; + __u32 array_number; + __u32 reserved1; + __u64 reserved2; } __attribute__ ((packed)) raid; struct { - u8 device; - u8 reserved1; - u16 reserved2; - u32 reserved3; - u64 reserved4; + __u8 device; + __u8 reserved1; + __u16 reserved2; + __u32 reserved3; + __u64 reserved4; } __attribute__ ((packed)) sata; struct { - u64 reserved1; - u64 reserved2; + __u64 reserved1; + __u64 reserved2; } __attribute__ ((packed)) unknown; } device_path; - u8 reserved4; - u8 checksum; + __u8 reserved4; + __u8 checksum; } __attribute__ ((packed)); struct edd_info { - u8 device; - u8 version; - u16 interface_support; - u16 legacy_max_cylinder; - u8 legacy_max_head; - u8 legacy_sectors_per_track; + __u8 device; + __u8 version; + __u16 interface_support; + __u16 legacy_max_cylinder; + __u8 legacy_max_head; + __u8 legacy_sectors_per_track; struct edd_device_params params; } __attribute__ ((packed)); @@ -184,8 +184,9 @@ struct edd { unsigned char edd_info_nr; }; +#ifdef __KERNEL__ extern struct edd edd; - +#endif /* __KERNEL__ */ #endif /*!__ASSEMBLY__ */ #endif /* _LINUX_EDD_H */ diff --git a/include/linux/lguest.h b/include/linux/lguest.h index 157ad64..8beb291 100644 --- a/include/linux/lguest.h +++ b/include/linux/lguest.h @@ -1,76 +1,16 @@ /* Things the lguest guest needs to know. Note: like all lguest interfaces, * this is subject to wild and random change between versions. */ -#ifndef _ASM_LGUEST_H -#define _ASM_LGUEST_H +#ifndef _LINUX_LGUEST_H +#define _LINUX_LGUEST_H #ifndef __ASSEMBLY__ +#include <linux/time.h> #include <asm/irq.h> - -#define LHCALL_FLUSH_ASYNC 0 -#define LHCALL_LGUEST_INIT 1 -#define LHCALL_CRASH 2 -#define LHCALL_LOAD_GDT 3 -#define LHCALL_NEW_PGTABLE 4 -#define LHCALL_FLUSH_TLB 5 -#define LHCALL_LOAD_IDT_ENTRY 6 -#define LHCALL_SET_STACK 7 -#define LHCALL_TS 8 -#define LHCALL_SET_CLOCKEVENT 9 -#define LHCALL_HALT 10 -#define LHCALL_BIND_DMA 12 -#define LHCALL_SEND_DMA 13 -#define LHCALL_SET_PTE 14 -#define LHCALL_SET_PMD 15 -#define LHCALL_LOAD_TLS 16 +#include <asm/lguest_hcall.h> #define LG_CLOCK_MIN_DELTA 100UL #define LG_CLOCK_MAX_DELTA ULONG_MAX -/*G:031 First, how does our Guest contact the Host to ask for privileged - * operations? There are two ways: the direct way is to make a "hypercall", - * to make requests of the Host Itself. - * - * Our hypercall mechanism uses the highest unused trap code (traps 32 and - * above are used by real hardware interrupts). Seventeen hypercalls are - * available: the hypercall number is put in the %eax register, and the - * arguments (when required) are placed in %edx, %ebx and %ecx. If a return - * value makes sense, it's returned in %eax. - * - * Grossly invalid calls result in Sudden Death at the hands of the vengeful - * Host, rather than returning failure. This reflects Winston Churchill's - * definition of a gentleman: "someone who is only rude intentionally". */ -#define LGUEST_TRAP_ENTRY 0x1F - -static inline unsigned long -hcall(unsigned long call, - unsigned long arg1, unsigned long arg2, unsigned long arg3) -{ - /* "int" is the Intel instruction to trigger a trap. */ - asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) - /* The call is in %eax (aka "a"), and can be replaced */ - : "=a"(call) - /* The other arguments are in %eax, %edx, %ebx & %ecx */ - : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3) - /* "memory" means this might write somewhere in memory. - * This isn't true for all calls, but it's safe to tell - * gcc that it might happen so it doesn't get clever. */ - : "memory"); - return call; -} -/*:*/ - -void async_hcall(unsigned long call, - unsigned long arg1, unsigned long arg2, unsigned long arg3); - -/* Can't use our min() macro here: needs to be a constant */ -#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) - -#define LHCALL_RING_SIZE 64 -struct hcall_ring -{ - u32 eax, edx, ebx, ecx; -}; - /*G:032 The second method of communicating with the Host is to via "struct * lguest_data". The Guest's very first hypercall is to tell the Host where * this is, and then the Guest and Host both publish information in it. :*/ @@ -97,20 +37,24 @@ struct lguest_data /* 0xFF == done (set by Host), 0 == pending (set by Guest). */ u8 hcall_status[LHCALL_RING_SIZE]; /* The actual registers for the hypercalls. */ - struct hcall_ring hcalls[LHCALL_RING_SIZE]; + struct hcall_args hcalls[LHCALL_RING_SIZE]; /* Fields initialized by the Host at boot: */ /* Memory not to try to access */ unsigned long reserve_mem; - /* ID of this Guest (used by network driver to set ethernet address) */ - u16 guestid; /* KHz for the TSC clock. */ u32 tsc_khz; + /* Page where the top-level pagetable is */ + unsigned long pgdir; /* Fields initialized by the Guest at boot: */ /* Instruction range to suppress interrupts even if enabled */ unsigned long noirq_start, noirq_end; + /* Address above which page tables are all identical. */ + unsigned long kernel_address; + /* The vector to try to use for system calls (0x40 or 0x80). */ + unsigned int syscall_vec; }; extern struct lguest_data lguest_data; #endif /* __ASSEMBLY__ */ -#endif /* _ASM_LGUEST_H */ +#endif /* _LINUX_LGUEST_H */ diff --git a/include/linux/lguest_bus.h b/include/linux/lguest_bus.h deleted file mode 100644 index d27853d..0000000 --- a/include/linux/lguest_bus.h +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef _ASM_LGUEST_DEVICE_H -#define _ASM_LGUEST_DEVICE_H -/* Everything you need to know about lguest devices. */ -#include <linux/device.h> -#include <linux/lguest.h> -#include <linux/lguest_launcher.h> - -struct lguest_device { - /* Unique busid, and index into lguest_page->devices[] */ - unsigned int index; - - struct device dev; - - /* Driver can hang data off here. */ - void *private; -}; - -/*D:380 Since interrupt numbers are arbitrary, we use a convention: each device - * can use the interrupt number corresponding to its index. The +1 is because - * interrupt 0 is not usable (it's actually the timer interrupt). */ -static inline int lgdev_irq(const struct lguest_device *dev) -{ - return dev->index + 1; -} -/*:*/ - -/* dma args must not be vmalloced! */ -void lguest_send_dma(unsigned long key, struct lguest_dma *dma); -int lguest_bind_dma(unsigned long key, struct lguest_dma *dmas, - unsigned int num, u8 irq); -void lguest_unbind_dma(unsigned long key, struct lguest_dma *dmas); - -/* Map the virtual device space */ -void *lguest_map(unsigned long phys_addr, unsigned long pages); -void lguest_unmap(void *); - -struct lguest_driver { - const char *name; - struct module *owner; - u16 device_type; - int (*probe)(struct lguest_device *dev); - void (*remove)(struct lguest_device *dev); - - struct device_driver drv; -}; - -extern int register_lguest_driver(struct lguest_driver *drv); -extern void unregister_lguest_driver(struct lguest_driver *drv); - -extern struct lguest_device_desc *lguest_devices; /* Just past max_pfn */ -#endif /* _ASM_LGUEST_DEVICE_H */ diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h index 6416705..61e1e3e 100644 --- a/include/linux/lguest_launcher.h +++ b/include/linux/lguest_launcher.h @@ -1,6 +1,7 @@ #ifndef _ASM_LGUEST_USER #define _ASM_LGUEST_USER /* Everything the "lguest" userspace program needs to know. */ +#include <linux/types.h> /* They can register up to 32 arrays of lguest_dma. */ #define LGUEST_MAX_DMA 32 /* At most we can dma 16 lguest_dma in one op. */ @@ -9,66 +10,6 @@ /* How many devices? Assume each one wants up to two dma arrays per device. */ #define LGUEST_MAX_DEVICES (LGUEST_MAX_DMA/2) -/*D:200 - * Lguest I/O - * - * The lguest I/O mechanism is the only way Guests can talk to devices. There - * are two hypercalls involved: SEND_DMA for output and BIND_DMA for input. In - * each case, "struct lguest_dma" describes the buffer: this contains 16 - * addr/len pairs, and if there are fewer buffer elements the len array is - * terminated with a 0. - * - * I/O is organized by keys: BIND_DMA attaches buffers to a particular key, and - * SEND_DMA transfers to buffers bound to particular key. By convention, keys - * correspond to a physical address within the device's page. This means that - * devices will never accidentally end up with the same keys, and allows the - * Host use The Futex Trick (as we'll see later in our journey). - * - * SEND_DMA simply indicates a key to send to, and the physical address of the - * "struct lguest_dma" to send. The Host will write the number of bytes - * transferred into the "struct lguest_dma"'s used_len member. - * - * BIND_DMA indicates a key to bind to, a pointer to an array of "struct - * lguest_dma"s ready for receiving, the size of that array, and an interrupt - * to trigger when data is received. The Host will only allow transfers into - * buffers with a used_len of zero: it then sets used_len to the number of - * bytes transferred and triggers the interrupt for the Guest to process the - * new input. */ -struct lguest_dma -{ - /* 0 if free to be used, filled by the Host. */ - u32 used_len; - unsigned long addr[LGUEST_MAX_DMA_SECTIONS]; - u16 len[LGUEST_MAX_DMA_SECTIONS]; -}; -/*:*/ - -/*D:460 This is the layout of a block device memory page. The Launcher sets up - * the num_sectors initially to tell the Guest the size of the disk. The Guest - * puts the type, sector and length of the request in the first three fields, - * then DMAs to the Host. The Host processes the request, sets up the result, - * then DMAs back to the Guest. */ -struct lguest_block_page -{ - /* 0 is a read, 1 is a write. */ - int type; - u32 sector; /* Offset in device = sector * 512. */ - u32 bytes; /* Length expected to be read/written in bytes */ - /* 0 = pending, 1 = done, 2 = done, error */ - int result; - u32 num_sectors; /* Disk length = num_sectors * 512 */ -}; - -/*D:520 The network device is basically a memory page where all the Guests on - * the network publish their MAC (ethernet) addresses: it's an array of "struct - * lguest_net": */ -struct lguest_net -{ - /* Simply the mac address (with multicast bit meaning promisc). */ - unsigned char mac[6]; -}; -/*:*/ - /* Where the Host expects the Guest to SEND_DMA console output to. */ #define LGUEST_CONSOLE_DMA_KEY 0 @@ -81,38 +22,29 @@ struct lguest_net * complex burden for the Host and suboptimal for the Guest, so we have our own * "lguest" bus and simple drivers. * - * Devices are described by an array of LGUEST_MAX_DEVICES of these structs, - * placed by the Launcher just above the top of physical memory: + * Devices are described by a simplified ID, a status byte, and some "config" + * bytes which describe this device's configuration. This is placed by the + * Launcher just above the top of physical memory: */ struct lguest_device_desc { - /* The device type: console, network, disk etc. */ - u16 type; -#define LGUEST_DEVICE_T_CONSOLE 1 -#define LGUEST_DEVICE_T_NET 2 -#define LGUEST_DEVICE_T_BLOCK 3 - - /* The specific features of this device: these depends on device type - * except for LGUEST_DEVICE_F_RANDOMNESS. */ - u16 features; -#define LGUEST_NET_F_NOCSUM 0x4000 /* Don't bother checksumming */ -#define LGUEST_DEVICE_F_RANDOMNESS 0x8000 /* IRQ is fairly random */ - - /* This is how the Guest reports status of the device: the Host can set - * LGUEST_DEVICE_S_REMOVED to indicate removal, but the rest are only - * ever manipulated by the Guest, and only ever set. */ - u16 status; -/* 256 and above are device specific. */ -#define LGUEST_DEVICE_S_ACKNOWLEDGE 1 /* We have seen device. */ -#define LGUEST_DEVICE_S_DRIVER 2 /* We have found a driver */ -#define LGUEST_DEVICE_S_DRIVER_OK 4 /* Driver says OK! */ -#define LGUEST_DEVICE_S_REMOVED 8 /* Device has gone away. */ -#define LGUEST_DEVICE_S_REMOVED_ACK 16 /* Driver has been told. */ -#define LGUEST_DEVICE_S_FAILED 128 /* Something actually failed */ + /* The device type: console, network, disk etc. Type 0 terminates. */ + __u8 type; + /* The number of bytes of the config array. */ + __u8 config_len; + /* A status byte, written by the Guest. */ + __u8 status; + __u8 config[0]; +}; - /* Each device exists somewhere in Guest physical memory, over some - * number of pages. */ - u16 num_pages; - u32 pfn; +/*D:135 This is how we expect the device configuration field for a virtqueue + * (type VIRTIO_CONFIG_F_VIRTQUEUE) to be laid out: */ +struct lguest_vqconfig { + /* The number of entries in the virtio_ring */ + __u16 num; + /* The interrupt we get when something happens. */ + __u16 irq; + /* The page number of the virtio ring for this device. */ + __u32 pfn; }; /*:*/ @@ -120,7 +52,7 @@ struct lguest_device_desc { enum lguest_req { LHREQ_INITIALIZE, /* + pfnlimit, pgdir, start, pageoffset */ - LHREQ_GETDMA, /* + addr (returns &lguest_dma, irq in ->used_len) */ + LHREQ_GETDMA, /* No longer used */ LHREQ_IRQ, /* + irq */ LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */ }; diff --git a/include/linux/mlx4/doorbell.h b/include/linux/mlx4/doorbell.h index 3f2da44..f31bba2 100644 --- a/include/linux/mlx4/doorbell.h +++ b/include/linux/mlx4/doorbell.h @@ -52,11 +52,6 @@ #define MLX4_INIT_DOORBELL_LOCK(ptr) do { } while (0) #define MLX4_GET_DOORBELL_LOCK(ptr) (NULL) -static inline void mlx4_write64_raw(__be64 val, void __iomem *dest) -{ - __raw_writeq((__force u64) val, dest); -} - static inline void mlx4_write64(__be32 val[2], void __iomem *dest, spinlock_t *doorbell_lock) { @@ -75,12 +70,6 @@ static inline void mlx4_write64(__be32 val[2], void __iomem *dest, #define MLX4_INIT_DOORBELL_LOCK(ptr) spin_lock_init(ptr) #define MLX4_GET_DOORBELL_LOCK(ptr) (ptr) -static inline void mlx4_write64_raw(__be64 val, void __iomem *dest) -{ - __raw_writel(((__force u32 *) &val)[0], dest); - __raw_writel(((__force u32 *) &val)[1], dest + 4); -} - static inline void mlx4_write64(__be32 val[2], void __iomem *dest, spinlock_t *doorbell_lock) { diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 522b0dd..e9fddb4 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -361,4 +361,10 @@ struct ssb_device_id { #define SSB_ANY_ID 0xFFFF #define SSB_ANY_REV 0xFF +struct virtio_device_id { + __u32 device; + __u32 vendor; +}; +#define VIRTIO_DEV_ANY_ID 0xffffffff + #endif /* LINUX_MOD_DEVICETABLE_H */ diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 42daf5e..df7ddce 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -41,6 +41,11 @@ static inline void sg_set_page(struct scatterlist *sg, struct page *page) { unsigned long page_link = sg->page_link & 0x3; + /* + * In order for the low bit stealing approach to work, pages + * must be aligned at a 32-bit boundary as a minimum. + */ + BUG_ON((unsigned long) page & 0x03); #ifdef CONFIG_DEBUG_SG BUG_ON(sg->sg_magic != SG_MAGIC); #endif diff --git a/include/linux/screen_info.h b/include/linux/screen_info.h index ba81ffe..827b85b 100644 --- a/include/linux/screen_info.h +++ b/include/linux/screen_info.h @@ -8,45 +8,43 @@ */ struct screen_info { - u8 orig_x; /* 0x00 */ - u8 orig_y; /* 0x01 */ - u16 ext_mem_k; /* 0x02 */ - u16 orig_video_page; /* 0x04 */ - u8 orig_video_mode; /* 0x06 */ - u8 orig_video_cols; /* 0x07 */ - u16 unused2; /* 0x08 */ - u16 orig_video_ega_bx; /* 0x0a */ - u16 unused3; /* 0x0c */ - u8 orig_video_lines; /* 0x0e */ - u8 orig_video_isVGA; /* 0x0f */ - u16 orig_video_points; /* 0x10 */ + __u8 orig_x; /* 0x00 */ + __u8 orig_y; /* 0x01 */ + __u16 ext_mem_k; /* 0x02 */ + __u16 orig_video_page; /* 0x04 */ + __u8 orig_video_mode; /* 0x06 */ + __u8 orig_video_cols; /* 0x07 */ + __u16 unused2; /* 0x08 */ + __u16 orig_video_ega_bx;/* 0x0a */ + __u16 unused3; /* 0x0c */ + __u8 orig_video_lines; /* 0x0e */ + __u8 orig_video_isVGA; /* 0x0f */ + __u16 orig_video_points;/* 0x10 */ /* VESA graphic mode -- linear frame buffer */ - u16 lfb_width; /* 0x12 */ - u16 lfb_height; /* 0x14 */ - u16 lfb_depth; /* 0x16 */ - u32 lfb_base; /* 0x18 */ - u32 lfb_size; /* 0x1c */ - u16 cl_magic, cl_offset; /* 0x20 */ - u16 lfb_linelength; /* 0x24 */ - u8 red_size; /* 0x26 */ - u8 red_pos; /* 0x27 */ - u8 green_size; /* 0x28 */ - u8 green_pos; /* 0x29 */ - u8 blue_size; /* 0x2a */ - u8 blue_pos; /* 0x2b */ - u8 rsvd_size; /* 0x2c */ - u8 rsvd_pos; /* 0x2d */ - u16 vesapm_seg; /* 0x2e */ - u16 vesapm_off; /* 0x30 */ - u16 pages; /* 0x32 */ - u16 vesa_attributes; /* 0x34 */ - u32 capabilities; /* 0x36 */ - u8 _reserved[6]; /* 0x3a */ + __u16 lfb_width; /* 0x12 */ + __u16 lfb_height; /* 0x14 */ + __u16 lfb_depth; /* 0x16 */ + __u32 lfb_base; /* 0x18 */ + __u32 lfb_size; /* 0x1c */ + __u16 cl_magic, cl_offset; /* 0x20 */ + __u16 lfb_linelength; /* 0x24 */ + __u8 red_size; /* 0x26 */ + __u8 red_pos; /* 0x27 */ + __u8 green_size; /* 0x28 */ + __u8 green_pos; /* 0x29 */ + __u8 blue_size; /* 0x2a */ + __u8 blue_pos; /* 0x2b */ + __u8 rsvd_size; /* 0x2c */ + __u8 rsvd_pos; /* 0x2d */ + __u16 vesapm_seg; /* 0x2e */ + __u16 vesapm_off; /* 0x30 */ + __u16 pages; /* 0x32 */ + __u16 vesa_attributes; /* 0x34 */ + __u32 capabilities; /* 0x36 */ + __u8 _reserved[6]; /* 0x3a */ } __attribute__((packed)); -extern struct screen_info screen_info; - #define VIDEO_TYPE_MDA 0x10 /* Monochrome Text Display */ #define VIDEO_TYPE_CGA 0x11 /* CGA Display */ #define VIDEO_TYPE_EGAM 0x20 /* EGA/VGA in Monochrome Mode */ @@ -65,4 +63,17 @@ extern struct screen_info screen_info; #define VIDEO_TYPE_PMAC 0x60 /* PowerMacintosh frame buffer. */ +#ifdef __KERNEL__ +extern struct screen_info screen_info; + +#define ORIG_X (screen_info.orig_x) +#define ORIG_Y (screen_info.orig_y) +#define ORIG_VIDEO_MODE (screen_info.orig_video_mode) +#define ORIG_VIDEO_COLS (screen_info.orig_video_cols) +#define ORIG_VIDEO_EGA_BX (screen_info.orig_video_ega_bx) +#define ORIG_VIDEO_LINES (screen_info.orig_video_lines) +#define ORIG_VIDEO_ISVGA (screen_info.orig_video_isVGA) +#define ORIG_VIDEO_POINTS (screen_info.orig_video_points) +#endif /* __KERNEL__ */ + #endif /* _SCREEN_INFO_H */ diff --git a/include/linux/virtio.h b/include/linux/virtio.h new file mode 100644 index 0000000..14e1379 --- /dev/null +++ b/include/linux/virtio.h @@ -0,0 +1,110 @@ +#ifndef _LINUX_VIRTIO_H +#define _LINUX_VIRTIO_H +/* Everything a virtio driver needs to work with any particular virtio + * implementation. */ +#include <linux/types.h> +#include <linux/scatterlist.h> +#include <linux/spinlock.h> +#include <linux/device.h> +#include <linux/mod_devicetable.h> + +/** + * virtqueue - a queue to register buffers for sending or receiving. + * @callback: the function to call when buffers are consumed (can be NULL). + * If this returns false, callbacks are suppressed until vq_ops->restart + * is called. + * @vdev: the virtio device this queue was created for. + * @vq_ops: the operations for this virtqueue (see below). + * @priv: a pointer for the virtqueue implementation to use. + */ +struct virtqueue +{ + bool (*callback)(struct virtqueue *vq); + struct virtio_device *vdev; + struct virtqueue_ops *vq_ops; + void *priv; +}; + +/** + * virtqueue_ops - operations for virtqueue abstraction layer + * @add_buf: expose buffer to other end + * vq: the struct virtqueue we're talking about. + * sg: the description of the buffer(s). + * out_num: the number of sg readable by other side + * in_num: the number of sg which are writable (after readable ones) + * data: the token identifying the buffer. + * Returns 0 or an error. + * @kick: update after add_buf + * vq: the struct virtqueue + * After one or more add_buf calls, invoke this to kick the other side. + * @get_buf: get the next used buffer + * vq: the struct virtqueue we're talking about. + * len: the length written into the buffer + * Returns NULL or the "data" token handed to add_buf. + * @restart: restart callbacks after callback returned false. + * vq: the struct virtqueue we're talking about. + * This returns "false" (and doesn't re-enable) if there are pending + * buffers in the queue, to avoid a race. + * @shutdown: "unadd" all buffers. + * vq: the struct virtqueue we're talking about. + * Remove everything from the queue. + * + * Locking rules are straightforward: the driver is responsible for + * locking. No two operations may be invoked simultaneously. + * + * All operations can be called in any context. + */ +struct virtqueue_ops { + int (*add_buf)(struct virtqueue *vq, + struct scatterlist sg[], + unsigned int out_num, + unsigned int in_num, + void *data); + + void (*kick)(struct virtqueue *vq); + + void *(*get_buf)(struct virtqueue *vq, unsigned int *len); + + bool (*restart)(struct virtqueue *vq); + + void (*shutdown)(struct virtqueue *vq); +}; + +/** + * virtio_device - representation of a device using virtio + * @index: unique position on the virtio bus + * @dev: underlying device. + * @id: the device type identification (used to match it with a driver). + * @config: the configuration ops for this device. + * @priv: private pointer for the driver's use. + */ +struct virtio_device +{ + int index; + struct device dev; + struct virtio_device_id id; + struct virtio_config_ops *config; + void *priv; +}; + +int register_virtio_device(struct virtio_device *dev); +void unregister_virtio_device(struct virtio_device *dev); + +/** + * virtio_driver - operations for a virtio I/O driver + * @driver: underlying device driver (populate name and owner). + * @id_table: the ids serviced by this driver. + * @probe: the function to call when a device is found. Returns a token for + * remove, or PTR_ERR(). + * @remove: the function when a device is removed. + */ +struct virtio_driver { + struct device_driver driver; + const struct virtio_device_id *id_table; + int (*probe)(struct virtio_device *dev); + void (*remove)(struct virtio_device *dev); +}; + +int register_virtio_driver(struct virtio_driver *drv); +void unregister_virtio_driver(struct virtio_driver *drv); +#endif /* _LINUX_VIRTIO_H */ diff --git a/include/linux/virtio_9p.h b/include/linux/virtio_9p.h new file mode 100644 index 0000000..8eff0b5 --- /dev/null +++ b/include/linux/virtio_9p.h @@ -0,0 +1,10 @@ +#ifndef _LINUX_VIRTIO_9P_H +#define _LINUX_VIRTIO_9P_H +#include <linux/virtio_config.h> + +/* The ID for virtio console */ +#define VIRTIO_ID_9P 9 +/* Maximum number of virtio channels per partition (1 for now) */ +#define MAX_9P_CHAN 1 + +#endif /* _LINUX_VIRTIO_9P_H */ diff --git a/include/linux/virtio_blk.h b/include/linux/virtio_blk.h new file mode 100644 index 0000000..7bd2bce --- /dev/null +++ b/include/linux/virtio_blk.h @@ -0,0 +1,51 @@ +#ifndef _LINUX_VIRTIO_BLK_H +#define _LINUX_VIRTIO_BLK_H +#include <linux/virtio_config.h> + +/* The ID for virtio_block */ +#define VIRTIO_ID_BLOCK 2 + +/* Feature bits */ +#define VIRTIO_CONFIG_BLK_F 0x40 +#define VIRTIO_BLK_F_BARRIER 1 /* Does host support barriers? */ + +/* The capacity (in 512-byte sectors). */ +#define VIRTIO_CONFIG_BLK_F_CAPACITY 0x41 +/* The maximum segment size. */ +#define VIRTIO_CONFIG_BLK_F_SIZE_MAX 0x42 +/* The maximum number of segments. */ +#define VIRTIO_CONFIG_BLK_F_SEG_MAX 0x43 + +/* These two define direction. */ +#define VIRTIO_BLK_T_IN 0 +#define VIRTIO_BLK_T_OUT 1 + +/* This bit says it's a scsi command, not an actual read or write. */ +#define VIRTIO_BLK_T_SCSI_CMD 2 + +/* Barrier before this op. */ +#define VIRTIO_BLK_T_BARRIER 0x80000000 + +/* This is the first element of the read scatter-gather list. */ +struct virtio_blk_outhdr +{ + /* VIRTIO_BLK_T* */ + __u32 type; + /* io priority. */ + __u32 ioprio; + /* Sector (ie. 512 byte offset) */ + __u64 sector; + /* Where to put reply. */ + __u64 id; +}; + +#define VIRTIO_BLK_S_OK 0 +#define VIRTIO_BLK_S_IOERR 1 +#define VIRTIO_BLK_S_UNSUPP 2 + +/* This is the first element of the write scatter-gather list */ +struct virtio_blk_inhdr +{ + unsigned char status; +}; +#endif /* _LINUX_VIRTIO_BLK_H */ diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h new file mode 100644 index 0000000..bcc0188 --- /dev/null +++ b/include/linux/virtio_config.h @@ -0,0 +1,111 @@ +#ifndef _LINUX_VIRTIO_CONFIG_H +#define _LINUX_VIRTIO_CONFIG_H +/* Virtio devices use a standardized configuration space to define their + * features and pass configuration information, but each implementation can + * store and access that space differently. */ +#include <linux/types.h> + +/* Status byte for guest to report progress, and synchronize config. */ +/* We have seen device and processed generic fields (VIRTIO_CONFIG_F_VIRTIO) */ +#define VIRTIO_CONFIG_S_ACKNOWLEDGE 1 +/* We have found a driver for the device. */ +#define VIRTIO_CONFIG_S_DRIVER 2 +/* Driver has used its parts of the config, and is happy */ +#define VIRTIO_CONFIG_S_DRIVER_OK 4 +/* We've given up on this device. */ +#define VIRTIO_CONFIG_S_FAILED 0x80 + +/* Feature byte (actually 7 bits availabe): */ +/* Requirements/features of the virtio implementation. */ +#define VIRTIO_CONFIG_F_VIRTIO 1 +/* Requirements/features of the virtqueue (may have more than one). */ +#define VIRTIO_CONFIG_F_VIRTQUEUE 2 + +#ifdef __KERNEL__ +struct virtio_device; + +/** + * virtio_config_ops - operations for configuring a virtio device + * @find: search for the next configuration field of the given type. + * vdev: the virtio_device + * type: the feature type + * len: the (returned) length of the field if found. + * Returns a token if found, or NULL. Never returnes the same field twice + * (ie. it's used up). + * @get: read the value of a configuration field after find(). + * vdev: the virtio_device + * token: the token returned from find(). + * buf: the buffer to write the field value into. + * len: the length of the buffer (given by find()). + * Note that contents are conventionally little-endian. + * @set: write the value of a configuration field after find(). + * vdev: the virtio_device + * token: the token returned from find(). + * buf: the buffer to read the field value from. + * len: the length of the buffer (given by find()). + * Note that contents are conventionally little-endian. + * @get_status: read the status byte + * vdev: the virtio_device + * Returns the status byte + * @set_status: write the status byte + * vdev: the virtio_device + * status: the new status byte + * @find_vq: find the first VIRTIO_CONFIG_F_VIRTQUEUE and create a virtqueue. + * vdev: the virtio_device + * callback: the virqtueue callback + * Returns the new virtqueue or ERR_PTR(). + * @del_vq: free a virtqueue found by find_vq(). + */ +struct virtio_config_ops +{ + void *(*find)(struct virtio_device *vdev, u8 type, unsigned *len); + void (*get)(struct virtio_device *vdev, void *token, + void *buf, unsigned len); + void (*set)(struct virtio_device *vdev, void *token, + const void *buf, unsigned len); + u8 (*get_status)(struct virtio_device *vdev); + void (*set_status)(struct virtio_device *vdev, u8 status); + struct virtqueue *(*find_vq)(struct virtio_device *vdev, + bool (*callback)(struct virtqueue *)); + void (*del_vq)(struct virtqueue *vq); +}; + +/** + * virtio_config_val - get a single virtio config and mark it used. + * @config: the virtio config space + * @type: the type to search for. + * @val: a pointer to the value to fill in. + * + * Once used, the config type is marked with VIRTIO_CONFIG_F_USED so it can't + * be found again. This version does endian conversion. */ +#define virtio_config_val(vdev, type, v) ({ \ + int _err = __virtio_config_val((vdev),(type),(v),sizeof(*(v))); \ + \ + BUILD_BUG_ON(sizeof(*(v)) != 1 && sizeof(*(v)) != 2 \ + && sizeof(*(v)) != 4 && sizeof(*(v)) != 8); \ + if (!_err) { \ + switch (sizeof(*(v))) { \ + case 2: le16_to_cpus((__u16 *) v); break; \ + case 4: le32_to_cpus((__u32 *) v); break; \ + case 8: le64_to_cpus((__u64 *) v); break; \ + } \ + } \ + _err; \ +}) + +int __virtio_config_val(struct virtio_device *dev, + u8 type, void *val, size_t size); + +/** + * virtio_use_bit - helper to use a feature bit in a bitfield value. + * @dev: the virtio device + * @token: the token as returned from vdev->config->find(). + * @len: the length of the field. + * @bitnum: the bit to test. + * + * If handed a NULL token, it returns false, otherwise returns bit status. + * If it's one, it sets the mirroring acknowledgement bit. */ +int virtio_use_bit(struct virtio_device *vdev, + void *token, unsigned int len, unsigned int bitnum); +#endif /* __KERNEL__ */ +#endif /* _LINUX_VIRTIO_CONFIG_H */ diff --git a/include/linux/virtio_console.h b/include/linux/virtio_console.h new file mode 100644 index 0000000..ed2d4ea --- /dev/null +++ b/include/linux/virtio_console.h @@ -0,0 +1,12 @@ +#ifndef _LINUX_VIRTIO_CONSOLE_H +#define _LINUX_VIRTIO_CONSOLE_H +#include <linux/virtio_config.h> + +/* The ID for virtio console */ +#define VIRTIO_ID_CONSOLE 3 + +#ifdef __KERNEL__ +int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int)); +#endif /* __KERNEL__ */ + +#endif /* _LINUX_VIRTIO_CONSOLE_H */ diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h new file mode 100644 index 0000000..ae469ae --- /dev/null +++ b/include/linux/virtio_net.h @@ -0,0 +1,36 @@ +#ifndef _LINUX_VIRTIO_NET_H +#define _LINUX_VIRTIO_NET_H +#include <linux/virtio_config.h> + +/* The ID for virtio_net */ +#define VIRTIO_ID_NET 1 + +/* The bitmap of config for virtio net */ +#define VIRTIO_CONFIG_NET_F 0x40 +#define VIRTIO_NET_F_NO_CSUM 0 +#define VIRTIO_NET_F_TSO4 1 +#define VIRTIO_NET_F_UFO 2 +#define VIRTIO_NET_F_TSO4_ECN 3 +#define VIRTIO_NET_F_TSO6 4 + +/* The config defining mac address. */ +#define VIRTIO_CONFIG_NET_MAC_F 0x41 + +/* This is the first element of the scatter-gather list. If you don't + * specify GSO or CSUM features, you can simply ignore the header. */ +struct virtio_net_hdr +{ +#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 // Use csum_start, csum_offset + __u8 flags; +#define VIRTIO_NET_HDR_GSO_NONE 0 // Not a GSO frame +#define VIRTIO_NET_HDR_GSO_TCPV4 1 // GSO frame, IPv4 TCP (TSO) +/* FIXME: Do we need this? If they said they can handle ECN, do they care? */ +#define VIRTIO_NET_HDR_GSO_TCPV4_ECN 2 // GSO frame, IPv4 TCP w/ ECN +#define VIRTIO_NET_HDR_GSO_UDP 3 // GSO frame, IPv4 UDP (UFO) +#define VIRTIO_NET_HDR_GSO_TCPV6 4 // GSO frame, IPv6 TCP + __u8 gso_type; + __u16 gso_size; + __u16 csum_start; + __u16 csum_offset; +}; +#endif /* _LINUX_VIRTIO_NET_H */ diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h new file mode 100644 index 0000000..ac69e7b --- /dev/null +++ b/include/linux/virtio_ring.h @@ -0,0 +1,119 @@ +#ifndef _LINUX_VIRTIO_RING_H +#define _LINUX_VIRTIO_RING_H +/* An interface for efficient virtio implementation, currently for use by KVM + * and lguest, but hopefully others soon. Do NOT change this since it will + * break existing servers and clients. + * + * This header is BSD licensed so anyone can use the definitions to implement + * compatible drivers/servers. + * + * Copyright Rusty Russell IBM Corporation 2007. */ +#include <linux/types.h> + +/* This marks a buffer as continuing via the next field. */ +#define VRING_DESC_F_NEXT 1 +/* This marks a buffer as write-only (otherwise read-only). */ +#define VRING_DESC_F_WRITE 2 + +/* This means don't notify other side when buffer added. */ +#define VRING_USED_F_NO_NOTIFY 1 +/* This means don't interrupt guest when buffer consumed. */ +#define VRING_AVAIL_F_NO_INTERRUPT 1 + +/* Virtio ring descriptors: 16 bytes. These can chain together via "next". */ +struct vring_desc +{ + /* Address (guest-physical). */ + __u64 addr; + /* Length. */ + __u32 len; + /* The flags as indicated above. */ + __u16 flags; + /* We chain unused descriptors via this, too */ + __u16 next; +}; + +struct vring_avail +{ + __u16 flags; + __u16 idx; + __u16 ring[]; +}; + +/* u32 is used here for ids for padding reasons. */ +struct vring_used_elem +{ + /* Index of start of used descriptor chain. */ + __u32 id; + /* Total length of the descriptor chain which was used (written to) */ + __u32 len; +}; + +struct vring_used +{ + __u16 flags; + __u16 idx; + struct vring_used_elem ring[]; +}; + +struct vring { + unsigned int num; + + struct vring_desc *desc; + + struct vring_avail *avail; + + struct vring_used *used; +}; + +/* The standard layout for the ring is a continuous chunk of memory which looks + * like this. The used fields will be aligned to a "num+1" boundary. + * + * struct vring + * { + * // The actual descriptors (16 bytes each) + * struct vring_desc desc[num]; + * + * // A ring of available descriptor heads with free-running index. + * __u16 avail_flags; + * __u16 avail_idx; + * __u16 available[num]; + * + * // Padding so a correctly-chosen num value will cache-align used_idx. + * char pad[sizeof(struct vring_desc) - sizeof(avail_flags)]; + * + * // A ring of used descriptor heads with free-running index. + * __u16 used_flags; + * __u16 used_idx; + * struct vring_used_elem used[num]; + * }; + */ +static inline void vring_init(struct vring *vr, unsigned int num, void *p) +{ + vr->num = num; + vr->desc = p; + vr->avail = p + num*sizeof(struct vring); + vr->used = p + (num+1)*(sizeof(struct vring) + sizeof(__u16)); +} + +static inline unsigned vring_size(unsigned int num) +{ + return (num + 1) * (sizeof(struct vring_desc) + sizeof(__u16)) + + sizeof(__u32) + num * sizeof(struct vring_used_elem); +} + +#ifdef __KERNEL__ +#include <linux/irqreturn.h> +struct virtio_device; +struct virtqueue; + +struct virtqueue *vring_new_virtqueue(unsigned int num, + struct virtio_device *vdev, + void *pages, + void (*notify)(struct virtqueue *vq), + bool (*callback)(struct virtqueue *vq)); +void vring_del_virtqueue(struct virtqueue *vq); + +irqreturn_t vring_interrupt(int irq, void *_vq); +#endif /* __KERNEL__ */ +#endif /* _LINUX_VIRTIO_RING_H */ diff --git a/include/media/saa7146.h b/include/media/saa7146.h index cd3ff2c..88b2b5a 100644 --- a/include/media/saa7146.h +++ b/include/media/saa7146.h @@ -12,6 +12,7 @@ #include <asm/io.h> /* for accessing devices */ #include <linux/stringify.h> #include <linux/mutex.h> +#include <linux/scatterlist.h> #include <linux/vmalloc.h> /* for vmalloc() */ #include <linux/mm.h> /* for vmalloc_to_page() */ diff --git a/include/sound/version.h b/include/sound/version.h index 8d4a8dd..a2be8ad 100644 --- a/include/sound/version.h +++ b/include/sound/version.h @@ -1,3 +1,3 @@ /* include/version.h. Generated by alsa/ksync script. */ #define CONFIG_SND_VERSION "1.0.15" -#define CONFIG_SND_DATE " (Tue Oct 16 14:57:44 2007 UTC)" +#define CONFIG_SND_DATE " (Tue Oct 23 06:09:18 2007 UTC)" diff --git a/include/video/Kbuild b/include/video/Kbuild index 53a6c73..0e406f7 100644 --- a/include/video/Kbuild +++ b/include/video/Kbuild @@ -1 +1,2 @@ unifdef-y += sisfb.h uvesafb.h +unifdef-y += edid.h diff --git a/include/video/edid.h b/include/video/edid.h index f6a42d6..928c342 100644 --- a/include/video/edid.h +++ b/include/video/edid.h @@ -1,17 +1,16 @@ #ifndef __linux_video_edid_h__ #define __linux_video_edid_h__ -#ifdef __KERNEL__ - +#if !defined(__KERNEL__) || defined(CONFIG_X86) -#ifdef CONFIG_X86 struct edid_info { unsigned char dummy[128]; }; +#ifdef __KERNEL__ extern struct edid_info edid_info; -#endif /* CONFIG_X86 */ - #endif /* __KERNEL__ */ +#endif + #endif /* __linux_video_edid_h__ */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 80eab7a..1f31422 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -29,12 +29,28 @@ void synchronize_irq(unsigned int irq) { struct irq_desc *desc = irq_desc + irq; + unsigned int status; if (irq >= NR_IRQS) return; - while (desc->status & IRQ_INPROGRESS) - cpu_relax(); + do { + unsigned long flags; + + /* + * Wait until we're out of the critical section. This might + * give the wrong answer due to the lack of memory barriers. + */ + while (desc->status & IRQ_INPROGRESS) + cpu_relax(); + + /* Ok, that indicated we're done: double-check carefully. */ + spin_lock_irqsave(&desc->lock, flags); + status = desc->status; + spin_unlock_irqrestore(&desc->lock, flags); + + /* Oops, that failed? */ + } while (status & IRQ_INPROGRESS); } EXPORT_SYMBOL(synchronize_irq); diff --git a/lib/reed_solomon/decode_rs.c b/lib/reed_solomon/decode_rs.c index a58df56..0ec3f25 100644 --- a/lib/reed_solomon/decode_rs.c +++ b/lib/reed_solomon/decode_rs.c @@ -39,8 +39,7 @@ /* Check length parameter for validity */ pad = nn - nroots - len; - if (pad < 0 || pad >= nn) - return -ERANGE; + BUG_ON(pad < 0 || pad >= nn); /* Does the caller provide the syndrome ? */ if (s != NULL) @@ -203,7 +202,7 @@ * deg(lambda) unequal to number of roots => uncorrectable * error detected */ - count = -1; + count = -EBADMSG; goto finish; } /* diff --git a/lib/reed_solomon/reed_solomon.c b/lib/reed_solomon/reed_solomon.c index 5b0d852..3ea2db9 100644 --- a/lib/reed_solomon/reed_solomon.c +++ b/lib/reed_solomon/reed_solomon.c @@ -320,6 +320,7 @@ EXPORT_SYMBOL_GPL(encode_rs8); * The syndrome and parity uses a uint16_t data type to enable * symbol size > 8. The calling code must take care of decoding of the * syndrome result and the received parity before calling this code. + * Returns the number of corrected bits or -EBADMSG for uncorrectable errors. */ int decode_rs8(struct rs_control *rs, uint8_t *data, uint16_t *par, int len, uint16_t *s, int no_eras, int *eras_pos, uint16_t invmsk, @@ -363,6 +364,7 @@ EXPORT_SYMBOL_GPL(encode_rs16); * @corr: buffer to store correction bitmask on eras_pos * * Each field in the data array contains up to symbol size bits of valid data. + * Returns the number of corrected bits or -EBADMSG for uncorrectable errors. */ int decode_rs16(struct rs_control *rs, uint16_t *data, uint16_t *par, int len, uint16_t *s, int no_eras, int *eras_pos, uint16_t invmsk, @@ -1171,8 +1171,7 @@ munmap_back: vm_flags = vma->vm_flags; if (vma_wants_writenotify(vma)) - vma->vm_page_prot = - protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)]; + vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); if (!file || !vma_merge(mm, prev, addr, vma->vm_end, vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { diff --git a/mm/mprotect.c b/mm/mprotect.c index 5522784..4de5468 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -194,7 +194,7 @@ success: vma->vm_flags = newflags; vma->vm_page_prot = vm_get_page_prot(newflags); if (vma_wants_writenotify(vma)) { - vma->vm_page_prot = vm_get_page_prot(newflags); + vma->vm_page_prot = vm_get_page_prot(newflags & ~VM_SHARED); dirty_accountable = 1; } diff --git a/net/9p/Kconfig b/net/9p/Kconfig index 71bc110..bafc50c 100644 --- a/net/9p/Kconfig +++ b/net/9p/Kconfig @@ -23,6 +23,13 @@ config NET_9P_FD file descriptors. TCP/IP is the default transport for 9p, so if you are going to use 9p, you'll likely want this. +config NET_9P_VIRTIO + depends on NET_9P && EXPERIMENTAL && VIRTIO + tristate "9P Virtio Transport (Experimental)" + help + This builds support for a transports between + guest partitions and a host partition. + config NET_9P_DEBUG bool "Debug information" depends on NET_9P diff --git a/net/9p/Makefile b/net/9p/Makefile index 5059bc0..d3abb24 100644 --- a/net/9p/Makefile +++ b/net/9p/Makefile @@ -1,5 +1,6 @@ obj-$(CONFIG_NET_9P) := 9pnet.o obj-$(CONFIG_NET_9P_FD) += 9pnet_fd.o +obj-$(CONFIG_NET_9P_VIRTIO) += 9pnet_virtio.o 9pnet-objs := \ mod.o \ @@ -12,3 +13,6 @@ obj-$(CONFIG_NET_9P_FD) += 9pnet_fd.o 9pnet_fd-objs := \ trans_fd.o \ + +9pnet_virtio-objs := \ + trans_virtio.o \ diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c new file mode 100644 index 0000000..40b71a2 --- /dev/null +++ b/net/9p/trans_virtio.c @@ -0,0 +1,353 @@ +/* + * The Guest 9p transport driver + * + * This is a trivial pipe-based transport driver based on the lguest console + * code: we use lguest's DMA mechanism to send bytes out, and register a + * DMA buffer to receive bytes in. It is assumed to be present and available + * from the very beginning of boot. + * + * This may be have been done by just instaniating another HVC console, + * but HVC's blocksize of 16 bytes is annoying and painful to performance. + * + * A more efficient transport could be built based on the virtio block driver + * but it requires some changes in the 9p transport model (which are in + * progress) + * + */ +/* + * Copyright (C) 2007 Eric Van Hensbergen, IBM Corporation + * + * Based on virtio console driver + * Copyright (C) 2006, 2007 Rusty Russell, IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include <linux/in.h> +#include <linux/module.h> +#include <linux/net.h> +#include <linux/ipv6.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/un.h> +#include <linux/uaccess.h> +#include <linux/inet.h> +#include <linux/idr.h> +#include <linux/file.h> +#include <net/9p/9p.h> +#include <linux/parser.h> +#include <net/9p/transport.h> +#include <linux/scatterlist.h> +#include <linux/virtio.h> +#include <linux/virtio_9p.h> + +/* a single mutex to manage channel initialization and attachment */ +static DECLARE_MUTEX(virtio_9p_lock); +/* global which tracks highest initialized channel */ +static int chan_index; + +/* We keep all per-channel information in a structure. + * This structure is allocated within the devices dev->mem space. + * A pointer to the structure will get put in the transport private. + */ +static struct virtio_chan { + bool initialized; /* channel is initialized */ + bool inuse; /* channel is in use */ + + struct virtqueue *in_vq, *out_vq; + struct virtio_device *vdev; + + /* This is our input buffer, and how much data is left in it. */ + unsigned int in_len; + char *in, *inbuf; + + wait_queue_head_t wq; /* waitq for buffer */ +} channels[MAX_9P_CHAN]; + +/* How many bytes left in this page. */ +static unsigned int rest_of_page(void *data) +{ + return PAGE_SIZE - ((unsigned long)data % PAGE_SIZE); +} + +static int p9_virtio_write(struct p9_trans *trans, void *buf, int count) +{ + struct virtio_chan *chan = (struct virtio_chan *) trans->priv; + struct virtqueue *out_vq = chan->out_vq; + struct scatterlist sg[1]; + unsigned int len; + + P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio write (%d)\n", count); + + /* keep it simple - make sure we don't overflow a page */ + if (rest_of_page(buf) < count) + count = rest_of_page(buf); + + sg_init_one(sg, buf, count); + + /* add_buf wants a token to identify this buffer: we hand it any + * non-NULL pointer, since there's only ever one buffer. */ + if (out_vq->vq_ops->add_buf(out_vq, sg, 1, 0, (void *)1) == 0) { + /* Tell Host to go! */ + out_vq->vq_ops->kick(out_vq); + /* Chill out until it's done with the buffer. */ + while (!out_vq->vq_ops->get_buf(out_vq, &len)) + cpu_relax(); + } + + P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio wrote (%d)\n", count); + + /* We're expected to return the amount of data we wrote: all of it. */ + return count; +} + +/* Create a scatter-gather list representing our input buffer and put it in the + * queue. */ +static void add_inbuf(struct virtio_chan *chan) +{ + struct scatterlist sg[1]; + + sg_init_one(sg, chan->inbuf, PAGE_SIZE); + + /* We should always be able to add one buffer to an empty queue. */ + if (chan->in_vq->vq_ops->add_buf(chan->in_vq, sg, 0, 1, chan->inbuf)) + BUG(); + chan->in_vq->vq_ops->kick(chan->in_vq); +} + +static int p9_virtio_read(struct p9_trans *trans, void *buf, int count) +{ + struct virtio_chan *chan = (struct virtio_chan *) trans->priv; + struct virtqueue *in_vq = chan->in_vq; + + P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio read (%d)\n", count); + + /* If we don't have an input queue yet, we can't get input. */ + BUG_ON(!in_vq); + + /* No buffer? Try to get one. */ + if (!chan->in_len) { + chan->in = in_vq->vq_ops->get_buf(in_vq, &chan->in_len); + if (!chan->in) + return 0; + } + + /* You want more than we have to give? Well, try wanting less! */ + if (chan->in_len < count) + count = chan->in_len; + + /* Copy across to their buffer and increment offset. */ + memcpy(buf, chan->in, count); + chan->in += count; + chan->in_len -= count; + + /* Finished? Re-register buffer so Host will use it again. */ + if (chan->in_len == 0) + add_inbuf(chan); + + P9_DPRINTK(P9_DEBUG_TRANS, "9p debug: virtio finished read (%d)\n", + count); + + return count; +} + +/* The poll function is used by 9p transports to determine if there + * is there is activity available on a particular channel. In our case + * we use it to wait for a callback from the input routines. + */ +static unsigned int +p9_virtio_poll(struct p9_trans *trans, struct poll_table_struct *pt) +{ + struct virtio_chan *chan = (struct virtio_chan *)trans->priv; + struct virtqueue *in_vq = chan->in_vq; + int ret = POLLOUT; /* we can always handle more output */ + + poll_wait(NULL, &chan->wq, pt); + + /* No buffer? Try to get one. */ + if (!chan->in_len) + chan->in = in_vq->vq_ops->get_buf(in_vq, &chan->in_len); + + if (chan->in_len) + ret |= POLLIN; + + return ret; +} + +static void p9_virtio_close(struct p9_trans *trans) +{ + struct virtio_chan *chan = trans->priv; + + down(&virtio_9p_lock); + chan->inuse = false; + up(&virtio_9p_lock); + + kfree(trans); +} + +static bool p9_virtio_intr(struct virtqueue *q) +{ + struct virtio_chan *chan = q->vdev->priv; + + P9_DPRINTK(P9_DEBUG_TRANS, "9p poll_wakeup: %p\n", &chan->wq); + wake_up_interruptible(&chan->wq); + + return true; +} + +static int p9_virtio_probe(struct virtio_device *dev) +{ + int err; + struct virtio_chan *chan; + int index; + + down(&virtio_9p_lock); + index = chan_index++; + chan = &channels[index]; + up(&virtio_9p_lock); + + if (chan_index > MAX_9P_CHAN) { + printk(KERN_ERR "9p: virtio: Maximum channels exceeded\n"); + BUG(); + } + + chan->vdev = dev; + + /* This is the scratch page we use to receive console input */ + chan->inbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!chan->inbuf) { + err = -ENOMEM; + goto fail; + } + + /* Find the input queue. */ + dev->priv = chan; + chan->in_vq = dev->config->find_vq(dev, p9_virtio_intr); + if (IS_ERR(chan->in_vq)) { + err = PTR_ERR(chan->in_vq); + goto free; + } + + chan->out_vq = dev->config->find_vq(dev, NULL); + if (IS_ERR(chan->out_vq)) { + err = PTR_ERR(chan->out_vq); + goto free_in_vq; + } + + init_waitqueue_head(&chan->wq); + + /* Register the input buffer the first time. */ + add_inbuf(chan); + chan->inuse = false; + chan->initialized = true; + + return 0; + +free_in_vq: + dev->config->del_vq(chan->in_vq); +free: + kfree(chan->inbuf); +fail: + down(&virtio_9p_lock); + chan_index--; + up(&virtio_9p_lock); + return err; +} + +/* This sets up a transport channel for 9p communication. Right now + * we only match the first available channel, but eventually we couldlook up + * alternate channels by matching devname versus a virtio_config entry. + * We use a simple reference count mechanism to ensure that only a single + * mount has a channel open at a time. */ +static struct p9_trans *p9_virtio_create(const char *devname, char *args) +{ + struct p9_trans *trans; + int index = 0; + struct virtio_chan *chan = channels; + + down(&virtio_9p_lock); + while (index < MAX_9P_CHAN) { + if (chan->initialized && !chan->inuse) { + chan->inuse = true; + break; + } else { + index++; + chan = &channels[index]; + } + } + up(&virtio_9p_lock); + + if (index >= MAX_9P_CHAN) { + printk(KERN_ERR "9p: virtio: couldn't find a free channel\n"); + return NULL; + } + + trans = kmalloc(sizeof(struct p9_trans), GFP_KERNEL); + if (!trans) { + printk(KERN_ERR "9p: couldn't allocate transport\n"); + return ERR_PTR(-ENOMEM); + } + + trans->write = p9_virtio_write; + trans->read = p9_virtio_read; + trans->close = p9_virtio_close; + trans->poll = p9_virtio_poll; + trans->priv = chan; + + return trans; +} + +#define VIRTIO_ID_9P 9 + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_9P, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +/* The standard "struct lguest_driver": */ +static struct virtio_driver p9_virtio_drv = { + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .probe = p9_virtio_probe, +}; + +static struct p9_trans_module p9_virtio_trans = { + .name = "virtio", + .create = p9_virtio_create, + .maxsize = PAGE_SIZE, + .def = 0, +}; + +/* The standard init function */ +static int __init p9_virtio_init(void) +{ + int count; + + for (count = 0; count < MAX_9P_CHAN; count++) + channels[count].initialized = false; + + v9fs_register_trans(&p9_virtio_trans); + return register_virtio_driver(&p9_virtio_drv); +} + +module_init(p9_virtio_init); + +MODULE_DEVICE_TABLE(virtio, id_table); +MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>"); +MODULE_DESCRIPTION("Virtio 9p Transport"); +MODULE_LICENSE("GPL"); diff --git a/net/ieee80211/ieee80211_crypt_tkip.c b/net/ieee80211/ieee80211_crypt_tkip.c index c796661..8117776 100644 --- a/net/ieee80211/ieee80211_crypt_tkip.c +++ b/net/ieee80211/ieee80211_crypt_tkip.c @@ -14,6 +14,7 @@ #include <linux/init.h> #include <linux/slab.h> #include <linux/random.h> +#include <linux/scatterlist.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/mm.h> diff --git a/net/ieee80211/ieee80211_crypt_wep.c b/net/ieee80211/ieee80211_crypt_wep.c index 0af6103..9693429 100644 --- a/net/ieee80211/ieee80211_crypt_wep.c +++ b/net/ieee80211/ieee80211_crypt_wep.c @@ -14,6 +14,7 @@ #include <linux/init.h> #include <linux/slab.h> #include <linux/random.h> +#include <linux/scatterlist.h> #include <linux/skbuff.h> #include <linux/mm.h> #include <asm/string.h> diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c index cc806d6..a84a233 100644 --- a/net/mac80211/wep.c +++ b/net/mac80211/wep.c @@ -16,7 +16,7 @@ #include <linux/crypto.h> #include <linux/err.h> #include <linux/mm.h> -#include <asm/scatterlist.h> +#include <linux/scatterlist.h> #include <net/mac80211.h> #include "ieee80211_i.h" diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index d5a9785..658476c 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -56,7 +56,7 @@ #include <linux/ipv6.h> #include <linux/net.h> #include <linux/inet.h> -#include <asm/scatterlist.h> +#include <linux/scatterlist.h> #include <linux/crypto.h> #include <net/sock.h> diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c index fb2220a..313d4be 100644 --- a/net/xfrm/xfrm_algo.c +++ b/net/xfrm/xfrm_algo.c @@ -13,6 +13,7 @@ #include <linux/kernel.h> #include <linux/pfkeyv2.h> #include <linux/crypto.h> +#include <linux/scatterlist.h> #include <net/xfrm.h> #if defined(CONFIG_INET_AH) || defined(CONFIG_INET_AH_MODULE) || defined(CONFIG_INET6_AH) || defined(CONFIG_INET6_AH_MODULE) #include <net/ah.h> diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index 91c15da..d802b5a 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -525,6 +525,20 @@ static int do_ssb_entry(const char *filename, return 1; } +/* Looks like: virtio:dNvN */ +static int do_virtio_entry(const char *filename, struct virtio_device_id *id, + char *alias) +{ + id->device = TO_NATIVE(id->device); + id->vendor = TO_NATIVE(id->vendor); + + strcpy(alias, "virtio:"); + ADD(alias, "d", 1, id->device); + ADD(alias, "v", id->vendor != VIRTIO_DEV_ANY_ID, id->vendor); + + return 1; +} + /* Ignore any prefix, eg. v850 prepends _ */ static inline int sym_is(const char *symbol, const char *name) { @@ -651,6 +665,10 @@ void handle_moddevtable(struct module *mod, struct elf_info *info, do_table(symval, sym->st_size, sizeof(struct ssb_device_id), "ssb", do_ssb_entry, mod); + else if (sym_is(symname, "__mod_virtio_device_table")) + do_table(symval, sym->st_size, + sizeof(struct virtio_device_id), "virtio", + do_virtio_entry, mod); free(zeros); } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 24e1b18..9f3124b 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2977,11 +2977,7 @@ static int selinux_task_prctl(int option, static int selinux_task_wait(struct task_struct *p) { - u32 perm; - - perm = signal_to_av(p->exit_signal); - - return task_has_perm(p, current, perm); + return task_has_perm(p, current, PROCESS__SIGCHLD); } static void selinux_task_reparent_to_init(struct task_struct *p) diff --git a/sound/core/control.c b/sound/core/control.c index 4c3aa8e..df0774c 100644 --- a/sound/core/control.c +++ b/sound/core/control.c @@ -93,15 +93,16 @@ static int snd_ctl_open(struct inode *inode, struct file *file) static void snd_ctl_empty_read_queue(struct snd_ctl_file * ctl) { + unsigned long flags; struct snd_kctl_event *cread; - spin_lock(&ctl->read_lock); + spin_lock_irqsave(&ctl->read_lock, flags); while (!list_empty(&ctl->events)) { cread = snd_kctl_event(ctl->events.next); list_del(&cread->list); kfree(cread); } - spin_unlock(&ctl->read_lock); + spin_unlock_irqrestore(&ctl->read_lock, flags); } static int snd_ctl_release(struct inode *inode, struct file *file) diff --git a/sound/pci/bt87x.c b/sound/pci/bt87x.c index 91f9e6a..2dba752 100644 --- a/sound/pci/bt87x.c +++ b/sound/pci/bt87x.c @@ -165,7 +165,7 @@ struct snd_bt87x_board { unsigned no_digital:1; /* No digital input */ }; -static const __devinitdata struct snd_bt87x_board snd_bt87x_boards[] = { +static __devinitdata struct snd_bt87x_board snd_bt87x_boards[] = { [SND_BT87X_BOARD_UNKNOWN] = { .dig_rate = 32000, /* just a guess */ }, @@ -848,7 +848,7 @@ static int __devinit snd_bt87x_detect_card(struct pci_dev *pci) int i; const struct pci_device_id *supported; - supported = pci_match_device(&driver, pci); + supported = pci_match_id(snd_bt87x_ids, pci); if (supported && supported->driver_data > 0) return supported->driver_data; diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c index 187533e..ad4cb38 100644 --- a/sound/pci/hda/hda_codec.c +++ b/sound/pci/hda/hda_codec.c @@ -626,24 +626,19 @@ int __devinit snd_hda_codec_new(struct hda_bus *bus, unsigned int codec_addr, snd_hda_get_codec_name(codec, bus->card->mixername, sizeof(bus->card->mixername)); -#ifdef CONFIG_SND_HDA_GENERIC if (is_generic_config(codec)) { err = snd_hda_parse_generic_codec(codec); goto patched; } -#endif if (codec->preset && codec->preset->patch) { err = codec->preset->patch(codec); goto patched; } /* call the default parser */ -#ifdef CONFIG_SND_HDA_GENERIC err = snd_hda_parse_generic_codec(codec); -#else - printk(KERN_ERR "hda-codec: No codec parser is available\n"); - err = -ENODEV; -#endif + if (err < 0) + printk(KERN_ERR "hda-codec: No codec parser is available\n"); patched: if (err < 0) { diff --git a/sound/pci/hda/hda_local.h b/sound/pci/hda/hda_local.h index a79d0ed..20c5e62 100644 --- a/sound/pci/hda/hda_local.h +++ b/sound/pci/hda/hda_local.h @@ -245,7 +245,14 @@ int snd_hda_multi_out_analog_cleanup(struct hda_codec *codec, /* * generic codec parser */ +#ifdef CONFIG_SND_HDA_GENERIC int snd_hda_parse_generic_codec(struct hda_codec *codec); +#else +static inline int snd_hda_parse_generic_codec(struct hda_codec *codec) +{ + return -ENODEV; +} +#endif /* * generic proc interface @@ -303,16 +310,17 @@ enum { extern const char *auto_pin_cfg_labels[AUTO_PIN_LAST]; +#define AUTO_CFG_MAX_OUTS 5 + struct auto_pin_cfg { int line_outs; - hda_nid_t line_out_pins[5]; /* sorted in the order of - * Front/Surr/CLFE/Side - */ + /* sorted in the order of Front/Surr/CLFE/Side */ + hda_nid_t line_out_pins[AUTO_CFG_MAX_OUTS]; int speaker_outs; - hda_nid_t speaker_pins[5]; + hda_nid_t speaker_pins[AUTO_CFG_MAX_OUTS]; int hp_outs; int line_out_type; /* AUTO_PIN_XXX_OUT */ - hda_nid_t hp_pins[5]; + hda_nid_t hp_pins[AUTO_CFG_MAX_OUTS]; hda_nid_t input_pins[AUTO_PIN_LAST]; hda_nid_t dig_out_pin; hda_nid_t dig_in_pin; diff --git a/sound/pci/hda/patch_analog.c b/sound/pci/hda/patch_analog.c index 54cfd45..0ee8ae4 100644 --- a/sound/pci/hda/patch_analog.c +++ b/sound/pci/hda/patch_analog.c @@ -72,7 +72,7 @@ struct ad198x_spec { unsigned int num_kctl_alloc, num_kctl_used; struct snd_kcontrol_new *kctl_alloc; struct hda_input_mux private_imux; - hda_nid_t private_dac_nids[4]; + hda_nid_t private_dac_nids[AUTO_CFG_MAX_OUTS]; unsigned int jack_present :1; @@ -612,7 +612,8 @@ static void ad1986a_hp_automute(struct hda_codec *codec) unsigned int present; present = snd_hda_codec_read(codec, 0x1a, 0, AC_VERB_GET_PIN_SENSE, 0); - spec->jack_present = (present & 0x80000000) != 0; + /* Lenovo N100 seems to report the reversed bit for HP jack-sensing */ + spec->jack_present = !(present & 0x80000000); ad1986a_update_hp(codec); } diff --git a/sound/pci/hda/patch_cmedia.c b/sound/pci/hda/patch_cmedia.c index 2468f31..6c54793 100644 --- a/sound/pci/hda/patch_cmedia.c +++ b/sound/pci/hda/patch_cmedia.c @@ -50,7 +50,7 @@ struct cmi_spec { /* playback */ struct hda_multi_out multiout; - hda_nid_t dac_nids[4]; /* NID for each DAC */ + hda_nid_t dac_nids[AUTO_CFG_MAX_OUTS]; /* NID for each DAC */ int num_dacs; /* capture */ @@ -73,7 +73,6 @@ struct cmi_spec { unsigned int pin_def_confs; /* multichannel pins */ - hda_nid_t multich_pin[4]; /* max 8-channel */ struct hda_verb multi_init[9]; /* 2 verbs for each pin + terminator */ }; diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c index 080e300..6aa0739 100644 --- a/sound/pci/hda/patch_conexant.c +++ b/sound/pci/hda/patch_conexant.c @@ -85,7 +85,7 @@ struct conexant_spec { unsigned int num_kctl_alloc, num_kctl_used; struct snd_kcontrol_new *kctl_alloc; struct hda_input_mux private_imux; - hda_nid_t private_dac_nids[4]; + hda_nid_t private_dac_nids[AUTO_CFG_MAX_OUTS]; }; @@ -554,10 +554,16 @@ static struct snd_kcontrol_new cxt5045_mixers[] = { .get = conexant_mux_enum_get, .put = conexant_mux_enum_put }, - HDA_CODEC_VOLUME("Int Mic Volume", 0x1a, 0x01, HDA_INPUT), - HDA_CODEC_MUTE("Int Mic Switch", 0x1a, 0x01, HDA_INPUT), - HDA_CODEC_VOLUME("Ext Mic Volume", 0x1a, 0x02, HDA_INPUT), - HDA_CODEC_MUTE("Ext Mic Switch", 0x1a, 0x02, HDA_INPUT), + HDA_CODEC_VOLUME("Int Mic Capture Volume", 0x1a, 0x01, HDA_INPUT), + HDA_CODEC_MUTE("Int Mic Capture Switch", 0x1a, 0x01, HDA_INPUT), + HDA_CODEC_VOLUME("Ext Mic Capture Volume", 0x1a, 0x02, HDA_INPUT), + HDA_CODEC_MUTE("Ext Mic Capture Switch", 0x1a, 0x02, HDA_INPUT), + HDA_CODEC_VOLUME("PCM Playback Volume", 0x17, 0x0, HDA_INPUT), + HDA_CODEC_MUTE("PCM Playback Switch", 0x17, 0x0, HDA_INPUT), + HDA_CODEC_VOLUME("Int Mic Playback Volume", 0x17, 0x1, HDA_INPUT), + HDA_CODEC_MUTE("Int Mic Playback Switch", 0x17, 0x1, HDA_INPUT), + HDA_CODEC_VOLUME("Ext Mic Playback Volume", 0x17, 0x2, HDA_INPUT), + HDA_CODEC_MUTE("Ext Mic Playback Switch", 0x17, 0x2, HDA_INPUT), HDA_BIND_VOL("Master Playback Volume", &cxt5045_hp_bind_master_vol), { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, @@ -576,16 +582,15 @@ static struct hda_verb cxt5045_init_verbs[] = { {0x12, AC_VERB_SET_PIN_WIDGET_CONTROL, PIN_IN }, {0x14, AC_VERB_SET_PIN_WIDGET_CONTROL, PIN_IN|AC_PINCTL_VREF_80 }, /* HP, Amp */ - {0x11, AC_VERB_SET_PIN_WIDGET_CONTROL, PIN_HP }, - {0x17, AC_VERB_SET_CONNECT_SEL,0x01}, - {0x17, AC_VERB_SET_AMP_GAIN_MUTE, - AC_AMP_SET_OUTPUT|AC_AMP_SET_RIGHT|AC_AMP_SET_LEFT|0x01}, - {0x17, AC_VERB_SET_AMP_GAIN_MUTE, - AC_AMP_SET_OUTPUT|AC_AMP_SET_RIGHT|AC_AMP_SET_LEFT|0x02}, - {0x17, AC_VERB_SET_AMP_GAIN_MUTE, - AC_AMP_SET_OUTPUT|AC_AMP_SET_RIGHT|AC_AMP_SET_LEFT|0x03}, - {0x17, AC_VERB_SET_AMP_GAIN_MUTE, - AC_AMP_SET_OUTPUT|AC_AMP_SET_RIGHT|AC_AMP_SET_LEFT|0x04}, + {0x10, AC_VERB_SET_PIN_WIDGET_CONTROL, PIN_OUT}, + {0x10, AC_VERB_SET_CONNECT_SEL, 0x1}, + {0x11, AC_VERB_SET_PIN_WIDGET_CONTROL, PIN_HP}, + {0x11, AC_VERB_SET_CONNECT_SEL, 0x1}, + {0x17, AC_VERB_SET_AMP_GAIN_MUTE, AMP_IN_MUTE(0)}, + {0x17, AC_VERB_SET_AMP_GAIN_MUTE, AMP_IN_MUTE(1)}, + {0x17, AC_VERB_SET_AMP_GAIN_MUTE, AMP_IN_MUTE(2)}, + {0x17, AC_VERB_SET_AMP_GAIN_MUTE, AMP_IN_MUTE(3)}, + {0x17, AC_VERB_SET_AMP_GAIN_MUTE, AMP_IN_MUTE(4)}, /* Record selector: Int mic */ {0x1a, AC_VERB_SET_CONNECT_SEL,0x1}, {0x1a, AC_VERB_SET_AMP_GAIN_MUTE, diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 53b0428..d9f78c8 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -238,7 +238,7 @@ struct alc_spec { unsigned int num_kctl_alloc, num_kctl_used; struct snd_kcontrol_new *kctl_alloc; struct hda_input_mux private_imux; - hda_nid_t private_dac_nids[5]; + hda_nid_t private_dac_nids[AUTO_CFG_MAX_OUTS]; /* hooks */ void (*init_hook)(struct hda_codec *codec); diff --git a/sound/pci/hda/patch_sigmatel.c b/sound/pci/hda/patch_sigmatel.c index bf95019..f9b2c43 100644 --- a/sound/pci/hda/patch_sigmatel.c +++ b/sound/pci/hda/patch_sigmatel.c @@ -111,6 +111,7 @@ struct sigmatel_spec { unsigned int alt_switch: 1; unsigned int hp_detect: 1; unsigned int gpio_mute: 1; + unsigned int no_vol_knob :1; unsigned int gpio_mask, gpio_data; @@ -1930,7 +1931,8 @@ static int stac92xx_auto_create_hp_ctls(struct hda_codec *codec, } if (spec->multiout.hp_nid) { const char *pfx; - if (old_num_dacs == spec->multiout.num_dacs) + if (old_num_dacs == spec->multiout.num_dacs && + spec->no_vol_knob) pfx = "Master"; else pfx = "Headphone"; @@ -2487,6 +2489,7 @@ static int patch_stac9200(struct hda_codec *codec) codec->spec = spec; spec->num_pins = ARRAY_SIZE(stac9200_pin_nids); spec->pin_nids = stac9200_pin_nids; + spec->no_vol_knob = 1; spec->board_config = snd_hda_check_board_config(codec, STAC_9200_MODELS, stac9200_models, stac9200_cfg_tbl); @@ -2541,6 +2544,7 @@ static int patch_stac925x(struct hda_codec *codec) codec->spec = spec; spec->num_pins = ARRAY_SIZE(stac925x_pin_nids); spec->pin_nids = stac925x_pin_nids; + spec->no_vol_knob = 1; spec->board_config = snd_hda_check_board_config(codec, STAC_925x_MODELS, stac925x_models, stac925x_cfg_tbl); diff --git a/sound/pci/hda/patch_via.c b/sound/pci/hda/patch_via.c index 33b5e1f..4cdf3e6 100644 --- a/sound/pci/hda/patch_via.c +++ b/sound/pci/hda/patch_via.c @@ -114,7 +114,7 @@ struct via_spec { unsigned int num_kctl_alloc, num_kctl_used; struct snd_kcontrol_new *kctl_alloc; struct hda_input_mux private_imux; - hda_nid_t private_dac_nids[4]; + hda_nid_t private_dac_nids[AUTO_CFG_MAX_OUTS]; #ifdef CONFIG_SND_HDA_POWER_SAVE struct hda_loopback_check loopback; diff --git a/sound/sh/aica.c b/sound/sh/aica.c index 131ec48..88dc840 100644 --- a/sound/sh/aica.c +++ b/sound/sh/aica.c @@ -106,11 +106,14 @@ static void spu_write_wait(void) static void spu_memset(u32 toi, u32 what, int length) { int i; + unsigned long flags; snd_assert(length % 4 == 0, return); for (i = 0; i < length; i++) { if (!(i % 8)) spu_write_wait(); + local_irq_save(flags); writel(what, toi + SPU_MEMORY_BASE); + local_irq_restore(flags); toi++; } } @@ -118,6 +121,7 @@ static void spu_memset(u32 toi, u32 what, int length) /* spu_memload - write to SPU address space */ static void spu_memload(u32 toi, void *from, int length) { + unsigned long flags; u32 *froml = from; u32 __iomem *to = (u32 __iomem *) (SPU_MEMORY_BASE + toi); int i; @@ -128,7 +132,9 @@ static void spu_memload(u32 toi, void *from, int length) if (!(i % 8)) spu_write_wait(); val = *froml; + local_irq_save(flags); writel(val, to); + local_irq_restore(flags); froml++; to++; } @@ -138,28 +144,36 @@ static void spu_memload(u32 toi, void *from, int length) static void spu_disable(void) { int i; + unsigned long flags; u32 regval; spu_write_wait(); regval = readl(ARM_RESET_REGISTER); regval |= 1; spu_write_wait(); + local_irq_save(flags); writel(regval, ARM_RESET_REGISTER); + local_irq_restore(flags); for (i = 0; i < 64; i++) { spu_write_wait(); regval = readl(SPU_REGISTER_BASE + (i * 0x80)); regval = (regval & ~0x4000) | 0x8000; spu_write_wait(); + local_irq_save(flags); writel(regval, SPU_REGISTER_BASE + (i * 0x80)); + local_irq_restore(flags); } } /* spu_enable - set spu registers to enable sound output */ static void spu_enable(void) { + unsigned long flags; u32 regval = readl(ARM_RESET_REGISTER); regval &= ~1; spu_write_wait(); + local_irq_save(flags); writel(regval, ARM_RESET_REGISTER); + local_irq_restore(flags); } /* @@ -168,25 +182,34 @@ static void spu_enable(void) */ static void spu_reset(void) { + unsigned long flags; spu_disable(); spu_memset(0, 0, 0x200000 / 4); /* Put ARM7 in endless loop */ + local_irq_save(flags); ctrl_outl(0xea000002, SPU_MEMORY_BASE); + local_irq_restore(flags); spu_enable(); } /* aica_chn_start - write to spu to start playback */ static void aica_chn_start(void) { + unsigned long flags; spu_write_wait(); + local_irq_save(flags); writel(AICA_CMD_KICK | AICA_CMD_START, (u32 *) AICA_CONTROL_POINT); + local_irq_restore(flags); } /* aica_chn_halt - write to spu to halt playback */ static void aica_chn_halt(void) { + unsigned long flags; spu_write_wait(); + local_irq_save(flags); writel(AICA_CMD_KICK | AICA_CMD_STOP, (u32 *) AICA_CONTROL_POINT); + local_irq_restore(flags); } /* ALSA code below */ @@ -213,12 +236,13 @@ static int aica_dma_transfer(int channels, int buffer_size, int q, err, period_offset; struct snd_card_aica *dreamcastcard; struct snd_pcm_runtime *runtime; - err = 0; + unsigned long flags; dreamcastcard = substream->pcm->private_data; period_offset = dreamcastcard->clicks; period_offset %= (AICA_PERIOD_NUMBER / channels); runtime = substream->runtime; for (q = 0; q < channels; q++) { + local_irq_save(flags); err = dma_xfer(AICA_DMA_CHANNEL, (unsigned long) (runtime->dma_area + (AICA_BUFFER_SIZE * q) / @@ -228,9 +252,12 @@ static int aica_dma_transfer(int channels, int buffer_size, AICA_CHANNEL0_OFFSET + q * CHANNEL_OFFSET + AICA_PERIOD_SIZE * period_offset, buffer_size / channels, AICA_DMA_MODE); - if (unlikely(err < 0)) + if (unlikely(err < 0)) { + local_irq_restore(flags); break; + } dma_wait_for_completion(AICA_DMA_CHANNEL); + local_irq_restore(flags); } return err; } diff --git a/sound/sparc/cs4231.c b/sound/sparc/cs4231.c index 9785382..f8c7a12 100644 --- a/sound/sparc/cs4231.c +++ b/sound/sparc/cs4231.c @@ -400,65 +400,44 @@ static void snd_cs4231_mce_up(struct snd_cs4231 *chip) static void snd_cs4231_mce_down(struct snd_cs4231 *chip) { - unsigned long flags; - unsigned long end_time; - int timeout; + unsigned long flags, timeout; + int reg; - spin_lock_irqsave(&chip->lock, flags); snd_cs4231_busy_wait(chip); + spin_lock_irqsave(&chip->lock, flags); #ifdef CONFIG_SND_DEBUG if (__cs4231_readb(chip, CS4231U(chip, REGSEL)) & CS4231_INIT) snd_printdd("mce_down [%p] - auto calibration time out (0)\n", CS4231U(chip, REGSEL)); #endif chip->mce_bit &= ~CS4231_MCE; - timeout = __cs4231_readb(chip, CS4231U(chip, REGSEL)); - __cs4231_writeb(chip, chip->mce_bit | (timeout & 0x1f), + reg = __cs4231_readb(chip, CS4231U(chip, REGSEL)); + __cs4231_writeb(chip, chip->mce_bit | (reg & 0x1f), CS4231U(chip, REGSEL)); - if (timeout == 0x80) - snd_printdd("mce_down [%p]: serious init problem - " - "codec still busy\n", - chip->port); - if ((timeout & CS4231_MCE) == 0) { + if (reg == 0x80) + snd_printdd("mce_down [%p]: serious init problem " + "- codec still busy\n", chip->port); + if ((reg & CS4231_MCE) == 0) { spin_unlock_irqrestore(&chip->lock, flags); return; } /* - * Wait for (possible -- during init auto-calibration may not be set) - * calibration process to start. Needs upto 5 sample periods on AD1848 - * which at the slowest possible rate of 5.5125 kHz means 907 us. + * Wait for auto-calibration (AC) process to finish, i.e. ACI to go low. */ - msleep(1); - - /* check condition up to 250ms */ - end_time = jiffies + msecs_to_jiffies(250); - while (snd_cs4231_in(chip, CS4231_TEST_INIT) & - CS4231_CALIB_IN_PROGRESS) { - + timeout = jiffies + msecs_to_jiffies(250); + do { spin_unlock_irqrestore(&chip->lock, flags); - if (time_after(jiffies, end_time)) { - snd_printk("mce_down - " - "auto calibration time out (2)\n"); - return; - } - msleep(1); - spin_lock_irqsave(&chip->lock, flags); - } - - /* check condition up to 100ms */ - end_time = jiffies + msecs_to_jiffies(100); - while (__cs4231_readb(chip, CS4231U(chip, REGSEL)) & CS4231_INIT) { - spin_unlock_irqrestore(&chip->lock, flags); - if (time_after(jiffies, end_time)) { - snd_printk("mce_down - " - "auto calibration time out (3)\n"); - return; - } msleep(1); spin_lock_irqsave(&chip->lock, flags); - } + reg = snd_cs4231_in(chip, CS4231_TEST_INIT); + reg &= CS4231_CALIB_IN_PROGRESS; + } while (reg && time_before(jiffies, timeout)); spin_unlock_irqrestore(&chip->lock, flags); + + if (reg) + snd_printk(KERN_ERR + "mce_down - auto calibration time out (2)\n"); } static void snd_cs4231_advance_dma(struct cs4231_dma_control *dma_cont, diff --git a/sound/usb/usbquirks.h b/sound/usb/usbquirks.h index 743568f..59410f4 100644 --- a/sound/usb/usbquirks.h +++ b/sound/usb/usbquirks.h @@ -84,6 +84,15 @@ USB_DEVICE_ID_MATCH_INT_CLASS | USB_DEVICE_ID_MATCH_INT_SUBCLASS, .idVendor = 0x046d, + .idProduct = 0x08f5, + .bInterfaceClass = USB_CLASS_AUDIO, + .bInterfaceSubClass = USB_SUBCLASS_AUDIO_CONTROL +}, +{ + .match_flags = USB_DEVICE_ID_MATCH_DEVICE | + USB_DEVICE_ID_MATCH_INT_CLASS | + USB_DEVICE_ID_MATCH_INT_SUBCLASS, + .idVendor = 0x046d, .idProduct = 0x08f6, .bInterfaceClass = USB_CLASS_AUDIO, .bInterfaceSubClass = USB_SUBCLASS_AUDIO_CONTROL |