Merge upstream QEMU 10.0.50 into the Android source tree.

This change integrates many changes from the upstream QEMU sources. Its main purpose is to enable correct ARMv6 and ARMv7 support to the Android emulator. Due to the nature of the upstream code base, this unfortunately also required changes to many other parts of the source. Note that to ensure easier integrations in the future, some source files and directories that have heavy Android-specific customization have been renamed with an -android suffix. The original files are still there for easier integration tracking, but *never* compiled. For example: net.c net-android.c qemu-char.c qemu-char-android.c slirp/ slirp-android/ etc... Tested on linux-x86, darwin-x86 and windows host machines.
author: David 'Digit' Turner <digit@google.com> 2009-09-14 14:32:27 -0700
committer: David 'Digit' Turner <digit@google.com> 2009-09-14 14:32:27 -0700
commit: 5d8f37ad78fc66901af50c762029a501561f3b23 (patch)
tree: 206790f8f21000850a98c4f9590a79e779106278 /block
parent: cd059b15f2c7df69f4a087bd66900eb172e41d1c (diff)
download: external_qemu-5d8f37ad78fc66901af50c762029a501561f3b23.zip
external_qemu-5d8f37ad78fc66901af50c762029a501561f3b23.tar.gz
external_qemu-5d8f37ad78fc66901af50c762029a501561f3b23.tar.bz2
17 files changed, 11923 insertions, 0 deletions
diff --git a/block/bochs.c b/block/bochs.c
new file mode 100644
index 0000000..bac81c4
--- /dev/null
+++ b/block/bochs.c
@@ -0,0 +1,259 @@
+/*
+ * Block driver for the various disk image formats used by Bochs
+ * Currently only for "growing" type in read-only mode
+ *
+ * Copyright (c) 2005 Alex Beregszaszi
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+
+/**************************************************************/
+
+#define HEADER_MAGIC "Bochs Virtual HD Image"
+#define HEADER_VERSION 0x00020000
+#define HEADER_V1 0x00010000
+#define HEADER_SIZE 512
+
+#define REDOLOG_TYPE "Redolog"
+#define GROWING_TYPE "Growing"
+
+// not allocated: 0xffffffff
+
+// always little-endian
+struct bochs_header_v1 {
+    char magic[32]; // "Bochs Virtual HD Image"
+    char type[16]; // "Redolog"
+    char subtype[16]; // "Undoable" / "Volatile" / "Growing"
+    uint32_t version;
+    uint32_t header; // size of header
+
+    union {
+	struct {
+	    uint32_t catalog; // num of entries
+	    uint32_t bitmap; // bitmap size
+	    uint32_t extent; // extent size
+	    uint64_t disk; // disk size
+	    char padding[HEADER_SIZE - 64 - 8 - 20];
+	} redolog;
+	char padding[HEADER_SIZE - 64 - 8];
+    } extra;
+};
+
+// always little-endian
+struct bochs_header {
+    char magic[32]; // "Bochs Virtual HD Image"
+    char type[16]; // "Redolog"
+    char subtype[16]; // "Undoable" / "Volatile" / "Growing"
+    uint32_t version;
+    uint32_t header; // size of header
+
+    union {
+	struct {
+	    uint32_t catalog; // num of entries
+	    uint32_t bitmap; // bitmap size
+	    uint32_t extent; // extent size
+	    uint32_t reserved; // for ???
+	    uint64_t disk; // disk size
+	    char padding[HEADER_SIZE - 64 - 8 - 24];
+	} redolog;
+	char padding[HEADER_SIZE - 64 - 8];
+    } extra;
+};
+
+typedef struct BDRVBochsState {
+    int fd;
+
+    uint32_t *catalog_bitmap;
+    int catalog_size;
+
+    int data_offset;
+
+    int bitmap_blocks;
+    int extent_blocks;
+    int extent_size;
+} BDRVBochsState;
+
+static int bochs_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    const struct bochs_header *bochs = (const void *)buf;
+
+    if (buf_size < HEADER_SIZE)
+	return 0;
+
+    if (!strcmp(bochs->magic, HEADER_MAGIC) &&
+	!strcmp(bochs->type, REDOLOG_TYPE) &&
+	!strcmp(bochs->subtype, GROWING_TYPE) &&
+	((le32_to_cpu(bochs->version) == HEADER_VERSION) ||
+	(le32_to_cpu(bochs->version) == HEADER_V1)))
+	return 100;
+
+    return 0;
+}
+
+static int bochs_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVBochsState *s = bs->opaque;
+    int fd, i;
+    struct bochs_header bochs;
+    struct bochs_header_v1 header_v1;
+
+    fd = open(filename, O_RDWR | O_BINARY);
+    if (fd < 0) {
+        fd = open(filename, O_RDONLY | O_BINARY);
+        if (fd < 0)
+            return -1;
+    }
+
+    bs->read_only = 1; // no write support yet
+
+    s->fd = fd;
+
+    if (read(fd, &bochs, sizeof(bochs)) != sizeof(bochs)) {
+        goto fail;
+    }
+
+    if (strcmp(bochs.magic, HEADER_MAGIC) ||
+        strcmp(bochs.type, REDOLOG_TYPE) ||
+        strcmp(bochs.subtype, GROWING_TYPE) ||
+	((le32_to_cpu(bochs.version) != HEADER_VERSION) &&
+	(le32_to_cpu(bochs.version) != HEADER_V1))) {
+        goto fail;
+    }
+
+    if (le32_to_cpu(bochs.version) == HEADER_V1) {
+      memcpy(&header_v1, &bochs, sizeof(bochs));
+      bs->total_sectors = le64_to_cpu(header_v1.extra.redolog.disk) / 512;
+    } else {
+      bs->total_sectors = le64_to_cpu(bochs.extra.redolog.disk) / 512;
+    }
+
+    lseek(s->fd, le32_to_cpu(bochs.header), SEEK_SET);
+
+    s->catalog_size = le32_to_cpu(bochs.extra.redolog.catalog);
+    s->catalog_bitmap = qemu_malloc(s->catalog_size * 4);
+    if (read(s->fd, s->catalog_bitmap, s->catalog_size * 4) !=
+	s->catalog_size * 4)
+	goto fail;
+    for (i = 0; i < s->catalog_size; i++)
+	le32_to_cpus(&s->catalog_bitmap[i]);
+
+    s->data_offset = le32_to_cpu(bochs.header) + (s->catalog_size * 4);
+
+    s->bitmap_blocks = 1 + (le32_to_cpu(bochs.extra.redolog.bitmap) - 1) / 512;
+    s->extent_blocks = 1 + (le32_to_cpu(bochs.extra.redolog.extent) - 1) / 512;
+
+    s->extent_size = le32_to_cpu(bochs.extra.redolog.extent);
+
+    return 0;
+ fail:
+    close(fd);
+    return -1;
+}
+
+static inline int seek_to_sector(BlockDriverState *bs, int64_t sector_num)
+{
+    BDRVBochsState *s = bs->opaque;
+    int64_t offset = sector_num * 512;
+    int64_t extent_index, extent_offset, bitmap_offset, block_offset;
+    char bitmap_entry;
+
+    // seek to sector
+    extent_index = offset / s->extent_size;
+    extent_offset = (offset % s->extent_size) / 512;
+
+    if (s->catalog_bitmap[extent_index] == 0xffffffff)
+    {
+//	fprintf(stderr, "page not allocated [%x - %x:%x]\n",
+//	    sector_num, extent_index, extent_offset);
+	return -1; // not allocated
+    }
+
+    bitmap_offset = s->data_offset + (512 * s->catalog_bitmap[extent_index] *
+	(s->extent_blocks + s->bitmap_blocks));
+    block_offset = bitmap_offset + (512 * (s->bitmap_blocks + extent_offset));
+
+//    fprintf(stderr, "sect: %x [ext i: %x o: %x] -> %x bitmap: %x block: %x\n",
+//	sector_num, extent_index, extent_offset,
+//	le32_to_cpu(s->catalog_bitmap[extent_index]),
+//	bitmap_offset, block_offset);
+
+    // read in bitmap for current extent
+    lseek(s->fd, bitmap_offset + (extent_offset / 8), SEEK_SET);
+
+    read(s->fd, &bitmap_entry, 1);
+
+    if (!((bitmap_entry >> (extent_offset % 8)) & 1))
+    {
+//	fprintf(stderr, "sector (%x) in bitmap not allocated\n",
+//	    sector_num);
+	return -1; // not allocated
+    }
+
+    lseek(s->fd, block_offset, SEEK_SET);
+
+    return 0;
+}
+
+static int bochs_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
+{
+    BDRVBochsState *s = bs->opaque;
+    int ret;
+
+    while (nb_sectors > 0) {
+	if (!seek_to_sector(bs, sector_num))
+	{
+	    ret = read(s->fd, buf, 512);
+	    if (ret != 512)
+		return -1;
+	}
+	else
+            memset(buf, 0, 512);
+        nb_sectors--;
+        sector_num++;
+        buf += 512;
+    }
+    return 0;
+}
+
+static void bochs_close(BlockDriverState *bs)
+{
+    BDRVBochsState *s = bs->opaque;
+    qemu_free(s->catalog_bitmap);
+    close(s->fd);
+}
+
+static BlockDriver bdrv_bochs = {
+    .format_name	= "bochs",
+    .instance_size	= sizeof(BDRVBochsState),
+    .bdrv_probe		= bochs_probe,
+    .bdrv_open		= bochs_open,
+    .bdrv_read		= bochs_read,
+    .bdrv_close		= bochs_close,
+};
+
+static void bdrv_bochs_init(void)
+{
+    bdrv_register(&bdrv_bochs);
+}
+
+block_init(bdrv_bochs_init);
diff --git a/block/cloop.c b/block/cloop.c
new file mode 100644
index 0000000..06c687e
--- /dev/null
+++ b/block/cloop.c
@@ -0,0 +1,171 @@
+/*
+ * QEMU Block driver for CLOOP images
+ *
+ * Copyright (c) 2004 Johannes E. Schindelin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+#include <zlib.h>
+
+typedef struct BDRVCloopState {
+    int fd;
+    uint32_t block_size;
+    uint32_t n_blocks;
+    uint64_t* offsets;
+    uint32_t sectors_per_block;
+    uint32_t current_block;
+    uint8_t *compressed_block;
+    uint8_t *uncompressed_block;
+    z_stream zstream;
+} BDRVCloopState;
+
+static int cloop_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    const char* magic_version_2_0="#!/bin/sh\n"
+	"#V2.0 Format\n"
+	"modprobe cloop file=$0 && mount -r -t iso9660 /dev/cloop $1\n";
+    int length=strlen(magic_version_2_0);
+    if(length>buf_size)
+	length=buf_size;
+    if(!memcmp(magic_version_2_0,buf,length))
+	return 2;
+    return 0;
+}
+
+static int cloop_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVCloopState *s = bs->opaque;
+    uint32_t offsets_size,max_compressed_block_size=1,i;
+
+    s->fd = open(filename, O_RDONLY | O_BINARY);
+    if (s->fd < 0)
+        return -errno;
+    bs->read_only = 1;
+
+    /* read header */
+    if(lseek(s->fd,128,SEEK_SET)<0) {
+cloop_close:
+	close(s->fd);
+	return -1;
+    }
+    if(read(s->fd,&s->block_size,4)<4)
+	goto cloop_close;
+    s->block_size=be32_to_cpu(s->block_size);
+    if(read(s->fd,&s->n_blocks,4)<4)
+	goto cloop_close;
+    s->n_blocks=be32_to_cpu(s->n_blocks);
+
+    /* read offsets */
+    offsets_size=s->n_blocks*sizeof(uint64_t);
+    s->offsets=(uint64_t*)qemu_malloc(offsets_size);
+    if(read(s->fd,s->offsets,offsets_size)<offsets_size)
+	goto cloop_close;
+    for(i=0;i<s->n_blocks;i++) {
+	s->offsets[i]=be64_to_cpu(s->offsets[i]);
+	if(i>0) {
+	    uint32_t size=s->offsets[i]-s->offsets[i-1];
+	    if(size>max_compressed_block_size)
+		max_compressed_block_size=size;
+	}
+    }
+
+    /* initialize zlib engine */
+    s->compressed_block = qemu_malloc(max_compressed_block_size+1);
+    s->uncompressed_block = qemu_malloc(s->block_size);
+    if(inflateInit(&s->zstream) != Z_OK)
+	goto cloop_close;
+    s->current_block=s->n_blocks;
+
+    s->sectors_per_block = s->block_size/512;
+    bs->total_sectors = s->n_blocks*s->sectors_per_block;
+    return 0;
+}
+
+static inline int cloop_read_block(BDRVCloopState *s,int block_num)
+{
+    if(s->current_block != block_num) {
+	int ret;
+        uint32_t bytes = s->offsets[block_num+1]-s->offsets[block_num];
+
+	lseek(s->fd, s->offsets[block_num], SEEK_SET);
+        ret = read(s->fd, s->compressed_block, bytes);
+        if (ret != bytes)
+            return -1;
+
+	s->zstream.next_in = s->compressed_block;
+	s->zstream.avail_in = bytes;
+	s->zstream.next_out = s->uncompressed_block;
+	s->zstream.avail_out = s->block_size;
+	ret = inflateReset(&s->zstream);
+	if(ret != Z_OK)
+	    return -1;
+	ret = inflate(&s->zstream, Z_FINISH);
+	if(ret != Z_STREAM_END || s->zstream.total_out != s->block_size)
+	    return -1;
+
+	s->current_block = block_num;
+    }
+    return 0;
+}
+
+static int cloop_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
+{
+    BDRVCloopState *s = bs->opaque;
+    int i;
+
+    for(i=0;i<nb_sectors;i++) {
+	uint32_t sector_offset_in_block=((sector_num+i)%s->sectors_per_block),
+	    block_num=(sector_num+i)/s->sectors_per_block;
+	if(cloop_read_block(s, block_num) != 0)
+	    return -1;
+	memcpy(buf+i*512,s->uncompressed_block+sector_offset_in_block*512,512);
+    }
+    return 0;
+}
+
+static void cloop_close(BlockDriverState *bs)
+{
+    BDRVCloopState *s = bs->opaque;
+    close(s->fd);
+    if(s->n_blocks>0)
+	free(s->offsets);
+    free(s->compressed_block);
+    free(s->uncompressed_block);
+    inflateEnd(&s->zstream);
+}
+
+static BlockDriver bdrv_cloop = {
+    .format_name	= "cloop",
+    .instance_size	= sizeof(BDRVCloopState),
+    .bdrv_probe		= cloop_probe,
+    .bdrv_open		= cloop_open,
+    .bdrv_read		= cloop_read,
+    .bdrv_close		= cloop_close,
+};
+
+static void bdrv_cloop_init(void)
+{
+    bdrv_register(&bdrv_cloop);
+}
+
+block_init(bdrv_cloop_init);
diff --git a/block/cow.c b/block/cow.c
new file mode 100644
index 0000000..84818f1
--- /dev/null
+++ b/block/cow.c
@@ -0,0 +1,299 @@
+/*
+ * Block driver for the COW format
+ *
+ * Copyright (c) 2004 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef _WIN32
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+#include <sys/mman.h>
+
+/**************************************************************/
+/* COW block driver using file system holes */
+
+/* user mode linux compatible COW file */
+#define COW_MAGIC 0x4f4f4f4d  /* MOOO */
+#define COW_VERSION 2
+
+struct cow_header_v2 {
+    uint32_t magic;
+    uint32_t version;
+    char backing_file[1024];
+    int32_t mtime;
+    uint64_t size;
+    uint32_t sectorsize;
+};
+
+typedef struct BDRVCowState {
+    int fd;
+    uint8_t *cow_bitmap; /* if non NULL, COW mappings are used first */
+    uint8_t *cow_bitmap_addr; /* mmap address of cow_bitmap */
+    int cow_bitmap_size;
+    int64_t cow_sectors_offset;
+} BDRVCowState;
+
+static int cow_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    const struct cow_header_v2 *cow_header = (const void *)buf;
+
+    if (buf_size >= sizeof(struct cow_header_v2) &&
+        be32_to_cpu(cow_header->magic) == COW_MAGIC &&
+        be32_to_cpu(cow_header->version) == COW_VERSION)
+        return 100;
+    else
+        return 0;
+}
+
+static int cow_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVCowState *s = bs->opaque;
+    int fd;
+    struct cow_header_v2 cow_header;
+    int64_t size;
+
+    fd = open(filename, O_RDWR | O_BINARY | O_LARGEFILE);
+    if (fd < 0) {
+        fd = open(filename, O_RDONLY | O_BINARY | O_LARGEFILE);
+        if (fd < 0)
+            return -1;
+    }
+    s->fd = fd;
+    /* see if it is a cow image */
+    if (read(fd, &cow_header, sizeof(cow_header)) != sizeof(cow_header)) {
+        goto fail;
+    }
+
+    if (be32_to_cpu(cow_header.magic) != COW_MAGIC ||
+        be32_to_cpu(cow_header.version) != COW_VERSION) {
+        goto fail;
+    }
+
+    /* cow image found */
+    size = be64_to_cpu(cow_header.size);
+    bs->total_sectors = size / 512;
+
+    pstrcpy(bs->backing_file, sizeof(bs->backing_file),
+            cow_header.backing_file);
+
+    /* mmap the bitmap */
+    s->cow_bitmap_size = ((bs->total_sectors + 7) >> 3) + sizeof(cow_header);
+    s->cow_bitmap_addr = (void *)mmap(get_mmap_addr(s->cow_bitmap_size),
+                                      s->cow_bitmap_size,
+                                      PROT_READ | PROT_WRITE,
+                                      MAP_SHARED, s->fd, 0);
+    if (s->cow_bitmap_addr == MAP_FAILED)
+        goto fail;
+    s->cow_bitmap = s->cow_bitmap_addr + sizeof(cow_header);
+    s->cow_sectors_offset = (s->cow_bitmap_size + 511) & ~511;
+    return 0;
+ fail:
+    close(fd);
+    return -1;
+}
+
+static inline void cow_set_bit(uint8_t *bitmap, int64_t bitnum)
+{
+    bitmap[bitnum / 8] |= (1 << (bitnum%8));
+}
+
+static inline int is_bit_set(const uint8_t *bitmap, int64_t bitnum)
+{
+    return !!(bitmap[bitnum / 8] & (1 << (bitnum%8)));
+}
+
+
+/* Return true if first block has been changed (ie. current version is
+ * in COW file).  Set the number of continuous blocks for which that
+ * is true. */
+static inline int is_changed(uint8_t *bitmap,
+                             int64_t sector_num, int nb_sectors,
+                             int *num_same)
+{
+    int changed;
+
+    if (!bitmap || nb_sectors == 0) {
+	*num_same = nb_sectors;
+	return 0;
+    }
+
+    changed = is_bit_set(bitmap, sector_num);
+    for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) {
+	if (is_bit_set(bitmap, sector_num + *num_same) != changed)
+	    break;
+    }
+
+    return changed;
+}
+
+static int cow_is_allocated(BlockDriverState *bs, int64_t sector_num,
+                            int nb_sectors, int *pnum)
+{
+    BDRVCowState *s = bs->opaque;
+    return is_changed(s->cow_bitmap, sector_num, nb_sectors, pnum);
+}
+
+static int cow_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
+{
+    BDRVCowState *s = bs->opaque;
+    int ret, n;
+
+    while (nb_sectors > 0) {
+        if (is_changed(s->cow_bitmap, sector_num, nb_sectors, &n)) {
+            lseek(s->fd, s->cow_sectors_offset + sector_num * 512, SEEK_SET);
+            ret = read(s->fd, buf, n * 512);
+            if (ret != n * 512)
+                return -1;
+        } else {
+            if (bs->backing_hd) {
+                /* read from the base image */
+                ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
+                if (ret < 0)
+                    return -1;
+            } else {
+            memset(buf, 0, n * 512);
+        }
+        }
+        nb_sectors -= n;
+        sector_num += n;
+        buf += n * 512;
+    }
+    return 0;
+}
+
+static int cow_write(BlockDriverState *bs, int64_t sector_num,
+                     const uint8_t *buf, int nb_sectors)
+{
+    BDRVCowState *s = bs->opaque;
+    int ret, i;
+
+    lseek(s->fd, s->cow_sectors_offset + sector_num * 512, SEEK_SET);
+    ret = write(s->fd, buf, nb_sectors * 512);
+    if (ret != nb_sectors * 512)
+        return -1;
+    for (i = 0; i < nb_sectors; i++)
+        cow_set_bit(s->cow_bitmap, sector_num + i);
+    return 0;
+}
+
+static void cow_close(BlockDriverState *bs)
+{
+    BDRVCowState *s = bs->opaque;
+    munmap((void *)s->cow_bitmap_addr, s->cow_bitmap_size);
+    close(s->fd);
+}
+
+static int cow_create(const char *filename, QEMUOptionParameter *options)
+{
+    int fd, cow_fd;
+    struct cow_header_v2 cow_header;
+    struct stat st;
+    int64_t image_sectors = 0;
+    const char *image_filename = NULL;
+
+    /* Read out options */
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            image_sectors = options->value.n / 512;
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+            image_filename = options->value.s;
+        }
+        options++;
+    }
+
+    cow_fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
+              0644);
+    if (cow_fd < 0)
+        return -1;
+    memset(&cow_header, 0, sizeof(cow_header));
+    cow_header.magic = cpu_to_be32(COW_MAGIC);
+    cow_header.version = cpu_to_be32(COW_VERSION);
+    if (image_filename) {
+        /* Note: if no file, we put a dummy mtime */
+        cow_header.mtime = cpu_to_be32(0);
+
+        fd = open(image_filename, O_RDONLY | O_BINARY);
+        if (fd < 0) {
+            close(cow_fd);
+            goto mtime_fail;
+        }
+        if (fstat(fd, &st) != 0) {
+            close(fd);
+            goto mtime_fail;
+        }
+        close(fd);
+        cow_header.mtime = cpu_to_be32(st.st_mtime);
+    mtime_fail:
+        pstrcpy(cow_header.backing_file, sizeof(cow_header.backing_file),
+                image_filename);
+    }
+    cow_header.sectorsize = cpu_to_be32(512);
+    cow_header.size = cpu_to_be64(image_sectors * 512);
+    write(cow_fd, &cow_header, sizeof(cow_header));
+    /* resize to include at least all the bitmap */
+    ftruncate(cow_fd, sizeof(cow_header) + ((image_sectors + 7) >> 3));
+    close(cow_fd);
+    return 0;
+}
+
+static void cow_flush(BlockDriverState *bs)
+{
+    BDRVCowState *s = bs->opaque;
+    fsync(s->fd);
+}
+
+static QEMUOptionParameter cow_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    {
+        .name = BLOCK_OPT_BACKING_FILE,
+        .type = OPT_STRING,
+        .help = "File name of a base image"
+    },
+    { NULL }
+};
+
+static BlockDriver bdrv_cow = {
+    .format_name	= "cow",
+    .instance_size	= sizeof(BDRVCowState),
+    .bdrv_probe		= cow_probe,
+    .bdrv_open		= cow_open,
+    .bdrv_read		= cow_read,
+    .bdrv_write		= cow_write,
+    .bdrv_close		= cow_close,
+    .bdrv_create	= cow_create,
+    .bdrv_flush		= cow_flush,
+    .bdrv_is_allocated	= cow_is_allocated,
+
+    .create_options = cow_create_options,
+};
+
+static void bdrv_cow_init(void)
+{
+    bdrv_register(&bdrv_cow);
+}
+
+block_init(bdrv_cow_init);
+#endif
diff --git a/block/dmg.c b/block/dmg.c
new file mode 100644
index 0000000..262560f
--- /dev/null
+++ b/block/dmg.c
@@ -0,0 +1,301 @@
+/*
+ * QEMU Block driver for DMG images
+ *
+ * Copyright (c) 2004 Johannes E. Schindelin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "bswap.h"
+#include "module.h"
+#include <zlib.h>
+
+typedef struct BDRVDMGState {
+    int fd;
+
+    /* each chunk contains a certain number of sectors,
+     * offsets[i] is the offset in the .dmg file,
+     * lengths[i] is the length of the compressed chunk,
+     * sectors[i] is the sector beginning at offsets[i],
+     * sectorcounts[i] is the number of sectors in that chunk,
+     * the sectors array is ordered
+     * 0<=i<n_chunks */
+
+    uint32_t n_chunks;
+    uint32_t* types;
+    uint64_t* offsets;
+    uint64_t* lengths;
+    uint64_t* sectors;
+    uint64_t* sectorcounts;
+    uint32_t current_chunk;
+    uint8_t *compressed_chunk;
+    uint8_t *uncompressed_chunk;
+    z_stream zstream;
+} BDRVDMGState;
+
+static int dmg_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    int len=strlen(filename);
+    if(len>4 && !strcmp(filename+len-4,".dmg"))
+	return 2;
+    return 0;
+}
+
+static off_t read_off(int fd)
+{
+	uint64_t buffer;
+	if(read(fd,&buffer,8)<8)
+		return 0;
+	return be64_to_cpu(buffer);
+}
+
+static off_t read_uint32(int fd)
+{
+	uint32_t buffer;
+	if(read(fd,&buffer,4)<4)
+		return 0;
+	return be32_to_cpu(buffer);
+}
+
+static int dmg_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVDMGState *s = bs->opaque;
+    off_t info_begin,info_end,last_in_offset,last_out_offset;
+    uint32_t count;
+    uint32_t max_compressed_size=1,max_sectors_per_chunk=1,i;
+
+    s->fd = open(filename, O_RDONLY | O_BINARY);
+    if (s->fd < 0)
+        return -errno;
+    bs->read_only = 1;
+    s->n_chunks = 0;
+    s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL;
+
+    /* read offset of info blocks */
+    if(lseek(s->fd,-0x1d8,SEEK_END)<0) {
+dmg_close:
+	close(s->fd);
+	/* open raw instead */
+	bs->drv=bdrv_find_format("raw");
+	return bs->drv->bdrv_open(bs, filename, flags);
+    }
+    info_begin=read_off(s->fd);
+    if(info_begin==0)
+	goto dmg_close;
+    if(lseek(s->fd,info_begin,SEEK_SET)<0)
+	goto dmg_close;
+    if(read_uint32(s->fd)!=0x100)
+	goto dmg_close;
+    if((count = read_uint32(s->fd))==0)
+	goto dmg_close;
+    info_end = info_begin+count;
+    if(lseek(s->fd,0xf8,SEEK_CUR)<0)
+	goto dmg_close;
+
+    /* read offsets */
+    last_in_offset = last_out_offset = 0;
+    while(lseek(s->fd,0,SEEK_CUR)<info_end) {
+        uint32_t type;
+
+	count = read_uint32(s->fd);
+	if(count==0)
+	    goto dmg_close;
+	type = read_uint32(s->fd);
+	if(type!=0x6d697368 || count<244)
+	    lseek(s->fd,count-4,SEEK_CUR);
+	else {
+	    int new_size, chunk_count;
+	    if(lseek(s->fd,200,SEEK_CUR)<0)
+	        goto dmg_close;
+	    chunk_count = (count-204)/40;
+	    new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count);
+	    s->types = qemu_realloc(s->types, new_size/2);
+	    s->offsets = qemu_realloc(s->offsets, new_size);
+	    s->lengths = qemu_realloc(s->lengths, new_size);
+	    s->sectors = qemu_realloc(s->sectors, new_size);
+	    s->sectorcounts = qemu_realloc(s->sectorcounts, new_size);
+
+	    for(i=s->n_chunks;i<s->n_chunks+chunk_count;i++) {
+		s->types[i] = read_uint32(s->fd);
+		if(s->types[i]!=0x80000005 && s->types[i]!=1 && s->types[i]!=2) {
+		    if(s->types[i]==0xffffffff) {
+			last_in_offset = s->offsets[i-1]+s->lengths[i-1];
+			last_out_offset = s->sectors[i-1]+s->sectorcounts[i-1];
+		    }
+		    chunk_count--;
+		    i--;
+		    if(lseek(s->fd,36,SEEK_CUR)<0)
+			goto dmg_close;
+		    continue;
+		}
+		read_uint32(s->fd);
+		s->sectors[i] = last_out_offset+read_off(s->fd);
+		s->sectorcounts[i] = read_off(s->fd);
+		s->offsets[i] = last_in_offset+read_off(s->fd);
+		s->lengths[i] = read_off(s->fd);
+		if(s->lengths[i]>max_compressed_size)
+		    max_compressed_size = s->lengths[i];
+		if(s->sectorcounts[i]>max_sectors_per_chunk)
+		    max_sectors_per_chunk = s->sectorcounts[i];
+	    }
+	    s->n_chunks+=chunk_count;
+	}
+    }
+
+    /* initialize zlib engine */
+    s->compressed_chunk = qemu_malloc(max_compressed_size+1);
+    s->uncompressed_chunk = qemu_malloc(512*max_sectors_per_chunk);
+    if(inflateInit(&s->zstream) != Z_OK)
+	goto dmg_close;
+
+    s->current_chunk = s->n_chunks;
+
+    return 0;
+}
+
+static inline int is_sector_in_chunk(BDRVDMGState* s,
+		uint32_t chunk_num,int sector_num)
+{
+    if(chunk_num>=s->n_chunks || s->sectors[chunk_num]>sector_num ||
+	    s->sectors[chunk_num]+s->sectorcounts[chunk_num]<=sector_num)
+	return 0;
+    else
+	return -1;
+}
+
+static inline uint32_t search_chunk(BDRVDMGState* s,int sector_num)
+{
+    /* binary search */
+    uint32_t chunk1=0,chunk2=s->n_chunks,chunk3;
+    while(chunk1!=chunk2) {
+	chunk3 = (chunk1+chunk2)/2;
+	if(s->sectors[chunk3]>sector_num)
+	    chunk2 = chunk3;
+	else if(s->sectors[chunk3]+s->sectorcounts[chunk3]>sector_num)
+	    return chunk3;
+	else
+	    chunk1 = chunk3;
+    }
+    return s->n_chunks; /* error */
+}
+
+static inline int dmg_read_chunk(BDRVDMGState *s,int sector_num)
+{
+    if(!is_sector_in_chunk(s,s->current_chunk,sector_num)) {
+	int ret;
+	uint32_t chunk = search_chunk(s,sector_num);
+
+	if(chunk>=s->n_chunks)
+	    return -1;
+
+	s->current_chunk = s->n_chunks;
+	switch(s->types[chunk]) {
+	case 0x80000005: { /* zlib compressed */
+	    int i;
+
+	    ret = lseek(s->fd, s->offsets[chunk], SEEK_SET);
+	    if(ret<0)
+		return -1;
+
+	    /* we need to buffer, because only the chunk as whole can be
+	     * inflated. */
+	    i=0;
+	    do {
+		ret = read(s->fd, s->compressed_chunk+i, s->lengths[chunk]-i);
+		if(ret<0 && errno==EINTR)
+		    ret=0;
+		i+=ret;
+	    } while(ret>=0 && ret+i<s->lengths[chunk]);
+
+	    if (ret != s->lengths[chunk])
+		return -1;
+
+	    s->zstream.next_in = s->compressed_chunk;
+	    s->zstream.avail_in = s->lengths[chunk];
+	    s->zstream.next_out = s->uncompressed_chunk;
+	    s->zstream.avail_out = 512*s->sectorcounts[chunk];
+	    ret = inflateReset(&s->zstream);
+	    if(ret != Z_OK)
+		return -1;
+	    ret = inflate(&s->zstream, Z_FINISH);
+	    if(ret != Z_STREAM_END || s->zstream.total_out != 512*s->sectorcounts[chunk])
+		return -1;
+	    break; }
+	case 1: /* copy */
+	    ret = read(s->fd, s->uncompressed_chunk, s->lengths[chunk]);
+	    if (ret != s->lengths[chunk])
+		return -1;
+	    break;
+	case 2: /* zero */
+	    memset(s->uncompressed_chunk, 0, 512*s->sectorcounts[chunk]);
+	    break;
+	}
+	s->current_chunk = chunk;
+    }
+    return 0;
+}
+
+static int dmg_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
+{
+    BDRVDMGState *s = bs->opaque;
+    int i;
+
+    for(i=0;i<nb_sectors;i++) {
+	uint32_t sector_offset_in_chunk;
+	if(dmg_read_chunk(s, sector_num+i) != 0)
+	    return -1;
+	sector_offset_in_chunk = sector_num+i-s->sectors[s->current_chunk];
+	memcpy(buf+i*512,s->uncompressed_chunk+sector_offset_in_chunk*512,512);
+    }
+    return 0;
+}
+
+static void dmg_close(BlockDriverState *bs)
+{
+    BDRVDMGState *s = bs->opaque;
+    close(s->fd);
+    if(s->n_chunks>0) {
+	free(s->types);
+	free(s->offsets);
+	free(s->lengths);
+	free(s->sectors);
+	free(s->sectorcounts);
+    }
+    free(s->compressed_chunk);
+    free(s->uncompressed_chunk);
+    inflateEnd(&s->zstream);
+}
+
+static BlockDriver bdrv_dmg = {
+    .format_name	= "dmg",
+    .instance_size	= sizeof(BDRVDMGState),
+    .bdrv_probe		= dmg_probe,
+    .bdrv_open		= dmg_open,
+    .bdrv_read		= dmg_read,
+    .bdrv_close		= dmg_close,
+};
+
+static void bdrv_dmg_init(void)
+{
+    bdrv_register(&bdrv_dmg);
+}
+
+block_init(bdrv_dmg_init);
diff --git a/block/nbd.c b/block/nbd.c
new file mode 100644
index 0000000..47d4778
--- /dev/null
+++ b/block/nbd.c
@@ -0,0 +1,196 @@
+/*
+ * QEMU Block driver for  NBD
+ *
+ * Copyright (C) 2008 Bull S.A.S.
+ *     Author: Laurent Vivier <Laurent.Vivier@bull.net>
+ *
+ * Some parts:
+ *    Copyright (C) 2007 Anthony Liguori <anthony@codemonkey.ws>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "nbd.h"
+#include "module.h"
+
+#include <sys/types.h>
+#include <unistd.h>
+
+typedef struct BDRVNBDState {
+    int sock;
+    off_t size;
+    size_t blocksize;
+} BDRVNBDState;
+
+static int nbd_open(BlockDriverState *bs, const char* filename, int flags)
+{
+    BDRVNBDState *s = bs->opaque;
+    const char *host;
+    const char *unixpath;
+    int sock;
+    off_t size;
+    size_t blocksize;
+    int ret;
+
+    if ((flags & BDRV_O_CREAT))
+        return -EINVAL;
+
+    if (!strstart(filename, "nbd:", &host))
+        return -EINVAL;
+
+    if (strstart(host, "unix:", &unixpath)) {
+
+        if (unixpath[0] != '/')
+            return -EINVAL;
+
+        sock = unix_socket_outgoing(unixpath);
+
+    } else {
+        uint16_t port;
+        char *p, *r;
+        char hostname[128];
+
+        pstrcpy(hostname, 128, host);
+
+        p = strchr(hostname, ':');
+        if (p == NULL)
+            return -EINVAL;
+
+        *p = '\0';
+        p++;
+
+        port = strtol(p, &r, 0);
+        if (r == p)
+            return -EINVAL;
+        sock = tcp_socket_outgoing(hostname, port);
+    }
+
+    if (sock == -1)
+        return -errno;
+
+    ret = nbd_receive_negotiate(sock, &size, &blocksize);
+    if (ret == -1)
+        return -errno;
+
+    s->sock = sock;
+    s->size = size;
+    s->blocksize = blocksize;
+
+    return 0;
+}
+
+static int nbd_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
+{
+    BDRVNBDState *s = bs->opaque;
+    struct nbd_request request;
+    struct nbd_reply reply;
+
+    request.type = NBD_CMD_READ;
+    request.handle = (uint64_t)(intptr_t)bs;
+    request.from = sector_num * 512;;
+    request.len = nb_sectors * 512;
+
+    if (nbd_send_request(s->sock, &request) == -1)
+        return -errno;
+
+    if (nbd_receive_reply(s->sock, &reply) == -1)
+        return -errno;
+
+    if (reply.error !=0)
+        return -reply.error;
+
+    if (reply.handle != request.handle)
+        return -EIO;
+
+    if (nbd_wr_sync(s->sock, buf, request.len, 1) != request.len)
+        return -EIO;
+
+    return 0;
+}
+
+static int nbd_write(BlockDriverState *bs, int64_t sector_num,
+                     const uint8_t *buf, int nb_sectors)
+{
+    BDRVNBDState *s = bs->opaque;
+    struct nbd_request request;
+    struct nbd_reply reply;
+
+    request.type = NBD_CMD_WRITE;
+    request.handle = (uint64_t)(intptr_t)bs;
+    request.from = sector_num * 512;;
+    request.len = nb_sectors * 512;
+
+    if (nbd_send_request(s->sock, &request) == -1)
+        return -errno;
+
+    if (nbd_wr_sync(s->sock, (uint8_t*)buf, request.len, 0) != request.len)
+        return -EIO;
+
+    if (nbd_receive_reply(s->sock, &reply) == -1)
+        return -errno;
+
+    if (reply.error !=0)
+        return -reply.error;
+
+    if (reply.handle != request.handle)
+        return -EIO;
+
+    return 0;
+}
+
+static void nbd_close(BlockDriverState *bs)
+{
+    BDRVNBDState *s = bs->opaque;
+    struct nbd_request request;
+
+    request.type = NBD_CMD_DISC;
+    request.handle = (uint64_t)(intptr_t)bs;
+    request.from = 0;
+    request.len = 0;
+    nbd_send_request(s->sock, &request);
+
+    close(s->sock);
+}
+
+static int64_t nbd_getlength(BlockDriverState *bs)
+{
+    BDRVNBDState *s = bs->opaque;
+
+    return s->size;
+}
+
+static BlockDriver bdrv_nbd = {
+    .format_name	= "nbd",
+    .instance_size	= sizeof(BDRVNBDState),
+    .bdrv_open		= nbd_open,
+    .bdrv_read		= nbd_read,
+    .bdrv_write		= nbd_write,
+    .bdrv_close		= nbd_close,
+    .bdrv_getlength	= nbd_getlength,
+    .protocol_name	= "nbd",
+};
+
+static void bdrv_nbd_init(void)
+{
+    bdrv_register(&bdrv_nbd);
+}
+
+block_init(bdrv_nbd_init);
diff --git a/block/parallels.c b/block/parallels.c
new file mode 100644
index 0000000..0b64a5c
--- /dev/null
+++ b/block/parallels.c
@@ -0,0 +1,181 @@
+/*
+ * Block driver for Parallels disk image format
+ *
+ * Copyright (c) 2007 Alex Beregszaszi
+ *
+ * This code is based on comparing different disk images created by Parallels.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+
+/**************************************************************/
+
+#define HEADER_MAGIC "WithoutFreeSpace"
+#define HEADER_VERSION 2
+#define HEADER_SIZE 64
+
+// always little-endian
+struct parallels_header {
+    char magic[16]; // "WithoutFreeSpace"
+    uint32_t version;
+    uint32_t heads;
+    uint32_t cylinders;
+    uint32_t tracks;
+    uint32_t catalog_entries;
+    uint32_t nb_sectors;
+    char padding[24];
+} __attribute__((packed));
+
+typedef struct BDRVParallelsState {
+    int fd;
+
+    uint32_t *catalog_bitmap;
+    int catalog_size;
+
+    int tracks;
+} BDRVParallelsState;
+
+static int parallels_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    const struct parallels_header *ph = (const void *)buf;
+
+    if (buf_size < HEADER_SIZE)
+	return 0;
+
+    if (!memcmp(ph->magic, HEADER_MAGIC, 16) &&
+	(le32_to_cpu(ph->version) == HEADER_VERSION))
+	return 100;
+
+    return 0;
+}
+
+static int parallels_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVParallelsState *s = bs->opaque;
+    int fd, i;
+    struct parallels_header ph;
+
+    fd = open(filename, O_RDWR | O_BINARY | O_LARGEFILE);
+    if (fd < 0) {
+        fd = open(filename, O_RDONLY | O_BINARY | O_LARGEFILE);
+        if (fd < 0)
+            return -1;
+    }
+
+    bs->read_only = 1; // no write support yet
+
+    s->fd = fd;
+
+    if (read(fd, &ph, sizeof(ph)) != sizeof(ph))
+        goto fail;
+
+    if (memcmp(ph.magic, HEADER_MAGIC, 16) ||
+	(le32_to_cpu(ph.version) != HEADER_VERSION)) {
+        goto fail;
+    }
+
+    bs->total_sectors = le32_to_cpu(ph.nb_sectors);
+
+    if (lseek(s->fd, 64, SEEK_SET) != 64)
+	goto fail;
+
+    s->tracks = le32_to_cpu(ph.tracks);
+
+    s->catalog_size = le32_to_cpu(ph.catalog_entries);
+    s->catalog_bitmap = qemu_malloc(s->catalog_size * 4);
+    if (read(s->fd, s->catalog_bitmap, s->catalog_size * 4) !=
+	s->catalog_size * 4)
+	goto fail;
+    for (i = 0; i < s->catalog_size; i++)
+	le32_to_cpus(&s->catalog_bitmap[i]);
+
+    return 0;
+fail:
+    if (s->catalog_bitmap)
+	qemu_free(s->catalog_bitmap);
+    close(fd);
+    return -1;
+}
+
+static inline int seek_to_sector(BlockDriverState *bs, int64_t sector_num)
+{
+    BDRVParallelsState *s = bs->opaque;
+    uint32_t index, offset, position;
+
+    index = sector_num / s->tracks;
+    offset = sector_num % s->tracks;
+
+    // not allocated
+    if ((index > s->catalog_size) || (s->catalog_bitmap[index] == 0))
+	return -1;
+
+    position = (s->catalog_bitmap[index] + offset) * 512;
+
+//    fprintf(stderr, "sector: %llx index=%x offset=%x pointer=%x position=%x\n",
+//	sector_num, index, offset, s->catalog_bitmap[index], position);
+
+    if (lseek(s->fd, position, SEEK_SET) != position)
+	return -1;
+
+    return 0;
+}
+
+static int parallels_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
+{
+    BDRVParallelsState *s = bs->opaque;
+
+    while (nb_sectors > 0) {
+	if (!seek_to_sector(bs, sector_num)) {
+	    if (read(s->fd, buf, 512) != 512)
+		return -1;
+	} else
+            memset(buf, 0, 512);
+        nb_sectors--;
+        sector_num++;
+        buf += 512;
+    }
+    return 0;
+}
+
+static void parallels_close(BlockDriverState *bs)
+{
+    BDRVParallelsState *s = bs->opaque;
+    qemu_free(s->catalog_bitmap);
+    close(s->fd);
+}
+
+static BlockDriver bdrv_parallels = {
+    .format_name	= "parallels",
+    .instance_size	= sizeof(BDRVParallelsState),
+    .bdrv_probe		= parallels_probe,
+    .bdrv_open		= parallels_open,
+    .bdrv_read		= parallels_read,
+    .bdrv_close		= parallels_close,
+};
+
+static void bdrv_parallels_init(void)
+{
+    bdrv_register(&bdrv_parallels);
+}
+
+block_init(bdrv_parallels_init);
diff --git a/block/qcow.c b/block/qcow.c
new file mode 100644
index 0000000..55a68a6
--- /dev/null
+++ b/block/qcow.c
@@ -0,0 +1,954 @@
+/*
+ * Block driver for the QCOW format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+#include <zlib.h>
+#include "aes.h"
+
+/**************************************************************/
+/* QEMU COW block driver with compression and encryption support */
+
+#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
+#define QCOW_VERSION 1
+
+#define QCOW_CRYPT_NONE 0
+#define QCOW_CRYPT_AES  1
+
+#define QCOW_OFLAG_COMPRESSED (1LL << 63)
+
+typedef struct QCowHeader {
+    uint32_t magic;
+    uint32_t version;
+    uint64_t backing_file_offset;
+    uint32_t backing_file_size;
+    uint32_t mtime;
+    uint64_t size; /* in bytes */
+    uint8_t cluster_bits;
+    uint8_t l2_bits;
+    uint32_t crypt_method;
+    uint64_t l1_table_offset;
+} QCowHeader;
+
+#define L2_CACHE_SIZE 16
+
+typedef struct BDRVQcowState {
+    BlockDriverState *hd;
+    int cluster_bits;
+    int cluster_size;
+    int cluster_sectors;
+    int l2_bits;
+    int l2_size;
+    int l1_size;
+    uint64_t cluster_offset_mask;
+    uint64_t l1_table_offset;
+    uint64_t *l1_table;
+    uint64_t *l2_cache;
+    uint64_t l2_cache_offsets[L2_CACHE_SIZE];
+    uint32_t l2_cache_counts[L2_CACHE_SIZE];
+    uint8_t *cluster_cache;
+    uint8_t *cluster_data;
+    uint64_t cluster_cache_offset;
+    uint32_t crypt_method; /* current crypt method, 0 if no key yet */
+    uint32_t crypt_method_header;
+    AES_KEY aes_encrypt_key;
+    AES_KEY aes_decrypt_key;
+} BDRVQcowState;
+
+static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset);
+
+static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    const QCowHeader *cow_header = (const void *)buf;
+
+    if (buf_size >= sizeof(QCowHeader) &&
+        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
+        be32_to_cpu(cow_header->version) == QCOW_VERSION)
+        return 100;
+    else
+        return 0;
+}
+
+static int qcow_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVQcowState *s = bs->opaque;
+    int len, i, shift, ret;
+    QCowHeader header;
+
+    ret = bdrv_file_open(&s->hd, filename, flags);
+    if (ret < 0)
+        return ret;
+    if (bdrv_pread(s->hd, 0, &header, sizeof(header)) != sizeof(header))
+        goto fail;
+    be32_to_cpus(&header.magic);
+    be32_to_cpus(&header.version);
+    be64_to_cpus(&header.backing_file_offset);
+    be32_to_cpus(&header.backing_file_size);
+    be32_to_cpus(&header.mtime);
+    be64_to_cpus(&header.size);
+    be32_to_cpus(&header.crypt_method);
+    be64_to_cpus(&header.l1_table_offset);
+
+    if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION)
+        goto fail;
+    if (header.size <= 1 || header.cluster_bits < 9)
+        goto fail;
+    if (header.crypt_method > QCOW_CRYPT_AES)
+        goto fail;
+    s->crypt_method_header = header.crypt_method;
+    if (s->crypt_method_header)
+        bs->encrypted = 1;
+    s->cluster_bits = header.cluster_bits;
+    s->cluster_size = 1 << s->cluster_bits;
+    s->cluster_sectors = 1 << (s->cluster_bits - 9);
+    s->l2_bits = header.l2_bits;
+    s->l2_size = 1 << s->l2_bits;
+    bs->total_sectors = header.size / 512;
+    s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
+
+    /* read the level 1 table */
+    shift = s->cluster_bits + s->l2_bits;
+    s->l1_size = (header.size + (1LL << shift) - 1) >> shift;
+
+    s->l1_table_offset = header.l1_table_offset;
+    s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
+    if (!s->l1_table)
+        goto fail;
+    if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) !=
+        s->l1_size * sizeof(uint64_t))
+        goto fail;
+    for(i = 0;i < s->l1_size; i++) {
+        be64_to_cpus(&s->l1_table[i]);
+    }
+    /* alloc L2 cache */
+    s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+    if (!s->l2_cache)
+        goto fail;
+    s->cluster_cache = qemu_malloc(s->cluster_size);
+    if (!s->cluster_cache)
+        goto fail;
+    s->cluster_data = qemu_malloc(s->cluster_size);
+    if (!s->cluster_data)
+        goto fail;
+    s->cluster_cache_offset = -1;
+
+    /* read the backing file name */
+    if (header.backing_file_offset != 0) {
+        len = header.backing_file_size;
+        if (len > 1023)
+            len = 1023;
+        if (bdrv_pread(s->hd, header.backing_file_offset, bs->backing_file, len) != len)
+            goto fail;
+        bs->backing_file[len] = '\0';
+    }
+    return 0;
+
+ fail:
+    qemu_free(s->l1_table);
+    qemu_free(s->l2_cache);
+    qemu_free(s->cluster_cache);
+    qemu_free(s->cluster_data);
+    bdrv_delete(s->hd);
+    return -1;
+}
+
+static int qcow_set_key(BlockDriverState *bs, const char *key)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint8_t keybuf[16];
+    int len, i;
+
+    memset(keybuf, 0, 16);
+    len = strlen(key);
+    if (len > 16)
+        len = 16;
+    /* XXX: we could compress the chars to 7 bits to increase
+       entropy */
+    for(i = 0;i < len;i++) {
+        keybuf[i] = key[i];
+    }
+    s->crypt_method = s->crypt_method_header;
+
+    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
+        return -1;
+    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
+        return -1;
+#if 0
+    /* test */
+    {
+        uint8_t in[16];
+        uint8_t out[16];
+        uint8_t tmp[16];
+        for(i=0;i<16;i++)
+            in[i] = i;
+        AES_encrypt(in, tmp, &s->aes_encrypt_key);
+        AES_decrypt(tmp, out, &s->aes_decrypt_key);
+        for(i = 0; i < 16; i++)
+            printf(" %02x", tmp[i]);
+        printf("\n");
+        for(i = 0; i < 16; i++)
+            printf(" %02x", out[i]);
+        printf("\n");
+    }
+#endif
+    return 0;
+}
+
+/* The crypt function is compatible with the linux cryptoloop
+   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
+   supported */
+static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
+                            uint8_t *out_buf, const uint8_t *in_buf,
+                            int nb_sectors, int enc,
+                            const AES_KEY *key)
+{
+    union {
+        uint64_t ll[2];
+        uint8_t b[16];
+    } ivec;
+    int i;
+
+    for(i = 0; i < nb_sectors; i++) {
+        ivec.ll[0] = cpu_to_le64(sector_num);
+        ivec.ll[1] = 0;
+        AES_cbc_encrypt(in_buf, out_buf, 512, key,
+                        ivec.b, enc);
+        sector_num++;
+        in_buf += 512;
+        out_buf += 512;
+    }
+}
+
+/* 'allocate' is:
+ *
+ * 0 to not allocate.
+ *
+ * 1 to allocate a normal cluster (for sector indexes 'n_start' to
+ * 'n_end')
+ *
+ * 2 to allocate a compressed cluster of size
+ * 'compressed_size'. 'compressed_size' must be > 0 and <
+ * cluster_size
+ *
+ * return 0 if not allocated.
+ */
+static uint64_t get_cluster_offset(BlockDriverState *bs,
+                                   uint64_t offset, int allocate,
+                                   int compressed_size,
+                                   int n_start, int n_end)
+{
+    BDRVQcowState *s = bs->opaque;
+    int min_index, i, j, l1_index, l2_index;
+    uint64_t l2_offset, *l2_table, cluster_offset, tmp;
+    uint32_t min_count;
+    int new_l2_table;
+
+    l1_index = offset >> (s->l2_bits + s->cluster_bits);
+    l2_offset = s->l1_table[l1_index];
+    new_l2_table = 0;
+    if (!l2_offset) {
+        if (!allocate)
+            return 0;
+        /* allocate a new l2 entry */
+        l2_offset = bdrv_getlength(s->hd);
+        /* round to cluster size */
+        l2_offset = (l2_offset + s->cluster_size - 1) & ~(s->cluster_size - 1);
+        /* update the L1 entry */
+        s->l1_table[l1_index] = l2_offset;
+        tmp = cpu_to_be64(l2_offset);
+        if (bdrv_pwrite(s->hd, s->l1_table_offset + l1_index * sizeof(tmp),
+                        &tmp, sizeof(tmp)) != sizeof(tmp))
+            return 0;
+        new_l2_table = 1;
+    }
+    for(i = 0; i < L2_CACHE_SIZE; i++) {
+        if (l2_offset == s->l2_cache_offsets[i]) {
+            /* increment the hit count */
+            if (++s->l2_cache_counts[i] == 0xffffffff) {
+                for(j = 0; j < L2_CACHE_SIZE; j++) {
+                    s->l2_cache_counts[j] >>= 1;
+                }
+            }
+            l2_table = s->l2_cache + (i << s->l2_bits);
+            goto found;
+        }
+    }
+    /* not found: load a new entry in the least used one */
+    min_index = 0;
+    min_count = 0xffffffff;
+    for(i = 0; i < L2_CACHE_SIZE; i++) {
+        if (s->l2_cache_counts[i] < min_count) {
+            min_count = s->l2_cache_counts[i];
+            min_index = i;
+        }
+    }
+    l2_table = s->l2_cache + (min_index << s->l2_bits);
+    if (new_l2_table) {
+        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
+        if (bdrv_pwrite(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
+            s->l2_size * sizeof(uint64_t))
+            return 0;
+    } else {
+        if (bdrv_pread(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
+            s->l2_size * sizeof(uint64_t))
+            return 0;
+    }
+    s->l2_cache_offsets[min_index] = l2_offset;
+    s->l2_cache_counts[min_index] = 1;
+ found:
+    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+    cluster_offset = be64_to_cpu(l2_table[l2_index]);
+    if (!cluster_offset ||
+        ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) {
+        if (!allocate)
+            return 0;
+        /* allocate a new cluster */
+        if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
+            (n_end - n_start) < s->cluster_sectors) {
+            /* if the cluster is already compressed, we must
+               decompress it in the case it is not completely
+               overwritten */
+            if (decompress_cluster(s, cluster_offset) < 0)
+                return 0;
+            cluster_offset = bdrv_getlength(s->hd);
+            cluster_offset = (cluster_offset + s->cluster_size - 1) &
+                ~(s->cluster_size - 1);
+            /* write the cluster content */
+            if (bdrv_pwrite(s->hd, cluster_offset, s->cluster_cache, s->cluster_size) !=
+                s->cluster_size)
+                return -1;
+        } else {
+            cluster_offset = bdrv_getlength(s->hd);
+            if (allocate == 1) {
+                /* round to cluster size */
+                cluster_offset = (cluster_offset + s->cluster_size - 1) &
+                    ~(s->cluster_size - 1);
+                bdrv_truncate(s->hd, cluster_offset + s->cluster_size);
+                /* if encrypted, we must initialize the cluster
+                   content which won't be written */
+                if (s->crypt_method &&
+                    (n_end - n_start) < s->cluster_sectors) {
+                    uint64_t start_sect;
+                    start_sect = (offset & ~(s->cluster_size - 1)) >> 9;
+                    memset(s->cluster_data + 512, 0x00, 512);
+                    for(i = 0; i < s->cluster_sectors; i++) {
+                        if (i < n_start || i >= n_end) {
+                            encrypt_sectors(s, start_sect + i,
+                                            s->cluster_data,
+                                            s->cluster_data + 512, 1, 1,
+                                            &s->aes_encrypt_key);
+                            if (bdrv_pwrite(s->hd, cluster_offset + i * 512,
+                                            s->cluster_data, 512) != 512)
+                                return -1;
+                        }
+                    }
+                }
+            } else if (allocate == 2) {
+                cluster_offset |= QCOW_OFLAG_COMPRESSED |
+                    (uint64_t)compressed_size << (63 - s->cluster_bits);
+            }
+        }
+        /* update L2 table */
+        tmp = cpu_to_be64(cluster_offset);
+        l2_table[l2_index] = tmp;
+        if (bdrv_pwrite(s->hd,
+                        l2_offset + l2_index * sizeof(tmp), &tmp, sizeof(tmp)) != sizeof(tmp))
+            return 0;
+    }
+    return cluster_offset;
+}
+
+static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num,
+                             int nb_sectors, int *pnum)
+{
+    BDRVQcowState *s = bs->opaque;
+    int index_in_cluster, n;
+    uint64_t cluster_offset;
+
+    cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
+    index_in_cluster = sector_num & (s->cluster_sectors - 1);
+    n = s->cluster_sectors - index_in_cluster;
+    if (n > nb_sectors)
+        n = nb_sectors;
+    *pnum = n;
+    return (cluster_offset != 0);
+}
+
+static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
+                             const uint8_t *buf, int buf_size)
+{
+    z_stream strm1, *strm = &strm1;
+    int ret, out_len;
+
+    memset(strm, 0, sizeof(*strm));
+
+    strm->next_in = (uint8_t *)buf;
+    strm->avail_in = buf_size;
+    strm->next_out = out_buf;
+    strm->avail_out = out_buf_size;
+
+    ret = inflateInit2(strm, -12);
+    if (ret != Z_OK)
+        return -1;
+    ret = inflate(strm, Z_FINISH);
+    out_len = strm->next_out - out_buf;
+    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
+        out_len != out_buf_size) {
+        inflateEnd(strm);
+        return -1;
+    }
+    inflateEnd(strm);
+    return 0;
+}
+
+static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset)
+{
+    int ret, csize;
+    uint64_t coffset;
+
+    coffset = cluster_offset & s->cluster_offset_mask;
+    if (s->cluster_cache_offset != coffset) {
+        csize = cluster_offset >> (63 - s->cluster_bits);
+        csize &= (s->cluster_size - 1);
+        ret = bdrv_pread(s->hd, coffset, s->cluster_data, csize);
+        if (ret != csize)
+            return -1;
+        if (decompress_buffer(s->cluster_cache, s->cluster_size,
+                              s->cluster_data, csize) < 0) {
+            return -1;
+        }
+        s->cluster_cache_offset = coffset;
+    }
+    return 0;
+}
+
+#if 0
+
+static int qcow_read(BlockDriverState *bs, int64_t sector_num,
+                     uint8_t *buf, int nb_sectors)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret, index_in_cluster, n;
+    uint64_t cluster_offset;
+
+    while (nb_sectors > 0) {
+        cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
+        index_in_cluster = sector_num & (s->cluster_sectors - 1);
+        n = s->cluster_sectors - index_in_cluster;
+        if (n > nb_sectors)
+            n = nb_sectors;
+        if (!cluster_offset) {
+            if (bs->backing_hd) {
+                /* read from the base image */
+                ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
+                if (ret < 0)
+                    return -1;
+            } else {
+                memset(buf, 0, 512 * n);
+            }
+        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
+            if (decompress_cluster(s, cluster_offset) < 0)
+                return -1;
+            memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n);
+        } else {
+            ret = bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
+            if (ret != n * 512)
+                return -1;
+            if (s->crypt_method) {
+                encrypt_sectors(s, sector_num, buf, buf, n, 0,
+                                &s->aes_decrypt_key);
+            }
+        }
+        nb_sectors -= n;
+        sector_num += n;
+        buf += n * 512;
+    }
+    return 0;
+}
+#endif
+
+typedef struct QCowAIOCB {
+    BlockDriverAIOCB common;
+    int64_t sector_num;
+    QEMUIOVector *qiov;
+    uint8_t *buf;
+    void *orig_buf;
+    int nb_sectors;
+    int n;
+    uint64_t cluster_offset;
+    uint8_t *cluster_data;
+    struct iovec hd_iov;
+    QEMUIOVector hd_qiov;
+    BlockDriverAIOCB *hd_aiocb;
+} QCowAIOCB;
+
+static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    QCowAIOCB *acb = (QCowAIOCB *)blockacb;
+    if (acb->hd_aiocb)
+        bdrv_aio_cancel(acb->hd_aiocb);
+    qemu_aio_release(acb);
+}
+
+static AIOPool qcow_aio_pool = {
+    .aiocb_size         = sizeof(QCowAIOCB),
+    .cancel             = qcow_aio_cancel,
+};
+
+static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque, int is_write)
+{
+    QCowAIOCB *acb;
+
+    acb = qemu_aio_get(&qcow_aio_pool, bs, cb, opaque);
+    if (!acb)
+        return NULL;
+    acb->hd_aiocb = NULL;
+    acb->sector_num = sector_num;
+    acb->qiov = qiov;
+    if (qiov->niov > 1) {
+        acb->buf = acb->orig_buf = qemu_blockalign(bs, qiov->size);
+        if (is_write)
+            qemu_iovec_to_buffer(qiov, acb->buf);
+    } else {
+        acb->buf = (uint8_t *)qiov->iov->iov_base;
+    }
+    acb->nb_sectors = nb_sectors;
+    acb->n = 0;
+    acb->cluster_offset = 0;
+    return acb;
+}
+
+static void qcow_aio_read_cb(void *opaque, int ret)
+{
+    QCowAIOCB *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVQcowState *s = bs->opaque;
+    int index_in_cluster;
+
+    acb->hd_aiocb = NULL;
+    if (ret < 0)
+        goto done;
+
+ redo:
+    /* post process the read buffer */
+    if (!acb->cluster_offset) {
+        /* nothing to do */
+    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
+        /* nothing to do */
+    } else {
+        if (s->crypt_method) {
+            encrypt_sectors(s, acb->sector_num, acb->buf, acb->buf,
+                            acb->n, 0,
+                            &s->aes_decrypt_key);
+        }
+    }
+
+    acb->nb_sectors -= acb->n;
+    acb->sector_num += acb->n;
+    acb->buf += acb->n * 512;
+
+    if (acb->nb_sectors == 0) {
+        /* request completed */
+        ret = 0;
+        goto done;
+    }
+
+    /* prepare next AIO request */
+    acb->cluster_offset = get_cluster_offset(bs, acb->sector_num << 9,
+                                             0, 0, 0, 0);
+    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
+    acb->n = s->cluster_sectors - index_in_cluster;
+    if (acb->n > acb->nb_sectors)
+        acb->n = acb->nb_sectors;
+
+    if (!acb->cluster_offset) {
+        if (bs->backing_hd) {
+            /* read from the base image */
+            acb->hd_iov.iov_base = (void *)acb->buf;
+            acb->hd_iov.iov_len = acb->n * 512;
+            qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
+            acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num,
+                &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
+            if (acb->hd_aiocb == NULL)
+                goto done;
+        } else {
+            /* Note: in this case, no need to wait */
+            memset(acb->buf, 0, 512 * acb->n);
+            goto redo;
+        }
+    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
+        /* add AIO support for compressed blocks ? */
+        if (decompress_cluster(s, acb->cluster_offset) < 0)
+            goto done;
+        memcpy(acb->buf,
+               s->cluster_cache + index_in_cluster * 512, 512 * acb->n);
+        goto redo;
+    } else {
+        if ((acb->cluster_offset & 511) != 0) {
+            ret = -EIO;
+            goto done;
+        }
+        acb->hd_iov.iov_base = (void *)acb->buf;
+        acb->hd_iov.iov_len = acb->n * 512;
+        qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
+        acb->hd_aiocb = bdrv_aio_readv(s->hd,
+                            (acb->cluster_offset >> 9) + index_in_cluster,
+                            &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
+        if (acb->hd_aiocb == NULL)
+            goto done;
+    }
+
+    return;
+
+done:
+    if (acb->qiov->niov > 1) {
+        qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size);
+        qemu_vfree(acb->orig_buf);
+    }
+    acb->common.cb(acb->common.opaque, ret);
+    qemu_aio_release(acb);
+}
+
+static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    QCowAIOCB *acb;
+
+    acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+    if (!acb)
+        return NULL;
+
+    qcow_aio_read_cb(acb, 0);
+    return &acb->common;
+}
+
+static void qcow_aio_write_cb(void *opaque, int ret)
+{
+    QCowAIOCB *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVQcowState *s = bs->opaque;
+    int index_in_cluster;
+    uint64_t cluster_offset;
+    const uint8_t *src_buf;
+
+    acb->hd_aiocb = NULL;
+
+    if (ret < 0)
+        goto done;
+
+    acb->nb_sectors -= acb->n;
+    acb->sector_num += acb->n;
+    acb->buf += acb->n * 512;
+
+    if (acb->nb_sectors == 0) {
+        /* request completed */
+        ret = 0;
+        goto done;
+    }
+
+    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
+    acb->n = s->cluster_sectors - index_in_cluster;
+    if (acb->n > acb->nb_sectors)
+        acb->n = acb->nb_sectors;
+    cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, 1, 0,
+                                        index_in_cluster,
+                                        index_in_cluster + acb->n);
+    if (!cluster_offset || (cluster_offset & 511) != 0) {
+        ret = -EIO;
+        goto done;
+    }
+    if (s->crypt_method) {
+        if (!acb->cluster_data) {
+            acb->cluster_data = qemu_mallocz(s->cluster_size);
+            if (!acb->cluster_data) {
+                ret = -ENOMEM;
+                goto done;
+            }
+        }
+        encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf,
+                        acb->n, 1, &s->aes_encrypt_key);
+        src_buf = acb->cluster_data;
+    } else {
+        src_buf = acb->buf;
+    }
+
+    acb->hd_iov.iov_base = (void *)src_buf;
+    acb->hd_iov.iov_len = acb->n * 512;
+    qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
+    acb->hd_aiocb = bdrv_aio_writev(s->hd,
+                                    (cluster_offset >> 9) + index_in_cluster,
+                                    &acb->hd_qiov, acb->n,
+                                    qcow_aio_write_cb, acb);
+    if (acb->hd_aiocb == NULL)
+        goto done;
+    return;
+
+done:
+    if (acb->qiov->niov > 1)
+        qemu_vfree(acb->orig_buf);
+    acb->common.cb(acb->common.opaque, ret);
+    qemu_aio_release(acb);
+}
+
+static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowAIOCB *acb;
+
+    s->cluster_cache_offset = -1; /* disable compressed cache */
+
+    acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+    if (!acb)
+        return NULL;
+
+
+    qcow_aio_write_cb(acb, 0);
+    return &acb->common;
+}
+
+static void qcow_close(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    qemu_free(s->l1_table);
+    qemu_free(s->l2_cache);
+    qemu_free(s->cluster_cache);
+    qemu_free(s->cluster_data);
+    bdrv_delete(s->hd);
+}
+
+static int qcow_create(const char *filename, QEMUOptionParameter *options)
+{
+    int fd, header_size, backing_filename_len, l1_size, i, shift;
+    QCowHeader header;
+    uint64_t tmp;
+    int64_t total_size = 0;
+    const char *backing_file = NULL;
+    int flags = 0;
+
+    /* Read out options */
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            total_size = options->value.n / 512;
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+            backing_file = options->value.s;
+        } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) {
+            flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0;
+        }
+        options++;
+    }
+
+    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
+    if (fd < 0)
+        return -1;
+    memset(&header, 0, sizeof(header));
+    header.magic = cpu_to_be32(QCOW_MAGIC);
+    header.version = cpu_to_be32(QCOW_VERSION);
+    header.size = cpu_to_be64(total_size * 512);
+    header_size = sizeof(header);
+    backing_filename_len = 0;
+    if (backing_file) {
+        if (strcmp(backing_file, "fat:")) {
+            header.backing_file_offset = cpu_to_be64(header_size);
+            backing_filename_len = strlen(backing_file);
+            header.backing_file_size = cpu_to_be32(backing_filename_len);
+            header_size += backing_filename_len;
+        } else {
+            /* special backing file for vvfat */
+            backing_file = NULL;
+        }
+        header.cluster_bits = 9; /* 512 byte cluster to avoid copying
+                                    unmodifyed sectors */
+        header.l2_bits = 12; /* 32 KB L2 tables */
+    } else {
+        header.cluster_bits = 12; /* 4 KB clusters */
+        header.l2_bits = 9; /* 4 KB L2 tables */
+    }
+    header_size = (header_size + 7) & ~7;
+    shift = header.cluster_bits + header.l2_bits;
+    l1_size = ((total_size * 512) + (1LL << shift) - 1) >> shift;
+
+    header.l1_table_offset = cpu_to_be64(header_size);
+    if (flags & BLOCK_FLAG_ENCRYPT) {
+        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
+    } else {
+        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
+    }
+
+    /* write all the data */
+    write(fd, &header, sizeof(header));
+    if (backing_file) {
+        write(fd, backing_file, backing_filename_len);
+    }
+    lseek(fd, header_size, SEEK_SET);
+    tmp = 0;
+    for(i = 0;i < l1_size; i++) {
+        write(fd, &tmp, sizeof(tmp));
+    }
+    close(fd);
+    return 0;
+}
+
+static int qcow_make_empty(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
+    int ret;
+
+    memset(s->l1_table, 0, l1_length);
+    if (bdrv_pwrite(s->hd, s->l1_table_offset, s->l1_table, l1_length) < 0)
+	return -1;
+    ret = bdrv_truncate(s->hd, s->l1_table_offset + l1_length);
+    if (ret < 0)
+        return ret;
+
+    memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+    memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
+    memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
+
+    return 0;
+}
+
+/* XXX: put compressed sectors first, then all the cluster aligned
+   tables to avoid losing bytes in alignment */
+static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
+                                 const uint8_t *buf, int nb_sectors)
+{
+    BDRVQcowState *s = bs->opaque;
+    z_stream strm;
+    int ret, out_len;
+    uint8_t *out_buf;
+    uint64_t cluster_offset;
+
+    if (nb_sectors != s->cluster_sectors)
+        return -EINVAL;
+
+    out_buf = qemu_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
+    if (!out_buf)
+        return -1;
+
+    /* best compression, small window, no zlib header */
+    memset(&strm, 0, sizeof(strm));
+    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+                       Z_DEFLATED, -12,
+                       9, Z_DEFAULT_STRATEGY);
+    if (ret != 0) {
+        qemu_free(out_buf);
+        return -1;
+    }
+
+    strm.avail_in = s->cluster_size;
+    strm.next_in = (uint8_t *)buf;
+    strm.avail_out = s->cluster_size;
+    strm.next_out = out_buf;
+
+    ret = deflate(&strm, Z_FINISH);
+    if (ret != Z_STREAM_END && ret != Z_OK) {
+        qemu_free(out_buf);
+        deflateEnd(&strm);
+        return -1;
+    }
+    out_len = strm.next_out - out_buf;
+
+    deflateEnd(&strm);
+
+    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
+        /* could not compress: write normal cluster */
+        bdrv_write(bs, sector_num, buf, s->cluster_sectors);
+    } else {
+        cluster_offset = get_cluster_offset(bs, sector_num << 9, 2,
+                                            out_len, 0, 0);
+        cluster_offset &= s->cluster_offset_mask;
+        if (bdrv_pwrite(s->hd, cluster_offset, out_buf, out_len) != out_len) {
+            qemu_free(out_buf);
+            return -1;
+        }
+    }
+
+    qemu_free(out_buf);
+    return 0;
+}
+
+static void qcow_flush(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    bdrv_flush(s->hd);
+}
+
+static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+    BDRVQcowState *s = bs->opaque;
+    bdi->cluster_size = s->cluster_size;
+    return 0;
+}
+
+
+static QEMUOptionParameter qcow_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    {
+        .name = BLOCK_OPT_BACKING_FILE,
+        .type = OPT_STRING,
+        .help = "File name of a base image"
+    },
+    {
+        .name = BLOCK_OPT_ENCRYPT,
+        .type = OPT_FLAG,
+        .help = "Encrypt the image"
+    },
+    { NULL }
+};
+
+static BlockDriver bdrv_qcow = {
+    .format_name	= "qcow",
+    .instance_size	= sizeof(BDRVQcowState),
+    .bdrv_probe		= qcow_probe,
+    .bdrv_open		= qcow_open,
+    .bdrv_close		= qcow_close,
+    .bdrv_create	= qcow_create,
+    .bdrv_flush		= qcow_flush,
+    .bdrv_is_allocated	= qcow_is_allocated,
+    .bdrv_set_key	= qcow_set_key,
+    .bdrv_make_empty	= qcow_make_empty,
+    .bdrv_aio_readv	= qcow_aio_readv,
+    .bdrv_aio_writev	= qcow_aio_writev,
+    .bdrv_write_compressed = qcow_write_compressed,
+    .bdrv_get_info	= qcow_get_info,
+
+    .create_options = qcow_create_options,
+};
+
+static void bdrv_qcow_init(void)
+{
+    bdrv_register(&bdrv_qcow);
+}
+
+block_init(bdrv_qcow_init);
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
new file mode 100644
index 0000000..d349655
--- /dev/null
+++ b/block/qcow2-cluster.c
@@ -0,0 +1,800 @@
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <zlib.h>
+
+#include "qemu-common.h"
+#include "block_int.h"
+#include "block/qcow2.h"
+
+int qcow2_grow_l1_table(BlockDriverState *bs, int min_size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int new_l1_size, new_l1_size2, ret, i;
+    uint64_t *new_l1_table;
+    uint64_t new_l1_table_offset;
+    uint8_t data[12];
+
+    new_l1_size = s->l1_size;
+    if (min_size <= new_l1_size)
+        return 0;
+    while (min_size > new_l1_size) {
+        new_l1_size = (new_l1_size * 3 + 1) / 2;
+    }
+#ifdef DEBUG_ALLOC2
+    printf("grow l1_table from %d to %d\n", s->l1_size, new_l1_size);
+#endif
+
+    new_l1_size2 = sizeof(uint64_t) * new_l1_size;
+    new_l1_table = qemu_mallocz(new_l1_size2);
+    memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
+
+    /* write new table (align to cluster) */
+    new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2);
+
+    for(i = 0; i < s->l1_size; i++)
+        new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
+    ret = bdrv_pwrite(s->hd, new_l1_table_offset, new_l1_table, new_l1_size2);
+    if (ret != new_l1_size2)
+        goto fail;
+    for(i = 0; i < s->l1_size; i++)
+        new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
+
+    /* set new table */
+    cpu_to_be32w((uint32_t*)data, new_l1_size);
+    cpu_to_be64w((uint64_t*)(data + 4), new_l1_table_offset);
+    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, l1_size), data,
+                sizeof(data)) != sizeof(data))
+        goto fail;
+    qemu_free(s->l1_table);
+    qcow2_free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t));
+    s->l1_table_offset = new_l1_table_offset;
+    s->l1_table = new_l1_table;
+    s->l1_size = new_l1_size;
+    return 0;
+ fail:
+    qemu_free(s->l1_table);
+    return -EIO;
+}
+
+void qcow2_l2_cache_reset(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+
+    memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+    memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
+    memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
+}
+
+static inline int l2_cache_new_entry(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint32_t min_count;
+    int min_index, i;
+
+    /* find a new entry in the least used one */
+    min_index = 0;
+    min_count = 0xffffffff;
+    for(i = 0; i < L2_CACHE_SIZE; i++) {
+        if (s->l2_cache_counts[i] < min_count) {
+            min_count = s->l2_cache_counts[i];
+            min_index = i;
+        }
+    }
+    return min_index;
+}
+
+/*
+ * seek_l2_table
+ *
+ * seek l2_offset in the l2_cache table
+ * if not found, return NULL,
+ * if found,
+ *   increments the l2 cache hit count of the entry,
+ *   if counter overflow, divide by two all counters
+ *   return the pointer to the l2 cache entry
+ *
+ */
+
+static uint64_t *seek_l2_table(BDRVQcowState *s, uint64_t l2_offset)
+{
+    int i, j;
+
+    for(i = 0; i < L2_CACHE_SIZE; i++) {
+        if (l2_offset == s->l2_cache_offsets[i]) {
+            /* increment the hit count */
+            if (++s->l2_cache_counts[i] == 0xffffffff) {
+                for(j = 0; j < L2_CACHE_SIZE; j++) {
+                    s->l2_cache_counts[j] >>= 1;
+                }
+            }
+            return s->l2_cache + (i << s->l2_bits);
+        }
+    }
+    return NULL;
+}
+
+/*
+ * l2_load
+ *
+ * Loads a L2 table into memory. If the table is in the cache, the cache
+ * is used; otherwise the L2 table is loaded from the image file.
+ *
+ * Returns a pointer to the L2 table on success, or NULL if the read from
+ * the image file failed.
+ */
+
+static uint64_t *l2_load(BlockDriverState *bs, uint64_t l2_offset)
+{
+    BDRVQcowState *s = bs->opaque;
+    int min_index;
+    uint64_t *l2_table;
+
+    /* seek if the table for the given offset is in the cache */
+
+    l2_table = seek_l2_table(s, l2_offset);
+    if (l2_table != NULL)
+        return l2_table;
+
+    /* not found: load a new entry in the least used one */
+
+    min_index = l2_cache_new_entry(bs);
+    l2_table = s->l2_cache + (min_index << s->l2_bits);
+    if (bdrv_pread(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
+        s->l2_size * sizeof(uint64_t))
+        return NULL;
+    s->l2_cache_offsets[min_index] = l2_offset;
+    s->l2_cache_counts[min_index] = 1;
+
+    return l2_table;
+}
+
+/*
+ * Writes one sector of the L1 table to the disk (can't update single entries
+ * and we really don't want bdrv_pread to perform a read-modify-write)
+ */
+#define L1_ENTRIES_PER_SECTOR (512 / 8)
+static int write_l1_entry(BDRVQcowState *s, int l1_index)
+{
+    uint64_t buf[L1_ENTRIES_PER_SECTOR];
+    int l1_start_index;
+    int i;
+
+    l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1);
+    for (i = 0; i < L1_ENTRIES_PER_SECTOR; i++) {
+        buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]);
+    }
+
+    if (bdrv_pwrite(s->hd, s->l1_table_offset + 8 * l1_start_index,
+        buf, sizeof(buf)) != sizeof(buf))
+    {
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * l2_allocate
+ *
+ * Allocate a new l2 entry in the file. If l1_index points to an already
+ * used entry in the L2 table (i.e. we are doing a copy on write for the L2
+ * table) copy the contents of the old L2 table into the newly allocated one.
+ * Otherwise the new table is initialized with zeros.
+ *
+ */
+
+static uint64_t *l2_allocate(BlockDriverState *bs, int l1_index)
+{
+    BDRVQcowState *s = bs->opaque;
+    int min_index;
+    uint64_t old_l2_offset;
+    uint64_t *l2_table, l2_offset;
+
+    old_l2_offset = s->l1_table[l1_index];
+
+    /* allocate a new l2 entry */
+
+    l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
+
+    /* update the L1 entry */
+
+    s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
+    if (write_l1_entry(s, l1_index) < 0) {
+        return NULL;
+    }
+
+    /* allocate a new entry in the l2 cache */
+
+    min_index = l2_cache_new_entry(bs);
+    l2_table = s->l2_cache + (min_index << s->l2_bits);
+
+    if (old_l2_offset == 0) {
+        /* if there was no old l2 table, clear the new table */
+        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
+    } else {
+        /* if there was an old l2 table, read it from the disk */
+        if (bdrv_pread(s->hd, old_l2_offset,
+                       l2_table, s->l2_size * sizeof(uint64_t)) !=
+            s->l2_size * sizeof(uint64_t))
+            return NULL;
+    }
+    /* write the l2 table to the file */
+    if (bdrv_pwrite(s->hd, l2_offset,
+                    l2_table, s->l2_size * sizeof(uint64_t)) !=
+        s->l2_size * sizeof(uint64_t))
+        return NULL;
+
+    /* update the l2 cache entry */
+
+    s->l2_cache_offsets[min_index] = l2_offset;
+    s->l2_cache_counts[min_index] = 1;
+
+    return l2_table;
+}
+
+static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size,
+        uint64_t *l2_table, uint64_t start, uint64_t mask)
+{
+    int i;
+    uint64_t offset = be64_to_cpu(l2_table[0]) & ~mask;
+
+    if (!offset)
+        return 0;
+
+    for (i = start; i < start + nb_clusters; i++)
+        if (offset + i * cluster_size != (be64_to_cpu(l2_table[i]) & ~mask))
+            break;
+
+	return (i - start);
+}
+
+static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table)
+{
+    int i = 0;
+
+    while(nb_clusters-- && l2_table[i] == 0)
+        i++;
+
+    return i;
+}
+
+/* The crypt function is compatible with the linux cryptoloop
+   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
+   supported */
+void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
+                           uint8_t *out_buf, const uint8_t *in_buf,
+                           int nb_sectors, int enc,
+                           const AES_KEY *key)
+{
+    union {
+        uint64_t ll[2];
+        uint8_t b[16];
+    } ivec;
+    int i;
+
+    for(i = 0; i < nb_sectors; i++) {
+        ivec.ll[0] = cpu_to_le64(sector_num);
+        ivec.ll[1] = 0;
+        AES_cbc_encrypt(in_buf, out_buf, 512, key,
+                        ivec.b, enc);
+        sector_num++;
+        in_buf += 512;
+        out_buf += 512;
+    }
+}
+
+
+static int qcow_read(BlockDriverState *bs, int64_t sector_num,
+                     uint8_t *buf, int nb_sectors)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret, index_in_cluster, n, n1;
+    uint64_t cluster_offset;
+
+    while (nb_sectors > 0) {
+        n = nb_sectors;
+        cluster_offset = qcow2_get_cluster_offset(bs, sector_num << 9, &n);
+        index_in_cluster = sector_num & (s->cluster_sectors - 1);
+        if (!cluster_offset) {
+            if (bs->backing_hd) {
+                /* read from the base image */
+                n1 = qcow2_backing_read1(bs->backing_hd, sector_num, buf, n);
+                if (n1 > 0) {
+                    ret = bdrv_read(bs->backing_hd, sector_num, buf, n1);
+                    if (ret < 0)
+                        return -1;
+                }
+            } else {
+                memset(buf, 0, 512 * n);
+            }
+        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
+            if (qcow2_decompress_cluster(s, cluster_offset) < 0)
+                return -1;
+            memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n);
+        } else {
+            ret = bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
+            if (ret != n * 512)
+                return -1;
+            if (s->crypt_method) {
+                qcow2_encrypt_sectors(s, sector_num, buf, buf, n, 0,
+                                &s->aes_decrypt_key);
+            }
+        }
+        nb_sectors -= n;
+        sector_num += n;
+        buf += n * 512;
+    }
+    return 0;
+}
+
+static int copy_sectors(BlockDriverState *bs, uint64_t start_sect,
+                        uint64_t cluster_offset, int n_start, int n_end)
+{
+    BDRVQcowState *s = bs->opaque;
+    int n, ret;
+
+    n = n_end - n_start;
+    if (n <= 0)
+        return 0;
+    ret = qcow_read(bs, start_sect + n_start, s->cluster_data, n);
+    if (ret < 0)
+        return ret;
+    if (s->crypt_method) {
+        qcow2_encrypt_sectors(s, start_sect + n_start,
+                        s->cluster_data,
+                        s->cluster_data, n, 1,
+                        &s->aes_encrypt_key);
+    }
+    ret = bdrv_write(s->hd, (cluster_offset >> 9) + n_start,
+                     s->cluster_data, n);
+    if (ret < 0)
+        return ret;
+    return 0;
+}
+
+
+/*
+ * get_cluster_offset
+ *
+ * For a given offset of the disk image, return cluster offset in
+ * qcow2 file.
+ *
+ * on entry, *num is the number of contiguous clusters we'd like to
+ * access following offset.
+ *
+ * on exit, *num is the number of contiguous clusters we can read.
+ *
+ * Return 1, if the offset is found
+ * Return 0, otherwise.
+ *
+ */
+
+uint64_t qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
+    int *num)
+{
+    BDRVQcowState *s = bs->opaque;
+    int l1_index, l2_index;
+    uint64_t l2_offset, *l2_table, cluster_offset;
+    int l1_bits, c;
+    int index_in_cluster, nb_available, nb_needed, nb_clusters;
+
+    index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1);
+    nb_needed = *num + index_in_cluster;
+
+    l1_bits = s->l2_bits + s->cluster_bits;
+
+    /* compute how many bytes there are between the offset and
+     * the end of the l1 entry
+     */
+
+    nb_available = (1 << l1_bits) - (offset & ((1 << l1_bits) - 1));
+
+    /* compute the number of available sectors */
+
+    nb_available = (nb_available >> 9) + index_in_cluster;
+
+    if (nb_needed > nb_available) {
+        nb_needed = nb_available;
+    }
+
+    cluster_offset = 0;
+
+    /* seek the the l2 offset in the l1 table */
+
+    l1_index = offset >> l1_bits;
+    if (l1_index >= s->l1_size)
+        goto out;
+
+    l2_offset = s->l1_table[l1_index];
+
+    /* seek the l2 table of the given l2 offset */
+
+    if (!l2_offset)
+        goto out;
+
+    /* load the l2 table in memory */
+
+    l2_offset &= ~QCOW_OFLAG_COPIED;
+    l2_table = l2_load(bs, l2_offset);
+    if (l2_table == NULL)
+        return 0;
+
+    /* find the cluster offset for the given disk offset */
+
+    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+    cluster_offset = be64_to_cpu(l2_table[l2_index]);
+    nb_clusters = size_to_clusters(s, nb_needed << 9);
+
+    if (!cluster_offset) {
+        /* how many empty clusters ? */
+        c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]);
+    } else {
+        /* how many allocated clusters ? */
+        c = count_contiguous_clusters(nb_clusters, s->cluster_size,
+                &l2_table[l2_index], 0, QCOW_OFLAG_COPIED);
+    }
+
+   nb_available = (c * s->cluster_sectors);
+out:
+    if (nb_available > nb_needed)
+        nb_available = nb_needed;
+
+    *num = nb_available - index_in_cluster;
+
+    return cluster_offset & ~QCOW_OFLAG_COPIED;
+}
+
+/*
+ * get_cluster_table
+ *
+ * for a given disk offset, load (and allocate if needed)
+ * the l2 table.
+ *
+ * the l2 table offset in the qcow2 file and the cluster index
+ * in the l2 table are given to the caller.
+ *
+ */
+
+static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
+                             uint64_t **new_l2_table,
+                             uint64_t *new_l2_offset,
+                             int *new_l2_index)
+{
+    BDRVQcowState *s = bs->opaque;
+    int l1_index, l2_index, ret;
+    uint64_t l2_offset, *l2_table;
+
+    /* seek the the l2 offset in the l1 table */
+
+    l1_index = offset >> (s->l2_bits + s->cluster_bits);
+    if (l1_index >= s->l1_size) {
+        ret = qcow2_grow_l1_table(bs, l1_index + 1);
+        if (ret < 0)
+            return 0;
+    }
+    l2_offset = s->l1_table[l1_index];
+
+    /* seek the l2 table of the given l2 offset */
+
+    if (l2_offset & QCOW_OFLAG_COPIED) {
+        /* load the l2 table in memory */
+        l2_offset &= ~QCOW_OFLAG_COPIED;
+        l2_table = l2_load(bs, l2_offset);
+        if (l2_table == NULL)
+            return 0;
+    } else {
+        if (l2_offset)
+            qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t));
+        l2_table = l2_allocate(bs, l1_index);
+        if (l2_table == NULL)
+            return 0;
+        l2_offset = s->l1_table[l1_index] & ~QCOW_OFLAG_COPIED;
+    }
+
+    /* find the cluster offset for the given disk offset */
+
+    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+
+    *new_l2_table = l2_table;
+    *new_l2_offset = l2_offset;
+    *new_l2_index = l2_index;
+
+    return 1;
+}
+
+/*
+ * alloc_compressed_cluster_offset
+ *
+ * For a given offset of the disk image, return cluster offset in
+ * qcow2 file.
+ *
+ * If the offset is not found, allocate a new compressed cluster.
+ *
+ * Return the cluster offset if successful,
+ * Return 0, otherwise.
+ *
+ */
+
+uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
+                                               uint64_t offset,
+                                               int compressed_size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int l2_index, ret;
+    uint64_t l2_offset, *l2_table, cluster_offset;
+    int nb_csectors;
+
+    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
+    if (ret == 0)
+        return 0;
+
+    cluster_offset = be64_to_cpu(l2_table[l2_index]);
+    if (cluster_offset & QCOW_OFLAG_COPIED)
+        return cluster_offset & ~QCOW_OFLAG_COPIED;
+
+    if (cluster_offset)
+        qcow2_free_any_clusters(bs, cluster_offset, 1);
+
+    cluster_offset = qcow2_alloc_bytes(bs, compressed_size);
+    nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) -
+                  (cluster_offset >> 9);
+
+    cluster_offset |= QCOW_OFLAG_COMPRESSED |
+                      ((uint64_t)nb_csectors << s->csize_shift);
+
+    /* update L2 table */
+
+    /* compressed clusters never have the copied flag */
+
+    l2_table[l2_index] = cpu_to_be64(cluster_offset);
+    if (bdrv_pwrite(s->hd,
+                    l2_offset + l2_index * sizeof(uint64_t),
+                    l2_table + l2_index,
+                    sizeof(uint64_t)) != sizeof(uint64_t))
+        return 0;
+
+    return cluster_offset;
+}
+
+/*
+ * Write L2 table updates to disk, writing whole sectors to avoid a
+ * read-modify-write in bdrv_pwrite
+ */
+#define L2_ENTRIES_PER_SECTOR (512 / 8)
+static int write_l2_entries(BDRVQcowState *s, uint64_t *l2_table,
+    uint64_t l2_offset, int l2_index, int num)
+{
+    int l2_start_index = l2_index & ~(L1_ENTRIES_PER_SECTOR - 1);
+    int start_offset = (8 * l2_index) & ~511;
+    int end_offset = (8 * (l2_index + num) + 511) & ~511;
+    size_t len = end_offset - start_offset;
+
+    if (bdrv_pwrite(s->hd, l2_offset + start_offset, &l2_table[l2_start_index],
+        len) != len)
+    {
+        return -1;
+    }
+
+    return 0;
+}
+
+int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, uint64_t cluster_offset,
+    QCowL2Meta *m)
+{
+    BDRVQcowState *s = bs->opaque;
+    int i, j = 0, l2_index, ret;
+    uint64_t *old_cluster, start_sect, l2_offset, *l2_table;
+
+    if (m->nb_clusters == 0)
+        return 0;
+
+    old_cluster = qemu_malloc(m->nb_clusters * sizeof(uint64_t));
+
+    /* copy content of unmodified sectors */
+    start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9;
+    if (m->n_start) {
+        ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start);
+        if (ret < 0)
+            goto err;
+    }
+
+    if (m->nb_available & (s->cluster_sectors - 1)) {
+        uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1);
+        ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9),
+                m->nb_available - end, s->cluster_sectors);
+        if (ret < 0)
+            goto err;
+    }
+
+    ret = -EIO;
+    /* update L2 table */
+    if (!get_cluster_table(bs, m->offset, &l2_table, &l2_offset, &l2_index))
+        goto err;
+
+    for (i = 0; i < m->nb_clusters; i++) {
+        /* if two concurrent writes happen to the same unallocated cluster
+	 * each write allocates separate cluster and writes data concurrently.
+	 * The first one to complete updates l2 table with pointer to its
+	 * cluster the second one has to do RMW (which is done above by
+	 * copy_sectors()), update l2 table with its cluster pointer and free
+	 * old cluster. This is what this loop does */
+        if(l2_table[l2_index + i] != 0)
+            old_cluster[j++] = l2_table[l2_index + i];
+
+        l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
+                    (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
+     }
+
+    if (write_l2_entries(s, l2_table, l2_offset, l2_index, m->nb_clusters) < 0) {
+        ret = -1;
+        goto err;
+    }
+
+    for (i = 0; i < j; i++)
+        qcow2_free_any_clusters(bs,
+            be64_to_cpu(old_cluster[i]) & ~QCOW_OFLAG_COPIED, 1);
+
+    ret = 0;
+err:
+    qemu_free(old_cluster);
+    return ret;
+ }
+
+/*
+ * alloc_cluster_offset
+ *
+ * For a given offset of the disk image, return cluster offset in
+ * qcow2 file.
+ *
+ * If the offset is not found, allocate a new cluster.
+ *
+ * Return the cluster offset if successful,
+ * Return 0, otherwise.
+ *
+ */
+
+uint64_t qcow2_alloc_cluster_offset(BlockDriverState *bs,
+                                    uint64_t offset,
+                                    int n_start, int n_end,
+                                    int *num, QCowL2Meta *m)
+{
+    BDRVQcowState *s = bs->opaque;
+    int l2_index, ret;
+    uint64_t l2_offset, *l2_table, cluster_offset;
+    int nb_clusters, i = 0;
+
+    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
+    if (ret == 0)
+        return 0;
+
+    nb_clusters = size_to_clusters(s, n_end << 9);
+
+    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+
+    cluster_offset = be64_to_cpu(l2_table[l2_index]);
+
+    /* We keep all QCOW_OFLAG_COPIED clusters */
+
+    if (cluster_offset & QCOW_OFLAG_COPIED) {
+        nb_clusters = count_contiguous_clusters(nb_clusters, s->cluster_size,
+                &l2_table[l2_index], 0, 0);
+
+        cluster_offset &= ~QCOW_OFLAG_COPIED;
+        m->nb_clusters = 0;
+
+        goto out;
+    }
+
+    /* for the moment, multiple compressed clusters are not managed */
+
+    if (cluster_offset & QCOW_OFLAG_COMPRESSED)
+        nb_clusters = 1;
+
+    /* how many available clusters ? */
+
+    while (i < nb_clusters) {
+        i += count_contiguous_clusters(nb_clusters - i, s->cluster_size,
+                &l2_table[l2_index], i, 0);
+
+        if(be64_to_cpu(l2_table[l2_index + i]))
+            break;
+
+        i += count_contiguous_free_clusters(nb_clusters - i,
+                &l2_table[l2_index + i]);
+
+        cluster_offset = be64_to_cpu(l2_table[l2_index + i]);
+
+        if ((cluster_offset & QCOW_OFLAG_COPIED) ||
+                (cluster_offset & QCOW_OFLAG_COMPRESSED))
+            break;
+    }
+    nb_clusters = i;
+
+    /* allocate a new cluster */
+
+    cluster_offset = qcow2_alloc_clusters(bs, nb_clusters * s->cluster_size);
+
+    /* save info needed for meta data update */
+    m->offset = offset;
+    m->n_start = n_start;
+    m->nb_clusters = nb_clusters;
+
+out:
+    m->nb_available = MIN(nb_clusters << (s->cluster_bits - 9), n_end);
+
+    *num = m->nb_available - n_start;
+
+    return cluster_offset;
+}
+
+static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
+                             const uint8_t *buf, int buf_size)
+{
+    z_stream strm1, *strm = &strm1;
+    int ret, out_len;
+
+    memset(strm, 0, sizeof(*strm));
+
+    strm->next_in = (uint8_t *)buf;
+    strm->avail_in = buf_size;
+    strm->next_out = out_buf;
+    strm->avail_out = out_buf_size;
+
+    ret = inflateInit2(strm, -12);
+    if (ret != Z_OK)
+        return -1;
+    ret = inflate(strm, Z_FINISH);
+    out_len = strm->next_out - out_buf;
+    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
+        out_len != out_buf_size) {
+        inflateEnd(strm);
+        return -1;
+    }
+    inflateEnd(strm);
+    return 0;
+}
+
+int qcow2_decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset)
+{
+    int ret, csize, nb_csectors, sector_offset;
+    uint64_t coffset;
+
+    coffset = cluster_offset & s->cluster_offset_mask;
+    if (s->cluster_cache_offset != coffset) {
+        nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
+        sector_offset = coffset & 511;
+        csize = nb_csectors * 512 - sector_offset;
+        ret = bdrv_read(s->hd, coffset >> 9, s->cluster_data, nb_csectors);
+        if (ret < 0) {
+            return -1;
+        }
+        if (decompress_buffer(s->cluster_cache, s->cluster_size,
+                              s->cluster_data + sector_offset, csize) < 0) {
+            return -1;
+        }
+        s->cluster_cache_offset = coffset;
+    }
+    return 0;
+}
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
new file mode 100644
index 0000000..dd6e293
--- /dev/null
+++ b/block/qcow2-refcount.c
@@ -0,0 +1,854 @@
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "block_int.h"
+#include "block/qcow2.h"
+
+static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size);
+static int update_refcount(BlockDriverState *bs,
+                            int64_t offset, int64_t length,
+                            int addend);
+
+/*********************************************************/
+/* refcount handling */
+
+int qcow2_refcount_init(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret, refcount_table_size2, i;
+
+    s->refcount_block_cache = qemu_malloc(s->cluster_size);
+    refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
+    s->refcount_table = qemu_malloc(refcount_table_size2);
+    if (s->refcount_table_size > 0) {
+        ret = bdrv_pread(s->hd, s->refcount_table_offset,
+                         s->refcount_table, refcount_table_size2);
+        if (ret != refcount_table_size2)
+            goto fail;
+        for(i = 0; i < s->refcount_table_size; i++)
+            be64_to_cpus(&s->refcount_table[i]);
+    }
+    return 0;
+ fail:
+    return -ENOMEM;
+}
+
+void qcow2_refcount_close(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    qemu_free(s->refcount_block_cache);
+    qemu_free(s->refcount_table);
+}
+
+
+static int load_refcount_block(BlockDriverState *bs,
+                               int64_t refcount_block_offset)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret;
+    ret = bdrv_pread(s->hd, refcount_block_offset, s->refcount_block_cache,
+                     s->cluster_size);
+    if (ret != s->cluster_size)
+        return -EIO;
+    s->refcount_block_cache_offset = refcount_block_offset;
+    return 0;
+}
+
+static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
+{
+    BDRVQcowState *s = bs->opaque;
+    int refcount_table_index, block_index;
+    int64_t refcount_block_offset;
+
+    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
+    if (refcount_table_index >= s->refcount_table_size)
+        return 0;
+    refcount_block_offset = s->refcount_table[refcount_table_index];
+    if (!refcount_block_offset)
+        return 0;
+    if (refcount_block_offset != s->refcount_block_cache_offset) {
+        /* better than nothing: return allocated if read error */
+        if (load_refcount_block(bs, refcount_block_offset) < 0)
+            return 1;
+    }
+    block_index = cluster_index &
+        ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
+    return be16_to_cpu(s->refcount_block_cache[block_index]);
+}
+
+static int grow_refcount_table(BlockDriverState *bs, int min_size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int new_table_size, new_table_size2, refcount_table_clusters, i, ret;
+    uint64_t *new_table;
+    int64_t table_offset;
+    uint8_t data[12];
+    int old_table_size;
+    int64_t old_table_offset;
+
+    if (min_size <= s->refcount_table_size)
+        return 0;
+    /* compute new table size */
+    refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
+    for(;;) {
+        if (refcount_table_clusters == 0) {
+            refcount_table_clusters = 1;
+        } else {
+            refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
+        }
+        new_table_size = refcount_table_clusters << (s->cluster_bits - 3);
+        if (min_size <= new_table_size)
+            break;
+    }
+#ifdef DEBUG_ALLOC2
+    printf("grow_refcount_table from %d to %d\n",
+           s->refcount_table_size,
+           new_table_size);
+#endif
+    new_table_size2 = new_table_size * sizeof(uint64_t);
+    new_table = qemu_mallocz(new_table_size2);
+    memcpy(new_table, s->refcount_table,
+           s->refcount_table_size * sizeof(uint64_t));
+    for(i = 0; i < s->refcount_table_size; i++)
+        cpu_to_be64s(&new_table[i]);
+    /* Note: we cannot update the refcount now to avoid recursion */
+    table_offset = alloc_clusters_noref(bs, new_table_size2);
+    ret = bdrv_pwrite(s->hd, table_offset, new_table, new_table_size2);
+    if (ret != new_table_size2)
+        goto fail;
+    for(i = 0; i < s->refcount_table_size; i++)
+        be64_to_cpus(&new_table[i]);
+
+    cpu_to_be64w((uint64_t*)data, table_offset);
+    cpu_to_be32w((uint32_t*)(data + 8), refcount_table_clusters);
+    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, refcount_table_offset),
+                    data, sizeof(data)) != sizeof(data))
+        goto fail;
+    qemu_free(s->refcount_table);
+    old_table_offset = s->refcount_table_offset;
+    old_table_size = s->refcount_table_size;
+    s->refcount_table = new_table;
+    s->refcount_table_size = new_table_size;
+    s->refcount_table_offset = table_offset;
+
+    update_refcount(bs, table_offset, new_table_size2, 1);
+    qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t));
+    return 0;
+ fail:
+    qcow2_free_clusters(bs, table_offset, new_table_size2);
+    qemu_free(new_table);
+    return -EIO;
+}
+
+
+static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t offset, refcount_block_offset;
+    int ret, refcount_table_index;
+    uint64_t data64;
+
+    /* Find L1 index and grow refcount table if needed */
+    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
+    if (refcount_table_index >= s->refcount_table_size) {
+        ret = grow_refcount_table(bs, refcount_table_index + 1);
+        if (ret < 0)
+            return ret;
+    }
+
+    /* Load or allocate the refcount block */
+    refcount_block_offset = s->refcount_table[refcount_table_index];
+    if (!refcount_block_offset) {
+        /* create a new refcount block */
+        /* Note: we cannot update the refcount now to avoid recursion */
+        offset = alloc_clusters_noref(bs, s->cluster_size);
+        memset(s->refcount_block_cache, 0, s->cluster_size);
+        ret = bdrv_pwrite(s->hd, offset, s->refcount_block_cache, s->cluster_size);
+        if (ret != s->cluster_size)
+            return -EINVAL;
+        s->refcount_table[refcount_table_index] = offset;
+        data64 = cpu_to_be64(offset);
+        ret = bdrv_pwrite(s->hd, s->refcount_table_offset +
+                          refcount_table_index * sizeof(uint64_t),
+                          &data64, sizeof(data64));
+        if (ret != sizeof(data64))
+            return -EINVAL;
+
+        refcount_block_offset = offset;
+        s->refcount_block_cache_offset = offset;
+        update_refcount(bs, offset, s->cluster_size, 1);
+    } else {
+        if (refcount_block_offset != s->refcount_block_cache_offset) {
+            if (load_refcount_block(bs, refcount_block_offset) < 0)
+                return -EIO;
+        }
+    }
+
+    return refcount_block_offset;
+}
+
+#define REFCOUNTS_PER_SECTOR (512 >> REFCOUNT_SHIFT)
+static int write_refcount_block_entries(BDRVQcowState *s,
+    int64_t refcount_block_offset, int first_index, int last_index)
+{
+    size_t size;
+
+    first_index &= ~(REFCOUNTS_PER_SECTOR - 1);
+    last_index = (last_index + REFCOUNTS_PER_SECTOR)
+        & ~(REFCOUNTS_PER_SECTOR - 1);
+
+    size = (last_index - first_index) << REFCOUNT_SHIFT;
+    if (bdrv_pwrite(s->hd,
+        refcount_block_offset + (first_index << REFCOUNT_SHIFT),
+        &s->refcount_block_cache[first_index], size) != size)
+    {
+        return -EIO;
+    }
+
+    return 0;
+}
+
+/* XXX: cache several refcount block clusters ? */
+static int update_refcount(BlockDriverState *bs,
+                            int64_t offset, int64_t length,
+                            int addend)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t start, last, cluster_offset;
+    int64_t refcount_block_offset = 0;
+    int64_t table_index = -1, old_table_index;
+    int first_index = -1, last_index = -1;
+
+#ifdef DEBUG_ALLOC2
+    printf("update_refcount: offset=%lld size=%lld addend=%d\n",
+           offset, length, addend);
+#endif
+    if (length <= 0)
+        return -EINVAL;
+    start = offset & ~(s->cluster_size - 1);
+    last = (offset + length - 1) & ~(s->cluster_size - 1);
+    for(cluster_offset = start; cluster_offset <= last;
+        cluster_offset += s->cluster_size)
+    {
+        int block_index, refcount;
+        int64_t cluster_index = cluster_offset >> s->cluster_bits;
+
+        /* Only write refcount block to disk when we are done with it */
+        old_table_index = table_index;
+        table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
+        if ((old_table_index >= 0) && (table_index != old_table_index)) {
+
+            if (write_refcount_block_entries(s, refcount_block_offset,
+                first_index, last_index) < 0)
+            {
+                return -EIO;
+            }
+
+            first_index = -1;
+            last_index = -1;
+        }
+
+        /* Load the refcount block and allocate it if needed */
+        refcount_block_offset = alloc_refcount_block(bs, cluster_index);
+        if (refcount_block_offset < 0) {
+            return refcount_block_offset;
+        }
+
+        /* we can update the count and save it */
+        block_index = cluster_index &
+            ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
+        if (first_index == -1 || block_index < first_index) {
+            first_index = block_index;
+        }
+        if (block_index > last_index) {
+            last_index = block_index;
+        }
+
+        refcount = be16_to_cpu(s->refcount_block_cache[block_index]);
+        refcount += addend;
+        if (refcount < 0 || refcount > 0xffff)
+            return -EINVAL;
+        if (refcount == 0 && cluster_index < s->free_cluster_index) {
+            s->free_cluster_index = cluster_index;
+        }
+        s->refcount_block_cache[block_index] = cpu_to_be16(refcount);
+    }
+
+    /* Write last changed block to disk */
+    if (refcount_block_offset != 0) {
+        if (write_refcount_block_entries(s, refcount_block_offset,
+            first_index, last_index) < 0)
+        {
+            return -EIO;
+        }
+    }
+
+    return 0;
+}
+
+/* addend must be 1 or -1 */
+static int update_cluster_refcount(BlockDriverState *bs,
+                                   int64_t cluster_index,
+                                   int addend)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret;
+
+    ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return get_refcount(bs, cluster_index);
+}
+
+
+
+/*********************************************************/
+/* cluster allocation functions */
+
+
+
+/* return < 0 if error */
+static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int i, nb_clusters;
+
+    nb_clusters = size_to_clusters(s, size);
+retry:
+    for(i = 0; i < nb_clusters; i++) {
+        int64_t i = s->free_cluster_index++;
+        if (get_refcount(bs, i) != 0)
+            goto retry;
+    }
+#ifdef DEBUG_ALLOC2
+    printf("alloc_clusters: size=%lld -> %lld\n",
+            size,
+            (s->free_cluster_index - nb_clusters) << s->cluster_bits);
+#endif
+    return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
+}
+
+int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size)
+{
+    int64_t offset;
+
+    offset = alloc_clusters_noref(bs, size);
+    update_refcount(bs, offset, size, 1);
+    return offset;
+}
+
+/* only used to allocate compressed sectors. We try to allocate
+   contiguous sectors. size must be <= cluster_size */
+int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t offset, cluster_offset;
+    int free_in_cluster;
+
+    assert(size > 0 && size <= s->cluster_size);
+    if (s->free_byte_offset == 0) {
+        s->free_byte_offset = qcow2_alloc_clusters(bs, s->cluster_size);
+    }
+ redo:
+    free_in_cluster = s->cluster_size -
+        (s->free_byte_offset & (s->cluster_size - 1));
+    if (size <= free_in_cluster) {
+        /* enough space in current cluster */
+        offset = s->free_byte_offset;
+        s->free_byte_offset += size;
+        free_in_cluster -= size;
+        if (free_in_cluster == 0)
+            s->free_byte_offset = 0;
+        if ((offset & (s->cluster_size - 1)) != 0)
+            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
+    } else {
+        offset = qcow2_alloc_clusters(bs, s->cluster_size);
+        cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1);
+        if ((cluster_offset + s->cluster_size) == offset) {
+            /* we are lucky: contiguous data */
+            offset = s->free_byte_offset;
+            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
+            s->free_byte_offset += size;
+        } else {
+            s->free_byte_offset = offset;
+            goto redo;
+        }
+    }
+    return offset;
+}
+
+void qcow2_free_clusters(BlockDriverState *bs,
+                          int64_t offset, int64_t size)
+{
+    update_refcount(bs, offset, size, -1);
+}
+
+/*
+ * free_any_clusters
+ *
+ * free clusters according to its type: compressed or not
+ *
+ */
+
+void qcow2_free_any_clusters(BlockDriverState *bs,
+    uint64_t cluster_offset, int nb_clusters)
+{
+    BDRVQcowState *s = bs->opaque;
+
+    /* free the cluster */
+
+    if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
+        int nb_csectors;
+        nb_csectors = ((cluster_offset >> s->csize_shift) &
+                       s->csize_mask) + 1;
+        qcow2_free_clusters(bs,
+            (cluster_offset & s->cluster_offset_mask) & ~511,
+            nb_csectors * 512);
+        return;
+    }
+
+    qcow2_free_clusters(bs, cluster_offset, nb_clusters << s->cluster_bits);
+
+    return;
+}
+
+
+
+/*********************************************************/
+/* snapshots and image creation */
+
+
+
+void qcow2_create_refcount_update(QCowCreateState *s, int64_t offset,
+    int64_t size)
+{
+    int refcount;
+    int64_t start, last, cluster_offset;
+    uint16_t *p;
+
+    start = offset & ~(s->cluster_size - 1);
+    last = (offset + size - 1)  & ~(s->cluster_size - 1);
+    for(cluster_offset = start; cluster_offset <= last;
+        cluster_offset += s->cluster_size) {
+        p = &s->refcount_block[cluster_offset >> s->cluster_bits];
+        refcount = be16_to_cpu(*p);
+        refcount++;
+        *p = cpu_to_be16(refcount);
+    }
+}
+
+/* update the refcounts of snapshots and the copied flag */
+int qcow2_update_snapshot_refcount(BlockDriverState *bs,
+    int64_t l1_table_offset, int l1_size, int addend)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
+    int64_t old_offset, old_l2_offset;
+    int l2_size, i, j, l1_modified, l2_modified, nb_csectors, refcount;
+
+    qcow2_l2_cache_reset(bs);
+
+    l2_table = NULL;
+    l1_table = NULL;
+    l1_size2 = l1_size * sizeof(uint64_t);
+    l1_allocated = 0;
+    if (l1_table_offset != s->l1_table_offset) {
+        l1_table = qemu_malloc(l1_size2);
+        l1_allocated = 1;
+        if (bdrv_pread(s->hd, l1_table_offset,
+                       l1_table, l1_size2) != l1_size2)
+            goto fail;
+        for(i = 0;i < l1_size; i++)
+            be64_to_cpus(&l1_table[i]);
+    } else {
+        assert(l1_size == s->l1_size);
+        l1_table = s->l1_table;
+        l1_allocated = 0;
+    }
+
+    l2_size = s->l2_size * sizeof(uint64_t);
+    l2_table = qemu_malloc(l2_size);
+    l1_modified = 0;
+    for(i = 0; i < l1_size; i++) {
+        l2_offset = l1_table[i];
+        if (l2_offset) {
+            old_l2_offset = l2_offset;
+            l2_offset &= ~QCOW_OFLAG_COPIED;
+            l2_modified = 0;
+            if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
+                goto fail;
+            for(j = 0; j < s->l2_size; j++) {
+                offset = be64_to_cpu(l2_table[j]);
+                if (offset != 0) {
+                    old_offset = offset;
+                    offset &= ~QCOW_OFLAG_COPIED;
+                    if (offset & QCOW_OFLAG_COMPRESSED) {
+                        nb_csectors = ((offset >> s->csize_shift) &
+                                       s->csize_mask) + 1;
+                        if (addend != 0)
+                            update_refcount(bs, (offset & s->cluster_offset_mask) & ~511,
+                                            nb_csectors * 512, addend);
+                        /* compressed clusters are never modified */
+                        refcount = 2;
+                    } else {
+                        if (addend != 0) {
+                            refcount = update_cluster_refcount(bs, offset >> s->cluster_bits, addend);
+                        } else {
+                            refcount = get_refcount(bs, offset >> s->cluster_bits);
+                        }
+                    }
+
+                    if (refcount == 1) {
+                        offset |= QCOW_OFLAG_COPIED;
+                    }
+                    if (offset != old_offset) {
+                        l2_table[j] = cpu_to_be64(offset);
+                        l2_modified = 1;
+                    }
+                }
+            }
+            if (l2_modified) {
+                if (bdrv_pwrite(s->hd,
+                                l2_offset, l2_table, l2_size) != l2_size)
+                    goto fail;
+            }
+
+            if (addend != 0) {
+                refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend);
+            } else {
+                refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
+            }
+            if (refcount == 1) {
+                l2_offset |= QCOW_OFLAG_COPIED;
+            }
+            if (l2_offset != old_l2_offset) {
+                l1_table[i] = l2_offset;
+                l1_modified = 1;
+            }
+        }
+    }
+    if (l1_modified) {
+        for(i = 0; i < l1_size; i++)
+            cpu_to_be64s(&l1_table[i]);
+        if (bdrv_pwrite(s->hd, l1_table_offset, l1_table,
+                        l1_size2) != l1_size2)
+            goto fail;
+        for(i = 0; i < l1_size; i++)
+            be64_to_cpus(&l1_table[i]);
+    }
+    if (l1_allocated)
+        qemu_free(l1_table);
+    qemu_free(l2_table);
+    return 0;
+ fail:
+    if (l1_allocated)
+        qemu_free(l1_table);
+    qemu_free(l2_table);
+    return -EIO;
+}
+
+
+
+
+/*********************************************************/
+/* refcount checking functions */
+
+
+
+/*
+ * Increases the refcount for a range of clusters in a given refcount table.
+ * This is used to construct a temporary refcount table out of L1 and L2 tables
+ * which can be compared the the refcount table saved in the image.
+ *
+ * Returns the number of errors in the image that were found
+ */
+static int inc_refcounts(BlockDriverState *bs,
+                          uint16_t *refcount_table,
+                          int refcount_table_size,
+                          int64_t offset, int64_t size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t start, last, cluster_offset;
+    int k;
+    int errors = 0;
+
+    if (size <= 0)
+        return 0;
+
+    start = offset & ~(s->cluster_size - 1);
+    last = (offset + size - 1) & ~(s->cluster_size - 1);
+    for(cluster_offset = start; cluster_offset <= last;
+        cluster_offset += s->cluster_size) {
+        k = cluster_offset >> s->cluster_bits;
+        if (k < 0 || k >= refcount_table_size) {
+            fprintf(stderr, "ERROR: invalid cluster offset=0x%" PRIx64 "\n",
+                cluster_offset);
+            errors++;
+        } else {
+            if (++refcount_table[k] == 0) {
+                fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
+                    "\n", cluster_offset);
+                errors++;
+            }
+        }
+    }
+
+    return errors;
+}
+
+/*
+ * Increases the refcount in the given refcount table for the all clusters
+ * referenced in the L2 table. While doing so, performs some checks on L2
+ * entries.
+ *
+ * Returns the number of errors found by the checks or -errno if an internal
+ * error occurred.
+ */
+static int check_refcounts_l2(BlockDriverState *bs,
+    uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset,
+    int check_copied)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t *l2_table, offset;
+    int i, l2_size, nb_csectors, refcount;
+    int errors = 0;
+
+    /* Read L2 table from disk */
+    l2_size = s->l2_size * sizeof(uint64_t);
+    l2_table = qemu_malloc(l2_size);
+
+    if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
+        goto fail;
+
+    /* Do the actual checks */
+    for(i = 0; i < s->l2_size; i++) {
+        offset = be64_to_cpu(l2_table[i]);
+        if (offset != 0) {
+            if (offset & QCOW_OFLAG_COMPRESSED) {
+                /* Compressed clusters don't have QCOW_OFLAG_COPIED */
+                if (offset & QCOW_OFLAG_COPIED) {
+                    fprintf(stderr, "ERROR: cluster %" PRId64 ": "
+                        "copied flag must never be set for compressed "
+                        "clusters\n", offset >> s->cluster_bits);
+                    offset &= ~QCOW_OFLAG_COPIED;
+                    errors++;
+                }
+
+                /* Mark cluster as used */
+                nb_csectors = ((offset >> s->csize_shift) &
+                               s->csize_mask) + 1;
+                offset &= s->cluster_offset_mask;
+                errors += inc_refcounts(bs, refcount_table,
+                              refcount_table_size,
+                              offset & ~511, nb_csectors * 512);
+            } else {
+                /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
+                if (check_copied) {
+                    uint64_t entry = offset;
+                    offset &= ~QCOW_OFLAG_COPIED;
+                    refcount = get_refcount(bs, offset >> s->cluster_bits);
+                    if ((refcount == 1) != ((entry & QCOW_OFLAG_COPIED) != 0)) {
+                        fprintf(stderr, "ERROR OFLAG_COPIED: offset=%"
+                            PRIx64 " refcount=%d\n", entry, refcount);
+                        errors++;
+                    }
+                }
+
+                /* Mark cluster as used */
+                offset &= ~QCOW_OFLAG_COPIED;
+                errors += inc_refcounts(bs, refcount_table,
+                              refcount_table_size,
+                              offset, s->cluster_size);
+
+                /* Correct offsets are cluster aligned */
+                if (offset & (s->cluster_size - 1)) {
+                    fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
+                        "properly aligned; L2 entry corrupted.\n", offset);
+                    errors++;
+                }
+            }
+        }
+    }
+
+    qemu_free(l2_table);
+    return errors;
+
+fail:
+    fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
+    qemu_free(l2_table);
+    return -EIO;
+}
+
+/*
+ * Increases the refcount for the L1 table, its L2 tables and all referenced
+ * clusters in the given refcount table. While doing so, performs some checks
+ * on L1 and L2 entries.
+ *
+ * Returns the number of errors found by the checks or -errno if an internal
+ * error occurred.
+ */
+static int check_refcounts_l1(BlockDriverState *bs,
+                              uint16_t *refcount_table,
+                              int refcount_table_size,
+                              int64_t l1_table_offset, int l1_size,
+                              int check_copied)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t *l1_table, l2_offset, l1_size2;
+    int i, refcount, ret;
+    int errors = 0;
+
+    l1_size2 = l1_size * sizeof(uint64_t);
+
+    /* Mark L1 table as used */
+    errors += inc_refcounts(bs, refcount_table, refcount_table_size,
+                  l1_table_offset, l1_size2);
+
+    /* Read L1 table entries from disk */
+    l1_table = qemu_malloc(l1_size2);
+    if (bdrv_pread(s->hd, l1_table_offset,
+                   l1_table, l1_size2) != l1_size2)
+        goto fail;
+    for(i = 0;i < l1_size; i++)
+        be64_to_cpus(&l1_table[i]);
+
+    /* Do the actual checks */
+    for(i = 0; i < l1_size; i++) {
+        l2_offset = l1_table[i];
+        if (l2_offset) {
+            /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
+            if (check_copied) {
+                refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED)
+                    >> s->cluster_bits);
+                if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) {
+                    fprintf(stderr, "ERROR OFLAG_COPIED: l2_offset=%" PRIx64
+                        " refcount=%d\n", l2_offset, refcount);
+                    errors++;
+                }
+            }
+
+            /* Mark L2 table as used */
+            l2_offset &= ~QCOW_OFLAG_COPIED;
+            errors += inc_refcounts(bs, refcount_table,
+                          refcount_table_size,
+                          l2_offset,
+                          s->cluster_size);
+
+            /* L2 tables are cluster aligned */
+            if (l2_offset & (s->cluster_size - 1)) {
+                fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
+                    "cluster aligned; L1 entry corrupted\n", l2_offset);
+                errors++;
+            }
+
+            /* Process and check L2 entries */
+            ret = check_refcounts_l2(bs, refcount_table, refcount_table_size,
+                l2_offset, check_copied);
+            if (ret < 0) {
+                goto fail;
+            }
+            errors += ret;
+        }
+    }
+    qemu_free(l1_table);
+    return errors;
+
+fail:
+    fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
+    qemu_free(l1_table);
+    return -EIO;
+}
+
+/*
+ * Checks an image for refcount consistency.
+ *
+ * Returns 0 if no errors are found, the number of errors in case the image is
+ * detected as corrupted, and -errno when an internal error occured.
+ */
+int qcow2_check_refcounts(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t size;
+    int nb_clusters, refcount1, refcount2, i;
+    QCowSnapshot *sn;
+    uint16_t *refcount_table;
+    int ret, errors = 0;
+
+    size = bdrv_getlength(s->hd);
+    nb_clusters = size_to_clusters(s, size);
+    refcount_table = qemu_mallocz(nb_clusters * sizeof(uint16_t));
+
+    /* header */
+    errors += inc_refcounts(bs, refcount_table, nb_clusters,
+                  0, s->cluster_size);
+
+    /* current L1 table */
+    ret = check_refcounts_l1(bs, refcount_table, nb_clusters,
+                       s->l1_table_offset, s->l1_size, 1);
+    if (ret < 0) {
+        return ret;
+    }
+    errors += ret;
+
+    /* snapshots */
+    for(i = 0; i < s->nb_snapshots; i++) {
+        sn = s->snapshots + i;
+        check_refcounts_l1(bs, refcount_table, nb_clusters,
+                           sn->l1_table_offset, sn->l1_size, 0);
+    }
+    errors += inc_refcounts(bs, refcount_table, nb_clusters,
+                  s->snapshots_offset, s->snapshots_size);
+
+    /* refcount data */
+    errors += inc_refcounts(bs, refcount_table, nb_clusters,
+                  s->refcount_table_offset,
+                  s->refcount_table_size * sizeof(uint64_t));
+    for(i = 0; i < s->refcount_table_size; i++) {
+        int64_t offset;
+        offset = s->refcount_table[i];
+        if (offset != 0) {
+            errors += inc_refcounts(bs, refcount_table, nb_clusters,
+                          offset, s->cluster_size);
+        }
+    }
+
+    /* compare ref counts */
+    for(i = 0; i < nb_clusters; i++) {
+        refcount1 = get_refcount(bs, i);
+        refcount2 = refcount_table[i];
+        if (refcount1 != refcount2) {
+            fprintf(stderr, "ERROR cluster %d refcount=%d reference=%d\n",
+                   i, refcount1, refcount2);
+            errors++;
+        }
+    }
+
+    qemu_free(refcount_table);
+
+    return errors;
+}
+
diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
new file mode 100644
index 0000000..e1e4d89
--- /dev/null
+++ b/block/qcow2-snapshot.c
@@ -0,0 +1,405 @@
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "block_int.h"
+#include "block/qcow2.h"
+
+typedef struct __attribute__((packed)) QCowSnapshotHeader {
+    /* header is 8 byte aligned */
+    uint64_t l1_table_offset;
+
+    uint32_t l1_size;
+    uint16_t id_str_size;
+    uint16_t name_size;
+
+    uint32_t date_sec;
+    uint32_t date_nsec;
+
+    uint64_t vm_clock_nsec;
+
+    uint32_t vm_state_size;
+    uint32_t extra_data_size; /* for extension */
+    /* extra data follows */
+    /* id_str follows */
+    /* name follows  */
+} QCowSnapshotHeader;
+
+void qcow2_free_snapshots(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    int i;
+
+    for(i = 0; i < s->nb_snapshots; i++) {
+        qemu_free(s->snapshots[i].name);
+        qemu_free(s->snapshots[i].id_str);
+    }
+    qemu_free(s->snapshots);
+    s->snapshots = NULL;
+    s->nb_snapshots = 0;
+}
+
+int qcow2_read_snapshots(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowSnapshotHeader h;
+    QCowSnapshot *sn;
+    int i, id_str_size, name_size;
+    int64_t offset;
+    uint32_t extra_data_size;
+
+    if (!s->nb_snapshots) {
+        s->snapshots = NULL;
+        s->snapshots_size = 0;
+        return 0;
+    }
+
+    offset = s->snapshots_offset;
+    s->snapshots = qemu_mallocz(s->nb_snapshots * sizeof(QCowSnapshot));
+    for(i = 0; i < s->nb_snapshots; i++) {
+        offset = align_offset(offset, 8);
+        if (bdrv_pread(s->hd, offset, &h, sizeof(h)) != sizeof(h))
+            goto fail;
+        offset += sizeof(h);
+        sn = s->snapshots + i;
+        sn->l1_table_offset = be64_to_cpu(h.l1_table_offset);
+        sn->l1_size = be32_to_cpu(h.l1_size);
+        sn->vm_state_size = be32_to_cpu(h.vm_state_size);
+        sn->date_sec = be32_to_cpu(h.date_sec);
+        sn->date_nsec = be32_to_cpu(h.date_nsec);
+        sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec);
+        extra_data_size = be32_to_cpu(h.extra_data_size);
+
+        id_str_size = be16_to_cpu(h.id_str_size);
+        name_size = be16_to_cpu(h.name_size);
+
+        offset += extra_data_size;
+
+        sn->id_str = qemu_malloc(id_str_size + 1);
+        if (bdrv_pread(s->hd, offset, sn->id_str, id_str_size) != id_str_size)
+            goto fail;
+        offset += id_str_size;
+        sn->id_str[id_str_size] = '\0';
+
+        sn->name = qemu_malloc(name_size + 1);
+        if (bdrv_pread(s->hd, offset, sn->name, name_size) != name_size)
+            goto fail;
+        offset += name_size;
+        sn->name[name_size] = '\0';
+    }
+    s->snapshots_size = offset - s->snapshots_offset;
+    return 0;
+ fail:
+    qcow2_free_snapshots(bs);
+    return -1;
+}
+
+/* add at the end of the file a new list of snapshots */
+static int qcow_write_snapshots(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowSnapshot *sn;
+    QCowSnapshotHeader h;
+    int i, name_size, id_str_size, snapshots_size;
+    uint64_t data64;
+    uint32_t data32;
+    int64_t offset, snapshots_offset;
+
+    /* compute the size of the snapshots */
+    offset = 0;
+    for(i = 0; i < s->nb_snapshots; i++) {
+        sn = s->snapshots + i;
+        offset = align_offset(offset, 8);
+        offset += sizeof(h);
+        offset += strlen(sn->id_str);
+        offset += strlen(sn->name);
+    }
+    snapshots_size = offset;
+
+    snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size);
+    offset = snapshots_offset;
+
+    for(i = 0; i < s->nb_snapshots; i++) {
+        sn = s->snapshots + i;
+        memset(&h, 0, sizeof(h));
+        h.l1_table_offset = cpu_to_be64(sn->l1_table_offset);
+        h.l1_size = cpu_to_be32(sn->l1_size);
+        h.vm_state_size = cpu_to_be32(sn->vm_state_size);
+        h.date_sec = cpu_to_be32(sn->date_sec);
+        h.date_nsec = cpu_to_be32(sn->date_nsec);
+        h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec);
+
+        id_str_size = strlen(sn->id_str);
+        name_size = strlen(sn->name);
+        h.id_str_size = cpu_to_be16(id_str_size);
+        h.name_size = cpu_to_be16(name_size);
+        offset = align_offset(offset, 8);
+        if (bdrv_pwrite(s->hd, offset, &h, sizeof(h)) != sizeof(h))
+            goto fail;
+        offset += sizeof(h);
+        if (bdrv_pwrite(s->hd, offset, sn->id_str, id_str_size) != id_str_size)
+            goto fail;
+        offset += id_str_size;
+        if (bdrv_pwrite(s->hd, offset, sn->name, name_size) != name_size)
+            goto fail;
+        offset += name_size;
+    }
+
+    /* update the various header fields */
+    data64 = cpu_to_be64(snapshots_offset);
+    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, snapshots_offset),
+                    &data64, sizeof(data64)) != sizeof(data64))
+        goto fail;
+    data32 = cpu_to_be32(s->nb_snapshots);
+    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, nb_snapshots),
+                    &data32, sizeof(data32)) != sizeof(data32))
+        goto fail;
+
+    /* free the old snapshot table */
+    qcow2_free_clusters(bs, s->snapshots_offset, s->snapshots_size);
+    s->snapshots_offset = snapshots_offset;
+    s->snapshots_size = snapshots_size;
+    return 0;
+ fail:
+    return -1;
+}
+
+static void find_new_snapshot_id(BlockDriverState *bs,
+                                 char *id_str, int id_str_size)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowSnapshot *sn;
+    int i, id, id_max = 0;
+
+    for(i = 0; i < s->nb_snapshots; i++) {
+        sn = s->snapshots + i;
+        id = strtoul(sn->id_str, NULL, 10);
+        if (id > id_max)
+            id_max = id;
+    }
+    snprintf(id_str, id_str_size, "%d", id_max + 1);
+}
+
+static int find_snapshot_by_id(BlockDriverState *bs, const char *id_str)
+{
+    BDRVQcowState *s = bs->opaque;
+    int i;
+
+    for(i = 0; i < s->nb_snapshots; i++) {
+        if (!strcmp(s->snapshots[i].id_str, id_str))
+            return i;
+    }
+    return -1;
+}
+
+static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name)
+{
+    BDRVQcowState *s = bs->opaque;
+    int i, ret;
+
+    ret = find_snapshot_by_id(bs, name);
+    if (ret >= 0)
+        return ret;
+    for(i = 0; i < s->nb_snapshots; i++) {
+        if (!strcmp(s->snapshots[i].name, name))
+            return i;
+    }
+    return -1;
+}
+
+/* if no id is provided, a new one is constructed */
+int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowSnapshot *snapshots1, sn1, *sn = &sn1;
+    int i, ret;
+    uint64_t *l1_table = NULL;
+
+    memset(sn, 0, sizeof(*sn));
+
+    if (sn_info->id_str[0] == '\0') {
+        /* compute a new id */
+        find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
+    }
+
+    /* check that the ID is unique */
+    if (find_snapshot_by_id(bs, sn_info->id_str) >= 0)
+        return -ENOENT;
+
+    sn->id_str = qemu_strdup(sn_info->id_str);
+    if (!sn->id_str)
+        goto fail;
+    sn->name = qemu_strdup(sn_info->name);
+    if (!sn->name)
+        goto fail;
+    sn->vm_state_size = sn_info->vm_state_size;
+    sn->date_sec = sn_info->date_sec;
+    sn->date_nsec = sn_info->date_nsec;
+    sn->vm_clock_nsec = sn_info->vm_clock_nsec;
+
+    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1);
+    if (ret < 0)
+        goto fail;
+
+    /* create the L1 table of the snapshot */
+    sn->l1_table_offset = qcow2_alloc_clusters(bs, s->l1_size * sizeof(uint64_t));
+    sn->l1_size = s->l1_size;
+
+    l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
+    for(i = 0; i < s->l1_size; i++) {
+        l1_table[i] = cpu_to_be64(s->l1_table[i]);
+    }
+    if (bdrv_pwrite(s->hd, sn->l1_table_offset,
+                    l1_table, s->l1_size * sizeof(uint64_t)) !=
+        (s->l1_size * sizeof(uint64_t)))
+        goto fail;
+    qemu_free(l1_table);
+    l1_table = NULL;
+
+    snapshots1 = qemu_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot));
+    if (s->snapshots) {
+        memcpy(snapshots1, s->snapshots, s->nb_snapshots * sizeof(QCowSnapshot));
+        qemu_free(s->snapshots);
+    }
+    s->snapshots = snapshots1;
+    s->snapshots[s->nb_snapshots++] = *sn;
+
+    if (qcow_write_snapshots(bs) < 0)
+        goto fail;
+#ifdef DEBUG_ALLOC
+    check_refcounts(bs);
+#endif
+    return 0;
+ fail:
+    qemu_free(sn->name);
+    qemu_free(l1_table);
+    return -1;
+}
+
+/* copy the snapshot 'snapshot_name' into the current disk image */
+int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowSnapshot *sn;
+    int i, snapshot_index, l1_size2;
+
+    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
+    if (snapshot_index < 0)
+        return -ENOENT;
+    sn = &s->snapshots[snapshot_index];
+
+    if (qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, -1) < 0)
+        goto fail;
+
+    if (qcow2_grow_l1_table(bs, sn->l1_size) < 0)
+        goto fail;
+
+    s->l1_size = sn->l1_size;
+    l1_size2 = s->l1_size * sizeof(uint64_t);
+    /* copy the snapshot l1 table to the current l1 table */
+    if (bdrv_pread(s->hd, sn->l1_table_offset,
+                   s->l1_table, l1_size2) != l1_size2)
+        goto fail;
+    if (bdrv_pwrite(s->hd, s->l1_table_offset,
+                    s->l1_table, l1_size2) != l1_size2)
+        goto fail;
+    for(i = 0;i < s->l1_size; i++) {
+        be64_to_cpus(&s->l1_table[i]);
+    }
+
+    if (qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1) < 0)
+        goto fail;
+
+#ifdef DEBUG_ALLOC
+    check_refcounts(bs);
+#endif
+    return 0;
+ fail:
+    return -EIO;
+}
+
+int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowSnapshot *sn;
+    int snapshot_index, ret;
+
+    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
+    if (snapshot_index < 0)
+        return -ENOENT;
+    sn = &s->snapshots[snapshot_index];
+
+    ret = qcow2_update_snapshot_refcount(bs, sn->l1_table_offset, sn->l1_size, -1);
+    if (ret < 0)
+        return ret;
+    /* must update the copied flag on the current cluster offsets */
+    ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
+    if (ret < 0)
+        return ret;
+    qcow2_free_clusters(bs, sn->l1_table_offset, sn->l1_size * sizeof(uint64_t));
+
+    qemu_free(sn->id_str);
+    qemu_free(sn->name);
+    memmove(sn, sn + 1, (s->nb_snapshots - snapshot_index - 1) * sizeof(*sn));
+    s->nb_snapshots--;
+    ret = qcow_write_snapshots(bs);
+    if (ret < 0) {
+        /* XXX: restore snapshot if error ? */
+        return ret;
+    }
+#ifdef DEBUG_ALLOC
+    check_refcounts(bs);
+#endif
+    return 0;
+}
+
+int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
+{
+    BDRVQcowState *s = bs->opaque;
+    QEMUSnapshotInfo *sn_tab, *sn_info;
+    QCowSnapshot *sn;
+    int i;
+
+    if (!s->nb_snapshots) {
+        *psn_tab = NULL;
+        return s->nb_snapshots;
+    }
+
+    sn_tab = qemu_mallocz(s->nb_snapshots * sizeof(QEMUSnapshotInfo));
+    for(i = 0; i < s->nb_snapshots; i++) {
+        sn_info = sn_tab + i;
+        sn = s->snapshots + i;
+        pstrcpy(sn_info->id_str, sizeof(sn_info->id_str),
+                sn->id_str);
+        pstrcpy(sn_info->name, sizeof(sn_info->name),
+                sn->name);
+        sn_info->vm_state_size = sn->vm_state_size;
+        sn_info->date_sec = sn->date_sec;
+        sn_info->date_nsec = sn->date_nsec;
+        sn_info->vm_clock_nsec = sn->vm_clock_nsec;
+    }
+    *psn_tab = sn_tab;
+    return s->nb_snapshots;
+}
+
diff --git a/block/qcow2.c b/block/qcow2.c
new file mode 100644
index 0000000..9acbddf
--- /dev/null
+++ b/block/qcow2.c
@@ -0,0 +1,1027 @@
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+#include <zlib.h>
+#include "aes.h"
+#include "block/qcow2.h"
+
+/*
+  Differences with QCOW:
+
+  - Support for multiple incremental snapshots.
+  - Memory management by reference counts.
+  - Clusters which have a reference count of one have the bit
+    QCOW_OFLAG_COPIED to optimize write performance.
+  - Size of compressed clusters is stored in sectors to reduce bit usage
+    in the cluster offsets.
+  - Support for storing additional data (such as the VM state) in the
+    snapshots.
+  - If a backing store is used, the cluster size is not constrained
+    (could be backported to QCOW).
+  - L2 tables have always a size of one cluster.
+*/
+
+//#define DEBUG_ALLOC
+//#define DEBUG_ALLOC2
+//#define DEBUG_EXT
+
+
+typedef struct {
+    uint32_t magic;
+    uint32_t len;
+} QCowExtension;
+#define  QCOW_EXT_MAGIC_END 0
+#define  QCOW_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
+
+
+
+static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    const QCowHeader *cow_header = (const void *)buf;
+
+    if (buf_size >= sizeof(QCowHeader) &&
+        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
+        be32_to_cpu(cow_header->version) == QCOW_VERSION)
+        return 100;
+    else
+        return 0;
+}
+
+
+/* 
+ * read qcow2 extension and fill bs
+ * start reading from start_offset
+ * finish reading upon magic of value 0 or when end_offset reached
+ * unknown magic is skipped (future extension this version knows nothing about)
+ * return 0 upon success, non-0 otherwise
+ */
+static int qcow_read_extensions(BlockDriverState *bs, uint64_t start_offset,
+                                uint64_t end_offset)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowExtension ext;
+    uint64_t offset;
+
+#ifdef DEBUG_EXT
+    printf("qcow_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
+#endif
+    offset = start_offset;
+    while (offset < end_offset) {
+
+#ifdef DEBUG_EXT
+        /* Sanity check */
+        if (offset > s->cluster_size)
+            printf("qcow_handle_extension: suspicious offset %lu\n", offset);
+
+        printf("attemting to read extended header in offset %lu\n", offset);
+#endif
+
+        if (bdrv_pread(s->hd, offset, &ext, sizeof(ext)) != sizeof(ext)) {
+            fprintf(stderr, "qcow_handle_extension: ERROR: pread fail from offset %llu\n",
+                    (unsigned long long)offset);
+            return 1;
+        }
+        be32_to_cpus(&ext.magic);
+        be32_to_cpus(&ext.len);
+        offset += sizeof(ext);
+#ifdef DEBUG_EXT
+        printf("ext.magic = 0x%x\n", ext.magic);
+#endif
+        switch (ext.magic) {
+        case QCOW_EXT_MAGIC_END:
+            return 0;
+
+        case QCOW_EXT_MAGIC_BACKING_FORMAT:
+            if (ext.len >= sizeof(bs->backing_format)) {
+                fprintf(stderr, "ERROR: ext_backing_format: len=%u too large"
+                        " (>=%zu)\n",
+                        ext.len, sizeof(bs->backing_format));
+                return 2;
+            }
+            if (bdrv_pread(s->hd, offset , bs->backing_format,
+                           ext.len) != ext.len)
+                return 3;
+            bs->backing_format[ext.len] = '\0';
+#ifdef DEBUG_EXT
+            printf("Qcow2: Got format extension %s\n", bs->backing_format);
+#endif
+            offset += ((ext.len + 7) & ~7);
+            break;
+
+        default:
+            /* unknown magic -- just skip it */
+            offset += ((ext.len + 7) & ~7);
+            break;
+        }
+    }
+
+    return 0;
+}
+
+
+static int qcow_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVQcowState *s = bs->opaque;
+    int len, i, shift, ret;
+    QCowHeader header;
+    uint64_t ext_end;
+
+    /* Performance is terrible right now with cache=writethrough due mainly
+     * to reference count updates.  If the user does not explicitly specify
+     * a caching type, force to writeback caching.
+     */
+    if ((flags & BDRV_O_CACHE_DEF)) {
+        flags |= BDRV_O_CACHE_WB;
+        flags &= ~BDRV_O_CACHE_DEF;
+    }
+    ret = bdrv_file_open(&s->hd, filename, flags);
+    if (ret < 0)
+        return ret;
+    if (bdrv_pread(s->hd, 0, &header, sizeof(header)) != sizeof(header))
+        goto fail;
+    be32_to_cpus(&header.magic);
+    be32_to_cpus(&header.version);
+    be64_to_cpus(&header.backing_file_offset);
+    be32_to_cpus(&header.backing_file_size);
+    be64_to_cpus(&header.size);
+    be32_to_cpus(&header.cluster_bits);
+    be32_to_cpus(&header.crypt_method);
+    be64_to_cpus(&header.l1_table_offset);
+    be32_to_cpus(&header.l1_size);
+    be64_to_cpus(&header.refcount_table_offset);
+    be32_to_cpus(&header.refcount_table_clusters);
+    be64_to_cpus(&header.snapshots_offset);
+    be32_to_cpus(&header.nb_snapshots);
+
+    if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION)
+        goto fail;
+    if (header.size <= 1 ||
+        header.cluster_bits < MIN_CLUSTER_BITS ||
+        header.cluster_bits > MAX_CLUSTER_BITS)
+        goto fail;
+    if (header.crypt_method > QCOW_CRYPT_AES)
+        goto fail;
+    s->crypt_method_header = header.crypt_method;
+    if (s->crypt_method_header)
+        bs->encrypted = 1;
+    s->cluster_bits = header.cluster_bits;
+    s->cluster_size = 1 << s->cluster_bits;
+    s->cluster_sectors = 1 << (s->cluster_bits - 9);
+    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
+    s->l2_size = 1 << s->l2_bits;
+    bs->total_sectors = header.size / 512;
+    s->csize_shift = (62 - (s->cluster_bits - 8));
+    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
+    s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
+    s->refcount_table_offset = header.refcount_table_offset;
+    s->refcount_table_size =
+        header.refcount_table_clusters << (s->cluster_bits - 3);
+
+    s->snapshots_offset = header.snapshots_offset;
+    s->nb_snapshots = header.nb_snapshots;
+
+    /* read the level 1 table */
+    s->l1_size = header.l1_size;
+    shift = s->cluster_bits + s->l2_bits;
+    s->l1_vm_state_index = (header.size + (1LL << shift) - 1) >> shift;
+    /* the L1 table must contain at least enough entries to put
+       header.size bytes */
+    if (s->l1_size < s->l1_vm_state_index)
+        goto fail;
+    s->l1_table_offset = header.l1_table_offset;
+    s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
+    if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) !=
+        s->l1_size * sizeof(uint64_t))
+        goto fail;
+    for(i = 0;i < s->l1_size; i++) {
+        be64_to_cpus(&s->l1_table[i]);
+    }
+    /* alloc L2 cache */
+    s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+    s->cluster_cache = qemu_malloc(s->cluster_size);
+    /* one more sector for decompressed data alignment */
+    s->cluster_data = qemu_malloc(QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
+                                  + 512);
+    s->cluster_cache_offset = -1;
+
+    if (qcow2_refcount_init(bs) < 0)
+        goto fail;
+
+    /* read qcow2 extensions */
+    if (header.backing_file_offset)
+        ext_end = header.backing_file_offset;
+    else
+        ext_end = s->cluster_size;
+    if (qcow_read_extensions(bs, sizeof(header), ext_end))
+        goto fail;
+
+    /* read the backing file name */
+    if (header.backing_file_offset != 0) {
+        len = header.backing_file_size;
+        if (len > 1023)
+            len = 1023;
+        if (bdrv_pread(s->hd, header.backing_file_offset, bs->backing_file, len) != len)
+            goto fail;
+        bs->backing_file[len] = '\0';
+    }
+    if (qcow2_read_snapshots(bs) < 0)
+        goto fail;
+
+#ifdef DEBUG_ALLOC
+    check_refcounts(bs);
+#endif
+    return 0;
+
+ fail:
+    qcow2_free_snapshots(bs);
+    qcow2_refcount_close(bs);
+    qemu_free(s->l1_table);
+    qemu_free(s->l2_cache);
+    qemu_free(s->cluster_cache);
+    qemu_free(s->cluster_data);
+    bdrv_delete(s->hd);
+    return -1;
+}
+
+static int qcow_set_key(BlockDriverState *bs, const char *key)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint8_t keybuf[16];
+    int len, i;
+
+    memset(keybuf, 0, 16);
+    len = strlen(key);
+    if (len > 16)
+        len = 16;
+    /* XXX: we could compress the chars to 7 bits to increase
+       entropy */
+    for(i = 0;i < len;i++) {
+        keybuf[i] = key[i];
+    }
+    s->crypt_method = s->crypt_method_header;
+
+    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
+        return -1;
+    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
+        return -1;
+#if 0
+    /* test */
+    {
+        uint8_t in[16];
+        uint8_t out[16];
+        uint8_t tmp[16];
+        for(i=0;i<16;i++)
+            in[i] = i;
+        AES_encrypt(in, tmp, &s->aes_encrypt_key);
+        AES_decrypt(tmp, out, &s->aes_decrypt_key);
+        for(i = 0; i < 16; i++)
+            printf(" %02x", tmp[i]);
+        printf("\n");
+        for(i = 0; i < 16; i++)
+            printf(" %02x", out[i]);
+        printf("\n");
+    }
+#endif
+    return 0;
+}
+
+static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num,
+                             int nb_sectors, int *pnum)
+{
+    uint64_t cluster_offset;
+
+    *pnum = nb_sectors;
+    cluster_offset = qcow2_get_cluster_offset(bs, sector_num << 9, pnum);
+
+    return (cluster_offset != 0);
+}
+
+/* handle reading after the end of the backing file */
+int qcow2_backing_read1(BlockDriverState *bs,
+                  int64_t sector_num, uint8_t *buf, int nb_sectors)
+{
+    int n1;
+    if ((sector_num + nb_sectors) <= bs->total_sectors)
+        return nb_sectors;
+    if (sector_num >= bs->total_sectors)
+        n1 = 0;
+    else
+        n1 = bs->total_sectors - sector_num;
+    memset(buf + n1 * 512, 0, 512 * (nb_sectors - n1));
+    return n1;
+}
+
+typedef struct QCowAIOCB {
+    BlockDriverAIOCB common;
+    int64_t sector_num;
+    QEMUIOVector *qiov;
+    uint8_t *buf;
+    void *orig_buf;
+    int nb_sectors;
+    int n;
+    uint64_t cluster_offset;
+    uint8_t *cluster_data;
+    BlockDriverAIOCB *hd_aiocb;
+    struct iovec hd_iov;
+    QEMUIOVector hd_qiov;
+    QEMUBH *bh;
+    QCowL2Meta l2meta;
+} QCowAIOCB;
+
+static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    QCowAIOCB *acb = (QCowAIOCB *)blockacb;
+    if (acb->hd_aiocb)
+        bdrv_aio_cancel(acb->hd_aiocb);
+    qemu_aio_release(acb);
+}
+
+static AIOPool qcow_aio_pool = {
+    .aiocb_size         = sizeof(QCowAIOCB),
+    .cancel             = qcow_aio_cancel,
+};
+
+static void qcow_aio_read_cb(void *opaque, int ret);
+static void qcow_aio_read_bh(void *opaque)
+{
+    QCowAIOCB *acb = opaque;
+    qemu_bh_delete(acb->bh);
+    acb->bh = NULL;
+    qcow_aio_read_cb(opaque, 0);
+}
+
+static int qcow_schedule_bh(QEMUBHFunc *cb, QCowAIOCB *acb)
+{
+    if (acb->bh)
+        return -EIO;
+
+    acb->bh = qemu_bh_new(cb, acb);
+    if (!acb->bh)
+        return -EIO;
+
+    qemu_bh_schedule(acb->bh);
+
+    return 0;
+}
+
+static void qcow_aio_read_cb(void *opaque, int ret)
+{
+    QCowAIOCB *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVQcowState *s = bs->opaque;
+    int index_in_cluster, n1;
+
+    acb->hd_aiocb = NULL;
+    if (ret < 0)
+        goto done;
+
+    /* post process the read buffer */
+    if (!acb->cluster_offset) {
+        /* nothing to do */
+    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
+        /* nothing to do */
+    } else {
+        if (s->crypt_method) {
+            qcow2_encrypt_sectors(s, acb->sector_num, acb->buf, acb->buf,
+                            acb->n, 0,
+                            &s->aes_decrypt_key);
+        }
+    }
+
+    acb->nb_sectors -= acb->n;
+    acb->sector_num += acb->n;
+    acb->buf += acb->n * 512;
+
+    if (acb->nb_sectors == 0) {
+        /* request completed */
+        ret = 0;
+        goto done;
+    }
+
+    /* prepare next AIO request */
+    acb->n = acb->nb_sectors;
+    acb->cluster_offset =
+        qcow2_get_cluster_offset(bs, acb->sector_num << 9, &acb->n);
+    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
+
+    if (!acb->cluster_offset) {
+        if (bs->backing_hd) {
+            /* read from the base image */
+            n1 = qcow2_backing_read1(bs->backing_hd, acb->sector_num,
+                               acb->buf, acb->n);
+            if (n1 > 0) {
+                acb->hd_iov.iov_base = (void *)acb->buf;
+                acb->hd_iov.iov_len = acb->n * 512;
+                qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
+                acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num,
+                                    &acb->hd_qiov, acb->n,
+				    qcow_aio_read_cb, acb);
+                if (acb->hd_aiocb == NULL)
+                    goto done;
+            } else {
+                ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
+                if (ret < 0)
+                    goto done;
+            }
+        } else {
+            /* Note: in this case, no need to wait */
+            memset(acb->buf, 0, 512 * acb->n);
+            ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
+            if (ret < 0)
+                goto done;
+        }
+    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
+        /* add AIO support for compressed blocks ? */
+        if (qcow2_decompress_cluster(s, acb->cluster_offset) < 0)
+            goto done;
+        memcpy(acb->buf,
+               s->cluster_cache + index_in_cluster * 512, 512 * acb->n);
+        ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
+        if (ret < 0)
+            goto done;
+    } else {
+        if ((acb->cluster_offset & 511) != 0) {
+            ret = -EIO;
+            goto done;
+        }
+
+        acb->hd_iov.iov_base = (void *)acb->buf;
+        acb->hd_iov.iov_len = acb->n * 512;
+        qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
+        acb->hd_aiocb = bdrv_aio_readv(s->hd,
+                            (acb->cluster_offset >> 9) + index_in_cluster,
+                            &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
+        if (acb->hd_aiocb == NULL)
+            goto done;
+    }
+
+    return;
+done:
+    if (acb->qiov->niov > 1) {
+        qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size);
+        qemu_vfree(acb->orig_buf);
+    }
+    acb->common.cb(acb->common.opaque, ret);
+    qemu_aio_release(acb);
+}
+
+static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque, int is_write)
+{
+    QCowAIOCB *acb;
+
+    acb = qemu_aio_get(&qcow_aio_pool, bs, cb, opaque);
+    if (!acb)
+        return NULL;
+    acb->hd_aiocb = NULL;
+    acb->sector_num = sector_num;
+    acb->qiov = qiov;
+    if (qiov->niov > 1) {
+        acb->buf = acb->orig_buf = qemu_blockalign(bs, qiov->size);
+        if (is_write)
+            qemu_iovec_to_buffer(qiov, acb->buf);
+    } else {
+        acb->buf = (uint8_t *)qiov->iov->iov_base;
+    }
+    acb->nb_sectors = nb_sectors;
+    acb->n = 0;
+    acb->cluster_offset = 0;
+    acb->l2meta.nb_clusters = 0;
+    return acb;
+}
+
+static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    QCowAIOCB *acb;
+
+    acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+    if (!acb)
+        return NULL;
+
+    qcow_aio_read_cb(acb, 0);
+    return &acb->common;
+}
+
+static void qcow_aio_write_cb(void *opaque, int ret)
+{
+    QCowAIOCB *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVQcowState *s = bs->opaque;
+    int index_in_cluster;
+    const uint8_t *src_buf;
+    int n_end;
+
+    acb->hd_aiocb = NULL;
+
+    if (ret < 0)
+        goto done;
+
+    if (qcow2_alloc_cluster_link_l2(bs, acb->cluster_offset, &acb->l2meta) < 0) {
+        qcow2_free_any_clusters(bs, acb->cluster_offset, acb->l2meta.nb_clusters);
+        goto done;
+    }
+
+    acb->nb_sectors -= acb->n;
+    acb->sector_num += acb->n;
+    acb->buf += acb->n * 512;
+
+    if (acb->nb_sectors == 0) {
+        /* request completed */
+        ret = 0;
+        goto done;
+    }
+
+    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
+    n_end = index_in_cluster + acb->nb_sectors;
+    if (s->crypt_method &&
+        n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
+        n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
+
+    acb->cluster_offset = qcow2_alloc_cluster_offset(bs, acb->sector_num << 9,
+                                          index_in_cluster,
+                                          n_end, &acb->n, &acb->l2meta);
+    if (!acb->cluster_offset || (acb->cluster_offset & 511) != 0) {
+        ret = -EIO;
+        goto done;
+    }
+    if (s->crypt_method) {
+        if (!acb->cluster_data) {
+            acb->cluster_data = qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS *
+                                             s->cluster_size);
+        }
+        qcow2_encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf,
+                        acb->n, 1, &s->aes_encrypt_key);
+        src_buf = acb->cluster_data;
+    } else {
+        src_buf = acb->buf;
+    }
+    acb->hd_iov.iov_base = (void *)src_buf;
+    acb->hd_iov.iov_len = acb->n * 512;
+    qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
+    acb->hd_aiocb = bdrv_aio_writev(s->hd,
+                                    (acb->cluster_offset >> 9) + index_in_cluster,
+                                    &acb->hd_qiov, acb->n,
+                                    qcow_aio_write_cb, acb);
+    if (acb->hd_aiocb == NULL)
+        goto done;
+
+    return;
+
+done:
+    if (acb->qiov->niov > 1)
+        qemu_vfree(acb->orig_buf);
+    acb->common.cb(acb->common.opaque, ret);
+    qemu_aio_release(acb);
+}
+
+static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowAIOCB *acb;
+
+    s->cluster_cache_offset = -1; /* disable compressed cache */
+
+    acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
+    if (!acb)
+        return NULL;
+
+    qcow_aio_write_cb(acb, 0);
+    return &acb->common;
+}
+
+static void qcow_close(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    qemu_free(s->l1_table);
+    qemu_free(s->l2_cache);
+    qemu_free(s->cluster_cache);
+    qemu_free(s->cluster_data);
+    qcow2_refcount_close(bs);
+    bdrv_delete(s->hd);
+}
+
+static int get_bits_from_size(size_t size)
+{
+    int res = 0;
+
+    if (size == 0) {
+        return -1;
+    }
+
+    while (size != 1) {
+        /* Not a power of two */
+        if (size & 1) {
+            return -1;
+        }
+
+        size >>= 1;
+        res++;
+    }
+
+    return res;
+}
+
+static int qcow_create2(const char *filename, int64_t total_size,
+                        const char *backing_file, const char *backing_format,
+                        int flags, size_t cluster_size)
+{
+
+    int fd, header_size, backing_filename_len, l1_size, i, shift, l2_bits;
+    int ref_clusters, backing_format_len = 0;
+    QCowHeader header;
+    uint64_t tmp, offset;
+    QCowCreateState s1, *s = &s1;
+    QCowExtension ext_bf = {0, 0};
+
+
+    memset(s, 0, sizeof(*s));
+
+    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
+    if (fd < 0)
+        return -1;
+    memset(&header, 0, sizeof(header));
+    header.magic = cpu_to_be32(QCOW_MAGIC);
+    header.version = cpu_to_be32(QCOW_VERSION);
+    header.size = cpu_to_be64(total_size * 512);
+    header_size = sizeof(header);
+    backing_filename_len = 0;
+    if (backing_file) {
+        if (backing_format) {
+            ext_bf.magic = QCOW_EXT_MAGIC_BACKING_FORMAT;
+            backing_format_len = strlen(backing_format);
+            ext_bf.len = (backing_format_len + 7) & ~7;
+            header_size += ((sizeof(ext_bf) + ext_bf.len + 7) & ~7);
+        }
+        header.backing_file_offset = cpu_to_be64(header_size);
+        backing_filename_len = strlen(backing_file);
+        header.backing_file_size = cpu_to_be32(backing_filename_len);
+        header_size += backing_filename_len;
+    }
+
+    /* Cluster size */
+    s->cluster_bits = get_bits_from_size(cluster_size);
+    if (s->cluster_bits < MIN_CLUSTER_BITS ||
+        s->cluster_bits > MAX_CLUSTER_BITS)
+    {
+        fprintf(stderr, "Cluster size must be a power of two between "
+            "%d and %dk\n",
+            1 << MIN_CLUSTER_BITS,
+            1 << (MAX_CLUSTER_BITS - 10));
+        return -EINVAL;
+    }
+    s->cluster_size = 1 << s->cluster_bits;
+
+    header.cluster_bits = cpu_to_be32(s->cluster_bits);
+    header_size = (header_size + 7) & ~7;
+    if (flags & BLOCK_FLAG_ENCRYPT) {
+        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
+    } else {
+        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
+    }
+    l2_bits = s->cluster_bits - 3;
+    shift = s->cluster_bits + l2_bits;
+    l1_size = (((total_size * 512) + (1LL << shift) - 1) >> shift);
+    offset = align_offset(header_size, s->cluster_size);
+    s->l1_table_offset = offset;
+    header.l1_table_offset = cpu_to_be64(s->l1_table_offset);
+    header.l1_size = cpu_to_be32(l1_size);
+    offset += align_offset(l1_size * sizeof(uint64_t), s->cluster_size);
+
+    s->refcount_table = qemu_mallocz(s->cluster_size);
+
+    s->refcount_table_offset = offset;
+    header.refcount_table_offset = cpu_to_be64(offset);
+    header.refcount_table_clusters = cpu_to_be32(1);
+    offset += s->cluster_size;
+    s->refcount_block_offset = offset;
+
+    /* count how many refcount blocks needed */
+    tmp = offset >> s->cluster_bits;
+    ref_clusters = (tmp >> (s->cluster_bits - REFCOUNT_SHIFT)) + 1;
+    for (i=0; i < ref_clusters; i++) {
+        s->refcount_table[i] = cpu_to_be64(offset);
+        offset += s->cluster_size;
+    }
+
+    s->refcount_block = qemu_mallocz(ref_clusters * s->cluster_size);
+
+    /* update refcounts */
+    qcow2_create_refcount_update(s, 0, header_size);
+    qcow2_create_refcount_update(s, s->l1_table_offset,
+        l1_size * sizeof(uint64_t));
+    qcow2_create_refcount_update(s, s->refcount_table_offset, s->cluster_size);
+    qcow2_create_refcount_update(s, s->refcount_block_offset,
+        ref_clusters * s->cluster_size);
+
+    /* write all the data */
+    write(fd, &header, sizeof(header));
+    if (backing_file) {
+        if (backing_format_len) {
+            char zero[16];
+            int d = ext_bf.len - backing_format_len;
+
+            memset(zero, 0, sizeof(zero));
+            cpu_to_be32s(&ext_bf.magic);
+            cpu_to_be32s(&ext_bf.len);
+            write(fd, &ext_bf, sizeof(ext_bf));
+            write(fd, backing_format, backing_format_len);
+            if (d>0) {
+                write(fd, zero, d);
+            }
+        }
+        write(fd, backing_file, backing_filename_len);
+    }
+    lseek(fd, s->l1_table_offset, SEEK_SET);
+    tmp = 0;
+    for(i = 0;i < l1_size; i++) {
+        write(fd, &tmp, sizeof(tmp));
+    }
+    lseek(fd, s->refcount_table_offset, SEEK_SET);
+    write(fd, s->refcount_table, s->cluster_size);
+
+    lseek(fd, s->refcount_block_offset, SEEK_SET);
+    write(fd, s->refcount_block, ref_clusters * s->cluster_size);
+
+    qemu_free(s->refcount_table);
+    qemu_free(s->refcount_block);
+    close(fd);
+    return 0;
+}
+
+static int qcow_create(const char *filename, QEMUOptionParameter *options)
+{
+    const char *backing_file = NULL;
+    const char *backing_fmt = NULL;
+    uint64_t sectors = 0;
+    int flags = 0;
+    size_t cluster_size = 65536;
+
+    /* Read out options */
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            sectors = options->value.n / 512;
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+            backing_file = options->value.s;
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) {
+            backing_fmt = options->value.s;
+        } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) {
+            flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0;
+        } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
+            if (options->value.n) {
+                cluster_size = options->value.n;
+            }
+        }
+        options++;
+    }
+
+    return qcow_create2(filename, sectors, backing_file, backing_fmt, flags,
+        cluster_size);
+}
+
+static int qcow_make_empty(BlockDriverState *bs)
+{
+#if 0
+    /* XXX: not correct */
+    BDRVQcowState *s = bs->opaque;
+    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
+    int ret;
+
+    memset(s->l1_table, 0, l1_length);
+    if (bdrv_pwrite(s->hd, s->l1_table_offset, s->l1_table, l1_length) < 0)
+        return -1;
+    ret = bdrv_truncate(s->hd, s->l1_table_offset + l1_length);
+    if (ret < 0)
+        return ret;
+
+    l2_cache_reset(bs);
+#endif
+    return 0;
+}
+
+/* XXX: put compressed sectors first, then all the cluster aligned
+   tables to avoid losing bytes in alignment */
+static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
+                                 const uint8_t *buf, int nb_sectors)
+{
+    BDRVQcowState *s = bs->opaque;
+    z_stream strm;
+    int ret, out_len;
+    uint8_t *out_buf;
+    uint64_t cluster_offset;
+
+    if (nb_sectors == 0) {
+        /* align end of file to a sector boundary to ease reading with
+           sector based I/Os */
+        cluster_offset = bdrv_getlength(s->hd);
+        cluster_offset = (cluster_offset + 511) & ~511;
+        bdrv_truncate(s->hd, cluster_offset);
+        return 0;
+    }
+
+    if (nb_sectors != s->cluster_sectors)
+        return -EINVAL;
+
+    out_buf = qemu_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
+
+    /* best compression, small window, no zlib header */
+    memset(&strm, 0, sizeof(strm));
+    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+                       Z_DEFLATED, -12,
+                       9, Z_DEFAULT_STRATEGY);
+    if (ret != 0) {
+        qemu_free(out_buf);
+        return -1;
+    }
+
+    strm.avail_in = s->cluster_size;
+    strm.next_in = (uint8_t *)buf;
+    strm.avail_out = s->cluster_size;
+    strm.next_out = out_buf;
+
+    ret = deflate(&strm, Z_FINISH);
+    if (ret != Z_STREAM_END && ret != Z_OK) {
+        qemu_free(out_buf);
+        deflateEnd(&strm);
+        return -1;
+    }
+    out_len = strm.next_out - out_buf;
+
+    deflateEnd(&strm);
+
+    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
+        /* could not compress: write normal cluster */
+        bdrv_write(bs, sector_num, buf, s->cluster_sectors);
+    } else {
+        cluster_offset = qcow2_alloc_compressed_cluster_offset(bs,
+            sector_num << 9, out_len);
+        if (!cluster_offset)
+            return -1;
+        cluster_offset &= s->cluster_offset_mask;
+        if (bdrv_pwrite(s->hd, cluster_offset, out_buf, out_len) != out_len) {
+            qemu_free(out_buf);
+            return -1;
+        }
+    }
+
+    qemu_free(out_buf);
+    return 0;
+}
+
+static void qcow_flush(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    bdrv_flush(s->hd);
+}
+
+static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+    BDRVQcowState *s = bs->opaque;
+    bdi->cluster_size = s->cluster_size;
+    bdi->vm_state_offset = (int64_t)s->l1_vm_state_index <<
+        (s->cluster_bits + s->l2_bits);
+    return 0;
+}
+
+
+static int qcow_check(BlockDriverState *bs)
+{
+    return qcow2_check_refcounts(bs);
+}
+
+#if 0
+static void dump_refcounts(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t nb_clusters, k, k1, size;
+    int refcount;
+
+    size = bdrv_getlength(s->hd);
+    nb_clusters = size_to_clusters(s, size);
+    for(k = 0; k < nb_clusters;) {
+        k1 = k;
+        refcount = get_refcount(bs, k);
+        k++;
+        while (k < nb_clusters && get_refcount(bs, k) == refcount)
+            k++;
+        printf("%lld: refcount=%d nb=%lld\n", k, refcount, k - k1);
+    }
+}
+#endif
+
+static int qcow_put_buffer(BlockDriverState *bs, const uint8_t *buf,
+                           int64_t pos, int size)
+{
+    int growable = bs->growable;
+
+    bs->growable = 1;
+    bdrv_pwrite(bs, pos, buf, size);
+    bs->growable = growable;
+
+    return size;
+}
+
+static int qcow_get_buffer(BlockDriverState *bs, uint8_t *buf,
+                           int64_t pos, int size)
+{
+    int growable = bs->growable;
+    int ret;
+
+    bs->growable = 1;
+    ret = bdrv_pread(bs, pos, buf, size);
+    bs->growable = growable;
+
+    return ret;
+}
+
+static QEMUOptionParameter qcow_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    {
+        .name = BLOCK_OPT_BACKING_FILE,
+        .type = OPT_STRING,
+        .help = "File name of a base image"
+    },
+    {
+        .name = BLOCK_OPT_BACKING_FMT,
+        .type = OPT_STRING,
+        .help = "Image format of the base image"
+    },
+    {
+        .name = BLOCK_OPT_ENCRYPT,
+        .type = OPT_FLAG,
+        .help = "Encrypt the image"
+    },
+    {
+        .name = BLOCK_OPT_CLUSTER_SIZE,
+        .type = OPT_SIZE,
+        .help = "qcow2 cluster size"
+    },
+    { NULL }
+};
+
+static BlockDriver bdrv_qcow2 = {
+    .format_name	= "qcow2",
+    .instance_size	= sizeof(BDRVQcowState),
+    .bdrv_probe		= qcow_probe,
+    .bdrv_open		= qcow_open,
+    .bdrv_close		= qcow_close,
+    .bdrv_create	= qcow_create,
+    .bdrv_flush		= qcow_flush,
+    .bdrv_is_allocated	= qcow_is_allocated,
+    .bdrv_set_key	= qcow_set_key,
+    .bdrv_make_empty	= qcow_make_empty,
+
+    .bdrv_aio_readv	= qcow_aio_readv,
+    .bdrv_aio_writev	= qcow_aio_writev,
+    .bdrv_write_compressed = qcow_write_compressed,
+
+    .bdrv_snapshot_create   = qcow2_snapshot_create,
+    .bdrv_snapshot_goto     = qcow2_snapshot_goto,
+    .bdrv_snapshot_delete   = qcow2_snapshot_delete,
+    .bdrv_snapshot_list     = qcow2_snapshot_list,
+    .bdrv_get_info	= qcow_get_info,
+
+    .bdrv_put_buffer    = qcow_put_buffer,
+    .bdrv_get_buffer    = qcow_get_buffer,
+
+    .create_options = qcow_create_options,
+    .bdrv_check = qcow_check,
+};
+
+static void bdrv_qcow2_init(void)
+{
+    bdrv_register(&bdrv_qcow2);
+}
+
+block_init(bdrv_qcow2_init);
diff --git a/block/qcow2.h b/block/qcow2.h
new file mode 100644
index 0000000..d734003
--- /dev/null
+++ b/block/qcow2.h
@@ -0,0 +1,203 @@
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef BLOCK_QCOW2_H
+#define BLOCK_QCOW2_H
+
+#include "aes.h"
+
+#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
+#define QCOW_VERSION 2
+
+#define QCOW_CRYPT_NONE 0
+#define QCOW_CRYPT_AES  1
+
+#define QCOW_MAX_CRYPT_CLUSTERS 32
+
+/* indicate that the refcount of the referenced cluster is exactly one. */
+#define QCOW_OFLAG_COPIED     (1LL << 63)
+/* indicate that the cluster is compressed (they never have the copied flag) */
+#define QCOW_OFLAG_COMPRESSED (1LL << 62)
+
+#define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */
+
+#define MIN_CLUSTER_BITS 9
+#define MAX_CLUSTER_BITS 16
+
+#define L2_CACHE_SIZE 16
+
+typedef struct QCowHeader {
+    uint32_t magic;
+    uint32_t version;
+    uint64_t backing_file_offset;
+    uint32_t backing_file_size;
+    uint32_t cluster_bits;
+    uint64_t size; /* in bytes */
+    uint32_t crypt_method;
+    uint32_t l1_size; /* XXX: save number of clusters instead ? */
+    uint64_t l1_table_offset;
+    uint64_t refcount_table_offset;
+    uint32_t refcount_table_clusters;
+    uint32_t nb_snapshots;
+    uint64_t snapshots_offset;
+} QCowHeader;
+
+typedef struct QCowSnapshot {
+    uint64_t l1_table_offset;
+    uint32_t l1_size;
+    char *id_str;
+    char *name;
+    uint32_t vm_state_size;
+    uint32_t date_sec;
+    uint32_t date_nsec;
+    uint64_t vm_clock_nsec;
+} QCowSnapshot;
+
+typedef struct BDRVQcowState {
+    BlockDriverState *hd;
+    int cluster_bits;
+    int cluster_size;
+    int cluster_sectors;
+    int l2_bits;
+    int l2_size;
+    int l1_size;
+    int l1_vm_state_index;
+    int csize_shift;
+    int csize_mask;
+    uint64_t cluster_offset_mask;
+    uint64_t l1_table_offset;
+    uint64_t *l1_table;
+    uint64_t *l2_cache;
+    uint64_t l2_cache_offsets[L2_CACHE_SIZE];
+    uint32_t l2_cache_counts[L2_CACHE_SIZE];
+    uint8_t *cluster_cache;
+    uint8_t *cluster_data;
+    uint64_t cluster_cache_offset;
+
+    uint64_t *refcount_table;
+    uint64_t refcount_table_offset;
+    uint32_t refcount_table_size;
+    uint64_t refcount_block_cache_offset;
+    uint16_t *refcount_block_cache;
+    int64_t free_cluster_index;
+    int64_t free_byte_offset;
+
+    uint32_t crypt_method; /* current crypt method, 0 if no key yet */
+    uint32_t crypt_method_header;
+    AES_KEY aes_encrypt_key;
+    AES_KEY aes_decrypt_key;
+    uint64_t snapshots_offset;
+    int snapshots_size;
+    int nb_snapshots;
+    QCowSnapshot *snapshots;
+} BDRVQcowState;
+
+/* XXX: use std qcow open function ? */
+typedef struct QCowCreateState {
+    int cluster_size;
+    int cluster_bits;
+    uint16_t *refcount_block;
+    uint64_t *refcount_table;
+    int64_t l1_table_offset;
+    int64_t refcount_table_offset;
+    int64_t refcount_block_offset;
+} QCowCreateState;
+
+/* XXX This could be private for qcow2-cluster.c */
+typedef struct QCowL2Meta
+{
+    uint64_t offset;
+    int n_start;
+    int nb_available;
+    int nb_clusters;
+} QCowL2Meta;
+
+static inline int size_to_clusters(BDRVQcowState *s, int64_t size)
+{
+    return (size + (s->cluster_size - 1)) >> s->cluster_bits;
+}
+
+static inline int64_t align_offset(int64_t offset, int n)
+{
+    offset = (offset + n - 1) & ~(n - 1);
+    return offset;
+}
+
+
+// FIXME Need qcow2_ prefix to global functions
+
+/* qcow2.c functions */
+int qcow2_backing_read1(BlockDriverState *bs,
+                  int64_t sector_num, uint8_t *buf, int nb_sectors);
+
+/* qcow2-refcount.c functions */
+int qcow2_refcount_init(BlockDriverState *bs);
+void qcow2_refcount_close(BlockDriverState *bs);
+
+int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size);
+int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size);
+void qcow2_free_clusters(BlockDriverState *bs,
+    int64_t offset, int64_t size);
+void qcow2_free_any_clusters(BlockDriverState *bs,
+    uint64_t cluster_offset, int nb_clusters);
+
+void qcow2_create_refcount_update(QCowCreateState *s, int64_t offset,
+    int64_t size);
+int qcow2_update_snapshot_refcount(BlockDriverState *bs,
+    int64_t l1_table_offset, int l1_size, int addend);
+
+int qcow2_check_refcounts(BlockDriverState *bs);
+
+/* qcow2-cluster.c functions */
+int qcow2_grow_l1_table(BlockDriverState *bs, int min_size);
+void qcow2_l2_cache_reset(BlockDriverState *bs);
+int qcow2_decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset);
+void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
+                     uint8_t *out_buf, const uint8_t *in_buf,
+                     int nb_sectors, int enc,
+                     const AES_KEY *key);
+
+uint64_t qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
+    int *num);
+uint64_t qcow2_alloc_cluster_offset(BlockDriverState *bs,
+                              uint64_t offset,
+                              int n_start, int n_end,
+                              int *num, QCowL2Meta *m);
+uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
+                                         uint64_t offset,
+                                         int compressed_size);
+
+int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, uint64_t cluster_offset,
+    QCowL2Meta *m);
+
+/* qcow2-snapshot.c functions */
+int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info);
+int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id);
+int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id);
+int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab);
+
+void qcow2_free_snapshots(BlockDriverState *bs);
+int qcow2_read_snapshots(BlockDriverState *bs);
+
+#endif
diff --git a/block/raw-posix.c b/block/raw-posix.c
new file mode 100644
index 0000000..fa1a394
--- /dev/null
+++ b/block/raw-posix.c
@@ -0,0 +1,1501 @@
+/*
+ * Block driver for RAW files (posix)
+ *
+ * Copyright (c) 2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "qemu-timer.h"
+#include "qemu-char.h"
+#include "block_int.h"
+#include "module.h"
+#ifdef CONFIG_AIO
+#include "posix-aio-compat.h"
+#endif
+
+#ifdef CONFIG_COCOA
+#include <paths.h>
+#include <sys/param.h>
+#include <IOKit/IOKitLib.h>
+#include <IOKit/IOBSD.h>
+#include <IOKit/storage/IOMediaBSDClient.h>
+#include <IOKit/storage/IOMedia.h>
+#include <IOKit/storage/IOCDMedia.h>
+//#include <IOKit/storage/IOCDTypes.h>
+#include <CoreFoundation/CoreFoundation.h>
+#endif
+
+#ifdef __sun__
+#define _POSIX_PTHREAD_SEMANTICS 1
+#include <signal.h>
+#include <sys/dkio.h>
+#endif
+#ifdef __linux__
+#include <sys/ioctl.h>
+#include <linux/cdrom.h>
+#include <linux/fd.h>
+#endif
+#ifdef __FreeBSD__
+#include <signal.h>
+#include <sys/disk.h>
+#include <sys/cdio.h>
+#endif
+
+#ifdef __OpenBSD__
+#include <sys/ioctl.h>
+#include <sys/disklabel.h>
+#include <sys/dkio.h>
+#endif
+
+#ifdef __DragonFly__
+#include <sys/ioctl.h>
+#include <sys/diskslice.h>
+#endif
+
+//#define DEBUG_FLOPPY
+
+//#define DEBUG_BLOCK
+#if defined(DEBUG_BLOCK)
+#define DEBUG_BLOCK_PRINT(formatCstr, ...) do { if (qemu_log_enabled()) \
+    { qemu_log(formatCstr, ## __VA_ARGS__); qemu_log_flush(); } } while (0)
+#else
+#define DEBUG_BLOCK_PRINT(formatCstr, ...)
+#endif
+
+/* OS X does not have O_DSYNC */
+#ifndef O_DSYNC
+#define O_DSYNC O_SYNC
+#endif
+
+/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */
+#ifndef O_DIRECT
+#define O_DIRECT O_DSYNC
+#endif
+
+#define FTYPE_FILE   0
+#define FTYPE_CD     1
+#define FTYPE_FD     2
+
+#define ALIGNED_BUFFER_SIZE (32 * 512)
+
+/* if the FD is not accessed during that time (in ms), we try to
+   reopen it to see if the disk has been changed */
+#define FD_OPEN_TIMEOUT 1000
+
+typedef struct BDRVRawState {
+    int fd;
+    int type;
+    unsigned int lseek_err_cnt;
+    int open_flags;
+#if defined(__linux__)
+    /* linux floppy specific */
+    int64_t fd_open_time;
+    int64_t fd_error_time;
+    int fd_got_error;
+    int fd_media_changed;
+#endif
+    uint8_t* aligned_buf;
+} BDRVRawState;
+
+static int posix_aio_init(void);
+
+static int fd_open(BlockDriverState *bs);
+
+#if defined(__FreeBSD__)
+static int cdrom_reopen(BlockDriverState *bs);
+#endif
+
+static int raw_open_common(BlockDriverState *bs, const char *filename,
+                           int bdrv_flags, int open_flags)
+{
+    BDRVRawState *s = bs->opaque;
+    int fd, ret;
+
+    posix_aio_init();
+
+    s->lseek_err_cnt = 0;
+
+    s->open_flags = open_flags | O_BINARY;
+    s->open_flags &= ~O_ACCMODE;
+    if ((bdrv_flags & BDRV_O_ACCESS) == BDRV_O_RDWR) {
+        s->open_flags |= O_RDWR;
+    } else {
+        s->open_flags |= O_RDONLY;
+        bs->read_only = 1;
+    }
+
+    /* Use O_DSYNC for write-through caching, no flags for write-back caching,
+     * and O_DIRECT for no caching. */
+    if ((bdrv_flags & BDRV_O_NOCACHE))
+        s->open_flags |= O_DIRECT;
+    else if (!(bdrv_flags & BDRV_O_CACHE_WB))
+        s->open_flags |= O_DSYNC;
+
+    s->fd = -1;
+    fd = open(filename, s->open_flags, 0644);
+    if (fd < 0) {
+        ret = -errno;
+        if (ret == -EROFS)
+            ret = -EACCES;
+        return ret;
+    }
+    s->fd = fd;
+    s->aligned_buf = NULL;
+    if ((bdrv_flags & BDRV_O_NOCACHE)) {
+        s->aligned_buf = qemu_blockalign(bs, ALIGNED_BUFFER_SIZE);
+        if (s->aligned_buf == NULL) {
+            ret = -errno;
+            close(fd);
+            return ret;
+        }
+    }
+    return 0;
+}
+
+static int raw_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVRawState *s = bs->opaque;
+    int open_flags = 0;
+
+    s->type = FTYPE_FILE;
+    if (flags & BDRV_O_CREAT)
+        open_flags = O_CREAT | O_TRUNC;
+
+    return raw_open_common(bs, filename, flags, open_flags);
+}
+
+/* XXX: use host sector size if necessary with:
+#ifdef DIOCGSECTORSIZE
+        {
+            unsigned int sectorsize = 512;
+            if (!ioctl(fd, DIOCGSECTORSIZE, &sectorsize) &&
+                sectorsize > bufsize)
+                bufsize = sectorsize;
+        }
+#endif
+#ifdef CONFIG_COCOA
+        u_int32_t   blockSize = 512;
+        if ( !ioctl( fd, DKIOCGETBLOCKSIZE, &blockSize ) && blockSize > bufsize) {
+            bufsize = blockSize;
+        }
+#endif
+*/
+
+/*
+ * offset and count are in bytes, but must be multiples of 512 for files
+ * opened with O_DIRECT. buf must be aligned to 512 bytes then.
+ *
+ * This function may be called without alignment if the caller ensures
+ * that O_DIRECT is not in effect.
+ */
+static int raw_pread_aligned(BlockDriverState *bs, int64_t offset,
+                     uint8_t *buf, int count)
+{
+    BDRVRawState *s = bs->opaque;
+    int ret;
+
+    ret = fd_open(bs);
+    if (ret < 0)
+        return ret;
+
+    if (offset >= 0 && lseek(s->fd, offset, SEEK_SET) == (off_t)-1) {
+        ++(s->lseek_err_cnt);
+        if(s->lseek_err_cnt <= 10) {
+            DEBUG_BLOCK_PRINT("raw_pread(%d:%s, %" PRId64 ", %p, %d) [%" PRId64
+                              "] lseek failed : %d = %s\n",
+                              s->fd, bs->filename, offset, buf, count,
+                              bs->total_sectors, errno, strerror(errno));
+        }
+        return -1;
+    }
+    s->lseek_err_cnt=0;
+
+    ret = read(s->fd, buf, count);
+    if (ret == count)
+        goto label__raw_read__success;
+
+    DEBUG_BLOCK_PRINT("raw_pread(%d:%s, %" PRId64 ", %p, %d) [%" PRId64
+                      "] read failed %d : %d = %s\n",
+                      s->fd, bs->filename, offset, buf, count,
+                      bs->total_sectors, ret, errno, strerror(errno));
+
+    /* Try harder for CDrom. */
+    if (bs->type == BDRV_TYPE_CDROM) {
+        lseek(s->fd, offset, SEEK_SET);
+        ret = read(s->fd, buf, count);
+        if (ret == count)
+            goto label__raw_read__success;
+        lseek(s->fd, offset, SEEK_SET);
+        ret = read(s->fd, buf, count);
+        if (ret == count)
+            goto label__raw_read__success;
+
+        DEBUG_BLOCK_PRINT("raw_pread(%d:%s, %" PRId64 ", %p, %d) [%" PRId64
+                          "] retry read failed %d : %d = %s\n",
+                          s->fd, bs->filename, offset, buf, count,
+                          bs->total_sectors, ret, errno, strerror(errno));
+    }
+
+label__raw_read__success:
+
+    return  (ret < 0) ? -errno : ret;
+}
+
+/*
+ * offset and count are in bytes, but must be multiples of 512 for files
+ * opened with O_DIRECT. buf must be aligned to 512 bytes then.
+ *
+ * This function may be called without alignment if the caller ensures
+ * that O_DIRECT is not in effect.
+ */
+static int raw_pwrite_aligned(BlockDriverState *bs, int64_t offset,
+                      const uint8_t *buf, int count)
+{
+    BDRVRawState *s = bs->opaque;
+    int ret;
+
+    ret = fd_open(bs);
+    if (ret < 0)
+        return -errno;
+
+    if (offset >= 0 && lseek(s->fd, offset, SEEK_SET) == (off_t)-1) {
+        ++(s->lseek_err_cnt);
+        if(s->lseek_err_cnt) {
+            DEBUG_BLOCK_PRINT("raw_pwrite(%d:%s, %" PRId64 ", %p, %d) [%"
+                              PRId64 "] lseek failed : %d = %s\n",
+                              s->fd, bs->filename, offset, buf, count,
+                              bs->total_sectors, errno, strerror(errno));
+        }
+        return -EIO;
+    }
+    s->lseek_err_cnt = 0;
+
+    ret = write(s->fd, buf, count);
+    if (ret == count)
+        goto label__raw_write__success;
+
+    DEBUG_BLOCK_PRINT("raw_pwrite(%d:%s, %" PRId64 ", %p, %d) [%" PRId64
+                      "] write failed %d : %d = %s\n",
+                      s->fd, bs->filename, offset, buf, count,
+                      bs->total_sectors, ret, errno, strerror(errno));
+
+label__raw_write__success:
+
+    return  (ret < 0) ? -errno : ret;
+}
+
+
+/*
+ * offset and count are in bytes and possibly not aligned. For files opened
+ * with O_DIRECT, necessary alignments are ensured before calling
+ * raw_pread_aligned to do the actual read.
+ */
+static int raw_pread(BlockDriverState *bs, int64_t offset,
+                     uint8_t *buf, int count)
+{
+    BDRVRawState *s = bs->opaque;
+    int size, ret, shift, sum;
+
+    sum = 0;
+
+    if (s->aligned_buf != NULL)  {
+
+        if (offset & 0x1ff) {
+            /* align offset on a 512 bytes boundary */
+
+            shift = offset & 0x1ff;
+            size = (shift + count + 0x1ff) & ~0x1ff;
+            if (size > ALIGNED_BUFFER_SIZE)
+                size = ALIGNED_BUFFER_SIZE;
+            ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf, size);
+            if (ret < 0)
+                return ret;
+
+            size = 512 - shift;
+            if (size > count)
+                size = count;
+            memcpy(buf, s->aligned_buf + shift, size);
+
+            buf += size;
+            offset += size;
+            count -= size;
+            sum += size;
+
+            if (count == 0)
+                return sum;
+        }
+        if (count & 0x1ff || (uintptr_t) buf & 0x1ff) {
+
+            /* read on aligned buffer */
+
+            while (count) {
+
+                size = (count + 0x1ff) & ~0x1ff;
+                if (size > ALIGNED_BUFFER_SIZE)
+                    size = ALIGNED_BUFFER_SIZE;
+
+                ret = raw_pread_aligned(bs, offset, s->aligned_buf, size);
+                if (ret < 0)
+                    return ret;
+
+                size = ret;
+                if (size > count)
+                    size = count;
+
+                memcpy(buf, s->aligned_buf, size);
+
+                buf += size;
+                offset += size;
+                count -= size;
+                sum += size;
+            }
+
+            return sum;
+        }
+    }
+
+    return raw_pread_aligned(bs, offset, buf, count) + sum;
+}
+
+static int raw_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
+{
+    int ret;
+
+    ret = raw_pread(bs, sector_num * 512, buf, nb_sectors * 512);
+    if (ret == (nb_sectors * 512))
+        ret = 0;
+    return ret;
+}
+
+/*
+ * offset and count are in bytes and possibly not aligned. For files opened
+ * with O_DIRECT, necessary alignments are ensured before calling
+ * raw_pwrite_aligned to do the actual write.
+ */
+static int raw_pwrite(BlockDriverState *bs, int64_t offset,
+                      const uint8_t *buf, int count)
+{
+    BDRVRawState *s = bs->opaque;
+    int size, ret, shift, sum;
+
+    sum = 0;
+
+    if (s->aligned_buf != NULL) {
+
+        if (offset & 0x1ff) {
+            /* align offset on a 512 bytes boundary */
+            shift = offset & 0x1ff;
+            ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf, 512);
+            if (ret < 0)
+                return ret;
+
+            size = 512 - shift;
+            if (size > count)
+                size = count;
+            memcpy(s->aligned_buf + shift, buf, size);
+
+            ret = raw_pwrite_aligned(bs, offset - shift, s->aligned_buf, 512);
+            if (ret < 0)
+                return ret;
+
+            buf += size;
+            offset += size;
+            count -= size;
+            sum += size;
+
+            if (count == 0)
+                return sum;
+        }
+        if (count & 0x1ff || (uintptr_t) buf & 0x1ff) {
+
+            while ((size = (count & ~0x1ff)) != 0) {
+
+                if (size > ALIGNED_BUFFER_SIZE)
+                    size = ALIGNED_BUFFER_SIZE;
+
+                memcpy(s->aligned_buf, buf, size);
+
+                ret = raw_pwrite_aligned(bs, offset, s->aligned_buf, size);
+                if (ret < 0)
+                    return ret;
+
+                buf += ret;
+                offset += ret;
+                count -= ret;
+                sum += ret;
+            }
+            /* here, count < 512 because (count & ~0x1ff) == 0 */
+            if (count) {
+                ret = raw_pread_aligned(bs, offset, s->aligned_buf, 512);
+                if (ret < 0)
+                    return ret;
+                 memcpy(s->aligned_buf, buf, count);
+
+                 ret = raw_pwrite_aligned(bs, offset, s->aligned_buf, 512);
+                 if (ret < 0)
+                     return ret;
+                 if (count < ret)
+                     ret = count;
+
+                 sum += ret;
+            }
+            return sum;
+        }
+    }
+    return raw_pwrite_aligned(bs, offset, buf, count) + sum;
+}
+
+static int raw_write(BlockDriverState *bs, int64_t sector_num,
+                     const uint8_t *buf, int nb_sectors)
+{
+    int ret;
+    ret = raw_pwrite(bs, sector_num * 512, buf, nb_sectors * 512);
+    if (ret == (nb_sectors * 512))
+        ret = 0;
+    return ret;
+}
+
+#ifdef CONFIG_AIO
+/***********************************************************/
+/* Unix AIO using POSIX AIO */
+
+typedef struct RawAIOCB {
+    BlockDriverAIOCB common;
+    struct qemu_paiocb aiocb;
+    struct RawAIOCB *next;
+    int ret;
+} RawAIOCB;
+
+typedef struct PosixAioState
+{
+    int rfd, wfd;
+    RawAIOCB *first_aio;
+} PosixAioState;
+
+static void posix_aio_read(void *opaque)
+{
+    PosixAioState *s = opaque;
+    RawAIOCB *acb, **pacb;
+    int ret;
+    ssize_t len;
+
+    /* read all bytes from signal pipe */
+    for (;;) {
+        char bytes[16];
+
+        len = read(s->rfd, bytes, sizeof(bytes));
+        if (len == -1 && errno == EINTR)
+            continue; /* try again */
+        if (len == sizeof(bytes))
+            continue; /* more to read */
+        break;
+    }
+
+    for(;;) {
+        pacb = &s->first_aio;
+        for(;;) {
+            acb = *pacb;
+            if (!acb)
+                goto the_end;
+            ret = qemu_paio_error(&acb->aiocb);
+            if (ret == ECANCELED) {
+                /* remove the request */
+                *pacb = acb->next;
+                qemu_aio_release(acb);
+            } else if (ret != EINPROGRESS) {
+                /* end of aio */
+                if (ret == 0) {
+                    ret = qemu_paio_return(&acb->aiocb);
+                    if (ret == acb->aiocb.aio_nbytes)
+                        ret = 0;
+                    else
+                        ret = -EINVAL;
+                } else {
+                    ret = -ret;
+                }
+                /* remove the request */
+                *pacb = acb->next;
+                /* call the callback */
+                acb->common.cb(acb->common.opaque, ret);
+                qemu_aio_release(acb);
+                break;
+            } else {
+                pacb = &acb->next;
+            }
+        }
+    }
+ the_end: ;
+}
+
+static int posix_aio_flush(void *opaque)
+{
+    PosixAioState *s = opaque;
+    return !!s->first_aio;
+}
+
+static PosixAioState *posix_aio_state;
+
+static void aio_signal_handler(int signum)
+{
+    if (posix_aio_state) {
+        char byte = 0;
+
+        write(posix_aio_state->wfd, &byte, sizeof(byte));
+    }
+
+    qemu_service_io();
+}
+
+static int posix_aio_init(void)
+{
+    struct sigaction act;
+    PosixAioState *s;
+    int fds[2];
+    struct qemu_paioinit ai;
+  
+    if (posix_aio_state)
+        return 0;
+
+    s = qemu_malloc(sizeof(PosixAioState));
+
+    sigfillset(&act.sa_mask);
+    act.sa_flags = 0; /* do not restart syscalls to interrupt select() */
+    act.sa_handler = aio_signal_handler;
+    sigaction(SIGUSR2, &act, NULL);
+
+    s->first_aio = NULL;
+    if (pipe(fds) == -1) {
+        fprintf(stderr, "failed to create pipe\n");
+        return -errno;
+    }
+
+    s->rfd = fds[0];
+    s->wfd = fds[1];
+
+    fcntl(s->rfd, F_SETFL, O_NONBLOCK);
+    fcntl(s->wfd, F_SETFL, O_NONBLOCK);
+
+    qemu_aio_set_fd_handler(s->rfd, posix_aio_read, NULL, posix_aio_flush, s);
+
+    memset(&ai, 0, sizeof(ai));
+    ai.aio_threads = 64;
+    ai.aio_num = 64;
+    qemu_paio_init(&ai);
+
+    posix_aio_state = s;
+
+    return 0;
+}
+
+static void raw_aio_remove(RawAIOCB *acb)
+{
+    RawAIOCB **pacb;
+
+    /* remove the callback from the queue */
+    pacb = &posix_aio_state->first_aio;
+    for(;;) {
+        if (*pacb == NULL) {
+            fprintf(stderr, "raw_aio_remove: aio request not found!\n");
+            break;
+        } else if (*pacb == acb) {
+            *pacb = acb->next;
+            qemu_aio_release(acb);
+            break;
+        }
+        pacb = &(*pacb)->next;
+    }
+}
+
+static void raw_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    int ret;
+    RawAIOCB *acb = (RawAIOCB *)blockacb;
+
+    ret = qemu_paio_cancel(acb->aiocb.aio_fildes, &acb->aiocb);
+    if (ret == QEMU_PAIO_NOTCANCELED) {
+        /* fail safe: if the aio could not be canceled, we wait for
+           it */
+        while (qemu_paio_error(&acb->aiocb) == EINPROGRESS);
+    }
+
+    raw_aio_remove(acb);
+}
+
+static AIOPool raw_aio_pool = {
+    .aiocb_size         = sizeof(RawAIOCB),
+    .cancel             = raw_aio_cancel,
+};
+
+static RawAIOCB *raw_aio_setup(BlockDriverState *bs, int64_t sector_num,
+        QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVRawState *s = bs->opaque;
+    RawAIOCB *acb;
+
+    if (fd_open(bs) < 0)
+        return NULL;
+
+    acb = qemu_aio_get(&raw_aio_pool, bs, cb, opaque);
+    if (!acb)
+        return NULL;
+    acb->aiocb.aio_fildes = s->fd;
+    acb->aiocb.ev_signo = SIGUSR2;
+    acb->aiocb.aio_iov = qiov->iov;
+    acb->aiocb.aio_niov = qiov->niov;
+    acb->aiocb.aio_nbytes = nb_sectors * 512;
+    acb->aiocb.aio_offset = sector_num * 512;
+    acb->aiocb.aio_flags = 0;
+
+    /*
+     * If O_DIRECT is used the buffer needs to be aligned on a sector
+     * boundary. Tell the low level code to ensure that in case it's
+     * not done yet.
+     */
+    if (s->aligned_buf)
+        acb->aiocb.aio_flags |= QEMU_AIO_SECTOR_ALIGNED;
+
+    acb->next = posix_aio_state->first_aio;
+    posix_aio_state->first_aio = acb;
+    return acb;
+}
+
+static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    RawAIOCB *acb;
+
+    acb = raw_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque);
+    if (!acb)
+        return NULL;
+    if (qemu_paio_read(&acb->aiocb) < 0) {
+        raw_aio_remove(acb);
+        return NULL;
+    }
+    return &acb->common;
+}
+
+static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    RawAIOCB *acb;
+
+    acb = raw_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque);
+    if (!acb)
+        return NULL;
+    if (qemu_paio_write(&acb->aiocb) < 0) {
+        raw_aio_remove(acb);
+        return NULL;
+    }
+    return &acb->common;
+}
+#else /* CONFIG_AIO */
+static int posix_aio_init(void)
+{
+    return 0;
+}
+#endif /* CONFIG_AIO */
+
+
+static void raw_close(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    if (s->fd >= 0) {
+        close(s->fd);
+        s->fd = -1;
+        if (s->aligned_buf != NULL)
+            qemu_free(s->aligned_buf);
+    }
+}
+
+static int raw_truncate(BlockDriverState *bs, int64_t offset)
+{
+    BDRVRawState *s = bs->opaque;
+    if (s->type != FTYPE_FILE)
+        return -ENOTSUP;
+    if (ftruncate(s->fd, offset) < 0)
+        return -errno;
+    return 0;
+}
+
+#ifdef __OpenBSD__
+static int64_t raw_getlength(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    int fd = s->fd;
+    struct stat st;
+
+    if (fstat(fd, &st))
+        return -1;
+    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
+        struct disklabel dl;
+
+        if (ioctl(fd, DIOCGDINFO, &dl))
+            return -1;
+        return (uint64_t)dl.d_secsize *
+            dl.d_partitions[DISKPART(st.st_rdev)].p_size;
+    } else
+        return st.st_size;
+}
+#else /* !__OpenBSD__ */
+static int64_t  raw_getlength(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    int fd = s->fd;
+    int64_t size;
+#ifdef HOST_BSD
+    struct stat sb;
+#ifdef __FreeBSD__
+    int reopened = 0;
+#endif
+#endif
+#ifdef __sun__
+    struct dk_minfo minfo;
+    int rv;
+#endif
+    int ret;
+
+    ret = fd_open(bs);
+    if (ret < 0)
+        return ret;
+
+#ifdef HOST_BSD
+#ifdef __FreeBSD__
+again:
+#endif
+    if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
+#ifdef DIOCGMEDIASIZE
+	if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size))
+#elif defined(DIOCGPART)
+        {
+                struct partinfo pi;
+                if (ioctl(fd, DIOCGPART, &pi) == 0)
+                        size = pi.media_size;
+                else
+                        size = 0;
+        }
+        if (size == 0)
+#endif
+#ifdef CONFIG_COCOA
+        size = LONG_LONG_MAX;
+#else
+        size = lseek(fd, 0LL, SEEK_END);
+#endif
+#ifdef __FreeBSD__
+        switch(s->type) {
+        case FTYPE_CD:
+            /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */
+            if (size == 2048LL * (unsigned)-1)
+                size = 0;
+            /* XXX no disc?  maybe we need to reopen... */
+            if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) {
+                reopened = 1;
+                goto again;
+            }
+        }
+#endif
+    } else
+#endif
+#ifdef __sun__
+    /*
+     * use the DKIOCGMEDIAINFO ioctl to read the size.
+     */
+    rv = ioctl ( fd, DKIOCGMEDIAINFO, &minfo );
+    if ( rv != -1 ) {
+        size = minfo.dki_lbsize * minfo.dki_capacity;
+    } else /* there are reports that lseek on some devices
+              fails, but irc discussion said that contingency
+              on contingency was overkill */
+#endif
+    {
+        size = lseek(fd, 0, SEEK_END);
+    }
+    return size;
+}
+#endif
+
+static int raw_create(const char *filename, QEMUOptionParameter *options)
+{
+    int fd;
+    int64_t total_size = 0;
+
+    /* Read out options */
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            total_size = options->value.n / 512;
+        }
+        options++;
+    }
+
+    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
+              0644);
+    if (fd < 0)
+        return -EIO;
+    ftruncate(fd, total_size * 512);
+    close(fd);
+    return 0;
+}
+
+static void raw_flush(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    fsync(s->fd);
+}
+
+
+static QEMUOptionParameter raw_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    { NULL }
+};
+
+static BlockDriver bdrv_raw = {
+    .format_name = "raw",
+    .instance_size = sizeof(BDRVRawState),
+    .bdrv_probe = NULL, /* no probe for protocols */
+    .bdrv_open = raw_open,
+    .bdrv_read = raw_read,
+    .bdrv_write = raw_write,
+    .bdrv_close = raw_close,
+    .bdrv_create = raw_create,
+    .bdrv_flush = raw_flush,
+
+#ifdef CONFIG_AIO
+    .bdrv_aio_readv = raw_aio_readv,
+    .bdrv_aio_writev = raw_aio_writev,
+#endif
+
+    .bdrv_truncate = raw_truncate,
+    .bdrv_getlength = raw_getlength,
+
+    .create_options = raw_create_options,
+};
+
+/***********************************************/
+/* host device */
+
+#ifdef CONFIG_COCOA
+static kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator );
+static kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex maxPathSize );
+
+kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator )
+{
+    kern_return_t       kernResult;
+    mach_port_t     masterPort;
+    CFMutableDictionaryRef  classesToMatch;
+
+    kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort );
+    if ( KERN_SUCCESS != kernResult ) {
+        printf( "IOMasterPort returned %d\n", kernResult );
+    }
+
+    classesToMatch = IOServiceMatching( kIOCDMediaClass );
+    if ( classesToMatch == NULL ) {
+        printf( "IOServiceMatching returned a NULL dictionary.\n" );
+    } else {
+    CFDictionarySetValue( classesToMatch, CFSTR( kIOMediaEjectableKey ), kCFBooleanTrue );
+    }
+    kernResult = IOServiceGetMatchingServices( masterPort, classesToMatch, mediaIterator );
+    if ( KERN_SUCCESS != kernResult )
+    {
+        printf( "IOServiceGetMatchingServices returned %d\n", kernResult );
+    }
+
+    return kernResult;
+}
+
+kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex maxPathSize )
+{
+    io_object_t     nextMedia;
+    kern_return_t   kernResult = KERN_FAILURE;
+    *bsdPath = '\0';
+    nextMedia = IOIteratorNext( mediaIterator );
+    if ( nextMedia )
+    {
+        CFTypeRef   bsdPathAsCFString;
+    bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 );
+        if ( bsdPathAsCFString ) {
+            size_t devPathLength;
+            strcpy( bsdPath, _PATH_DEV );
+            strcat( bsdPath, "r" );
+            devPathLength = strlen( bsdPath );
+            if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
+                kernResult = KERN_SUCCESS;
+            }
+            CFRelease( bsdPathAsCFString );
+        }
+        IOObjectRelease( nextMedia );
+    }
+
+    return kernResult;
+}
+
+#endif
+
+static int hdev_probe_device(const char *filename)
+{
+    struct stat st;
+
+    /* allow a dedicated CD-ROM driver to match with a higher priority */
+    if (strstart(filename, "/dev/cdrom", NULL))
+        return 50;
+
+    if (stat(filename, &st) >= 0 &&
+            (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
+        return 100;
+    }
+
+    return 0;
+}
+
+static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVRawState *s = bs->opaque;
+
+#ifdef CONFIG_COCOA
+    if (strstart(filename, "/dev/cdrom", NULL)) {
+        kern_return_t kernResult;
+        io_iterator_t mediaIterator;
+        char bsdPath[ MAXPATHLEN ];
+        int fd;
+
+        kernResult = FindEjectableCDMedia( &mediaIterator );
+        kernResult = GetBSDPath( mediaIterator, bsdPath, sizeof( bsdPath ) );
+
+        if ( bsdPath[ 0 ] != '\0' ) {
+            strcat(bsdPath,"s0");
+            /* some CDs don't have a partition 0 */
+            fd = open(bsdPath, O_RDONLY | O_BINARY | O_LARGEFILE);
+            if (fd < 0) {
+                bsdPath[strlen(bsdPath)-1] = '1';
+            } else {
+                close(fd);
+            }
+            filename = bsdPath;
+        }
+
+        if ( mediaIterator )
+            IOObjectRelease( mediaIterator );
+    }
+#endif
+
+    s->type = FTYPE_FILE;
+#if defined(__linux__) && defined(CONFIG_AIO)
+    if (strstart(filename, "/dev/sg", NULL)) {
+        bs->sg = 1;
+    }
+#endif
+
+    return raw_open_common(bs, filename, flags, 0);
+}
+
+#if defined(__linux__)
+/* Note: we do not have a reliable method to detect if the floppy is
+   present. The current method is to try to open the floppy at every
+   I/O and to keep it opened during a few hundreds of ms. */
+static int fd_open(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    int last_media_present;
+
+    if (s->type != FTYPE_FD)
+        return 0;
+    last_media_present = (s->fd >= 0);
+    if (s->fd >= 0 &&
+        (qemu_get_clock(rt_clock) - s->fd_open_time) >= FD_OPEN_TIMEOUT) {
+        close(s->fd);
+        s->fd = -1;
+#ifdef DEBUG_FLOPPY
+        printf("Floppy closed\n");
+#endif
+    }
+    if (s->fd < 0) {
+        if (s->fd_got_error &&
+            (qemu_get_clock(rt_clock) - s->fd_error_time) < FD_OPEN_TIMEOUT) {
+#ifdef DEBUG_FLOPPY
+            printf("No floppy (open delayed)\n");
+#endif
+            return -EIO;
+        }
+        s->fd = open(bs->filename, s->open_flags & ~O_NONBLOCK);
+        if (s->fd < 0) {
+            s->fd_error_time = qemu_get_clock(rt_clock);
+            s->fd_got_error = 1;
+            if (last_media_present)
+                s->fd_media_changed = 1;
+#ifdef DEBUG_FLOPPY
+            printf("No floppy\n");
+#endif
+            return -EIO;
+        }
+#ifdef DEBUG_FLOPPY
+        printf("Floppy opened\n");
+#endif
+    }
+    if (!last_media_present)
+        s->fd_media_changed = 1;
+    s->fd_open_time = qemu_get_clock(rt_clock);
+    s->fd_got_error = 0;
+    return 0;
+}
+
+static int hdev_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+{
+    BDRVRawState *s = bs->opaque;
+
+    return ioctl(s->fd, req, buf);
+}
+
+#ifdef CONFIG_AIO
+static BlockDriverAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
+        unsigned long int req, void *buf,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVRawState *s = bs->opaque;
+    RawAIOCB *acb;
+
+    if (fd_open(bs) < 0)
+        return NULL;
+
+    acb = qemu_aio_get(&raw_aio_pool, bs, cb, opaque);
+    if (!acb)
+        return NULL;
+    acb->aiocb.aio_fildes = s->fd;
+    acb->aiocb.ev_signo = SIGUSR2;
+    acb->aiocb.aio_offset = 0;
+    acb->aiocb.aio_flags = 0;
+
+    acb->next = posix_aio_state->first_aio;
+    posix_aio_state->first_aio = acb;
+
+    acb->aiocb.aio_ioctl_buf = buf;
+    acb->aiocb.aio_ioctl_cmd = req;
+    if (qemu_paio_ioctl(&acb->aiocb) < 0) {
+        raw_aio_remove(acb);
+        return NULL;
+    }
+
+    return &acb->common;
+}
+#endif
+
+#elif defined(__FreeBSD__)
+static int fd_open(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+
+    /* this is just to ensure s->fd is sane (its called by io ops) */
+    if (s->fd >= 0)
+        return 0;
+    return -EIO;
+}
+#else /* !linux && !FreeBSD */
+
+static int fd_open(BlockDriverState *bs)
+{
+    return 0;
+}
+
+#endif /* !linux && !FreeBSD */
+
+static int hdev_create(const char *filename, QEMUOptionParameter *options)
+{
+    int fd;
+    int ret = 0;
+    struct stat stat_buf;
+    int64_t total_size = 0;
+
+    /* Read out options */
+    while (options && options->name) {
+        if (!strcmp(options->name, "size")) {
+            total_size = options->value.n / 512;
+        }
+        options++;
+    }
+
+    fd = open(filename, O_WRONLY | O_BINARY);
+    if (fd < 0)
+        return -EIO;
+
+    if (fstat(fd, &stat_buf) < 0)
+        ret = -EIO;
+    else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode))
+        ret = -EIO;
+    else if (lseek(fd, 0, SEEK_END) < total_size * 512)
+        ret = -ENOSPC;
+
+    close(fd);
+    return ret;
+}
+
+static BlockDriver bdrv_host_device = {
+    .format_name	= "host_device",
+    .instance_size	= sizeof(BDRVRawState),
+    .bdrv_probe_device	= hdev_probe_device,
+    .bdrv_open		= hdev_open,
+    .bdrv_close		= raw_close,
+    .bdrv_create        = hdev_create,
+    .bdrv_flush		= raw_flush,
+
+#ifdef CONFIG_AIO
+    .bdrv_aio_readv	= raw_aio_readv,
+    .bdrv_aio_writev	= raw_aio_writev,
+#endif
+
+    .bdrv_read          = raw_read,
+    .bdrv_write         = raw_write,
+    .bdrv_getlength	= raw_getlength,
+
+    /* generic scsi device */
+#ifdef __linux__
+    .bdrv_ioctl         = hdev_ioctl,
+#ifdef CONFIG_AIO
+    .bdrv_aio_ioctl     = hdev_aio_ioctl,
+#endif
+#endif
+};
+
+#ifdef __linux__
+static int floppy_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVRawState *s = bs->opaque;
+    int ret;
+
+    posix_aio_init();
+
+    s->type = FTYPE_FD;
+
+    /* open will not fail even if no floppy is inserted, so add O_NONBLOCK */
+    ret = raw_open_common(bs, filename, flags, O_NONBLOCK);
+    if (ret)
+        return ret;
+
+    /* close fd so that we can reopen it as needed */
+    close(s->fd);
+    s->fd = -1;
+    s->fd_media_changed = 1;
+
+    return 0;
+}
+
+static int floppy_probe_device(const char *filename)
+{
+    if (strstart(filename, "/dev/fd", NULL))
+        return 100;
+    return 0;
+}
+
+
+static int floppy_is_inserted(BlockDriverState *bs)
+{
+    return fd_open(bs) >= 0;
+}
+
+static int floppy_media_changed(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    int ret;
+
+    /*
+     * XXX: we do not have a true media changed indication.
+     * It does not work if the floppy is changed without trying to read it.
+     */
+    fd_open(bs);
+    ret = s->fd_media_changed;
+    s->fd_media_changed = 0;
+#ifdef DEBUG_FLOPPY
+    printf("Floppy changed=%d\n", ret);
+#endif
+    return ret;
+}
+
+static int floppy_eject(BlockDriverState *bs, int eject_flag)
+{
+    BDRVRawState *s = bs->opaque;
+    int fd;
+
+    if (s->fd >= 0) {
+        close(s->fd);
+        s->fd = -1;
+    }
+    fd = open(bs->filename, s->open_flags | O_NONBLOCK);
+    if (fd >= 0) {
+        if (ioctl(fd, FDEJECT, 0) < 0)
+            perror("FDEJECT");
+        close(fd);
+    }
+
+    return 0;
+}
+
+static BlockDriver bdrv_host_floppy = {
+    .format_name        = "host_floppy",
+    .instance_size      = sizeof(BDRVRawState),
+    .bdrv_probe_device	= floppy_probe_device,
+    .bdrv_open          = floppy_open,
+    .bdrv_close         = raw_close,
+    .bdrv_create        = hdev_create,
+    .bdrv_flush         = raw_flush,
+
+#ifdef CONFIG_AIO
+    .bdrv_aio_readv     = raw_aio_readv,
+    .bdrv_aio_writev    = raw_aio_writev,
+#endif
+
+    .bdrv_read          = raw_read,
+    .bdrv_write         = raw_write,
+    .bdrv_getlength	= raw_getlength,
+
+    /* removable device support */
+    .bdrv_is_inserted   = floppy_is_inserted,
+    .bdrv_media_changed = floppy_media_changed,
+    .bdrv_eject         = floppy_eject,
+};
+
+static int cdrom_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVRawState *s = bs->opaque;
+
+    s->type = FTYPE_CD;
+
+    /* open will not fail even if no CD is inserted, so add O_NONBLOCK */
+    return raw_open_common(bs, filename, flags, O_NONBLOCK);
+}
+
+static int cdrom_probe_device(const char *filename)
+{
+    if (strstart(filename, "/dev/cd", NULL))
+        return 100;
+    return 0;
+}
+
+static int cdrom_is_inserted(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    int ret;
+
+    ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
+    if (ret == CDS_DISC_OK)
+        return 1;
+    return 0;
+}
+
+static int cdrom_eject(BlockDriverState *bs, int eject_flag)
+{
+    BDRVRawState *s = bs->opaque;
+
+    if (eject_flag) {
+        if (ioctl(s->fd, CDROMEJECT, NULL) < 0)
+            perror("CDROMEJECT");
+    } else {
+        if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0)
+            perror("CDROMEJECT");
+    }
+
+    return 0;
+}
+
+static int cdrom_set_locked(BlockDriverState *bs, int locked)
+{
+    BDRVRawState *s = bs->opaque;
+
+    if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) {
+        /*
+         * Note: an error can happen if the distribution automatically
+         * mounts the CD-ROM
+         */
+        /* perror("CDROM_LOCKDOOR"); */
+    }
+
+    return 0;
+}
+
+static BlockDriver bdrv_host_cdrom = {
+    .format_name        = "host_cdrom",
+    .instance_size      = sizeof(BDRVRawState),
+    .bdrv_probe_device	= cdrom_probe_device,
+    .bdrv_open          = cdrom_open,
+    .bdrv_close         = raw_close,
+    .bdrv_create        = hdev_create,
+    .bdrv_flush         = raw_flush,
+
+#ifdef CONFIG_AIO
+    .bdrv_aio_readv     = raw_aio_readv,
+    .bdrv_aio_writev    = raw_aio_writev,
+#endif
+
+    .bdrv_read          = raw_read,
+    .bdrv_write         = raw_write,
+    .bdrv_getlength     = raw_getlength,
+
+    /* removable device support */
+    .bdrv_is_inserted   = cdrom_is_inserted,
+    .bdrv_eject         = cdrom_eject,
+    .bdrv_set_locked    = cdrom_set_locked,
+
+    /* generic scsi device */
+    .bdrv_ioctl         = hdev_ioctl,
+#ifdef CONFIG_AIO
+    .bdrv_aio_ioctl     = hdev_aio_ioctl,
+#endif
+};
+#endif /* __linux__ */
+
+#ifdef __FreeBSD__
+static int cdrom_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVRawState *s = bs->opaque;
+    int ret;
+
+    s->type = FTYPE_CD;
+
+    ret = raw_open_common(bs, filename, flags, 0);
+    if (ret)
+        return ret;
+
+    /* make sure the door isnt locked at this time */
+    ioctl(s->fd, CDIOCALLOW);
+    return 0;
+}
+
+static int cdrom_probe_device(const char *filename)
+{
+    if (strstart(filename, "/dev/cd", NULL) ||
+            strstart(filename, "/dev/acd", NULL))
+        return 100;
+    return 0;
+}
+
+static int cdrom_reopen(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    int fd;
+
+    /*
+     * Force reread of possibly changed/newly loaded disc,
+     * FreeBSD seems to not notice sometimes...
+     */
+    if (s->fd >= 0)
+        close(s->fd);
+    fd = open(bs->filename, s->open_flags, 0644);
+    if (fd < 0) {
+        s->fd = -1;
+        return -EIO;
+    }
+    s->fd = fd;
+
+    /* make sure the door isnt locked at this time */
+    ioctl(s->fd, CDIOCALLOW);
+    return 0;
+}
+
+static int cdrom_is_inserted(BlockDriverState *bs)
+{
+    return raw_getlength(bs) > 0;
+}
+
+static int cdrom_eject(BlockDriverState *bs, int eject_flag)
+{
+    BDRVRawState *s = bs->opaque;
+
+    if (s->fd < 0)
+        return -ENOTSUP;
+
+    (void) ioctl(s->fd, CDIOCALLOW);
+
+    if (eject_flag) {
+        if (ioctl(s->fd, CDIOCEJECT) < 0)
+            perror("CDIOCEJECT");
+    } else {
+        if (ioctl(s->fd, CDIOCCLOSE) < 0)
+            perror("CDIOCCLOSE");
+    }
+
+    if (cdrom_reopen(bs) < 0)
+        return -ENOTSUP;
+    return 0;
+}
+
+static int cdrom_set_locked(BlockDriverState *bs, int locked)
+{
+    BDRVRawState *s = bs->opaque;
+
+    if (s->fd < 0)
+        return -ENOTSUP;
+    if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
+        /*
+         * Note: an error can happen if the distribution automatically
+         * mounts the CD-ROM
+         */
+        /* perror("CDROM_LOCKDOOR"); */
+    }
+
+    return 0;
+}
+
+static BlockDriver bdrv_host_cdrom = {
+    .format_name        = "host_cdrom",
+    .instance_size      = sizeof(BDRVRawState),
+    .bdrv_probe_device	= cdrom_probe_device,
+    .bdrv_open          = cdrom_open,
+    .bdrv_close         = raw_close,
+    .bdrv_create        = hdev_create,
+    .bdrv_flush         = raw_flush,
+
+#ifdef CONFIG_AIO
+    .bdrv_aio_readv     = raw_aio_readv,
+    .bdrv_aio_writev    = raw_aio_writev,
+#endif
+
+    .bdrv_read          = raw_read,
+    .bdrv_write         = raw_write,
+    .bdrv_getlength     = raw_getlength,
+
+    /* removable device support */
+    .bdrv_is_inserted   = cdrom_is_inserted,
+    .bdrv_eject         = cdrom_eject,
+    .bdrv_set_locked    = cdrom_set_locked,
+};
+#endif /* __FreeBSD__ */
+
+static void bdrv_raw_init(void)
+{
+    /*
+     * Register all the drivers.  Note that order is important, the driver
+     * registered last will get probed first.
+     */
+    bdrv_register(&bdrv_raw);
+    bdrv_register(&bdrv_host_device);
+#ifdef __linux__
+    bdrv_register(&bdrv_host_floppy);
+    bdrv_register(&bdrv_host_cdrom);
+#endif
+#ifdef __FreeBSD__
+    bdrv_register(&bdrv_host_cdrom);
+#endif
+}
+
+block_init(bdrv_raw_init);
diff --git a/block/raw-win32.c b/block/raw-win32.c
new file mode 100644
index 0000000..72acad5
--- /dev/null
+++ b/block/raw-win32.c
@@ -0,0 +1,419 @@
+/*
+ * Block driver for RAW files (win32)
+ *
+ * Copyright (c) 2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "qemu-timer.h"
+#include "block_int.h"
+#include "module.h"
+#include <windows.h>
+#include <winioctl.h>
+
+#define FTYPE_FILE 0
+#define FTYPE_CD     1
+#define FTYPE_HARDDISK 2
+
+typedef struct BDRVRawState {
+    HANDLE hfile;
+    int type;
+    char drive_path[16]; /* format: "d:\" */
+} BDRVRawState;
+
+int qemu_ftruncate64(int fd, int64_t length)
+{
+    LARGE_INTEGER li;
+    LONG high;
+    HANDLE h;
+    BOOL res;
+
+    if ((GetVersion() & 0x80000000UL) && (length >> 32) != 0)
+	return -1;
+
+    h = (HANDLE)_get_osfhandle(fd);
+
+    /* get current position, ftruncate do not change position */
+    li.HighPart = 0;
+    li.LowPart = SetFilePointer (h, 0, &li.HighPart, FILE_CURRENT);
+    if (li.LowPart == 0xffffffffUL && GetLastError() != NO_ERROR)
+	return -1;
+
+    high = length >> 32;
+    if (!SetFilePointer(h, (DWORD) length, &high, FILE_BEGIN))
+	return -1;
+    res = SetEndOfFile(h);
+
+    /* back to old position */
+    SetFilePointer(h, li.LowPart, &li.HighPart, FILE_BEGIN);
+    return res ? 0 : -1;
+}
+
+static int set_sparse(int fd)
+{
+    DWORD returned;
+    return (int) DeviceIoControl((HANDLE)_get_osfhandle(fd), FSCTL_SET_SPARSE,
+				 NULL, 0, NULL, 0, &returned, NULL);
+}
+
+static int raw_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVRawState *s = bs->opaque;
+    int access_flags, create_flags;
+    DWORD overlapped;
+
+    s->type = FTYPE_FILE;
+
+    if ((flags & BDRV_O_ACCESS) == O_RDWR) {
+        access_flags = GENERIC_READ | GENERIC_WRITE;
+    } else {
+        access_flags = GENERIC_READ;
+    }
+    if (flags & BDRV_O_CREAT) {
+        create_flags = CREATE_ALWAYS;
+    } else {
+        create_flags = OPEN_EXISTING;
+    }
+    overlapped = FILE_ATTRIBUTE_NORMAL;
+    if ((flags & BDRV_O_NOCACHE))
+        overlapped |= FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH;
+    else if (!(flags & BDRV_O_CACHE_WB))
+        overlapped |= FILE_FLAG_WRITE_THROUGH;
+    s->hfile = CreateFile(filename, access_flags,
+                          FILE_SHARE_READ, NULL,
+                          create_flags, overlapped, NULL);
+    if (s->hfile == INVALID_HANDLE_VALUE) {
+        int err = GetLastError();
+
+        if (err == ERROR_ACCESS_DENIED)
+            return -EACCES;
+        return -1;
+    }
+    return 0;
+}
+
+static int raw_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
+{
+    BDRVRawState *s = bs->opaque;
+    OVERLAPPED ov;
+    DWORD ret_count;
+    int ret;
+    int64_t offset = sector_num * 512;
+    int count = nb_sectors * 512;
+
+    memset(&ov, 0, sizeof(ov));
+    ov.Offset = offset;
+    ov.OffsetHigh = offset >> 32;
+    ret = ReadFile(s->hfile, buf, count, &ret_count, &ov);
+    if (!ret)
+        return ret_count;
+    if (ret_count == count)
+        ret_count = 0;
+    return ret_count;
+}
+
+static int raw_write(BlockDriverState *bs, int64_t sector_num,
+                     const uint8_t *buf, int nb_sectors)
+{
+    BDRVRawState *s = bs->opaque;
+    OVERLAPPED ov;
+    DWORD ret_count;
+    int ret;
+    int64_t offset = sector_num * 512;
+    int count = nb_sectors * 512;
+
+    memset(&ov, 0, sizeof(ov));
+    ov.Offset = offset;
+    ov.OffsetHigh = offset >> 32;
+    ret = WriteFile(s->hfile, buf, count, &ret_count, &ov);
+    if (!ret)
+        return ret_count;
+    if (ret_count == count)
+        ret_count = 0;
+    return ret_count;
+}
+
+static void raw_flush(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    FlushFileBuffers(s->hfile);
+}
+
+static void raw_close(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    CloseHandle(s->hfile);
+}
+
+static int raw_truncate(BlockDriverState *bs, int64_t offset)
+{
+    BDRVRawState *s = bs->opaque;
+    LONG low, high;
+
+    low = offset;
+    high = offset >> 32;
+    if (!SetFilePointer(s->hfile, low, &high, FILE_BEGIN))
+	return -EIO;
+    if (!SetEndOfFile(s->hfile))
+        return -EIO;
+    return 0;
+}
+
+static int64_t raw_getlength(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    LARGE_INTEGER l;
+    ULARGE_INTEGER available, total, total_free;
+    DISK_GEOMETRY_EX dg;
+    DWORD count;
+    BOOL status;
+
+    switch(s->type) {
+    case FTYPE_FILE:
+        l.LowPart = GetFileSize(s->hfile, (PDWORD)&l.HighPart);
+        if (l.LowPart == 0xffffffffUL && GetLastError() != NO_ERROR)
+            return -EIO;
+        break;
+    case FTYPE_CD:
+        if (!GetDiskFreeSpaceEx(s->drive_path, &available, &total, &total_free))
+            return -EIO;
+        l.QuadPart = total.QuadPart;
+        break;
+    case FTYPE_HARDDISK:
+        status = DeviceIoControl(s->hfile, IOCTL_DISK_GET_DRIVE_GEOMETRY_EX,
+                                 NULL, 0, &dg, sizeof(dg), &count, NULL);
+        if (status != 0) {
+            l = dg.DiskSize;
+        }
+        break;
+    default:
+        return -EIO;
+    }
+    return l.QuadPart;
+}
+
+static int raw_create(const char *filename, QEMUOptionParameter *options)
+{
+    int fd;
+    int64_t total_size = 0;
+
+    /* Read out options */
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            total_size = options->value.n / 512;
+        }
+        options++;
+    }
+
+    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
+              0644);
+    if (fd < 0)
+        return -EIO;
+    set_sparse(fd);
+    ftruncate(fd, total_size * 512);
+    close(fd);
+    return 0;
+}
+
+static QEMUOptionParameter raw_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    { NULL }
+};
+
+static BlockDriver bdrv_raw = {
+    .format_name	= "raw",
+    .instance_size	= sizeof(BDRVRawState),
+    .bdrv_open		= raw_open,
+    .bdrv_close		= raw_close,
+    .bdrv_create	= raw_create,
+    .bdrv_flush		= raw_flush,
+    .bdrv_read		= raw_read,
+    .bdrv_write		= raw_write,
+    .bdrv_truncate	= raw_truncate,
+    .bdrv_getlength	= raw_getlength,
+
+    .create_options = raw_create_options,
+};
+
+/***********************************************/
+/* host device */
+
+static int find_cdrom(char *cdrom_name, int cdrom_name_size)
+{
+    char drives[256], *pdrv = drives;
+    UINT type;
+
+    memset(drives, 0, sizeof(drives));
+    GetLogicalDriveStrings(sizeof(drives), drives);
+    while(pdrv[0] != '\0') {
+        type = GetDriveType(pdrv);
+        switch(type) {
+        case DRIVE_CDROM:
+            snprintf(cdrom_name, cdrom_name_size, "\\\\.\\%c:", pdrv[0]);
+            return 0;
+            break;
+        }
+        pdrv += lstrlen(pdrv) + 1;
+    }
+    return -1;
+}
+
+static int find_device_type(BlockDriverState *bs, const char *filename)
+{
+    BDRVRawState *s = bs->opaque;
+    UINT type;
+    const char *p;
+
+    if (strstart(filename, "\\\\.\\", &p) ||
+        strstart(filename, "//./", &p)) {
+        if (stristart(p, "PhysicalDrive", NULL))
+            return FTYPE_HARDDISK;
+        snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", p[0]);
+        type = GetDriveType(s->drive_path);
+        switch (type) {
+        case DRIVE_REMOVABLE:
+        case DRIVE_FIXED:
+            return FTYPE_HARDDISK;
+        case DRIVE_CDROM:
+            return FTYPE_CD;
+        default:
+            return FTYPE_FILE;
+        }
+    } else {
+        return FTYPE_FILE;
+    }
+}
+
+static int hdev_probe_device(const char *filename)
+{
+    if (strstart(filename, "/dev/cdrom", NULL))
+        return 100;
+    if (is_windows_drive(filename))
+        return 100;
+    return 0;
+}
+
+static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVRawState *s = bs->opaque;
+    int access_flags, create_flags;
+    DWORD overlapped;
+    char device_name[64];
+
+    if (strstart(filename, "/dev/cdrom", NULL)) {
+        if (find_cdrom(device_name, sizeof(device_name)) < 0)
+            return -ENOENT;
+        filename = device_name;
+    } else {
+        /* transform drive letters into device name */
+        if (((filename[0] >= 'a' && filename[0] <= 'z') ||
+             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
+            filename[1] == ':' && filename[2] == '\0') {
+            snprintf(device_name, sizeof(device_name), "\\\\.\\%c:", filename[0]);
+            filename = device_name;
+        }
+    }
+    s->type = find_device_type(bs, filename);
+
+    if ((flags & BDRV_O_ACCESS) == O_RDWR) {
+        access_flags = GENERIC_READ | GENERIC_WRITE;
+    } else {
+        access_flags = GENERIC_READ;
+    }
+    create_flags = OPEN_EXISTING;
+
+    overlapped = FILE_ATTRIBUTE_NORMAL;
+    if ((flags & BDRV_O_NOCACHE))
+        overlapped |= FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH;
+    else if (!(flags & BDRV_O_CACHE_WB))
+        overlapped |= FILE_FLAG_WRITE_THROUGH;
+    s->hfile = CreateFile(filename, access_flags,
+                          FILE_SHARE_READ, NULL,
+                          create_flags, overlapped, NULL);
+    if (s->hfile == INVALID_HANDLE_VALUE) {
+        int err = GetLastError();
+
+        if (err == ERROR_ACCESS_DENIED)
+            return -EACCES;
+        return -1;
+    }
+    return 0;
+}
+
+#if 0
+/***********************************************/
+/* removable device additional commands */
+
+static int raw_is_inserted(BlockDriverState *bs)
+{
+    return 1;
+}
+
+static int raw_media_changed(BlockDriverState *bs)
+{
+    return -ENOTSUP;
+}
+
+static int raw_eject(BlockDriverState *bs, int eject_flag)
+{
+    DWORD ret_count;
+
+    if (s->type == FTYPE_FILE)
+        return -ENOTSUP;
+    if (eject_flag) {
+        DeviceIoControl(s->hfile, IOCTL_STORAGE_EJECT_MEDIA,
+                        NULL, 0, NULL, 0, &lpBytesReturned, NULL);
+    } else {
+        DeviceIoControl(s->hfile, IOCTL_STORAGE_LOAD_MEDIA,
+                        NULL, 0, NULL, 0, &lpBytesReturned, NULL);
+    }
+}
+
+static int raw_set_locked(BlockDriverState *bs, int locked)
+{
+    return -ENOTSUP;
+}
+#endif
+
+static BlockDriver bdrv_host_device = {
+    .format_name	= "host_device",
+    .instance_size	= sizeof(BDRVRawState),
+    .bdrv_probe_device	= hdev_probe_device,
+    .bdrv_open		= hdev_open,
+    .bdrv_close		= raw_close,
+    .bdrv_flush		= raw_flush,
+
+    .bdrv_read		= raw_read,
+    .bdrv_write	        = raw_write,
+    .bdrv_getlength	= raw_getlength,
+};
+
+static void bdrv_raw_init(void)
+{
+    bdrv_register(&bdrv_raw);
+    bdrv_register(&bdrv_host_device);
+}
+
+block_init(bdrv_raw_init);
diff --git a/block/vmdk.c b/block/vmdk.c
new file mode 100644
index 0000000..f21f02b
--- /dev/null
+++ b/block/vmdk.c
@@ -0,0 +1,869 @@
+/*
+ * Block driver for the VMDK format
+ *
+ * Copyright (c) 2004 Fabrice Bellard
+ * Copyright (c) 2005 Filip Navara
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+
+#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
+#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
+
+typedef struct {
+    uint32_t version;
+    uint32_t flags;
+    uint32_t disk_sectors;
+    uint32_t granularity;
+    uint32_t l1dir_offset;
+    uint32_t l1dir_size;
+    uint32_t file_sectors;
+    uint32_t cylinders;
+    uint32_t heads;
+    uint32_t sectors_per_track;
+} VMDK3Header;
+
+typedef struct {
+    uint32_t version;
+    uint32_t flags;
+    int64_t capacity;
+    int64_t granularity;
+    int64_t desc_offset;
+    int64_t desc_size;
+    int32_t num_gtes_per_gte;
+    int64_t rgd_offset;
+    int64_t gd_offset;
+    int64_t grain_offset;
+    char filler[1];
+    char check_bytes[4];
+} __attribute__((packed)) VMDK4Header;
+
+#define L2_CACHE_SIZE 16
+
+typedef struct BDRVVmdkState {
+    BlockDriverState *hd;
+    int64_t l1_table_offset;
+    int64_t l1_backup_table_offset;
+    uint32_t *l1_table;
+    uint32_t *l1_backup_table;
+    unsigned int l1_size;
+    uint32_t l1_entry_sectors;
+
+    unsigned int l2_size;
+    uint32_t *l2_cache;
+    uint32_t l2_cache_offsets[L2_CACHE_SIZE];
+    uint32_t l2_cache_counts[L2_CACHE_SIZE];
+
+    unsigned int cluster_sectors;
+    uint32_t parent_cid;
+    int is_parent;
+} BDRVVmdkState;
+
+typedef struct VmdkMetaData {
+    uint32_t offset;
+    unsigned int l1_index;
+    unsigned int l2_index;
+    unsigned int l2_offset;
+    int valid;
+} VmdkMetaData;
+
+typedef struct ActiveBDRVState{
+    BlockDriverState *hd;            // active image handler
+    uint64_t cluster_offset;         // current write offset
+}ActiveBDRVState;
+
+static ActiveBDRVState activeBDRV;
+
+
+static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    uint32_t magic;
+
+    if (buf_size < 4)
+        return 0;
+    magic = be32_to_cpu(*(uint32_t *)buf);
+    if (magic == VMDK3_MAGIC ||
+        magic == VMDK4_MAGIC)
+        return 100;
+    else
+        return 0;
+}
+
+#define CHECK_CID 1
+
+#define SECTOR_SIZE 512
+#define DESC_SIZE 20*SECTOR_SIZE	// 20 sectors of 512 bytes each
+#define HEADER_SIZE 512   			// first sector of 512 bytes
+
+static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
+{
+    BDRVVmdkState *s = bs->opaque;
+    char desc[DESC_SIZE];
+    uint32_t cid;
+    const char *p_name, *cid_str;
+    size_t cid_str_size;
+
+    /* the descriptor offset = 0x200 */
+    if (bdrv_pread(s->hd, 0x200, desc, DESC_SIZE) != DESC_SIZE)
+        return 0;
+
+    if (parent) {
+        cid_str = "parentCID";
+        cid_str_size = sizeof("parentCID");
+    } else {
+        cid_str = "CID";
+        cid_str_size = sizeof("CID");
+    }
+
+    if ((p_name = strstr(desc,cid_str)) != NULL) {
+        p_name += cid_str_size;
+        sscanf(p_name,"%x",&cid);
+    }
+
+    return cid;
+}
+
+static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
+{
+    BDRVVmdkState *s = bs->opaque;
+    char desc[DESC_SIZE], tmp_desc[DESC_SIZE];
+    char *p_name, *tmp_str;
+
+    /* the descriptor offset = 0x200 */
+    if (bdrv_pread(s->hd, 0x200, desc, DESC_SIZE) != DESC_SIZE)
+        return -1;
+
+    tmp_str = strstr(desc,"parentCID");
+    pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str);
+    if ((p_name = strstr(desc,"CID")) != NULL) {
+        p_name += sizeof("CID");
+        snprintf(p_name, sizeof(desc) - (p_name - desc), "%x\n", cid);
+        pstrcat(desc, sizeof(desc), tmp_desc);
+    }
+
+    if (bdrv_pwrite(s->hd, 0x200, desc, DESC_SIZE) != DESC_SIZE)
+        return -1;
+    return 0;
+}
+
+static int vmdk_is_cid_valid(BlockDriverState *bs)
+{
+#ifdef CHECK_CID
+    BDRVVmdkState *s = bs->opaque;
+    BlockDriverState *p_bs = s->hd->backing_hd;
+    uint32_t cur_pcid;
+
+    if (p_bs) {
+        cur_pcid = vmdk_read_cid(p_bs,0);
+        if (s->parent_cid != cur_pcid)
+            // CID not valid
+            return 0;
+    }
+#endif
+    // CID valid
+    return 1;
+}
+
+static int vmdk_snapshot_create(const char *filename, const char *backing_file)
+{
+    int snp_fd, p_fd;
+    uint32_t p_cid;
+    char *p_name, *gd_buf, *rgd_buf;
+    const char *real_filename, *temp_str;
+    VMDK4Header header;
+    uint32_t gde_entries, gd_size;
+    int64_t gd_offset, rgd_offset, capacity, gt_size;
+    char p_desc[DESC_SIZE], s_desc[DESC_SIZE], hdr[HEADER_SIZE];
+    static const char desc_template[] =
+    "# Disk DescriptorFile\n"
+    "version=1\n"
+    "CID=%x\n"
+    "parentCID=%x\n"
+    "createType=\"monolithicSparse\"\n"
+    "parentFileNameHint=\"%s\"\n"
+    "\n"
+    "# Extent description\n"
+    "RW %u SPARSE \"%s\"\n"
+    "\n"
+    "# The Disk Data Base \n"
+    "#DDB\n"
+    "\n";
+
+    snp_fd = open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, 0644);
+    if (snp_fd < 0)
+        return -1;
+    p_fd = open(backing_file, O_RDONLY | O_BINARY | O_LARGEFILE);
+    if (p_fd < 0) {
+        close(snp_fd);
+        return -1;
+    }
+
+    /* read the header */
+    if (lseek(p_fd, 0x0, SEEK_SET) == -1)
+        goto fail;
+    if (read(p_fd, hdr, HEADER_SIZE) != HEADER_SIZE)
+        goto fail;
+
+    /* write the header */
+    if (lseek(snp_fd, 0x0, SEEK_SET) == -1)
+        goto fail;
+    if (write(snp_fd, hdr, HEADER_SIZE) == -1)
+        goto fail;
+
+    memset(&header, 0, sizeof(header));
+    memcpy(&header,&hdr[4], sizeof(header)); // skip the VMDK4_MAGIC
+
+    ftruncate(snp_fd, header.grain_offset << 9);
+    /* the descriptor offset = 0x200 */
+    if (lseek(p_fd, 0x200, SEEK_SET) == -1)
+        goto fail;
+    if (read(p_fd, p_desc, DESC_SIZE) != DESC_SIZE)
+        goto fail;
+
+    if ((p_name = strstr(p_desc,"CID")) != NULL) {
+        p_name += sizeof("CID");
+        sscanf(p_name,"%x",&p_cid);
+    }
+
+    real_filename = filename;
+    if ((temp_str = strrchr(real_filename, '\\')) != NULL)
+        real_filename = temp_str + 1;
+    if ((temp_str = strrchr(real_filename, '/')) != NULL)
+        real_filename = temp_str + 1;
+    if ((temp_str = strrchr(real_filename, ':')) != NULL)
+        real_filename = temp_str + 1;
+
+    snprintf(s_desc, sizeof(s_desc), desc_template, p_cid, p_cid, backing_file,
+             (uint32_t)header.capacity, real_filename);
+
+    /* write the descriptor */
+    if (lseek(snp_fd, 0x200, SEEK_SET) == -1)
+        goto fail;
+    if (write(snp_fd, s_desc, strlen(s_desc)) == -1)
+        goto fail;
+
+    gd_offset = header.gd_offset * SECTOR_SIZE;     // offset of GD table
+    rgd_offset = header.rgd_offset * SECTOR_SIZE;   // offset of RGD table
+    capacity = header.capacity * SECTOR_SIZE;       // Extent size
+    /*
+     * Each GDE span 32M disk, means:
+     * 512 GTE per GT, each GTE points to grain
+     */
+    gt_size = (int64_t)header.num_gtes_per_gte * header.granularity * SECTOR_SIZE;
+    if (!gt_size)
+        goto fail;
+    gde_entries = (uint32_t)(capacity / gt_size);  // number of gde/rgde
+    gd_size = gde_entries * sizeof(uint32_t);
+
+    /* write RGD */
+    rgd_buf = qemu_malloc(gd_size);
+    if (lseek(p_fd, rgd_offset, SEEK_SET) == -1)
+        goto fail_rgd;
+    if (read(p_fd, rgd_buf, gd_size) != gd_size)
+        goto fail_rgd;
+    if (lseek(snp_fd, rgd_offset, SEEK_SET) == -1)
+        goto fail_rgd;
+    if (write(snp_fd, rgd_buf, gd_size) == -1)
+        goto fail_rgd;
+    qemu_free(rgd_buf);
+
+    /* write GD */
+    gd_buf = qemu_malloc(gd_size);
+    if (lseek(p_fd, gd_offset, SEEK_SET) == -1)
+        goto fail_gd;
+    if (read(p_fd, gd_buf, gd_size) != gd_size)
+        goto fail_gd;
+    if (lseek(snp_fd, gd_offset, SEEK_SET) == -1)
+        goto fail_gd;
+    if (write(snp_fd, gd_buf, gd_size) == -1)
+        goto fail_gd;
+    qemu_free(gd_buf);
+
+    close(p_fd);
+    close(snp_fd);
+    return 0;
+
+    fail_gd:
+    qemu_free(gd_buf);
+    fail_rgd:
+    qemu_free(rgd_buf);
+    fail:
+    close(p_fd);
+    close(snp_fd);
+    return -1;
+}
+
+static void vmdk_parent_close(BlockDriverState *bs)
+{
+    if (bs->backing_hd)
+        bdrv_close(bs->backing_hd);
+}
+
+static int parent_open = 0;
+static int vmdk_parent_open(BlockDriverState *bs, const char * filename)
+{
+    BDRVVmdkState *s = bs->opaque;
+    char *p_name;
+    char desc[DESC_SIZE];
+    char parent_img_name[1024];
+
+    /* the descriptor offset = 0x200 */
+    if (bdrv_pread(s->hd, 0x200, desc, DESC_SIZE) != DESC_SIZE)
+        return -1;
+
+    if ((p_name = strstr(desc,"parentFileNameHint")) != NULL) {
+        char *end_name;
+        struct stat file_buf;
+
+        p_name += sizeof("parentFileNameHint") + 1;
+        if ((end_name = strchr(p_name,'\"')) == NULL)
+            return -1;
+        if ((end_name - p_name) > sizeof (s->hd->backing_file) - 1)
+            return -1;
+
+        pstrcpy(s->hd->backing_file, end_name - p_name + 1, p_name);
+        if (stat(s->hd->backing_file, &file_buf) != 0) {
+            path_combine(parent_img_name, sizeof(parent_img_name),
+                         filename, s->hd->backing_file);
+        } else {
+            pstrcpy(parent_img_name, sizeof(parent_img_name),
+                    s->hd->backing_file);
+        }
+
+        s->hd->backing_hd = bdrv_new("");
+        if (!s->hd->backing_hd) {
+            failure:
+            bdrv_close(s->hd);
+            return -1;
+        }
+        parent_open = 1;
+        if (bdrv_open(s->hd->backing_hd, parent_img_name, BDRV_O_RDONLY) < 0)
+            goto failure;
+        parent_open = 0;
+    }
+
+    return 0;
+}
+
+static int vmdk_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVVmdkState *s = bs->opaque;
+    uint32_t magic;
+    int l1_size, i, ret;
+
+    if (parent_open)
+        // Parent must be opened as RO.
+        flags = BDRV_O_RDONLY;
+
+    ret = bdrv_file_open(&s->hd, filename, flags);
+    if (ret < 0)
+        return ret;
+    if (bdrv_pread(s->hd, 0, &magic, sizeof(magic)) != sizeof(magic))
+        goto fail;
+
+    magic = be32_to_cpu(magic);
+    if (magic == VMDK3_MAGIC) {
+        VMDK3Header header;
+
+        if (bdrv_pread(s->hd, sizeof(magic), &header, sizeof(header)) != sizeof(header))
+            goto fail;
+        s->cluster_sectors = le32_to_cpu(header.granularity);
+        s->l2_size = 1 << 9;
+        s->l1_size = 1 << 6;
+        bs->total_sectors = le32_to_cpu(header.disk_sectors);
+        s->l1_table_offset = le32_to_cpu(header.l1dir_offset) << 9;
+        s->l1_backup_table_offset = 0;
+        s->l1_entry_sectors = s->l2_size * s->cluster_sectors;
+    } else if (magic == VMDK4_MAGIC) {
+        VMDK4Header header;
+
+        if (bdrv_pread(s->hd, sizeof(magic), &header, sizeof(header)) != sizeof(header))
+            goto fail;
+        bs->total_sectors = le64_to_cpu(header.capacity);
+        s->cluster_sectors = le64_to_cpu(header.granularity);
+        s->l2_size = le32_to_cpu(header.num_gtes_per_gte);
+        s->l1_entry_sectors = s->l2_size * s->cluster_sectors;
+        if (s->l1_entry_sectors <= 0)
+            goto fail;
+        s->l1_size = (bs->total_sectors + s->l1_entry_sectors - 1)
+            / s->l1_entry_sectors;
+        s->l1_table_offset = le64_to_cpu(header.rgd_offset) << 9;
+        s->l1_backup_table_offset = le64_to_cpu(header.gd_offset) << 9;
+
+        if (parent_open)
+            s->is_parent = 1;
+        else
+            s->is_parent = 0;
+
+        // try to open parent images, if exist
+        if (vmdk_parent_open(bs, filename) != 0)
+            goto fail;
+        // write the CID once after the image creation
+        s->parent_cid = vmdk_read_cid(bs,1);
+    } else {
+        goto fail;
+    }
+
+    /* read the L1 table */
+    l1_size = s->l1_size * sizeof(uint32_t);
+    s->l1_table = qemu_malloc(l1_size);
+    if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, l1_size) != l1_size)
+        goto fail;
+    for(i = 0; i < s->l1_size; i++) {
+        le32_to_cpus(&s->l1_table[i]);
+    }
+
+    if (s->l1_backup_table_offset) {
+        s->l1_backup_table = qemu_malloc(l1_size);
+        if (bdrv_pread(s->hd, s->l1_backup_table_offset, s->l1_backup_table, l1_size) != l1_size)
+            goto fail;
+        for(i = 0; i < s->l1_size; i++) {
+            le32_to_cpus(&s->l1_backup_table[i]);
+        }
+    }
+
+    s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint32_t));
+    return 0;
+ fail:
+    qemu_free(s->l1_backup_table);
+    qemu_free(s->l1_table);
+    qemu_free(s->l2_cache);
+    bdrv_delete(s->hd);
+    return -1;
+}
+
+static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
+                                   uint64_t offset, int allocate);
+
+static int get_whole_cluster(BlockDriverState *bs, uint64_t cluster_offset,
+                             uint64_t offset, int allocate)
+{
+    uint64_t parent_cluster_offset;
+    BDRVVmdkState *s = bs->opaque;
+    uint8_t  whole_grain[s->cluster_sectors*512];        // 128 sectors * 512 bytes each = grain size 64KB
+
+    // we will be here if it's first write on non-exist grain(cluster).
+    // try to read from parent image, if exist
+    if (s->hd->backing_hd) {
+        BDRVVmdkState *ps = s->hd->backing_hd->opaque;
+
+        if (!vmdk_is_cid_valid(bs))
+            return -1;
+
+        parent_cluster_offset = get_cluster_offset(s->hd->backing_hd, NULL, offset, allocate);
+
+        if (parent_cluster_offset) {
+            BDRVVmdkState *act_s = activeBDRV.hd->opaque;
+
+            if (bdrv_pread(ps->hd, parent_cluster_offset, whole_grain, ps->cluster_sectors*512) != ps->cluster_sectors*512)
+                return -1;
+
+            //Write grain only into the active image
+            if (bdrv_pwrite(act_s->hd, activeBDRV.cluster_offset << 9, whole_grain, sizeof(whole_grain)) != sizeof(whole_grain))
+                return -1;
+        }
+    }
+    return 0;
+}
+
+static int vmdk_L2update(BlockDriverState *bs, VmdkMetaData *m_data)
+{
+    BDRVVmdkState *s = bs->opaque;
+
+    /* update L2 table */
+    if (bdrv_pwrite(s->hd, ((int64_t)m_data->l2_offset * 512) + (m_data->l2_index * sizeof(m_data->offset)),
+                    &(m_data->offset), sizeof(m_data->offset)) != sizeof(m_data->offset))
+        return -1;
+    /* update backup L2 table */
+    if (s->l1_backup_table_offset != 0) {
+        m_data->l2_offset = s->l1_backup_table[m_data->l1_index];
+        if (bdrv_pwrite(s->hd, ((int64_t)m_data->l2_offset * 512) + (m_data->l2_index * sizeof(m_data->offset)),
+                        &(m_data->offset), sizeof(m_data->offset)) != sizeof(m_data->offset))
+            return -1;
+    }
+
+    return 0;
+}
+
+static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
+                                   uint64_t offset, int allocate)
+{
+    BDRVVmdkState *s = bs->opaque;
+    unsigned int l1_index, l2_offset, l2_index;
+    int min_index, i, j;
+    uint32_t min_count, *l2_table, tmp = 0;
+    uint64_t cluster_offset;
+
+    if (m_data)
+        m_data->valid = 0;
+
+    l1_index = (offset >> 9) / s->l1_entry_sectors;
+    if (l1_index >= s->l1_size)
+        return 0;
+    l2_offset = s->l1_table[l1_index];
+    if (!l2_offset)
+        return 0;
+    for(i = 0; i < L2_CACHE_SIZE; i++) {
+        if (l2_offset == s->l2_cache_offsets[i]) {
+            /* increment the hit count */
+            if (++s->l2_cache_counts[i] == 0xffffffff) {
+                for(j = 0; j < L2_CACHE_SIZE; j++) {
+                    s->l2_cache_counts[j] >>= 1;
+                }
+            }
+            l2_table = s->l2_cache + (i * s->l2_size);
+            goto found;
+        }
+    }
+    /* not found: load a new entry in the least used one */
+    min_index = 0;
+    min_count = 0xffffffff;
+    for(i = 0; i < L2_CACHE_SIZE; i++) {
+        if (s->l2_cache_counts[i] < min_count) {
+            min_count = s->l2_cache_counts[i];
+            min_index = i;
+        }
+    }
+    l2_table = s->l2_cache + (min_index * s->l2_size);
+    if (bdrv_pread(s->hd, (int64_t)l2_offset * 512, l2_table, s->l2_size * sizeof(uint32_t)) !=
+                                                                        s->l2_size * sizeof(uint32_t))
+        return 0;
+
+    s->l2_cache_offsets[min_index] = l2_offset;
+    s->l2_cache_counts[min_index] = 1;
+ found:
+    l2_index = ((offset >> 9) / s->cluster_sectors) % s->l2_size;
+    cluster_offset = le32_to_cpu(l2_table[l2_index]);
+
+    if (!cluster_offset) {
+        if (!allocate)
+            return 0;
+        // Avoid the L2 tables update for the images that have snapshots.
+        if (!s->is_parent) {
+            cluster_offset = bdrv_getlength(s->hd);
+            bdrv_truncate(s->hd, cluster_offset + (s->cluster_sectors << 9));
+
+            cluster_offset >>= 9;
+            tmp = cpu_to_le32(cluster_offset);
+            l2_table[l2_index] = tmp;
+            // Save the active image state
+            activeBDRV.cluster_offset = cluster_offset;
+            activeBDRV.hd = bs;
+        }
+        /* First of all we write grain itself, to avoid race condition
+         * that may to corrupt the image.
+         * This problem may occur because of insufficient space on host disk
+         * or inappropriate VM shutdown.
+         */
+        if (get_whole_cluster(bs, cluster_offset, offset, allocate) == -1)
+            return 0;
+
+        if (m_data) {
+            m_data->offset = tmp;
+            m_data->l1_index = l1_index;
+            m_data->l2_index = l2_index;
+            m_data->l2_offset = l2_offset;
+            m_data->valid = 1;
+        }
+    }
+    cluster_offset <<= 9;
+    return cluster_offset;
+}
+
+static int vmdk_is_allocated(BlockDriverState *bs, int64_t sector_num,
+                             int nb_sectors, int *pnum)
+{
+    BDRVVmdkState *s = bs->opaque;
+    int index_in_cluster, n;
+    uint64_t cluster_offset;
+
+    cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0);
+    index_in_cluster = sector_num % s->cluster_sectors;
+    n = s->cluster_sectors - index_in_cluster;
+    if (n > nb_sectors)
+        n = nb_sectors;
+    *pnum = n;
+    return (cluster_offset != 0);
+}
+
+static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
+{
+    BDRVVmdkState *s = bs->opaque;
+    int index_in_cluster, n, ret;
+    uint64_t cluster_offset;
+
+    while (nb_sectors > 0) {
+        cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0);
+        index_in_cluster = sector_num % s->cluster_sectors;
+        n = s->cluster_sectors - index_in_cluster;
+        if (n > nb_sectors)
+            n = nb_sectors;
+        if (!cluster_offset) {
+            // try to read from parent image, if exist
+            if (s->hd->backing_hd) {
+                if (!vmdk_is_cid_valid(bs))
+                    return -1;
+                ret = bdrv_read(s->hd->backing_hd, sector_num, buf, n);
+                if (ret < 0)
+                    return -1;
+            } else {
+                memset(buf, 0, 512 * n);
+            }
+        } else {
+            if(bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512) != n * 512)
+                return -1;
+        }
+        nb_sectors -= n;
+        sector_num += n;
+        buf += n * 512;
+    }
+    return 0;
+}
+
+static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
+                     const uint8_t *buf, int nb_sectors)
+{
+    BDRVVmdkState *s = bs->opaque;
+    VmdkMetaData m_data;
+    int index_in_cluster, n;
+    uint64_t cluster_offset;
+    static int cid_update = 0;
+
+    if (sector_num > bs->total_sectors) {
+        fprintf(stderr,
+                "(VMDK) Wrong offset: sector_num=0x%" PRIx64
+                " total_sectors=0x%" PRIx64 "\n",
+                sector_num, bs->total_sectors);
+        return -1;
+    }
+
+    while (nb_sectors > 0) {
+        index_in_cluster = sector_num & (s->cluster_sectors - 1);
+        n = s->cluster_sectors - index_in_cluster;
+        if (n > nb_sectors)
+            n = nb_sectors;
+        cluster_offset = get_cluster_offset(bs, &m_data, sector_num << 9, 1);
+        if (!cluster_offset)
+            return -1;
+
+        if (bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512) != n * 512)
+            return -1;
+        if (m_data.valid) {
+            /* update L2 tables */
+            if (vmdk_L2update(bs, &m_data) == -1)
+                return -1;
+        }
+        nb_sectors -= n;
+        sector_num += n;
+        buf += n * 512;
+
+        // update CID on the first write every time the virtual disk is opened
+        if (!cid_update) {
+            vmdk_write_cid(bs, time(NULL));
+            cid_update++;
+        }
+    }
+    return 0;
+}
+
+static int vmdk_create(const char *filename, QEMUOptionParameter *options)
+{
+    int fd, i;
+    VMDK4Header header;
+    uint32_t tmp, magic, grains, gd_size, gt_size, gt_count;
+    static const char desc_template[] =
+        "# Disk DescriptorFile\n"
+        "version=1\n"
+        "CID=%x\n"
+        "parentCID=ffffffff\n"
+        "createType=\"monolithicSparse\"\n"
+        "\n"
+        "# Extent description\n"
+        "RW %" PRId64 " SPARSE \"%s\"\n"
+        "\n"
+        "# The Disk Data Base \n"
+        "#DDB\n"
+        "\n"
+        "ddb.virtualHWVersion = \"%d\"\n"
+        "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
+        "ddb.geometry.heads = \"16\"\n"
+        "ddb.geometry.sectors = \"63\"\n"
+        "ddb.adapterType = \"ide\"\n";
+    char desc[1024];
+    const char *real_filename, *temp_str;
+    int64_t total_size = 0;
+    const char *backing_file = NULL;
+    int flags = 0;
+
+    // Read out options
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            total_size = options->value.n / 512;
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+            backing_file = options->value.s;
+        } else if (!strcmp(options->name, BLOCK_OPT_COMPAT6)) {
+            flags |= options->value.n ? BLOCK_FLAG_COMPAT6: 0;
+        }
+        options++;
+    }
+
+    /* XXX: add support for backing file */
+    if (backing_file) {
+        return vmdk_snapshot_create(filename, backing_file);
+    }
+
+    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
+              0644);
+    if (fd < 0)
+        return -1;
+    magic = cpu_to_be32(VMDK4_MAGIC);
+    memset(&header, 0, sizeof(header));
+    header.version = cpu_to_le32(1);
+    header.flags = cpu_to_le32(3); /* ?? */
+    header.capacity = cpu_to_le64(total_size);
+    header.granularity = cpu_to_le64(128);
+    header.num_gtes_per_gte = cpu_to_le32(512);
+
+    grains = (total_size + header.granularity - 1) / header.granularity;
+    gt_size = ((header.num_gtes_per_gte * sizeof(uint32_t)) + 511) >> 9;
+    gt_count = (grains + header.num_gtes_per_gte - 1) / header.num_gtes_per_gte;
+    gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9;
+
+    header.desc_offset = 1;
+    header.desc_size = 20;
+    header.rgd_offset = header.desc_offset + header.desc_size;
+    header.gd_offset = header.rgd_offset + gd_size + (gt_size * gt_count);
+    header.grain_offset =
+       ((header.gd_offset + gd_size + (gt_size * gt_count) +
+         header.granularity - 1) / header.granularity) *
+        header.granularity;
+
+    header.desc_offset = cpu_to_le64(header.desc_offset);
+    header.desc_size = cpu_to_le64(header.desc_size);
+    header.rgd_offset = cpu_to_le64(header.rgd_offset);
+    header.gd_offset = cpu_to_le64(header.gd_offset);
+    header.grain_offset = cpu_to_le64(header.grain_offset);
+
+    header.check_bytes[0] = 0xa;
+    header.check_bytes[1] = 0x20;
+    header.check_bytes[2] = 0xd;
+    header.check_bytes[3] = 0xa;
+
+    /* write all the data */
+    write(fd, &magic, sizeof(magic));
+    write(fd, &header, sizeof(header));
+
+    ftruncate(fd, header.grain_offset << 9);
+
+    /* write grain directory */
+    lseek(fd, le64_to_cpu(header.rgd_offset) << 9, SEEK_SET);
+    for (i = 0, tmp = header.rgd_offset + gd_size;
+         i < gt_count; i++, tmp += gt_size)
+        write(fd, &tmp, sizeof(tmp));
+
+    /* write backup grain directory */
+    lseek(fd, le64_to_cpu(header.gd_offset) << 9, SEEK_SET);
+    for (i = 0, tmp = header.gd_offset + gd_size;
+         i < gt_count; i++, tmp += gt_size)
+        write(fd, &tmp, sizeof(tmp));
+
+    /* compose the descriptor */
+    real_filename = filename;
+    if ((temp_str = strrchr(real_filename, '\\')) != NULL)
+        real_filename = temp_str + 1;
+    if ((temp_str = strrchr(real_filename, '/')) != NULL)
+        real_filename = temp_str + 1;
+    if ((temp_str = strrchr(real_filename, ':')) != NULL)
+        real_filename = temp_str + 1;
+    snprintf(desc, sizeof(desc), desc_template, (unsigned int)time(NULL),
+             total_size, real_filename,
+             (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
+             total_size / (int64_t)(63 * 16));
+
+    /* write the descriptor */
+    lseek(fd, le64_to_cpu(header.desc_offset) << 9, SEEK_SET);
+    write(fd, desc, strlen(desc));
+
+    close(fd);
+    return 0;
+}
+
+static void vmdk_close(BlockDriverState *bs)
+{
+    BDRVVmdkState *s = bs->opaque;
+
+    qemu_free(s->l1_table);
+    qemu_free(s->l2_cache);
+    // try to close parent image, if exist
+    vmdk_parent_close(s->hd);
+    bdrv_delete(s->hd);
+}
+
+static void vmdk_flush(BlockDriverState *bs)
+{
+    BDRVVmdkState *s = bs->opaque;
+    bdrv_flush(s->hd);
+}
+
+
+static QEMUOptionParameter vmdk_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    {
+        .name = BLOCK_OPT_BACKING_FILE,
+        .type = OPT_STRING,
+        .help = "File name of a base image"
+    },
+    {
+        .name = BLOCK_OPT_COMPAT6,
+        .type = OPT_FLAG,
+        .help = "VMDK version 6 image"
+    },
+    { NULL }
+};
+
+static BlockDriver bdrv_vmdk = {
+    .format_name	= "vmdk",
+    .instance_size	= sizeof(BDRVVmdkState),
+    .bdrv_probe		= vmdk_probe,
+    .bdrv_open		= vmdk_open,
+    .bdrv_read		= vmdk_read,
+    .bdrv_write		= vmdk_write,
+    .bdrv_close		= vmdk_close,
+    .bdrv_create	= vmdk_create,
+    .bdrv_flush		= vmdk_flush,
+    .bdrv_is_allocated	= vmdk_is_allocated,
+
+    .create_options = vmdk_create_options,
+};
+
+static void bdrv_vmdk_init(void)
+{
+    bdrv_register(&bdrv_vmdk);
+}
+
+block_init(bdrv_vmdk_init);
diff --git a/block/vpc.c b/block/vpc.c
new file mode 100644
index 0000000..ba482e9
--- /dev/null
+++ b/block/vpc.c
@@ -0,0 +1,623 @@
+/*
+ * Block driver for Conectix/Microsoft Virtual PC images
+ *
+ * Copyright (c) 2005 Alex Beregszaszi
+ * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+
+/**************************************************************/
+
+#define HEADER_SIZE 512
+
+//#define CACHE
+
+enum vhd_type {
+    VHD_FIXED           = 2,
+    VHD_DYNAMIC         = 3,
+    VHD_DIFFERENCING    = 4,
+};
+
+// Seconds since Jan 1, 2000 0:00:00 (UTC)
+#define VHD_TIMESTAMP_BASE 946684800
+
+// always big-endian
+struct vhd_footer {
+    char        creator[8]; // "conectix"
+    uint32_t    features;
+    uint32_t    version;
+
+    // Offset of next header structure, 0xFFFFFFFF if none
+    uint64_t    data_offset;
+
+    // Seconds since Jan 1, 2000 0:00:00 (UTC)
+    uint32_t    timestamp;
+
+    char        creator_app[4]; // "vpc "
+    uint16_t    major;
+    uint16_t    minor;
+    char        creator_os[4]; // "Wi2k"
+
+    uint64_t    orig_size;
+    uint64_t    size;
+
+    uint16_t    cyls;
+    uint8_t     heads;
+    uint8_t     secs_per_cyl;
+
+    uint32_t    type;
+
+    // Checksum of the Hard Disk Footer ("one's complement of the sum of all
+    // the bytes in the footer without the checksum field")
+    uint32_t    checksum;
+
+    // UUID used to identify a parent hard disk (backing file)
+    uint8_t     uuid[16];
+
+    uint8_t     in_saved_state;
+};
+
+struct vhd_dyndisk_header {
+    char        magic[8]; // "cxsparse"
+
+    // Offset of next header structure, 0xFFFFFFFF if none
+    uint64_t    data_offset;
+
+    // Offset of the Block Allocation Table (BAT)
+    uint64_t    table_offset;
+
+    uint32_t    version;
+    uint32_t    max_table_entries; // 32bit/entry
+
+    // 2 MB by default, must be a power of two
+    uint32_t    block_size;
+
+    uint32_t    checksum;
+    uint8_t     parent_uuid[16];
+    uint32_t    parent_timestamp;
+    uint32_t    reserved;
+
+    // Backing file name (in UTF-16)
+    uint8_t     parent_name[512];
+
+    struct {
+        uint32_t    platform;
+        uint32_t    data_space;
+        uint32_t    data_length;
+        uint32_t    reserved;
+        uint64_t    data_offset;
+    } parent_locator[8];
+};
+
+typedef struct BDRVVPCState {
+    BlockDriverState *hd;
+
+    uint8_t footer_buf[HEADER_SIZE];
+    uint64_t free_data_block_offset;
+    int max_table_entries;
+    uint32_t *pagetable;
+    uint64_t bat_offset;
+    uint64_t last_bitmap_offset;
+
+    uint32_t block_size;
+    uint32_t bitmap_size;
+
+#ifdef CACHE
+    uint8_t *pageentry_u8;
+    uint32_t *pageentry_u32;
+    uint16_t *pageentry_u16;
+
+    uint64_t last_bitmap;
+#endif
+} BDRVVPCState;
+
+static uint32_t vpc_checksum(uint8_t* buf, size_t size)
+{
+    uint32_t res = 0;
+    int i;
+
+    for (i = 0; i < size; i++)
+        res += buf[i];
+
+    return ~res;
+}
+
+
+static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8))
+	return 100;
+    return 0;
+}
+
+static int vpc_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVVPCState *s = bs->opaque;
+    int ret, i;
+    struct vhd_footer* footer;
+    struct vhd_dyndisk_header* dyndisk_header;
+    uint8_t buf[HEADER_SIZE];
+    uint32_t checksum;
+
+    ret = bdrv_file_open(&s->hd, filename, flags);
+    if (ret < 0)
+        return ret;
+
+    if (bdrv_pread(s->hd, 0, s->footer_buf, HEADER_SIZE) != HEADER_SIZE)
+        goto fail;
+
+    footer = (struct vhd_footer*) s->footer_buf;
+    if (strncmp(footer->creator, "conectix", 8))
+        goto fail;
+
+    checksum = be32_to_cpu(footer->checksum);
+    footer->checksum = 0;
+    if (vpc_checksum(s->footer_buf, HEADER_SIZE) != checksum)
+        fprintf(stderr, "block-vpc: The header checksum of '%s' is "
+            "incorrect.\n", filename);
+
+    // The visible size of a image in Virtual PC depends on the geometry
+    // rather than on the size stored in the footer (the size in the footer
+    // is too large usually)
+    bs->total_sectors = (int64_t)
+        be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
+
+    if (bdrv_pread(s->hd, be64_to_cpu(footer->data_offset), buf, HEADER_SIZE)
+            != HEADER_SIZE)
+        goto fail;
+
+    dyndisk_header = (struct vhd_dyndisk_header*) buf;
+
+    if (strncmp(dyndisk_header->magic, "cxsparse", 8))
+        goto fail;
+
+
+    s->block_size = be32_to_cpu(dyndisk_header->block_size);
+    s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;
+
+    s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries);
+    s->pagetable = qemu_malloc(s->max_table_entries * 4);
+
+    s->bat_offset = be64_to_cpu(dyndisk_header->table_offset);
+    if (bdrv_pread(s->hd, s->bat_offset, s->pagetable,
+            s->max_table_entries * 4) != s->max_table_entries * 4)
+	    goto fail;
+
+    s->free_data_block_offset =
+        (s->bat_offset + (s->max_table_entries * 4) + 511) & ~511;
+
+    for (i = 0; i < s->max_table_entries; i++) {
+        be32_to_cpus(&s->pagetable[i]);
+        if (s->pagetable[i] != 0xFFFFFFFF) {
+            int64_t next = (512 * (int64_t) s->pagetable[i]) +
+                s->bitmap_size + s->block_size;
+
+            if (next> s->free_data_block_offset)
+                s->free_data_block_offset = next;
+        }
+    }
+
+    s->last_bitmap_offset = (int64_t) -1;
+
+#ifdef CACHE
+    s->pageentry_u8 = qemu_malloc(512);
+    s->pageentry_u32 = s->pageentry_u8;
+    s->pageentry_u16 = s->pageentry_u8;
+    s->last_pagetable = -1;
+#endif
+
+    return 0;
+ fail:
+    bdrv_delete(s->hd);
+    return -1;
+}
+
+/*
+ * Returns the absolute byte offset of the given sector in the image file.
+ * If the sector is not allocated, -1 is returned instead.
+ *
+ * The parameter write must be 1 if the offset will be used for a write
+ * operation (the block bitmaps is updated then), 0 otherwise.
+ */
+static inline int64_t get_sector_offset(BlockDriverState *bs,
+    int64_t sector_num, int write)
+{
+    BDRVVPCState *s = bs->opaque;
+    uint64_t offset = sector_num * 512;
+    uint64_t bitmap_offset, block_offset;
+    uint32_t pagetable_index, pageentry_index;
+
+    pagetable_index = offset / s->block_size;
+    pageentry_index = (offset % s->block_size) / 512;
+
+    if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
+        return -1; // not allocated
+
+    bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
+    block_offset = bitmap_offset + s->bitmap_size + (512 * pageentry_index);
+
+    // We must ensure that we don't write to any sectors which are marked as
+    // unused in the bitmap. We get away with setting all bits in the block
+    // bitmap each time we write to a new block. This might cause Virtual PC to
+    // miss sparse read optimization, but it's not a problem in terms of
+    // correctness.
+    if (write && (s->last_bitmap_offset != bitmap_offset)) {
+        uint8_t bitmap[s->bitmap_size];
+
+        s->last_bitmap_offset = bitmap_offset;
+        memset(bitmap, 0xff, s->bitmap_size);
+        bdrv_pwrite(s->hd, bitmap_offset, bitmap, s->bitmap_size);
+    }
+
+//    printf("sector: %" PRIx64 ", index: %x, offset: %x, bioff: %" PRIx64 ", bloff: %" PRIx64 "\n",
+//	sector_num, pagetable_index, pageentry_index,
+//	bitmap_offset, block_offset);
+
+// disabled by reason
+#if 0
+#ifdef CACHE
+    if (bitmap_offset != s->last_bitmap)
+    {
+	lseek(s->fd, bitmap_offset, SEEK_SET);
+
+	s->last_bitmap = bitmap_offset;
+
+	// Scary! Bitmap is stored as big endian 32bit entries,
+	// while we used to look it up byte by byte
+	read(s->fd, s->pageentry_u8, 512);
+	for (i = 0; i < 128; i++)
+	    be32_to_cpus(&s->pageentry_u32[i]);
+    }
+
+    if ((s->pageentry_u8[pageentry_index / 8] >> (pageentry_index % 8)) & 1)
+	return -1;
+#else
+    lseek(s->fd, bitmap_offset + (pageentry_index / 8), SEEK_SET);
+
+    read(s->fd, &bitmap_entry, 1);
+
+    if ((bitmap_entry >> (pageentry_index % 8)) & 1)
+	return -1; // not allocated
+#endif
+#endif
+
+    return block_offset;
+}
+
+/*
+ * Writes the footer to the end of the image file. This is needed when the
+ * file grows as it overwrites the old footer
+ *
+ * Returns 0 on success and < 0 on error
+ */
+static int rewrite_footer(BlockDriverState* bs)
+{
+    int ret;
+    BDRVVPCState *s = bs->opaque;
+    int64_t offset = s->free_data_block_offset;
+
+    ret = bdrv_pwrite(s->hd, offset, s->footer_buf, HEADER_SIZE);
+    if (ret < 0)
+        return ret;
+
+    return 0;
+}
+
+/*
+ * Allocates a new block. This involves writing a new footer and updating
+ * the Block Allocation Table to use the space at the old end of the image
+ * file (overwriting the old footer)
+ *
+ * Returns the sectors' offset in the image file on success and < 0 on error
+ */
+static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num)
+{
+    BDRVVPCState *s = bs->opaque;
+    int64_t bat_offset;
+    uint32_t index, bat_value;
+    int ret;
+    uint8_t bitmap[s->bitmap_size];
+
+    // Check if sector_num is valid
+    if ((sector_num < 0) || (sector_num > bs->total_sectors))
+        return -1;
+
+    // Write entry into in-memory BAT
+    index = (sector_num * 512) / s->block_size;
+    if (s->pagetable[index] != 0xFFFFFFFF)
+        return -1;
+
+    s->pagetable[index] = s->free_data_block_offset / 512;
+
+    // Initialize the block's bitmap
+    memset(bitmap, 0xff, s->bitmap_size);
+    bdrv_pwrite(s->hd, s->free_data_block_offset, bitmap, s->bitmap_size);
+
+    // Write new footer (the old one will be overwritten)
+    s->free_data_block_offset += s->block_size + s->bitmap_size;
+    ret = rewrite_footer(bs);
+    if (ret < 0)
+        goto fail;
+
+    // Write BAT entry to disk
+    bat_offset = s->bat_offset + (4 * index);
+    bat_value = be32_to_cpu(s->pagetable[index]);
+    ret = bdrv_pwrite(s->hd, bat_offset, &bat_value, 4);
+    if (ret < 0)
+        goto fail;
+
+    return get_sector_offset(bs, sector_num, 0);
+
+fail:
+    s->free_data_block_offset -= (s->block_size + s->bitmap_size);
+    return -1;
+}
+
+static int vpc_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
+{
+    BDRVVPCState *s = bs->opaque;
+    int ret;
+    int64_t offset;
+
+    while (nb_sectors > 0) {
+        offset = get_sector_offset(bs, sector_num, 0);
+
+        if (offset == -1) {
+            memset(buf, 0, 512);
+        } else {
+            ret = bdrv_pread(s->hd, offset, buf, 512);
+            if (ret != 512)
+                return -1;
+        }
+
+        nb_sectors--;
+        sector_num++;
+        buf += 512;
+    }
+    return 0;
+}
+
+static int vpc_write(BlockDriverState *bs, int64_t sector_num,
+    const uint8_t *buf, int nb_sectors)
+{
+    BDRVVPCState *s = bs->opaque;
+    int64_t offset;
+    int ret;
+
+    while (nb_sectors > 0) {
+        offset = get_sector_offset(bs, sector_num, 1);
+
+        if (offset == -1) {
+            offset = alloc_block(bs, sector_num);
+            if (offset < 0)
+                return -1;
+        }
+
+        ret = bdrv_pwrite(s->hd, offset, buf, 512);
+        if (ret != 512)
+            return -1;
+
+        nb_sectors--;
+        sector_num++;
+        buf += 512;
+    }
+
+    return 0;
+}
+
+
+/*
+ * Calculates the number of cylinders, heads and sectors per cylinder
+ * based on a given number of sectors. This is the algorithm described
+ * in the VHD specification.
+ *
+ * Note that the geometry doesn't always exactly match total_sectors but
+ * may round it down.
+ *
+ * Returns 0 on success, -EFBIG if the size is larger than 127 GB
+ */
+static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
+    uint8_t* heads, uint8_t* secs_per_cyl)
+{
+    uint32_t cyls_times_heads;
+
+    if (total_sectors > 65535 * 16 * 255)
+        return -EFBIG;
+
+    if (total_sectors > 65535 * 16 * 63) {
+        *secs_per_cyl = 255;
+        *heads = 16;
+        cyls_times_heads = total_sectors / *secs_per_cyl;
+    } else {
+        *secs_per_cyl = 17;
+        cyls_times_heads = total_sectors / *secs_per_cyl;
+        *heads = (cyls_times_heads + 1023) / 1024;
+
+        if (*heads < 4)
+            *heads = 4;
+
+        if (cyls_times_heads >= (*heads * 1024) || *heads > 16) {
+            *secs_per_cyl = 31;
+            *heads = 16;
+            cyls_times_heads = total_sectors / *secs_per_cyl;
+        }
+
+        if (cyls_times_heads >= (*heads * 1024)) {
+            *secs_per_cyl = 63;
+            *heads = 16;
+            cyls_times_heads = total_sectors / *secs_per_cyl;
+        }
+    }
+
+    // Note: Rounding up deviates from the Virtual PC behaviour
+    // However, we need this to avoid truncating images in qemu-img convert
+    *cyls = (cyls_times_heads + *heads - 1) / *heads;
+
+    return 0;
+}
+
+static int vpc_create(const char *filename, QEMUOptionParameter *options)
+{
+    uint8_t buf[1024];
+    struct vhd_footer* footer = (struct vhd_footer*) buf;
+    struct vhd_dyndisk_header* dyndisk_header =
+        (struct vhd_dyndisk_header*) buf;
+    int fd, i;
+    uint16_t cyls;
+    uint8_t heads;
+    uint8_t secs_per_cyl;
+    size_t block_size, num_bat_entries;
+    int64_t total_sectors = 0;
+
+    // Read out options
+    while (options && options->name) {
+        if (!strcmp(options->name, "size")) {
+            total_sectors = options->value.n / 512;
+        }
+        options++;
+    }
+
+    // Create the file
+    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
+    if (fd < 0)
+        return -EIO;
+
+    // Calculate matching total_size and geometry
+    if (calculate_geometry(total_sectors, &cyls, &heads, &secs_per_cyl))
+        return -EFBIG;
+    total_sectors = (int64_t) cyls * heads * secs_per_cyl;
+
+    // Prepare the Hard Disk Footer
+    memset(buf, 0, 1024);
+
+    strncpy(footer->creator, "conectix", 8);
+    // TODO Check if "qemu" creator_app is ok for VPC
+    strncpy(footer->creator_app, "qemu", 4);
+    strncpy(footer->creator_os, "Wi2k", 4);
+
+    footer->features = be32_to_cpu(0x02);
+    footer->version = be32_to_cpu(0x00010000);
+    footer->data_offset = be64_to_cpu(HEADER_SIZE);
+    footer->timestamp = be32_to_cpu(time(NULL) - VHD_TIMESTAMP_BASE);
+
+    // Version of Virtual PC 2007
+    footer->major = be16_to_cpu(0x0005);
+    footer->minor =be16_to_cpu(0x0003);
+
+    footer->orig_size = be64_to_cpu(total_sectors * 512);
+    footer->size = be64_to_cpu(total_sectors * 512);
+
+    footer->cyls = be16_to_cpu(cyls);
+    footer->heads = heads;
+    footer->secs_per_cyl = secs_per_cyl;
+
+    footer->type = be32_to_cpu(VHD_DYNAMIC);
+
+    // TODO uuid is missing
+
+    footer->checksum = be32_to_cpu(vpc_checksum(buf, HEADER_SIZE));
+
+    // Write the footer (twice: at the beginning and at the end)
+    block_size = 0x200000;
+    num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512);
+
+    if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE)
+        return -EIO;
+
+    if (lseek(fd, 1536 + ((num_bat_entries * 4 + 511) & ~511), SEEK_SET) < 0)
+        return -EIO;
+    if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE)
+        return -EIO;
+
+    // Write the initial BAT
+    if (lseek(fd, 3 * 512, SEEK_SET) < 0)
+        return -EIO;
+
+    memset(buf, 0xFF, 512);
+    for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++)
+        if (write(fd, buf, 512) != 512)
+            return -EIO;
+
+
+    // Prepare the Dynamic Disk Header
+    memset(buf, 0, 1024);
+
+    strncpy(dyndisk_header->magic, "cxsparse", 8);
+
+    dyndisk_header->data_offset = be64_to_cpu(0xFFFFFFFF);
+    dyndisk_header->table_offset = be64_to_cpu(3 * 512);
+    dyndisk_header->version = be32_to_cpu(0x00010000);
+    dyndisk_header->block_size = be32_to_cpu(block_size);
+    dyndisk_header->max_table_entries = be32_to_cpu(num_bat_entries);
+
+    dyndisk_header->checksum = be32_to_cpu(vpc_checksum(buf, 1024));
+
+    // Write the header
+    if (lseek(fd, 512, SEEK_SET) < 0)
+        return -EIO;
+    if (write(fd, buf, 1024) != 1024)
+        return -EIO;
+
+    close(fd);
+    return 0;
+}
+
+static void vpc_close(BlockDriverState *bs)
+{
+    BDRVVPCState *s = bs->opaque;
+    qemu_free(s->pagetable);
+#ifdef CACHE
+    qemu_free(s->pageentry_u8);
+#endif
+    bdrv_delete(s->hd);
+}
+
+static QEMUOptionParameter vpc_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    { NULL }
+};
+
+static BlockDriver bdrv_vpc = {
+    .format_name	= "vpc",
+    .instance_size	= sizeof(BDRVVPCState),
+    .bdrv_probe		= vpc_probe,
+    .bdrv_open		= vpc_open,
+    .bdrv_read		= vpc_read,
+    .bdrv_write		= vpc_write,
+    .bdrv_close		= vpc_close,
+    .bdrv_create	= vpc_create,
+
+    .create_options = vpc_create_options,
+};
+
+static void bdrv_vpc_init(void)
+{
+    bdrv_register(&bdrv_vpc);
+}
+
+block_init(bdrv_vpc_init);
diff --git a/block/vvfat.c b/block/vvfat.c
new file mode 100644
index 0000000..1e37b9f
--- /dev/null
+++ b/block/vvfat.c
@@ -0,0 +1,2861 @@
+/* vim:set shiftwidth=4 ts=8: */
+/*
+ * QEMU Block driver for virtual VFAT (shadows a local directory)
+ *
+ * Copyright (c) 2004,2005 Johannes E. Schindelin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <sys/stat.h>
+#include <dirent.h>
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+
+#ifndef S_IWGRP
+#define S_IWGRP 0
+#endif
+#ifndef S_IWOTH
+#define S_IWOTH 0
+#endif
+
+/* TODO: add ":bootsector=blabla.img:" */
+/* LATER TODO: add automatic boot sector generation from
+    BOOTEASY.ASM and Ranish Partition Manager
+    Note that DOS assumes the system files to be the first files in the
+    file system (test if the boot sector still relies on that fact)! */
+/* MAYBE TODO: write block-visofs.c */
+/* TODO: call try_commit() only after a timeout */
+
+/* #define DEBUG */
+
+#ifdef DEBUG
+
+#define DLOG(a) a
+
+#undef stderr
+#define stderr STDERR
+FILE* stderr = NULL;
+
+static void checkpoint(void);
+
+#ifdef __MINGW32__
+void nonono(const char* file, int line, const char* msg) {
+    fprintf(stderr, "Nonono! %s:%d %s\n", file, line, msg);
+    exit(-5);
+}
+#undef assert
+#define assert(a) do {if (!(a)) nonono(__FILE__, __LINE__, #a);}while(0)
+#endif
+
+#else
+
+#define DLOG(a)
+
+#endif
+
+/* dynamic array functions */
+typedef struct array_t {
+    char* pointer;
+    unsigned int size,next,item_size;
+} array_t;
+
+static inline void array_init(array_t* array,unsigned int item_size)
+{
+    array->pointer = NULL;
+    array->size=0;
+    array->next=0;
+    array->item_size=item_size;
+}
+
+static inline void array_free(array_t* array)
+{
+    if(array->pointer)
+        free(array->pointer);
+    array->size=array->next=0;
+}
+
+/* does not automatically grow */
+static inline void* array_get(array_t* array,unsigned int index) {
+    assert(index < array->next);
+    return array->pointer + index * array->item_size;
+}
+
+static inline int array_ensure_allocated(array_t* array, int index)
+{
+    if((index + 1) * array->item_size > array->size) {
+	int new_size = (index + 32) * array->item_size;
+	array->pointer = qemu_realloc(array->pointer, new_size);
+	if (!array->pointer)
+	    return -1;
+	array->size = new_size;
+	array->next = index + 1;
+    }
+
+    return 0;
+}
+
+static inline void* array_get_next(array_t* array) {
+    unsigned int next = array->next;
+    void* result;
+
+    if (array_ensure_allocated(array, next) < 0)
+	return NULL;
+
+    array->next = next + 1;
+    result = array_get(array, next);
+
+    return result;
+}
+
+static inline void* array_insert(array_t* array,unsigned int index,unsigned int count) {
+    if((array->next+count)*array->item_size>array->size) {
+	int increment=count*array->item_size;
+	array->pointer=qemu_realloc(array->pointer,array->size+increment);
+	if(!array->pointer)
+            return NULL;
+	array->size+=increment;
+    }
+    memmove(array->pointer+(index+count)*array->item_size,
+		array->pointer+index*array->item_size,
+		(array->next-index)*array->item_size);
+    array->next+=count;
+    return array->pointer+index*array->item_size;
+}
+
+/* this performs a "roll", so that the element which was at index_from becomes
+ * index_to, but the order of all other elements is preserved. */
+static inline int array_roll(array_t* array,int index_to,int index_from,int count)
+{
+    char* buf;
+    char* from;
+    char* to;
+    int is;
+
+    if(!array ||
+	    index_to<0 || index_to>=array->next ||
+	    index_from<0 || index_from>=array->next)
+	return -1;
+
+    if(index_to==index_from)
+	return 0;
+
+    is=array->item_size;
+    from=array->pointer+index_from*is;
+    to=array->pointer+index_to*is;
+    buf=qemu_malloc(is*count);
+    memcpy(buf,from,is*count);
+
+    if(index_to<index_from)
+	memmove(to+is*count,to,from-to);
+    else
+	memmove(from,from+is*count,to-from);
+
+    memcpy(to,buf,is*count);
+
+    free(buf);
+
+    return 0;
+}
+
+static inline int array_remove_slice(array_t* array,int index, int count)
+{
+    assert(index >=0);
+    assert(count > 0);
+    assert(index + count <= array->next);
+    if(array_roll(array,array->next-1,index,count))
+	return -1;
+    array->next -= count;
+    return 0;
+}
+
+static int array_remove(array_t* array,int index)
+{
+    return array_remove_slice(array, index, 1);
+}
+
+/* return the index for a given member */
+static int array_index(array_t* array, void* pointer)
+{
+    size_t offset = (char*)pointer - array->pointer;
+    assert((offset % array->item_size) == 0);
+    assert(offset/array->item_size < array->next);
+    return offset/array->item_size;
+}
+
+/* These structures are used to fake a disk and the VFAT filesystem.
+ * For this reason we need to use __attribute__((packed)). */
+
+typedef struct bootsector_t {
+    uint8_t jump[3];
+    uint8_t name[8];
+    uint16_t sector_size;
+    uint8_t sectors_per_cluster;
+    uint16_t reserved_sectors;
+    uint8_t number_of_fats;
+    uint16_t root_entries;
+    uint16_t total_sectors16;
+    uint8_t media_type;
+    uint16_t sectors_per_fat;
+    uint16_t sectors_per_track;
+    uint16_t number_of_heads;
+    uint32_t hidden_sectors;
+    uint32_t total_sectors;
+    union {
+        struct {
+	    uint8_t drive_number;
+	    uint8_t current_head;
+	    uint8_t signature;
+	    uint32_t id;
+	    uint8_t volume_label[11];
+	} __attribute__((packed)) fat16;
+	struct {
+	    uint32_t sectors_per_fat;
+	    uint16_t flags;
+	    uint8_t major,minor;
+	    uint32_t first_cluster_of_root_directory;
+	    uint16_t info_sector;
+	    uint16_t backup_boot_sector;
+	    uint16_t ignored;
+	} __attribute__((packed)) fat32;
+    } u;
+    uint8_t fat_type[8];
+    uint8_t ignored[0x1c0];
+    uint8_t magic[2];
+} __attribute__((packed)) bootsector_t;
+
+typedef struct {
+    uint8_t head;
+    uint8_t sector;
+    uint8_t cylinder;
+} mbr_chs_t;
+
+typedef struct partition_t {
+    uint8_t attributes; /* 0x80 = bootable */
+    mbr_chs_t start_CHS;
+    uint8_t   fs_type; /* 0x1 = FAT12, 0x6 = FAT16, 0xe = FAT16_LBA, 0xb = FAT32, 0xc = FAT32_LBA */
+    mbr_chs_t end_CHS;
+    uint32_t start_sector_long;
+    uint32_t length_sector_long;
+} __attribute__((packed)) partition_t;
+
+typedef struct mbr_t {
+    uint8_t ignored[0x1b8];
+    uint32_t nt_id;
+    uint8_t ignored2[2];
+    partition_t partition[4];
+    uint8_t magic[2];
+} __attribute__((packed)) mbr_t;
+
+typedef struct direntry_t {
+    uint8_t name[8];
+    uint8_t extension[3];
+    uint8_t attributes;
+    uint8_t reserved[2];
+    uint16_t ctime;
+    uint16_t cdate;
+    uint16_t adate;
+    uint16_t begin_hi;
+    uint16_t mtime;
+    uint16_t mdate;
+    uint16_t begin;
+    uint32_t size;
+} __attribute__((packed)) direntry_t;
+
+/* this structure are used to transparently access the files */
+
+typedef struct mapping_t {
+    /* begin is the first cluster, end is the last+1 */
+    uint32_t begin,end;
+    /* as s->directory is growable, no pointer may be used here */
+    unsigned int dir_index;
+    /* the clusters of a file may be in any order; this points to the first */
+    int first_mapping_index;
+    union {
+	/* offset is
+	 * - the offset in the file (in clusters) for a file, or
+	 * - the next cluster of the directory for a directory, and
+	 * - the address of the buffer for a faked entry
+	 */
+	struct {
+	    uint32_t offset;
+	} file;
+	struct {
+	    int parent_mapping_index;
+	    int first_dir_index;
+	} dir;
+    } info;
+    /* path contains the full path, i.e. it always starts with s->path */
+    char* path;
+
+    enum { MODE_UNDEFINED = 0, MODE_NORMAL = 1, MODE_MODIFIED = 2,
+	MODE_DIRECTORY = 4, MODE_FAKED = 8,
+	MODE_DELETED = 16, MODE_RENAMED = 32 } mode;
+    int read_only;
+} mapping_t;
+
+#ifdef DEBUG
+static void print_direntry(const struct direntry_t*);
+static void print_mapping(const struct mapping_t* mapping);
+#endif
+
+/* here begins the real VVFAT driver */
+
+typedef struct BDRVVVFATState {
+    BlockDriverState* bs; /* pointer to parent */
+    unsigned int first_sectors_number; /* 1 for a single partition, 0x40 for a disk with partition table */
+    unsigned char first_sectors[0x40*0x200];
+
+    int fat_type; /* 16 or 32 */
+    array_t fat,directory,mapping;
+
+    unsigned int cluster_size;
+    unsigned int sectors_per_cluster;
+    unsigned int sectors_per_fat;
+    unsigned int sectors_of_root_directory;
+    uint32_t last_cluster_of_root_directory;
+    unsigned int faked_sectors; /* how many sectors are faked before file data */
+    uint32_t sector_count; /* total number of sectors of the partition */
+    uint32_t cluster_count; /* total number of clusters of this partition */
+    uint32_t max_fat_value;
+
+    int current_fd;
+    mapping_t* current_mapping;
+    unsigned char* cluster; /* points to current cluster */
+    unsigned char* cluster_buffer; /* points to a buffer to hold temp data */
+    unsigned int current_cluster;
+
+    /* write support */
+    BlockDriverState* write_target;
+    char* qcow_filename;
+    BlockDriverState* qcow;
+    void* fat2;
+    char* used_clusters;
+    array_t commits;
+    const char* path;
+    int downcase_short_names;
+} BDRVVVFATState;
+
+/* take the sector position spos and convert it to Cylinder/Head/Sector position
+ * if the position is outside the specified geometry, fill maximum value for CHS
+ * and return 1 to signal overflow.
+ */
+static int sector2CHS(BlockDriverState* bs, mbr_chs_t * chs, int spos){
+    int head,sector;
+    sector   = spos % (bs->secs);  spos/= bs->secs;
+    head     = spos % (bs->heads); spos/= bs->heads;
+    if(spos >= bs->cyls){
+        /* Overflow,
+        it happens if 32bit sector positions are used, while CHS is only 24bit.
+        Windows/Dos is said to take 1023/255/63 as nonrepresentable CHS */
+        chs->head     = 0xFF;
+        chs->sector   = 0xFF;
+        chs->cylinder = 0xFF;
+        return 1;
+    }
+    chs->head     = (uint8_t)head;
+    chs->sector   = (uint8_t)( (sector+1) | ((spos>>8)<<6) );
+    chs->cylinder = (uint8_t)spos;
+    return 0;
+}
+
+static void init_mbr(BDRVVVFATState* s)
+{
+    /* TODO: if the files mbr.img and bootsect.img exist, use them */
+    mbr_t* real_mbr=(mbr_t*)s->first_sectors;
+    partition_t* partition=&(real_mbr->partition[0]);
+    int lba;
+
+    memset(s->first_sectors,0,512);
+
+    /* Win NT Disk Signature */
+    real_mbr->nt_id= cpu_to_le32(0xbe1afdfa);
+
+    partition->attributes=0x80; /* bootable */
+
+    /* LBA is used when partition is outside the CHS geometry */
+    lba = sector2CHS(s->bs, &partition->start_CHS, s->first_sectors_number-1);
+    lba|= sector2CHS(s->bs, &partition->end_CHS,   s->sector_count);
+
+    /*LBA partitions are identified only by start/length_sector_long not by CHS*/
+    partition->start_sector_long =cpu_to_le32(s->first_sectors_number-1);
+    partition->length_sector_long=cpu_to_le32(s->sector_count - s->first_sectors_number+1);
+
+    /* FAT12/FAT16/FAT32 */
+    /* DOS uses different types when partition is LBA,
+       probably to prevent older versions from using CHS on them */
+    partition->fs_type= s->fat_type==12 ? 0x1:
+                        s->fat_type==16 ? (lba?0xe:0x06):
+                         /*fat_tyoe==32*/ (lba?0xc:0x0b);
+
+    real_mbr->magic[0]=0x55; real_mbr->magic[1]=0xaa;
+}
+
+/* direntry functions */
+
+/* dest is assumed to hold 258 bytes, and pads with 0xffff up to next multiple of 26 */
+static inline int short2long_name(char* dest,const char* src)
+{
+    int i;
+    int len;
+    for(i=0;i<129 && src[i];i++) {
+        dest[2*i]=src[i];
+	dest[2*i+1]=0;
+    }
+    len=2*i;
+    dest[2*i]=dest[2*i+1]=0;
+    for(i=2*i+2;(i%26);i++)
+	dest[i]=0xff;
+    return len;
+}
+
+static inline direntry_t* create_long_filename(BDRVVVFATState* s,const char* filename)
+{
+    char buffer[258];
+    int length=short2long_name(buffer,filename),
+        number_of_entries=(length+25)/26,i;
+    direntry_t* entry;
+
+    for(i=0;i<number_of_entries;i++) {
+	entry=array_get_next(&(s->directory));
+	entry->attributes=0xf;
+	entry->reserved[0]=0;
+	entry->begin=0;
+	entry->name[0]=(number_of_entries-i)|(i==0?0x40:0);
+    }
+    for(i=0;i<26*number_of_entries;i++) {
+	int offset=(i%26);
+	if(offset<10) offset=1+offset;
+	else if(offset<22) offset=14+offset-10;
+	else offset=28+offset-22;
+	entry=array_get(&(s->directory),s->directory.next-1-(i/26));
+	entry->name[offset]=buffer[i];
+    }
+    return array_get(&(s->directory),s->directory.next-number_of_entries);
+}
+
+static char is_free(const direntry_t* direntry)
+{
+    return direntry->name[0]==0xe5 || direntry->name[0]==0x00;
+}
+
+static char is_volume_label(const direntry_t* direntry)
+{
+    return direntry->attributes == 0x28;
+}
+
+static char is_long_name(const direntry_t* direntry)
+{
+    return direntry->attributes == 0xf;
+}
+
+static char is_short_name(const direntry_t* direntry)
+{
+    return !is_volume_label(direntry) && !is_long_name(direntry)
+	&& !is_free(direntry);
+}
+
+static char is_directory(const direntry_t* direntry)
+{
+    return direntry->attributes & 0x10 && direntry->name[0] != 0xe5;
+}
+
+static inline char is_dot(const direntry_t* direntry)
+{
+    return is_short_name(direntry) && direntry->name[0] == '.';
+}
+
+static char is_file(const direntry_t* direntry)
+{
+    return is_short_name(direntry) && !is_directory(direntry);
+}
+
+static inline uint32_t begin_of_direntry(const direntry_t* direntry)
+{
+    return le16_to_cpu(direntry->begin)|(le16_to_cpu(direntry->begin_hi)<<16);
+}
+
+static inline uint32_t filesize_of_direntry(const direntry_t* direntry)
+{
+    return le32_to_cpu(direntry->size);
+}
+
+static void set_begin_of_direntry(direntry_t* direntry, uint32_t begin)
+{
+    direntry->begin = cpu_to_le16(begin & 0xffff);
+    direntry->begin_hi = cpu_to_le16((begin >> 16) & 0xffff);
+}
+
+/* fat functions */
+
+static inline uint8_t fat_chksum(const direntry_t* entry)
+{
+    uint8_t chksum=0;
+    int i;
+
+    for(i=0;i<11;i++) {
+        unsigned char c;
+
+        c = (i <= 8) ? entry->name[i] : entry->extension[i-8];
+        chksum=(((chksum&0xfe)>>1)|((chksum&0x01)?0x80:0)) + c;
+    }
+
+    return chksum;
+}
+
+/* if return_time==0, this returns the fat_date, else the fat_time */
+static uint16_t fat_datetime(time_t time,int return_time) {
+    struct tm* t;
+#ifdef _WIN32
+    t=localtime(&time); /* this is not thread safe */
+#else
+    struct tm t1;
+    t=&t1;
+    localtime_r(&time,t);
+#endif
+    if(return_time)
+	return cpu_to_le16((t->tm_sec/2)|(t->tm_min<<5)|(t->tm_hour<<11));
+    return cpu_to_le16((t->tm_mday)|((t->tm_mon+1)<<5)|((t->tm_year-80)<<9));
+}
+
+static inline void fat_set(BDRVVVFATState* s,unsigned int cluster,uint32_t value)
+{
+    if(s->fat_type==32) {
+	uint32_t* entry=array_get(&(s->fat),cluster);
+	*entry=cpu_to_le32(value);
+    } else if(s->fat_type==16) {
+	uint16_t* entry=array_get(&(s->fat),cluster);
+	*entry=cpu_to_le16(value&0xffff);
+    } else {
+	int offset = (cluster*3/2);
+	unsigned char* p = array_get(&(s->fat), offset);
+        switch (cluster&1) {
+	case 0:
+		p[0] = value&0xff;
+		p[1] = (p[1]&0xf0) | ((value>>8)&0xf);
+		break;
+	case 1:
+		p[0] = (p[0]&0xf) | ((value&0xf)<<4);
+		p[1] = (value>>4);
+		break;
+	}
+    }
+}
+
+static inline uint32_t fat_get(BDRVVVFATState* s,unsigned int cluster)
+{
+    if(s->fat_type==32) {
+	uint32_t* entry=array_get(&(s->fat),cluster);
+	return le32_to_cpu(*entry);
+    } else if(s->fat_type==16) {
+	uint16_t* entry=array_get(&(s->fat),cluster);
+	return le16_to_cpu(*entry);
+    } else {
+	const uint8_t* x=(uint8_t*)(s->fat.pointer)+cluster*3/2;
+	return ((x[0]|(x[1]<<8))>>(cluster&1?4:0))&0x0fff;
+    }
+}
+
+static inline int fat_eof(BDRVVVFATState* s,uint32_t fat_entry)
+{
+    if(fat_entry>s->max_fat_value-8)
+	return -1;
+    return 0;
+}
+
+static inline void init_fat(BDRVVVFATState* s)
+{
+    if (s->fat_type == 12) {
+	array_init(&(s->fat),1);
+	array_ensure_allocated(&(s->fat),
+		s->sectors_per_fat * 0x200 * 3 / 2 - 1);
+    } else {
+	array_init(&(s->fat),(s->fat_type==32?4:2));
+	array_ensure_allocated(&(s->fat),
+		s->sectors_per_fat * 0x200 / s->fat.item_size - 1);
+    }
+    memset(s->fat.pointer,0,s->fat.size);
+
+    switch(s->fat_type) {
+	case 12: s->max_fat_value=0xfff; break;
+	case 16: s->max_fat_value=0xffff; break;
+	case 32: s->max_fat_value=0x0fffffff; break;
+	default: s->max_fat_value=0; /* error... */
+    }
+
+}
+
+/* TODO: in create_short_filename, 0xe5->0x05 is not yet handled! */
+/* TODO: in parse_short_filename, 0x05->0xe5 is not yet handled! */
+static inline direntry_t* create_short_and_long_name(BDRVVVFATState* s,
+	unsigned int directory_start, const char* filename, int is_dot)
+{
+    int i,j,long_index=s->directory.next;
+    direntry_t* entry = NULL;
+    direntry_t* entry_long = NULL;
+
+    if(is_dot) {
+	entry=array_get_next(&(s->directory));
+	memset(entry->name,0x20,11);
+	memcpy(entry->name,filename,strlen(filename));
+	return entry;
+    }
+
+    entry_long=create_long_filename(s,filename);
+
+    i = strlen(filename);
+    for(j = i - 1; j>0  && filename[j]!='.';j--);
+    if (j > 0)
+	i = (j > 8 ? 8 : j);
+    else if (i > 8)
+	i = 8;
+
+    entry=array_get_next(&(s->directory));
+    memset(entry->name,0x20,11);
+    memcpy(entry->name, filename, i);
+
+    if(j > 0)
+	for (i = 0; i < 3 && filename[j+1+i]; i++)
+	    entry->extension[i] = filename[j+1+i];
+
+    /* upcase & remove unwanted characters */
+    for(i=10;i>=0;i--) {
+	if(i==10 || i==7) for(;i>0 && entry->name[i]==' ';i--);
+	if(entry->name[i]<=' ' || entry->name[i]>0x7f
+		|| strchr(".*?<>|\":/\\[];,+='",entry->name[i]))
+	    entry->name[i]='_';
+        else if(entry->name[i]>='a' && entry->name[i]<='z')
+            entry->name[i]+='A'-'a';
+    }
+
+    /* mangle duplicates */
+    while(1) {
+	direntry_t* entry1=array_get(&(s->directory),directory_start);
+	int j;
+
+	for(;entry1<entry;entry1++)
+	    if(!is_long_name(entry1) && !memcmp(entry1->name,entry->name,11))
+		break; /* found dupe */
+	if(entry1==entry) /* no dupe found */
+	    break;
+
+	/* use all 8 characters of name */
+	if(entry->name[7]==' ') {
+	    int j;
+	    for(j=6;j>0 && entry->name[j]==' ';j--)
+		entry->name[j]='~';
+	}
+
+	/* increment number */
+	for(j=7;j>0 && entry->name[j]=='9';j--)
+	    entry->name[j]='0';
+	if(j>0) {
+	    if(entry->name[j]<'0' || entry->name[j]>'9')
+	        entry->name[j]='0';
+	    else
+	        entry->name[j]++;
+	}
+    }
+
+    /* calculate checksum; propagate to long name */
+    if(entry_long) {
+        uint8_t chksum=fat_chksum(entry);
+
+	/* calculate anew, because realloc could have taken place */
+	entry_long=array_get(&(s->directory),long_index);
+	while(entry_long<entry && is_long_name(entry_long)) {
+	    entry_long->reserved[1]=chksum;
+	    entry_long++;
+	}
+    }
+
+    return entry;
+}
+
+/*
+ * Read a directory. (the index of the corresponding mapping must be passed).
+ */
+static int read_directory(BDRVVVFATState* s, int mapping_index)
+{
+    mapping_t* mapping = array_get(&(s->mapping), mapping_index);
+    direntry_t* direntry;
+    const char* dirname = mapping->path;
+    int first_cluster = mapping->begin;
+    int parent_index = mapping->info.dir.parent_mapping_index;
+    mapping_t* parent_mapping = (mapping_t*)
+        (parent_index >= 0 ? array_get(&(s->mapping), parent_index) : NULL);
+    int first_cluster_of_parent = parent_mapping ? parent_mapping->begin : -1;
+
+    DIR* dir=opendir(dirname);
+    struct dirent* entry;
+    int i;
+
+    assert(mapping->mode & MODE_DIRECTORY);
+
+    if(!dir) {
+	mapping->end = mapping->begin;
+	return -1;
+    }
+
+    i = mapping->info.dir.first_dir_index =
+	    first_cluster == 0 ? 0 : s->directory.next;
+
+    /* actually read the directory, and allocate the mappings */
+    while((entry=readdir(dir))) {
+	unsigned int length=strlen(dirname)+2+strlen(entry->d_name);
+        char* buffer;
+	direntry_t* direntry;
+        struct stat st;
+	int is_dot=!strcmp(entry->d_name,".");
+	int is_dotdot=!strcmp(entry->d_name,"..");
+
+	if(first_cluster == 0 && (is_dotdot || is_dot))
+	    continue;
+
+	buffer=(char*)qemu_malloc(length);
+	snprintf(buffer,length,"%s/%s",dirname,entry->d_name);
+
+	if(stat(buffer,&st)<0) {
+	    free(buffer);
+            continue;
+	}
+
+	/* create directory entry for this file */
+	direntry=create_short_and_long_name(s, i, entry->d_name,
+		is_dot || is_dotdot);
+	direntry->attributes=(S_ISDIR(st.st_mode)?0x10:0x20);
+	direntry->reserved[0]=direntry->reserved[1]=0;
+	direntry->ctime=fat_datetime(st.st_ctime,1);
+	direntry->cdate=fat_datetime(st.st_ctime,0);
+	direntry->adate=fat_datetime(st.st_atime,0);
+	direntry->begin_hi=0;
+	direntry->mtime=fat_datetime(st.st_mtime,1);
+	direntry->mdate=fat_datetime(st.st_mtime,0);
+	if(is_dotdot)
+	    set_begin_of_direntry(direntry, first_cluster_of_parent);
+	else if(is_dot)
+	    set_begin_of_direntry(direntry, first_cluster);
+	else
+	    direntry->begin=0; /* do that later */
+        if (st.st_size > 0x7fffffff) {
+	    fprintf(stderr, "File %s is larger than 2GB\n", buffer);
+	    free(buffer);
+	    return -2;
+        }
+	direntry->size=cpu_to_le32(S_ISDIR(st.st_mode)?0:st.st_size);
+
+	/* create mapping for this file */
+	if(!is_dot && !is_dotdot && (S_ISDIR(st.st_mode) || st.st_size)) {
+	    s->current_mapping=(mapping_t*)array_get_next(&(s->mapping));
+	    s->current_mapping->begin=0;
+	    s->current_mapping->end=st.st_size;
+	    /*
+	     * we get the direntry of the most recent direntry, which
+	     * contains the short name and all the relevant information.
+	     */
+	    s->current_mapping->dir_index=s->directory.next-1;
+	    s->current_mapping->first_mapping_index = -1;
+	    if (S_ISDIR(st.st_mode)) {
+		s->current_mapping->mode = MODE_DIRECTORY;
+		s->current_mapping->info.dir.parent_mapping_index =
+		    mapping_index;
+	    } else {
+		s->current_mapping->mode = MODE_UNDEFINED;
+		s->current_mapping->info.file.offset = 0;
+	    }
+	    s->current_mapping->path=buffer;
+	    s->current_mapping->read_only =
+		(st.st_mode & (S_IWUSR | S_IWGRP | S_IWOTH)) == 0;
+	}
+    }
+    closedir(dir);
+
+    /* fill with zeroes up to the end of the cluster */
+    while(s->directory.next%(0x10*s->sectors_per_cluster)) {
+	direntry_t* direntry=array_get_next(&(s->directory));
+	memset(direntry,0,sizeof(direntry_t));
+    }
+
+/* TODO: if there are more entries, bootsector has to be adjusted! */
+#define ROOT_ENTRIES (0x02 * 0x10 * s->sectors_per_cluster)
+    if (mapping_index == 0 && s->directory.next < ROOT_ENTRIES) {
+	/* root directory */
+	int cur = s->directory.next;
+	array_ensure_allocated(&(s->directory), ROOT_ENTRIES - 1);
+	memset(array_get(&(s->directory), cur), 0,
+		(ROOT_ENTRIES - cur) * sizeof(direntry_t));
+    }
+
+     /* reget the mapping, since s->mapping was possibly realloc()ed */
+    mapping = (mapping_t*)array_get(&(s->mapping), mapping_index);
+    first_cluster += (s->directory.next - mapping->info.dir.first_dir_index)
+	* 0x20 / s->cluster_size;
+    mapping->end = first_cluster;
+
+    direntry = (direntry_t*)array_get(&(s->directory), mapping->dir_index);
+    set_begin_of_direntry(direntry, mapping->begin);
+
+    return 0;
+}
+
+static inline uint32_t sector2cluster(BDRVVVFATState* s,off_t sector_num)
+{
+    return (sector_num-s->faked_sectors)/s->sectors_per_cluster;
+}
+
+static inline off_t cluster2sector(BDRVVVFATState* s, uint32_t cluster_num)
+{
+    return s->faked_sectors + s->sectors_per_cluster * cluster_num;
+}
+
+static inline uint32_t sector_offset_in_cluster(BDRVVVFATState* s,off_t sector_num)
+{
+    return (sector_num-s->first_sectors_number-2*s->sectors_per_fat)%s->sectors_per_cluster;
+}
+
+#ifdef DBG
+static direntry_t* get_direntry_for_mapping(BDRVVVFATState* s,mapping_t* mapping)
+{
+    if(mapping->mode==MODE_UNDEFINED)
+	return 0;
+    return (direntry_t*)(s->directory.pointer+sizeof(direntry_t)*mapping->dir_index);
+}
+#endif
+
+static int init_directories(BDRVVVFATState* s,
+	const char* dirname)
+{
+    bootsector_t* bootsector;
+    mapping_t* mapping;
+    unsigned int i;
+    unsigned int cluster;
+
+    memset(&(s->first_sectors[0]),0,0x40*0x200);
+
+    s->cluster_size=s->sectors_per_cluster*0x200;
+    s->cluster_buffer=qemu_malloc(s->cluster_size);
+
+    /*
+     * The formula: sc = spf+1+spf*spc*(512*8/fat_type),
+     * where sc is sector_count,
+     * spf is sectors_per_fat,
+     * spc is sectors_per_clusters, and
+     * fat_type = 12, 16 or 32.
+     */
+    i = 1+s->sectors_per_cluster*0x200*8/s->fat_type;
+    s->sectors_per_fat=(s->sector_count+i)/i; /* round up */
+
+    array_init(&(s->mapping),sizeof(mapping_t));
+    array_init(&(s->directory),sizeof(direntry_t));
+
+    /* add volume label */
+    {
+	direntry_t* entry=array_get_next(&(s->directory));
+	entry->attributes=0x28; /* archive | volume label */
+	snprintf((char*)entry->name,11,"QEMU VVFAT");
+    }
+
+    /* Now build FAT, and write back information into directory */
+    init_fat(s);
+
+    s->faked_sectors=s->first_sectors_number+s->sectors_per_fat*2;
+    s->cluster_count=sector2cluster(s, s->sector_count);
+
+    mapping = array_get_next(&(s->mapping));
+    mapping->begin = 0;
+    mapping->dir_index = 0;
+    mapping->info.dir.parent_mapping_index = -1;
+    mapping->first_mapping_index = -1;
+    mapping->path = strdup(dirname);
+    i = strlen(mapping->path);
+    if (i > 0 && mapping->path[i - 1] == '/')
+	mapping->path[i - 1] = '\0';
+    mapping->mode = MODE_DIRECTORY;
+    mapping->read_only = 0;
+    s->path = mapping->path;
+
+    for (i = 0, cluster = 0; i < s->mapping.next; i++) {
+	/* MS-DOS expects the FAT to be 0 for the root directory
+	 * (except for the media byte). */
+	/* LATER TODO: still true for FAT32? */
+	int fix_fat = (i != 0);
+	mapping = array_get(&(s->mapping), i);
+
+        if (mapping->mode & MODE_DIRECTORY) {
+	    mapping->begin = cluster;
+	    if(read_directory(s, i)) {
+		fprintf(stderr, "Could not read directory %s\n",
+			mapping->path);
+		return -1;
+	    }
+	    mapping = array_get(&(s->mapping), i);
+	} else {
+	    assert(mapping->mode == MODE_UNDEFINED);
+	    mapping->mode=MODE_NORMAL;
+	    mapping->begin = cluster;
+	    if (mapping->end > 0) {
+		direntry_t* direntry = array_get(&(s->directory),
+			mapping->dir_index);
+
+		mapping->end = cluster + 1 + (mapping->end-1)/s->cluster_size;
+		set_begin_of_direntry(direntry, mapping->begin);
+	    } else {
+		mapping->end = cluster + 1;
+		fix_fat = 0;
+	    }
+	}
+
+	assert(mapping->begin < mapping->end);
+
+	/* next free cluster */
+	cluster = mapping->end;
+
+	if(cluster > s->cluster_count) {
+	    fprintf(stderr,"Directory does not fit in FAT%d (capacity %s)\n",
+		    s->fat_type,
+		    s->fat_type == 12 ? s->sector_count == 2880 ? "1.44 MB"
+								: "2.88 MB"
+				      : "504MB");
+	    return -EINVAL;
+	}
+
+	/* fix fat for entry */
+	if (fix_fat) {
+	    int j;
+	    for(j = mapping->begin; j < mapping->end - 1; j++)
+		fat_set(s, j, j+1);
+	    fat_set(s, mapping->end - 1, s->max_fat_value);
+	}
+    }
+
+    mapping = array_get(&(s->mapping), 0);
+    s->sectors_of_root_directory = mapping->end * s->sectors_per_cluster;
+    s->last_cluster_of_root_directory = mapping->end;
+
+    /* the FAT signature */
+    fat_set(s,0,s->max_fat_value);
+    fat_set(s,1,s->max_fat_value);
+
+    s->current_mapping = NULL;
+
+    bootsector=(bootsector_t*)(s->first_sectors+(s->first_sectors_number-1)*0x200);
+    bootsector->jump[0]=0xeb;
+    bootsector->jump[1]=0x3e;
+    bootsector->jump[2]=0x90;
+    memcpy(bootsector->name,"QEMU    ",8);
+    bootsector->sector_size=cpu_to_le16(0x200);
+    bootsector->sectors_per_cluster=s->sectors_per_cluster;
+    bootsector->reserved_sectors=cpu_to_le16(1);
+    bootsector->number_of_fats=0x2; /* number of FATs */
+    bootsector->root_entries=cpu_to_le16(s->sectors_of_root_directory*0x10);
+    bootsector->total_sectors16=s->sector_count>0xffff?0:cpu_to_le16(s->sector_count);
+    bootsector->media_type=(s->fat_type!=12?0xf8:s->sector_count==5760?0xf9:0xf8); /* media descriptor */
+    s->fat.pointer[0] = bootsector->media_type;
+    bootsector->sectors_per_fat=cpu_to_le16(s->sectors_per_fat);
+    bootsector->sectors_per_track=cpu_to_le16(s->bs->secs);
+    bootsector->number_of_heads=cpu_to_le16(s->bs->heads);
+    bootsector->hidden_sectors=cpu_to_le32(s->first_sectors_number==1?0:0x3f);
+    bootsector->total_sectors=cpu_to_le32(s->sector_count>0xffff?s->sector_count:0);
+
+    /* LATER TODO: if FAT32, this is wrong */
+    bootsector->u.fat16.drive_number=s->fat_type==12?0:0x80; /* assume this is hda (TODO) */
+    bootsector->u.fat16.current_head=0;
+    bootsector->u.fat16.signature=0x29;
+    bootsector->u.fat16.id=cpu_to_le32(0xfabe1afd);
+
+    memcpy(bootsector->u.fat16.volume_label,"QEMU VVFAT ",11);
+    memcpy(bootsector->fat_type,(s->fat_type==12?"FAT12   ":s->fat_type==16?"FAT16   ":"FAT32   "),8);
+    bootsector->magic[0]=0x55; bootsector->magic[1]=0xaa;
+
+    return 0;
+}
+
+#ifdef DEBUG
+static BDRVVVFATState *vvv = NULL;
+#endif
+
+static int enable_write_target(BDRVVVFATState *s);
+static int is_consistent(BDRVVVFATState *s);
+
+static int vvfat_open(BlockDriverState *bs, const char* dirname, int flags)
+{
+    BDRVVVFATState *s = bs->opaque;
+    int floppy = 0;
+    int i;
+
+#ifdef DEBUG
+    vvv = s;
+#endif
+
+DLOG(if (stderr == NULL) {
+    stderr = fopen("vvfat.log", "a");
+    setbuf(stderr, NULL);
+})
+
+    s->bs = bs;
+
+    s->fat_type=16;
+    /* LATER TODO: if FAT32, adjust */
+    s->sectors_per_cluster=0x10;
+    /* 504MB disk*/
+    bs->cyls=1024; bs->heads=16; bs->secs=63;
+
+    s->current_cluster=0xffffffff;
+
+    s->first_sectors_number=0x40;
+    /* read only is the default for safety */
+    bs->read_only = 1;
+    s->qcow = s->write_target = NULL;
+    s->qcow_filename = NULL;
+    s->fat2 = NULL;
+    s->downcase_short_names = 1;
+
+    if (!strstart(dirname, "fat:", NULL))
+	return -1;
+
+    if (strstr(dirname, ":floppy:")) {
+	floppy = 1;
+	s->fat_type = 12;
+	s->first_sectors_number = 1;
+	s->sectors_per_cluster=2;
+	bs->cyls = 80; bs->heads = 2; bs->secs = 36;
+    }
+
+    s->sector_count=bs->cyls*bs->heads*bs->secs;
+
+    if (strstr(dirname, ":32:")) {
+	fprintf(stderr, "Big fat greek warning: FAT32 has not been tested. You are welcome to do so!\n");
+	s->fat_type = 32;
+    } else if (strstr(dirname, ":16:")) {
+	s->fat_type = 16;
+    } else if (strstr(dirname, ":12:")) {
+	s->fat_type = 12;
+	s->sector_count=2880;
+    }
+
+    if (strstr(dirname, ":rw:")) {
+	if (enable_write_target(s))
+	    return -1;
+	bs->read_only = 0;
+    }
+
+    i = strrchr(dirname, ':') - dirname;
+    assert(i >= 3);
+    if (dirname[i-2] == ':' && qemu_isalpha(dirname[i-1]))
+	/* workaround for DOS drive names */
+	dirname += i-1;
+    else
+	dirname += i+1;
+
+    bs->total_sectors=bs->cyls*bs->heads*bs->secs;
+
+    if(init_directories(s, dirname))
+	return -1;
+
+    s->sector_count = s->faked_sectors + s->sectors_per_cluster*s->cluster_count;
+
+    if(s->first_sectors_number==0x40)
+	init_mbr(s);
+
+    /* for some reason or other, MS-DOS does not like to know about CHS... */
+    if (floppy)
+	bs->heads = bs->cyls = bs->secs = 0;
+
+    //    assert(is_consistent(s));
+    return 0;
+}
+
+static inline void vvfat_close_current_file(BDRVVVFATState *s)
+{
+    if(s->current_mapping) {
+	s->current_mapping = NULL;
+	if (s->current_fd) {
+		close(s->current_fd);
+		s->current_fd = 0;
+	}
+    }
+    s->current_cluster = -1;
+}
+
+/* mappings between index1 and index2-1 are supposed to be ordered
+ * return value is the index of the last mapping for which end>cluster_num
+ */
+static inline int find_mapping_for_cluster_aux(BDRVVVFATState* s,int cluster_num,int index1,int index2)
+{
+    int index3=index1+1;
+    while(1) {
+	mapping_t* mapping;
+	index3=(index1+index2)/2;
+	mapping=array_get(&(s->mapping),index3);
+	assert(mapping->begin < mapping->end);
+	if(mapping->begin>=cluster_num) {
+	    assert(index2!=index3 || index2==0);
+	    if(index2==index3)
+		return index1;
+	    index2=index3;
+	} else {
+	    if(index1==index3)
+		return mapping->end<=cluster_num ? index2 : index1;
+	    index1=index3;
+	}
+	assert(index1<=index2);
+	DLOG(mapping=array_get(&(s->mapping),index1);
+	assert(mapping->begin<=cluster_num);
+	assert(index2 >= s->mapping.next ||
+		((mapping = array_get(&(s->mapping),index2)) &&
+		mapping->end>cluster_num)));
+    }
+}
+
+static inline mapping_t* find_mapping_for_cluster(BDRVVVFATState* s,int cluster_num)
+{
+    int index=find_mapping_for_cluster_aux(s,cluster_num,0,s->mapping.next);
+    mapping_t* mapping;
+    if(index>=s->mapping.next)
+        return NULL;
+    mapping=array_get(&(s->mapping),index);
+    if(mapping->begin>cluster_num)
+        return NULL;
+    assert(mapping->begin<=cluster_num && mapping->end>cluster_num);
+    return mapping;
+}
+
+/*
+ * This function simply compares path == mapping->path. Since the mappings
+ * are sorted by cluster, this is expensive: O(n).
+ */
+static inline mapping_t* find_mapping_for_path(BDRVVVFATState* s,
+	const char* path)
+{
+    int i;
+
+    for (i = 0; i < s->mapping.next; i++) {
+	mapping_t* mapping = array_get(&(s->mapping), i);
+	if (mapping->first_mapping_index < 0 &&
+		!strcmp(path, mapping->path))
+	    return mapping;
+    }
+
+    return NULL;
+}
+
+static int open_file(BDRVVVFATState* s,mapping_t* mapping)
+{
+    if(!mapping)
+	return -1;
+    if(!s->current_mapping ||
+	    strcmp(s->current_mapping->path,mapping->path)) {
+	/* open file */
+	int fd = open(mapping->path, O_RDONLY | O_BINARY | O_LARGEFILE);
+	if(fd<0)
+	    return -1;
+	vvfat_close_current_file(s);
+	s->current_fd = fd;
+	s->current_mapping = mapping;
+    }
+    return 0;
+}
+
+static inline int read_cluster(BDRVVVFATState *s,int cluster_num)
+{
+    if(s->current_cluster != cluster_num) {
+	int result=0;
+	off_t offset;
+	assert(!s->current_mapping || s->current_fd || (s->current_mapping->mode & MODE_DIRECTORY));
+	if(!s->current_mapping
+		|| s->current_mapping->begin>cluster_num
+		|| s->current_mapping->end<=cluster_num) {
+	    /* binary search of mappings for file */
+	    mapping_t* mapping=find_mapping_for_cluster(s,cluster_num);
+
+	    assert(!mapping || (cluster_num>=mapping->begin && cluster_num<mapping->end));
+
+	    if (mapping && mapping->mode & MODE_DIRECTORY) {
+		vvfat_close_current_file(s);
+		s->current_mapping = mapping;
+read_cluster_directory:
+		offset = s->cluster_size*(cluster_num-s->current_mapping->begin);
+		s->cluster = (unsigned char*)s->directory.pointer+offset
+			+ 0x20*s->current_mapping->info.dir.first_dir_index;
+		assert(((s->cluster-(unsigned char*)s->directory.pointer)%s->cluster_size)==0);
+		assert((char*)s->cluster+s->cluster_size <= s->directory.pointer+s->directory.next*s->directory.item_size);
+		s->current_cluster = cluster_num;
+		return 0;
+	    }
+
+	    if(open_file(s,mapping))
+		return -2;
+	} else if (s->current_mapping->mode & MODE_DIRECTORY)
+	    goto read_cluster_directory;
+
+	assert(s->current_fd);
+
+	offset=s->cluster_size*(cluster_num-s->current_mapping->begin)+s->current_mapping->info.file.offset;
+	if(lseek(s->current_fd, offset, SEEK_SET)!=offset)
+	    return -3;
+	s->cluster=s->cluster_buffer;
+	result=read(s->current_fd,s->cluster,s->cluster_size);
+	if(result<0) {
+	    s->current_cluster = -1;
+	    return -1;
+	}
+	s->current_cluster = cluster_num;
+    }
+    return 0;
+}
+
+#ifdef DEBUG
+static void hexdump(const void* address, uint32_t len)
+{
+    const unsigned char* p = address;
+    int i, j;
+
+    for (i = 0; i < len; i += 16) {
+	for (j = 0; j < 16 && i + j < len; j++)
+	    fprintf(stderr, "%02x ", p[i + j]);
+	for (; j < 16; j++)
+	    fprintf(stderr, "   ");
+	fprintf(stderr, " ");
+	for (j = 0; j < 16 && i + j < len; j++)
+	    fprintf(stderr, "%c", (p[i + j] < ' ' || p[i + j] > 0x7f) ? '.' : p[i + j]);
+	fprintf(stderr, "\n");
+    }
+}
+
+static void print_direntry(const direntry_t* direntry)
+{
+    int j = 0;
+    char buffer[1024];
+
+    fprintf(stderr, "direntry 0x%x: ", (int)direntry);
+    if(!direntry)
+	return;
+    if(is_long_name(direntry)) {
+	unsigned char* c=(unsigned char*)direntry;
+	int i;
+	for(i=1;i<11 && c[i] && c[i]!=0xff;i+=2)
+#define ADD_CHAR(c) {buffer[j] = (c); if (buffer[j] < ' ') buffer[j] = 0xb0; j++;}
+	    ADD_CHAR(c[i]);
+	for(i=14;i<26 && c[i] && c[i]!=0xff;i+=2)
+	    ADD_CHAR(c[i]);
+	for(i=28;i<32 && c[i] && c[i]!=0xff;i+=2)
+	    ADD_CHAR(c[i]);
+	buffer[j] = 0;
+	fprintf(stderr, "%s\n", buffer);
+    } else {
+	int i;
+	for(i=0;i<11;i++)
+	    ADD_CHAR(direntry->name[i]);
+	buffer[j] = 0;
+	fprintf(stderr,"%s attributes=0x%02x begin=%d size=%d\n",
+		buffer,
+		direntry->attributes,
+		begin_of_direntry(direntry),le32_to_cpu(direntry->size));
+    }
+}
+
+static void print_mapping(const mapping_t* mapping)
+{
+    fprintf(stderr, "mapping (0x%x): begin, end = %d, %d, dir_index = %d, first_mapping_index = %d, name = %s, mode = 0x%x, " , (int)mapping, mapping->begin, mapping->end, mapping->dir_index, mapping->first_mapping_index, mapping->path, mapping->mode);
+    if (mapping->mode & MODE_DIRECTORY)
+	fprintf(stderr, "parent_mapping_index = %d, first_dir_index = %d\n", mapping->info.dir.parent_mapping_index, mapping->info.dir.first_dir_index);
+    else
+	fprintf(stderr, "offset = %d\n", mapping->info.file.offset);
+}
+#endif
+
+static int vvfat_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
+{
+    BDRVVVFATState *s = bs->opaque;
+    int i;
+
+    for(i=0;i<nb_sectors;i++,sector_num++) {
+	if (sector_num >= s->sector_count)
+	   return -1;
+	if (s->qcow) {
+	    int n;
+	    if (s->qcow->drv->bdrv_is_allocated(s->qcow,
+			sector_num, nb_sectors-i, &n)) {
+DLOG(fprintf(stderr, "sectors %d+%d allocated\n", (int)sector_num, n));
+		if (s->qcow->drv->bdrv_read(s->qcow, sector_num, buf+i*0x200, n))
+		    return -1;
+		i += n - 1;
+		sector_num += n - 1;
+		continue;
+	    }
+DLOG(fprintf(stderr, "sector %d not allocated\n", (int)sector_num));
+	}
+	if(sector_num<s->faked_sectors) {
+	    if(sector_num<s->first_sectors_number)
+		memcpy(buf+i*0x200,&(s->first_sectors[sector_num*0x200]),0x200);
+	    else if(sector_num-s->first_sectors_number<s->sectors_per_fat)
+		memcpy(buf+i*0x200,&(s->fat.pointer[(sector_num-s->first_sectors_number)*0x200]),0x200);
+	    else if(sector_num-s->first_sectors_number-s->sectors_per_fat<s->sectors_per_fat)
+		memcpy(buf+i*0x200,&(s->fat.pointer[(sector_num-s->first_sectors_number-s->sectors_per_fat)*0x200]),0x200);
+	} else {
+	    uint32_t sector=sector_num-s->faked_sectors,
+	    sector_offset_in_cluster=(sector%s->sectors_per_cluster),
+	    cluster_num=sector/s->sectors_per_cluster;
+	    if(read_cluster(s, cluster_num) != 0) {
+		/* LATER TODO: strict: return -1; */
+		memset(buf+i*0x200,0,0x200);
+		continue;
+	    }
+	    memcpy(buf+i*0x200,s->cluster+sector_offset_in_cluster*0x200,0x200);
+	}
+    }
+    return 0;
+}
+
+/* LATER TODO: statify all functions */
+
+/*
+ * Idea of the write support (use snapshot):
+ *
+ * 1. check if all data is consistent, recording renames, modifications,
+ *    new files and directories (in s->commits).
+ *
+ * 2. if the data is not consistent, stop committing
+ *
+ * 3. handle renames, and create new files and directories (do not yet
+ *    write their contents)
+ *
+ * 4. walk the directories, fixing the mapping and direntries, and marking
+ *    the handled mappings as not deleted
+ *
+ * 5. commit the contents of the files
+ *
+ * 6. handle deleted files and directories
+ *
+ */
+
+typedef struct commit_t {
+    char* path;
+    union {
+	struct { uint32_t cluster; } rename;
+	struct { int dir_index; uint32_t modified_offset; } writeout;
+	struct { uint32_t first_cluster; } new_file;
+	struct { uint32_t cluster; } mkdir;
+    } param;
+    /* DELETEs and RMDIRs are handled differently: see handle_deletes() */
+    enum {
+	ACTION_RENAME, ACTION_WRITEOUT, ACTION_NEW_FILE, ACTION_MKDIR
+    } action;
+} commit_t;
+
+static void clear_commits(BDRVVVFATState* s)
+{
+    int i;
+DLOG(fprintf(stderr, "clear_commits (%d commits)\n", s->commits.next));
+    for (i = 0; i < s->commits.next; i++) {
+	commit_t* commit = array_get(&(s->commits), i);
+	assert(commit->path || commit->action == ACTION_WRITEOUT);
+	if (commit->action != ACTION_WRITEOUT) {
+	    assert(commit->path);
+	    free(commit->path);
+	} else
+	    assert(commit->path == NULL);
+    }
+    s->commits.next = 0;
+}
+
+static void schedule_rename(BDRVVVFATState* s,
+	uint32_t cluster, char* new_path)
+{
+    commit_t* commit = array_get_next(&(s->commits));
+    commit->path = new_path;
+    commit->param.rename.cluster = cluster;
+    commit->action = ACTION_RENAME;
+}
+
+static void schedule_writeout(BDRVVVFATState* s,
+	int dir_index, uint32_t modified_offset)
+{
+    commit_t* commit = array_get_next(&(s->commits));
+    commit->path = NULL;
+    commit->param.writeout.dir_index = dir_index;
+    commit->param.writeout.modified_offset = modified_offset;
+    commit->action = ACTION_WRITEOUT;
+}
+
+static void schedule_new_file(BDRVVVFATState* s,
+	char* path, uint32_t first_cluster)
+{
+    commit_t* commit = array_get_next(&(s->commits));
+    commit->path = path;
+    commit->param.new_file.first_cluster = first_cluster;
+    commit->action = ACTION_NEW_FILE;
+}
+
+static void schedule_mkdir(BDRVVVFATState* s, uint32_t cluster, char* path)
+{
+    commit_t* commit = array_get_next(&(s->commits));
+    commit->path = path;
+    commit->param.mkdir.cluster = cluster;
+    commit->action = ACTION_MKDIR;
+}
+
+typedef struct {
+    /*
+     * Since the sequence number is at most 0x3f, and the filename
+     * length is at most 13 times the sequence number, the maximal
+     * filename length is 0x3f * 13 bytes.
+     */
+    unsigned char name[0x3f * 13 + 1];
+    int checksum, len;
+    int sequence_number;
+} long_file_name;
+
+static void lfn_init(long_file_name* lfn)
+{
+   lfn->sequence_number = lfn->len = 0;
+   lfn->checksum = 0x100;
+}
+
+/* return 0 if parsed successfully, > 0 if no long name, < 0 if error */
+static int parse_long_name(long_file_name* lfn,
+	const direntry_t* direntry)
+{
+    int i, j, offset;
+    const unsigned char* pointer = (const unsigned char*)direntry;
+
+    if (!is_long_name(direntry))
+	return 1;
+
+    if (pointer[0] & 0x40) {
+	lfn->sequence_number = pointer[0] & 0x3f;
+	lfn->checksum = pointer[13];
+	lfn->name[0] = 0;
+	lfn->name[lfn->sequence_number * 13] = 0;
+    } else if ((pointer[0] & 0x3f) != --lfn->sequence_number)
+	return -1;
+    else if (pointer[13] != lfn->checksum)
+	return -2;
+    else if (pointer[12] || pointer[26] || pointer[27])
+	return -3;
+
+    offset = 13 * (lfn->sequence_number - 1);
+    for (i = 0, j = 1; i < 13; i++, j+=2) {
+	if (j == 11)
+	    j = 14;
+	else if (j == 26)
+	    j = 28;
+
+	if (pointer[j+1] == 0)
+	    lfn->name[offset + i] = pointer[j];
+	else if (pointer[j+1] != 0xff || (pointer[0] & 0x40) == 0)
+	    return -4;
+	else
+	    lfn->name[offset + i] = 0;
+    }
+
+    if (pointer[0] & 0x40)
+	lfn->len = offset + strlen((char*)lfn->name + offset);
+
+    return 0;
+}
+
+/* returns 0 if successful, >0 if no short_name, and <0 on error */
+static int parse_short_name(BDRVVVFATState* s,
+	long_file_name* lfn, direntry_t* direntry)
+{
+    int i, j;
+
+    if (!is_short_name(direntry))
+	return 1;
+
+    for (j = 7; j >= 0 && direntry->name[j] == ' '; j--);
+    for (i = 0; i <= j; i++) {
+	if (direntry->name[i] <= ' ' || direntry->name[i] > 0x7f)
+	    return -1;
+	else if (s->downcase_short_names)
+	    lfn->name[i] = qemu_tolower(direntry->name[i]);
+	else
+	    lfn->name[i] = direntry->name[i];
+    }
+
+    for (j = 2; j >= 0 && direntry->extension[j] == ' '; j--);
+    if (j >= 0) {
+	lfn->name[i++] = '.';
+	lfn->name[i + j + 1] = '\0';
+	for (;j >= 0; j--) {
+	    if (direntry->extension[j] <= ' ' || direntry->extension[j] > 0x7f)
+		return -2;
+	    else if (s->downcase_short_names)
+		lfn->name[i + j] = qemu_tolower(direntry->extension[j]);
+	    else
+		lfn->name[i + j] = direntry->extension[j];
+	}
+    } else
+	lfn->name[i + j + 1] = '\0';
+
+    lfn->len = strlen((char*)lfn->name);
+
+    return 0;
+}
+
+static inline uint32_t modified_fat_get(BDRVVVFATState* s,
+	unsigned int cluster)
+{
+    if (cluster < s->last_cluster_of_root_directory) {
+	if (cluster + 1 == s->last_cluster_of_root_directory)
+	    return s->max_fat_value;
+	else
+	    return cluster + 1;
+    }
+
+    if (s->fat_type==32) {
+        uint32_t* entry=((uint32_t*)s->fat2)+cluster;
+        return le32_to_cpu(*entry);
+    } else if (s->fat_type==16) {
+        uint16_t* entry=((uint16_t*)s->fat2)+cluster;
+        return le16_to_cpu(*entry);
+    } else {
+        const uint8_t* x=s->fat2+cluster*3/2;
+        return ((x[0]|(x[1]<<8))>>(cluster&1?4:0))&0x0fff;
+    }
+}
+
+static inline int cluster_was_modified(BDRVVVFATState* s, uint32_t cluster_num)
+{
+    int was_modified = 0;
+    int i, dummy;
+
+    if (s->qcow == NULL)
+	return 0;
+
+    for (i = 0; !was_modified && i < s->sectors_per_cluster; i++)
+	was_modified = s->qcow->drv->bdrv_is_allocated(s->qcow,
+		cluster2sector(s, cluster_num) + i, 1, &dummy);
+
+    return was_modified;
+}
+
+static const char* get_basename(const char* path)
+{
+    char* basename = strrchr(path, '/');
+    if (basename == NULL)
+	return path;
+    else
+	return basename + 1; /* strip '/' */
+}
+
+/*
+ * The array s->used_clusters holds the states of the clusters. If it is
+ * part of a file, it has bit 2 set, in case of a directory, bit 1. If it
+ * was modified, bit 3 is set.
+ * If any cluster is allocated, but not part of a file or directory, this
+ * driver refuses to commit.
+ */
+typedef enum {
+     USED_DIRECTORY = 1, USED_FILE = 2, USED_ANY = 3, USED_ALLOCATED = 4
+} used_t;
+
+/*
+ * get_cluster_count_for_direntry() not only determines how many clusters
+ * are occupied by direntry, but also if it was renamed or modified.
+ *
+ * A file is thought to be renamed *only* if there already was a file with
+ * exactly the same first cluster, but a different name.
+ *
+ * Further, the files/directories handled by this function are
+ * assumed to be *not* deleted (and *only* those).
+ */
+static uint32_t get_cluster_count_for_direntry(BDRVVVFATState* s,
+	direntry_t* direntry, const char* path)
+{
+    /*
+     * This is a little bit tricky:
+     * IF the guest OS just inserts a cluster into the file chain,
+     * and leaves the rest alone, (i.e. the original file had clusters
+     * 15 -> 16, but now has 15 -> 32 -> 16), then the following happens:
+     *
+     * - do_commit will write the cluster into the file at the given
+     *   offset, but
+     *
+     * - the cluster which is overwritten should be moved to a later
+     *   position in the file.
+     *
+     * I am not aware that any OS does something as braindead, but this
+     * situation could happen anyway when not committing for a long time.
+     * Just to be sure that this does not bite us, detect it, and copy the
+     * contents of the clusters to-be-overwritten into the qcow.
+     */
+    int copy_it = 0;
+    int was_modified = 0;
+    int32_t ret = 0;
+
+    uint32_t cluster_num = begin_of_direntry(direntry);
+    uint32_t offset = 0;
+    int first_mapping_index = -1;
+    mapping_t* mapping = NULL;
+    const char* basename2 = NULL;
+
+    vvfat_close_current_file(s);
+
+    /* the root directory */
+    if (cluster_num == 0)
+	return 0;
+
+    /* write support */
+    if (s->qcow) {
+	basename2 = get_basename(path);
+
+	mapping = find_mapping_for_cluster(s, cluster_num);
+
+	if (mapping) {
+	    const char* basename;
+
+	    assert(mapping->mode & MODE_DELETED);
+	    mapping->mode &= ~MODE_DELETED;
+
+	    basename = get_basename(mapping->path);
+
+	    assert(mapping->mode & MODE_NORMAL);
+
+	    /* rename */
+	    if (strcmp(basename, basename2))
+		schedule_rename(s, cluster_num, strdup(path));
+	} else if (is_file(direntry))
+	    /* new file */
+	    schedule_new_file(s, strdup(path), cluster_num);
+	else {
+	    assert(0);
+	    return 0;
+	}
+    }
+
+    while(1) {
+	if (s->qcow) {
+	    if (!copy_it && cluster_was_modified(s, cluster_num)) {
+		if (mapping == NULL ||
+			mapping->begin > cluster_num ||
+			mapping->end <= cluster_num)
+		mapping = find_mapping_for_cluster(s, cluster_num);
+
+
+		if (mapping &&
+			(mapping->mode & MODE_DIRECTORY) == 0) {
+
+		    /* was modified in qcow */
+		    if (offset != mapping->info.file.offset + s->cluster_size
+			    * (cluster_num - mapping->begin)) {
+			/* offset of this cluster in file chain has changed */
+			assert(0);
+			copy_it = 1;
+		    } else if (offset == 0) {
+			const char* basename = get_basename(mapping->path);
+
+			if (strcmp(basename, basename2))
+			    copy_it = 1;
+			first_mapping_index = array_index(&(s->mapping), mapping);
+		    }
+
+		    if (mapping->first_mapping_index != first_mapping_index
+			    && mapping->info.file.offset > 0) {
+			assert(0);
+			copy_it = 1;
+		    }
+
+		    /* need to write out? */
+		    if (!was_modified && is_file(direntry)) {
+			was_modified = 1;
+			schedule_writeout(s, mapping->dir_index, offset);
+		    }
+		}
+	    }
+
+	    if (copy_it) {
+		int i, dummy;
+		/*
+		 * This is horribly inefficient, but that is okay, since
+		 * it is rarely executed, if at all.
+		 */
+		int64_t offset = cluster2sector(s, cluster_num);
+
+		vvfat_close_current_file(s);
+		for (i = 0; i < s->sectors_per_cluster; i++)
+		    if (!s->qcow->drv->bdrv_is_allocated(s->qcow,
+				offset + i, 1, &dummy)) {
+			if (vvfat_read(s->bs,
+				    offset, s->cluster_buffer, 1))
+			    return -1;
+			if (s->qcow->drv->bdrv_write(s->qcow,
+				    offset, s->cluster_buffer, 1))
+			    return -2;
+		    }
+	    }
+	}
+
+	ret++;
+	if (s->used_clusters[cluster_num] & USED_ANY)
+	    return 0;
+	s->used_clusters[cluster_num] = USED_FILE;
+
+	cluster_num = modified_fat_get(s, cluster_num);
+
+	if (fat_eof(s, cluster_num))
+	    return ret;
+	else if (cluster_num < 2 || cluster_num > s->max_fat_value - 16)
+	    return -1;
+
+	offset += s->cluster_size;
+    }
+}
+
+/*
+ * This function looks at the modified data (qcow).
+ * It returns 0 upon inconsistency or error, and the number of clusters
+ * used by the directory, its subdirectories and their files.
+ */
+static int check_directory_consistency(BDRVVVFATState *s,
+	int cluster_num, const char* path)
+{
+    int ret = 0;
+    unsigned char* cluster = qemu_malloc(s->cluster_size);
+    direntry_t* direntries = (direntry_t*)cluster;
+    mapping_t* mapping = find_mapping_for_cluster(s, cluster_num);
+
+    long_file_name lfn;
+    int path_len = strlen(path);
+    char path2[PATH_MAX];
+
+    assert(path_len < PATH_MAX); /* len was tested before! */
+    pstrcpy(path2, sizeof(path2), path);
+    path2[path_len] = '/';
+    path2[path_len + 1] = '\0';
+
+    if (mapping) {
+	const char* basename = get_basename(mapping->path);
+	const char* basename2 = get_basename(path);
+
+	assert(mapping->mode & MODE_DIRECTORY);
+
+	assert(mapping->mode & MODE_DELETED);
+	mapping->mode &= ~MODE_DELETED;
+
+	if (strcmp(basename, basename2))
+	    schedule_rename(s, cluster_num, strdup(path));
+    } else
+	/* new directory */
+	schedule_mkdir(s, cluster_num, strdup(path));
+
+    lfn_init(&lfn);
+    do {
+	int i;
+	int subret = 0;
+
+	ret++;
+
+	if (s->used_clusters[cluster_num] & USED_ANY) {
+	    fprintf(stderr, "cluster %d used more than once\n", (int)cluster_num);
+	    return 0;
+	}
+	s->used_clusters[cluster_num] = USED_DIRECTORY;
+
+DLOG(fprintf(stderr, "read cluster %d (sector %d)\n", (int)cluster_num, (int)cluster2sector(s, cluster_num)));
+	subret = vvfat_read(s->bs, cluster2sector(s, cluster_num), cluster,
+		s->sectors_per_cluster);
+	if (subret) {
+	    fprintf(stderr, "Error fetching direntries\n");
+	fail:
+	    free(cluster);
+	    return 0;
+	}
+
+	for (i = 0; i < 0x10 * s->sectors_per_cluster; i++) {
+	    int cluster_count = 0;
+
+DLOG(fprintf(stderr, "check direntry %d: \n", i); print_direntry(direntries + i));
+	    if (is_volume_label(direntries + i) || is_dot(direntries + i) ||
+		    is_free(direntries + i))
+		continue;
+
+	    subret = parse_long_name(&lfn, direntries + i);
+	    if (subret < 0) {
+		fprintf(stderr, "Error in long name\n");
+		goto fail;
+	    }
+	    if (subret == 0 || is_free(direntries + i))
+		continue;
+
+	    if (fat_chksum(direntries+i) != lfn.checksum) {
+		subret = parse_short_name(s, &lfn, direntries + i);
+		if (subret < 0) {
+		    fprintf(stderr, "Error in short name (%d)\n", subret);
+		    goto fail;
+		}
+		if (subret > 0 || !strcmp((char*)lfn.name, ".")
+			|| !strcmp((char*)lfn.name, ".."))
+		    continue;
+	    }
+	    lfn.checksum = 0x100; /* cannot use long name twice */
+
+	    if (path_len + 1 + lfn.len >= PATH_MAX) {
+		fprintf(stderr, "Name too long: %s/%s\n", path, lfn.name);
+		goto fail;
+	    }
+            pstrcpy(path2 + path_len + 1, sizeof(path2) - path_len - 1,
+                    (char*)lfn.name);
+
+	    if (is_directory(direntries + i)) {
+		if (begin_of_direntry(direntries + i) == 0) {
+		    DLOG(fprintf(stderr, "invalid begin for directory: %s\n", path2); print_direntry(direntries + i));
+		    goto fail;
+		}
+		cluster_count = check_directory_consistency(s,
+			begin_of_direntry(direntries + i), path2);
+		if (cluster_count == 0) {
+		    DLOG(fprintf(stderr, "problem in directory %s:\n", path2); print_direntry(direntries + i));
+		    goto fail;
+		}
+	    } else if (is_file(direntries + i)) {
+		/* check file size with FAT */
+		cluster_count = get_cluster_count_for_direntry(s, direntries + i, path2);
+		if (cluster_count !=
+			(le32_to_cpu(direntries[i].size) + s->cluster_size
+			 - 1) / s->cluster_size) {
+		    DLOG(fprintf(stderr, "Cluster count mismatch\n"));
+		    goto fail;
+		}
+	    } else
+		assert(0); /* cluster_count = 0; */
+
+	    ret += cluster_count;
+	}
+
+	cluster_num = modified_fat_get(s, cluster_num);
+    } while(!fat_eof(s, cluster_num));
+
+    free(cluster);
+    return ret;
+}
+
+/* returns 1 on success */
+static int is_consistent(BDRVVVFATState* s)
+{
+    int i, check;
+    int used_clusters_count = 0;
+
+DLOG(checkpoint());
+    /*
+     * - get modified FAT
+     * - compare the two FATs (TODO)
+     * - get buffer for marking used clusters
+     * - recurse direntries from root (using bs->bdrv_read to make
+     *    sure to get the new data)
+     *   - check that the FAT agrees with the size
+     *   - count the number of clusters occupied by this directory and
+     *     its files
+     * - check that the cumulative used cluster count agrees with the
+     *   FAT
+     * - if all is fine, return number of used clusters
+     */
+    if (s->fat2 == NULL) {
+	int size = 0x200 * s->sectors_per_fat;
+	s->fat2 = qemu_malloc(size);
+	memcpy(s->fat2, s->fat.pointer, size);
+    }
+    check = vvfat_read(s->bs,
+	    s->first_sectors_number, s->fat2, s->sectors_per_fat);
+    if (check) {
+	fprintf(stderr, "Could not copy fat\n");
+	return 0;
+    }
+    assert (s->used_clusters);
+    for (i = 0; i < sector2cluster(s, s->sector_count); i++)
+	s->used_clusters[i] &= ~USED_ANY;
+
+    clear_commits(s);
+
+    /* mark every mapped file/directory as deleted.
+     * (check_directory_consistency() will unmark those still present). */
+    if (s->qcow)
+	for (i = 0; i < s->mapping.next; i++) {
+	    mapping_t* mapping = array_get(&(s->mapping), i);
+	    if (mapping->first_mapping_index < 0)
+		mapping->mode |= MODE_DELETED;
+	}
+
+    used_clusters_count = check_directory_consistency(s, 0, s->path);
+    if (used_clusters_count <= 0) {
+	DLOG(fprintf(stderr, "problem in directory\n"));
+	return 0;
+    }
+
+    check = s->last_cluster_of_root_directory;
+    for (i = check; i < sector2cluster(s, s->sector_count); i++) {
+	if (modified_fat_get(s, i)) {
+	    if(!s->used_clusters[i]) {
+		DLOG(fprintf(stderr, "FAT was modified (%d), but cluster is not used?\n", i));
+		return 0;
+	    }
+	    check++;
+	}
+
+	if (s->used_clusters[i] == USED_ALLOCATED) {
+	    /* allocated, but not used... */
+	    DLOG(fprintf(stderr, "unused, modified cluster: %d\n", i));
+	    return 0;
+	}
+    }
+
+    if (check != used_clusters_count)
+	return 0;
+
+    return used_clusters_count;
+}
+
+static inline void adjust_mapping_indices(BDRVVVFATState* s,
+	int offset, int adjust)
+{
+    int i;
+
+    for (i = 0; i < s->mapping.next; i++) {
+	mapping_t* mapping = array_get(&(s->mapping), i);
+
+#define ADJUST_MAPPING_INDEX(name) \
+	if (mapping->name >= offset) \
+	    mapping->name += adjust
+
+	ADJUST_MAPPING_INDEX(first_mapping_index);
+	if (mapping->mode & MODE_DIRECTORY)
+	    ADJUST_MAPPING_INDEX(info.dir.parent_mapping_index);
+    }
+}
+
+/* insert or update mapping */
+static mapping_t* insert_mapping(BDRVVVFATState* s,
+	uint32_t begin, uint32_t end)
+{
+    /*
+     * - find mapping where mapping->begin >= begin,
+     * - if mapping->begin > begin: insert
+     *   - adjust all references to mappings!
+     * - else: adjust
+     * - replace name
+     */
+    int index = find_mapping_for_cluster_aux(s, begin, 0, s->mapping.next);
+    mapping_t* mapping = NULL;
+    mapping_t* first_mapping = array_get(&(s->mapping), 0);
+
+    if (index < s->mapping.next && (mapping = array_get(&(s->mapping), index))
+	    && mapping->begin < begin) {
+	mapping->end = begin;
+	index++;
+	mapping = array_get(&(s->mapping), index);
+    }
+    if (index >= s->mapping.next || mapping->begin > begin) {
+	mapping = array_insert(&(s->mapping), index, 1);
+	mapping->path = NULL;
+	adjust_mapping_indices(s, index, +1);
+    }
+
+    mapping->begin = begin;
+    mapping->end = end;
+
+DLOG(mapping_t* next_mapping;
+assert(index + 1 >= s->mapping.next ||
+((next_mapping = array_get(&(s->mapping), index + 1)) &&
+ next_mapping->begin >= end)));
+
+    if (s->current_mapping && first_mapping != (mapping_t*)s->mapping.pointer)
+	s->current_mapping = array_get(&(s->mapping),
+		s->current_mapping - first_mapping);
+
+    return mapping;
+}
+
+static int remove_mapping(BDRVVVFATState* s, int mapping_index)
+{
+    mapping_t* mapping = array_get(&(s->mapping), mapping_index);
+    mapping_t* first_mapping = array_get(&(s->mapping), 0);
+
+    /* free mapping */
+    if (mapping->first_mapping_index < 0)
+	free(mapping->path);
+
+    /* remove from s->mapping */
+    array_remove(&(s->mapping), mapping_index);
+
+    /* adjust all references to mappings */
+    adjust_mapping_indices(s, mapping_index, -1);
+
+    if (s->current_mapping && first_mapping != (mapping_t*)s->mapping.pointer)
+	s->current_mapping = array_get(&(s->mapping),
+		s->current_mapping - first_mapping);
+
+    return 0;
+}
+
+static void adjust_dirindices(BDRVVVFATState* s, int offset, int adjust)
+{
+    int i;
+    for (i = 0; i < s->mapping.next; i++) {
+	mapping_t* mapping = array_get(&(s->mapping), i);
+	if (mapping->dir_index >= offset)
+	    mapping->dir_index += adjust;
+	if ((mapping->mode & MODE_DIRECTORY) &&
+		mapping->info.dir.first_dir_index >= offset)
+	    mapping->info.dir.first_dir_index += adjust;
+    }
+}
+
+static direntry_t* insert_direntries(BDRVVVFATState* s,
+	int dir_index, int count)
+{
+    /*
+     * make room in s->directory,
+     * adjust_dirindices
+     */
+    direntry_t* result = array_insert(&(s->directory), dir_index, count);
+    if (result == NULL)
+	return NULL;
+    adjust_dirindices(s, dir_index, count);
+    return result;
+}
+
+static int remove_direntries(BDRVVVFATState* s, int dir_index, int count)
+{
+    int ret = array_remove_slice(&(s->directory), dir_index, count);
+    if (ret)
+	return ret;
+    adjust_dirindices(s, dir_index, -count);
+    return 0;
+}
+
+/*
+ * Adapt the mappings of the cluster chain starting at first cluster
+ * (i.e. if a file starts at first_cluster, the chain is followed according
+ * to the modified fat, and the corresponding entries in s->mapping are
+ * adjusted)
+ */
+static int commit_mappings(BDRVVVFATState* s,
+	uint32_t first_cluster, int dir_index)
+{
+    mapping_t* mapping = find_mapping_for_cluster(s, first_cluster);
+    direntry_t* direntry = array_get(&(s->directory), dir_index);
+    uint32_t cluster = first_cluster;
+
+    vvfat_close_current_file(s);
+
+    assert(mapping);
+    assert(mapping->begin == first_cluster);
+    mapping->first_mapping_index = -1;
+    mapping->dir_index = dir_index;
+    mapping->mode = (dir_index <= 0 || is_directory(direntry)) ?
+	MODE_DIRECTORY : MODE_NORMAL;
+
+    while (!fat_eof(s, cluster)) {
+	uint32_t c, c1;
+
+	for (c = cluster, c1 = modified_fat_get(s, c); c + 1 == c1;
+		c = c1, c1 = modified_fat_get(s, c1));
+
+	c++;
+	if (c > mapping->end) {
+	    int index = array_index(&(s->mapping), mapping);
+	    int i, max_i = s->mapping.next - index;
+	    for (i = 1; i < max_i && mapping[i].begin < c; i++);
+	    while (--i > 0)
+		remove_mapping(s, index + 1);
+	}
+	assert(mapping == array_get(&(s->mapping), s->mapping.next - 1)
+		|| mapping[1].begin >= c);
+	mapping->end = c;
+
+	if (!fat_eof(s, c1)) {
+	    int i = find_mapping_for_cluster_aux(s, c1, 0, s->mapping.next);
+	    mapping_t* next_mapping = i >= s->mapping.next ? NULL :
+		array_get(&(s->mapping), i);
+
+	    if (next_mapping == NULL || next_mapping->begin > c1) {
+		int i1 = array_index(&(s->mapping), mapping);
+
+		next_mapping = insert_mapping(s, c1, c1+1);
+
+		if (c1 < c)
+		    i1++;
+		mapping = array_get(&(s->mapping), i1);
+	    }
+
+	    next_mapping->dir_index = mapping->dir_index;
+	    next_mapping->first_mapping_index =
+		mapping->first_mapping_index < 0 ?
+		array_index(&(s->mapping), mapping) :
+		mapping->first_mapping_index;
+	    next_mapping->path = mapping->path;
+	    next_mapping->mode = mapping->mode;
+	    next_mapping->read_only = mapping->read_only;
+	    if (mapping->mode & MODE_DIRECTORY) {
+		next_mapping->info.dir.parent_mapping_index =
+			mapping->info.dir.parent_mapping_index;
+		next_mapping->info.dir.first_dir_index =
+			mapping->info.dir.first_dir_index +
+			0x10 * s->sectors_per_cluster *
+			(mapping->end - mapping->begin);
+	    } else
+		next_mapping->info.file.offset = mapping->info.file.offset +
+			mapping->end - mapping->begin;
+
+	    mapping = next_mapping;
+	}
+
+	cluster = c1;
+    }
+
+    return 0;
+}
+
+static int commit_direntries(BDRVVVFATState* s,
+	int dir_index, int parent_mapping_index)
+{
+    direntry_t* direntry = array_get(&(s->directory), dir_index);
+    uint32_t first_cluster = dir_index == 0 ? 0 : begin_of_direntry(direntry);
+    mapping_t* mapping = find_mapping_for_cluster(s, first_cluster);
+
+    int factor = 0x10 * s->sectors_per_cluster;
+    int old_cluster_count, new_cluster_count;
+    int current_dir_index = mapping->info.dir.first_dir_index;
+    int first_dir_index = current_dir_index;
+    int ret, i;
+    uint32_t c;
+
+DLOG(fprintf(stderr, "commit_direntries for %s, parent_mapping_index %d\n", mapping->path, parent_mapping_index));
+
+    assert(direntry);
+    assert(mapping);
+    assert(mapping->begin == first_cluster);
+    assert(mapping->info.dir.first_dir_index < s->directory.next);
+    assert(mapping->mode & MODE_DIRECTORY);
+    assert(dir_index == 0 || is_directory(direntry));
+
+    mapping->info.dir.parent_mapping_index = parent_mapping_index;
+
+    if (first_cluster == 0) {
+	old_cluster_count = new_cluster_count =
+	    s->last_cluster_of_root_directory;
+    } else {
+	for (old_cluster_count = 0, c = first_cluster; !fat_eof(s, c);
+		c = fat_get(s, c))
+	    old_cluster_count++;
+
+	for (new_cluster_count = 0, c = first_cluster; !fat_eof(s, c);
+		c = modified_fat_get(s, c))
+	    new_cluster_count++;
+    }
+
+    if (new_cluster_count > old_cluster_count) {
+	if (insert_direntries(s,
+		current_dir_index + factor * old_cluster_count,
+		factor * (new_cluster_count - old_cluster_count)) == NULL)
+	    return -1;
+    } else if (new_cluster_count < old_cluster_count)
+	remove_direntries(s,
+		current_dir_index + factor * new_cluster_count,
+		factor * (old_cluster_count - new_cluster_count));
+
+    for (c = first_cluster; !fat_eof(s, c); c = modified_fat_get(s, c)) {
+	void* direntry = array_get(&(s->directory), current_dir_index);
+	int ret = vvfat_read(s->bs, cluster2sector(s, c), direntry,
+		s->sectors_per_cluster);
+	if (ret)
+	    return ret;
+	assert(!strncmp(s->directory.pointer, "QEMU", 4));
+	current_dir_index += factor;
+    }
+
+    ret = commit_mappings(s, first_cluster, dir_index);
+    if (ret)
+	return ret;
+
+    /* recurse */
+    for (i = 0; i < factor * new_cluster_count; i++) {
+	direntry = array_get(&(s->directory), first_dir_index + i);
+	if (is_directory(direntry) && !is_dot(direntry)) {
+	    mapping = find_mapping_for_cluster(s, first_cluster);
+	    assert(mapping->mode & MODE_DIRECTORY);
+	    ret = commit_direntries(s, first_dir_index + i,
+		array_index(&(s->mapping), mapping));
+	    if (ret)
+		return ret;
+	}
+    }
+
+    return 0;
+}
+
+/* commit one file (adjust contents, adjust mapping),
+   return first_mapping_index */
+static int commit_one_file(BDRVVVFATState* s,
+	int dir_index, uint32_t offset)
+{
+    direntry_t* direntry = array_get(&(s->directory), dir_index);
+    uint32_t c = begin_of_direntry(direntry);
+    uint32_t first_cluster = c;
+    mapping_t* mapping = find_mapping_for_cluster(s, c);
+    uint32_t size = filesize_of_direntry(direntry);
+    char* cluster = qemu_malloc(s->cluster_size);
+    uint32_t i;
+    int fd = 0;
+
+    assert(offset < size);
+    assert((offset % s->cluster_size) == 0);
+
+    for (i = s->cluster_size; i < offset; i += s->cluster_size)
+	c = modified_fat_get(s, c);
+
+    fd = open(mapping->path, O_RDWR | O_CREAT | O_BINARY, 0666);
+    if (fd < 0) {
+	fprintf(stderr, "Could not open %s... (%s, %d)\n", mapping->path,
+		strerror(errno), errno);
+	return fd;
+    }
+    if (offset > 0)
+	if (lseek(fd, offset, SEEK_SET) != offset)
+	    return -3;
+
+    while (offset < size) {
+	uint32_t c1;
+	int rest_size = (size - offset > s->cluster_size ?
+		s->cluster_size : size - offset);
+	int ret;
+
+	c1 = modified_fat_get(s, c);
+
+	assert((size - offset == 0 && fat_eof(s, c)) ||
+		(size > offset && c >=2 && !fat_eof(s, c)));
+
+	ret = vvfat_read(s->bs, cluster2sector(s, c),
+	    (uint8_t*)cluster, (rest_size + 0x1ff) / 0x200);
+
+	if (ret < 0)
+	    return ret;
+
+	if (write(fd, cluster, rest_size) < 0)
+	    return -2;
+
+	offset += rest_size;
+	c = c1;
+    }
+
+    ftruncate(fd, size);
+    close(fd);
+
+    return commit_mappings(s, first_cluster, dir_index);
+}
+
+#ifdef DEBUG
+/* test, if all mappings point to valid direntries */
+static void check1(BDRVVVFATState* s)
+{
+    int i;
+    for (i = 0; i < s->mapping.next; i++) {
+	mapping_t* mapping = array_get(&(s->mapping), i);
+	if (mapping->mode & MODE_DELETED) {
+	    fprintf(stderr, "deleted\n");
+	    continue;
+	}
+	assert(mapping->dir_index >= 0);
+	assert(mapping->dir_index < s->directory.next);
+	direntry_t* direntry = array_get(&(s->directory), mapping->dir_index);
+	assert(mapping->begin == begin_of_direntry(direntry) || mapping->first_mapping_index >= 0);
+	if (mapping->mode & MODE_DIRECTORY) {
+	    assert(mapping->info.dir.first_dir_index + 0x10 * s->sectors_per_cluster * (mapping->end - mapping->begin) <= s->directory.next);
+	    assert((mapping->info.dir.first_dir_index % (0x10 * s->sectors_per_cluster)) == 0);
+	}
+    }
+}
+
+/* test, if all direntries have mappings */
+static void check2(BDRVVVFATState* s)
+{
+    int i;
+    int first_mapping = -1;
+
+    for (i = 0; i < s->directory.next; i++) {
+	direntry_t* direntry = array_get(&(s->directory), i);
+
+	if (is_short_name(direntry) && begin_of_direntry(direntry)) {
+	    mapping_t* mapping = find_mapping_for_cluster(s, begin_of_direntry(direntry));
+	    assert(mapping);
+	    assert(mapping->dir_index == i || is_dot(direntry));
+	    assert(mapping->begin == begin_of_direntry(direntry) || is_dot(direntry));
+	}
+
+	if ((i % (0x10 * s->sectors_per_cluster)) == 0) {
+	    /* cluster start */
+	    int j, count = 0;
+
+	    for (j = 0; j < s->mapping.next; j++) {
+		mapping_t* mapping = array_get(&(s->mapping), j);
+		if (mapping->mode & MODE_DELETED)
+		    continue;
+		if (mapping->mode & MODE_DIRECTORY) {
+		    if (mapping->info.dir.first_dir_index <= i && mapping->info.dir.first_dir_index + 0x10 * s->sectors_per_cluster > i) {
+			assert(++count == 1);
+			if (mapping->first_mapping_index == -1)
+			    first_mapping = array_index(&(s->mapping), mapping);
+			else
+			    assert(first_mapping == mapping->first_mapping_index);
+			if (mapping->info.dir.parent_mapping_index < 0)
+			    assert(j == 0);
+			else {
+			    mapping_t* parent = array_get(&(s->mapping), mapping->info.dir.parent_mapping_index);
+			    assert(parent->mode & MODE_DIRECTORY);
+			    assert(parent->info.dir.first_dir_index < mapping->info.dir.first_dir_index);
+			}
+		    }
+		}
+	    }
+	    if (count == 0)
+		first_mapping = -1;
+	}
+    }
+}
+#endif
+
+static int handle_renames_and_mkdirs(BDRVVVFATState* s)
+{
+    int i;
+
+#ifdef DEBUG
+    fprintf(stderr, "handle_renames\n");
+    for (i = 0; i < s->commits.next; i++) {
+	commit_t* commit = array_get(&(s->commits), i);
+	fprintf(stderr, "%d, %s (%d, %d)\n", i, commit->path ? commit->path : "(null)", commit->param.rename.cluster, commit->action);
+    }
+#endif
+
+    for (i = 0; i < s->commits.next;) {
+	commit_t* commit = array_get(&(s->commits), i);
+	if (commit->action == ACTION_RENAME) {
+	    mapping_t* mapping = find_mapping_for_cluster(s,
+		    commit->param.rename.cluster);
+	    char* old_path = mapping->path;
+
+	    assert(commit->path);
+	    mapping->path = commit->path;
+	    if (rename(old_path, mapping->path))
+		return -2;
+
+	    if (mapping->mode & MODE_DIRECTORY) {
+		int l1 = strlen(mapping->path);
+		int l2 = strlen(old_path);
+		int diff = l1 - l2;
+		direntry_t* direntry = array_get(&(s->directory),
+			mapping->info.dir.first_dir_index);
+		uint32_t c = mapping->begin;
+		int i = 0;
+
+		/* recurse */
+		while (!fat_eof(s, c)) {
+		    do {
+			direntry_t* d = direntry + i;
+
+			if (is_file(d) || (is_directory(d) && !is_dot(d))) {
+			    mapping_t* m = find_mapping_for_cluster(s,
+				    begin_of_direntry(d));
+			    int l = strlen(m->path);
+			    char* new_path = qemu_malloc(l + diff + 1);
+
+			    assert(!strncmp(m->path, mapping->path, l2));
+
+                            pstrcpy(new_path, l + diff + 1, mapping->path);
+                            pstrcpy(new_path + l1, l + diff + 1 - l1,
+                                    m->path + l2);
+
+			    schedule_rename(s, m->begin, new_path);
+			}
+			i++;
+		    } while((i % (0x10 * s->sectors_per_cluster)) != 0);
+		    c = fat_get(s, c);
+		}
+	    }
+
+	    free(old_path);
+	    array_remove(&(s->commits), i);
+	    continue;
+	} else if (commit->action == ACTION_MKDIR) {
+	    mapping_t* mapping;
+	    int j, parent_path_len;
+
+#ifdef __MINGW32__
+            if (mkdir(commit->path))
+                return -5;
+#else
+            if (mkdir(commit->path, 0755))
+                return -5;
+#endif
+
+	    mapping = insert_mapping(s, commit->param.mkdir.cluster,
+		    commit->param.mkdir.cluster + 1);
+	    if (mapping == NULL)
+		return -6;
+
+	    mapping->mode = MODE_DIRECTORY;
+	    mapping->read_only = 0;
+	    mapping->path = commit->path;
+	    j = s->directory.next;
+	    assert(j);
+	    insert_direntries(s, s->directory.next,
+		    0x10 * s->sectors_per_cluster);
+	    mapping->info.dir.first_dir_index = j;
+
+	    parent_path_len = strlen(commit->path)
+		- strlen(get_basename(commit->path)) - 1;
+	    for (j = 0; j < s->mapping.next; j++) {
+		mapping_t* m = array_get(&(s->mapping), j);
+		if (m->first_mapping_index < 0 && m != mapping &&
+			!strncmp(m->path, mapping->path, parent_path_len) &&
+			strlen(m->path) == parent_path_len)
+		    break;
+	    }
+	    assert(j < s->mapping.next);
+	    mapping->info.dir.parent_mapping_index = j;
+
+	    array_remove(&(s->commits), i);
+	    continue;
+	}
+
+	i++;
+    }
+    return 0;
+}
+
+/*
+ * TODO: make sure that the short name is not matching *another* file
+ */
+static int handle_commits(BDRVVVFATState* s)
+{
+    int i, fail = 0;
+
+    vvfat_close_current_file(s);
+
+    for (i = 0; !fail && i < s->commits.next; i++) {
+	commit_t* commit = array_get(&(s->commits), i);
+	switch(commit->action) {
+	case ACTION_RENAME: case ACTION_MKDIR:
+	    assert(0);
+	    fail = -2;
+	    break;
+	case ACTION_WRITEOUT: {
+	    direntry_t* entry = array_get(&(s->directory),
+		    commit->param.writeout.dir_index);
+	    uint32_t begin = begin_of_direntry(entry);
+	    mapping_t* mapping = find_mapping_for_cluster(s, begin);
+
+	    assert(mapping);
+	    assert(mapping->begin == begin);
+	    assert(commit->path == NULL);
+
+	    if (commit_one_file(s, commit->param.writeout.dir_index,
+			commit->param.writeout.modified_offset))
+		fail = -3;
+
+	    break;
+	}
+	case ACTION_NEW_FILE: {
+	    int begin = commit->param.new_file.first_cluster;
+	    mapping_t* mapping = find_mapping_for_cluster(s, begin);
+	    direntry_t* entry;
+	    int i;
+
+	    /* find direntry */
+	    for (i = 0; i < s->directory.next; i++) {
+		entry = array_get(&(s->directory), i);
+		if (is_file(entry) && begin_of_direntry(entry) == begin)
+		    break;
+	    }
+
+	    if (i >= s->directory.next) {
+		fail = -6;
+		continue;
+	    }
+
+	    /* make sure there exists an initial mapping */
+	    if (mapping && mapping->begin != begin) {
+		mapping->end = begin;
+		mapping = NULL;
+	    }
+	    if (mapping == NULL) {
+		mapping = insert_mapping(s, begin, begin+1);
+	    }
+	    /* most members will be fixed in commit_mappings() */
+	    assert(commit->path);
+	    mapping->path = commit->path;
+	    mapping->read_only = 0;
+	    mapping->mode = MODE_NORMAL;
+	    mapping->info.file.offset = 0;
+
+	    if (commit_one_file(s, i, 0))
+		fail = -7;
+
+	    break;
+	}
+	default:
+	    assert(0);
+	}
+    }
+    if (i > 0 && array_remove_slice(&(s->commits), 0, i))
+	return -1;
+    return fail;
+}
+
+static int handle_deletes(BDRVVVFATState* s)
+{
+    int i, deferred = 1, deleted = 1;
+
+    /* delete files corresponding to mappings marked as deleted */
+    /* handle DELETEs and unused mappings (modified_fat_get(s, mapping->begin) == 0) */
+    while (deferred && deleted) {
+	deferred = 0;
+	deleted = 0;
+
+	for (i = 1; i < s->mapping.next; i++) {
+	    mapping_t* mapping = array_get(&(s->mapping), i);
+	    if (mapping->mode & MODE_DELETED) {
+		direntry_t* entry = array_get(&(s->directory),
+			mapping->dir_index);
+
+		if (is_free(entry)) {
+		    /* remove file/directory */
+		    if (mapping->mode & MODE_DIRECTORY) {
+			int j, next_dir_index = s->directory.next,
+			first_dir_index = mapping->info.dir.first_dir_index;
+
+			if (rmdir(mapping->path) < 0) {
+			    if (errno == ENOTEMPTY) {
+				deferred++;
+				continue;
+			    } else
+				return -5;
+			}
+
+			for (j = 1; j < s->mapping.next; j++) {
+			    mapping_t* m = array_get(&(s->mapping), j);
+			    if (m->mode & MODE_DIRECTORY &&
+				    m->info.dir.first_dir_index >
+				    first_dir_index &&
+				    m->info.dir.first_dir_index <
+				    next_dir_index)
+				next_dir_index =
+				    m->info.dir.first_dir_index;
+			}
+			remove_direntries(s, first_dir_index,
+				next_dir_index - first_dir_index);
+
+			deleted++;
+		    }
+		} else {
+		    if (unlink(mapping->path))
+			return -4;
+		    deleted++;
+		}
+		DLOG(fprintf(stderr, "DELETE (%d)\n", i); print_mapping(mapping); print_direntry(entry));
+		remove_mapping(s, i);
+	    }
+	}
+    }
+
+    return 0;
+}
+
+/*
+ * synchronize mapping with new state:
+ *
+ * - copy FAT (with bdrv_read)
+ * - mark all filenames corresponding to mappings as deleted
+ * - recurse direntries from root (using bs->bdrv_read)
+ * - delete files corresponding to mappings marked as deleted
+ */
+static int do_commit(BDRVVVFATState* s)
+{
+    int ret = 0;
+
+    /* the real meat are the commits. Nothing to do? Move along! */
+    if (s->commits.next == 0)
+	return 0;
+
+    vvfat_close_current_file(s);
+
+    ret = handle_renames_and_mkdirs(s);
+    if (ret) {
+	fprintf(stderr, "Error handling renames (%d)\n", ret);
+	assert(0);
+	return ret;
+    }
+
+    /* copy FAT (with bdrv_read) */
+    memcpy(s->fat.pointer, s->fat2, 0x200 * s->sectors_per_fat);
+
+    /* recurse direntries from root (using bs->bdrv_read) */
+    ret = commit_direntries(s, 0, -1);
+    if (ret) {
+	fprintf(stderr, "Fatal: error while committing (%d)\n", ret);
+	assert(0);
+	return ret;
+    }
+
+    ret = handle_commits(s);
+    if (ret) {
+	fprintf(stderr, "Error handling commits (%d)\n", ret);
+	assert(0);
+	return ret;
+    }
+
+    ret = handle_deletes(s);
+    if (ret) {
+	fprintf(stderr, "Error deleting\n");
+        assert(0);
+	return ret;
+    }
+
+    s->qcow->drv->bdrv_make_empty(s->qcow);
+
+    memset(s->used_clusters, 0, sector2cluster(s, s->sector_count));
+
+DLOG(checkpoint());
+    return 0;
+}
+
+static int try_commit(BDRVVVFATState* s)
+{
+    vvfat_close_current_file(s);
+DLOG(checkpoint());
+    if(!is_consistent(s))
+	return -1;
+    return do_commit(s);
+}
+
+static int vvfat_write(BlockDriverState *bs, int64_t sector_num,
+                    const uint8_t *buf, int nb_sectors)
+{
+    BDRVVVFATState *s = bs->opaque;
+    int i, ret;
+
+DLOG(checkpoint());
+
+    vvfat_close_current_file(s);
+
+    /*
+     * Some sanity checks:
+     * - do not allow writing to the boot sector
+     * - do not allow to write non-ASCII filenames
+     */
+
+    if (sector_num < s->first_sectors_number)
+	return -1;
+
+    for (i = sector2cluster(s, sector_num);
+	    i <= sector2cluster(s, sector_num + nb_sectors - 1);) {
+	mapping_t* mapping = find_mapping_for_cluster(s, i);
+	if (mapping) {
+	    if (mapping->read_only) {
+		fprintf(stderr, "Tried to write to write-protected file %s\n",
+			mapping->path);
+		return -1;
+	    }
+
+	    if (mapping->mode & MODE_DIRECTORY) {
+		int begin = cluster2sector(s, i);
+		int end = begin + s->sectors_per_cluster, k;
+		int dir_index;
+		const direntry_t* direntries;
+		long_file_name lfn;
+
+		lfn_init(&lfn);
+
+		if (begin < sector_num)
+		    begin = sector_num;
+		if (end > sector_num + nb_sectors)
+		    end = sector_num + nb_sectors;
+		dir_index  = mapping->dir_index +
+		    0x10 * (begin - mapping->begin * s->sectors_per_cluster);
+		direntries = (direntry_t*)(buf + 0x200 * (begin - sector_num));
+
+		for (k = 0; k < (end - begin) * 0x10; k++) {
+		    /* do not allow non-ASCII filenames */
+		    if (parse_long_name(&lfn, direntries + k) < 0) {
+			fprintf(stderr, "Warning: non-ASCII filename\n");
+			return -1;
+		    }
+		    /* no access to the direntry of a read-only file */
+		    else if (is_short_name(direntries+k) &&
+			    (direntries[k].attributes & 1)) {
+			if (memcmp(direntries + k,
+				    array_get(&(s->directory), dir_index + k),
+				    sizeof(direntry_t))) {
+			    fprintf(stderr, "Warning: tried to write to write-protected file\n");
+			    return -1;
+			}
+		    }
+		}
+	    }
+	    i = mapping->end;
+	} else
+	    i++;
+    }
+
+    /*
+     * Use qcow backend. Commit later.
+     */
+DLOG(fprintf(stderr, "Write to qcow backend: %d + %d\n", (int)sector_num, nb_sectors));
+    ret = s->qcow->drv->bdrv_write(s->qcow, sector_num, buf, nb_sectors);
+    if (ret < 0) {
+	fprintf(stderr, "Error writing to qcow backend\n");
+	return ret;
+    }
+
+    for (i = sector2cluster(s, sector_num);
+	    i <= sector2cluster(s, sector_num + nb_sectors - 1); i++)
+	if (i >= 0)
+	    s->used_clusters[i] |= USED_ALLOCATED;
+
+DLOG(checkpoint());
+    /* TODO: add timeout */
+    try_commit(s);
+
+DLOG(checkpoint());
+    return 0;
+}
+
+static int vvfat_is_allocated(BlockDriverState *bs,
+	int64_t sector_num, int nb_sectors, int* n)
+{
+    BDRVVVFATState* s = bs->opaque;
+    *n = s->sector_count - sector_num;
+    if (*n > nb_sectors)
+	*n = nb_sectors;
+    else if (*n < 0)
+	return 0;
+    return 1;
+}
+
+static int write_target_commit(BlockDriverState *bs, int64_t sector_num,
+	const uint8_t* buffer, int nb_sectors) {
+    BDRVVVFATState* s = bs->opaque;
+    return try_commit(s);
+}
+
+static void write_target_close(BlockDriverState *bs) {
+    BDRVVVFATState* s = bs->opaque;
+    bdrv_delete(s->qcow);
+    free(s->qcow_filename);
+}
+
+static BlockDriver vvfat_write_target = {
+    .format_name        = "vvfat_write_target",
+    .bdrv_write         = write_target_commit,
+    .bdrv_close         = write_target_close,
+};
+
+static int enable_write_target(BDRVVVFATState *s)
+{
+    BlockDriver *bdrv_qcow;
+    QEMUOptionParameter *options;
+    int size = sector2cluster(s, s->sector_count);
+    s->used_clusters = calloc(size, 1);
+
+    array_init(&(s->commits), sizeof(commit_t));
+
+    s->qcow_filename = qemu_malloc(1024);
+    get_tmp_filename(s->qcow_filename, 1024);
+
+    bdrv_qcow = bdrv_find_format("qcow");
+    options = parse_option_parameters("", bdrv_qcow->create_options, NULL);
+    set_option_parameter_int(options, BLOCK_OPT_SIZE, s->sector_count * 512);
+    set_option_parameter(options, BLOCK_OPT_BACKING_FILE, "fat:");
+
+    if (bdrv_create(bdrv_qcow, s->qcow_filename, options) < 0)
+	return -1;
+    s->qcow = bdrv_new("");
+    if (s->qcow == NULL || bdrv_open(s->qcow, s->qcow_filename, 0) < 0)
+	return -1;
+
+#ifndef _WIN32
+    unlink(s->qcow_filename);
+#endif
+
+    s->bs->backing_hd = calloc(sizeof(BlockDriverState), 1);
+    s->bs->backing_hd->drv = &vvfat_write_target;
+    s->bs->backing_hd->opaque = s;
+
+    return 0;
+}
+
+static void vvfat_close(BlockDriverState *bs)
+{
+    BDRVVVFATState *s = bs->opaque;
+
+    vvfat_close_current_file(s);
+    array_free(&(s->fat));
+    array_free(&(s->directory));
+    array_free(&(s->mapping));
+    if(s->cluster_buffer)
+        free(s->cluster_buffer);
+}
+
+static BlockDriver bdrv_vvfat = {
+    .format_name	= "vvfat",
+    .instance_size	= sizeof(BDRVVVFATState),
+    .bdrv_open		= vvfat_open,
+    .bdrv_read		= vvfat_read,
+    .bdrv_write		= vvfat_write,
+    .bdrv_close		= vvfat_close,
+    .bdrv_is_allocated	= vvfat_is_allocated,
+    .protocol_name	= "fat",
+};
+
+static void bdrv_vvfat_init(void)
+{
+    bdrv_register(&bdrv_vvfat);
+}
+
+block_init(bdrv_vvfat_init);
+
+#ifdef DEBUG
+static void checkpoint(void) {
+    assert(((mapping_t*)array_get(&(vvv->mapping), 0))->end == 2);
+    check1(vvv);
+    check2(vvv);
+    assert(!vvv->current_mapping || vvv->current_fd || (vvv->current_mapping->mode & MODE_DIRECTORY));
+#if 0
+    if (((direntry_t*)vvv->directory.pointer)[1].attributes != 0xf)
+	fprintf(stderr, "Nonono!\n");
+    mapping_t* mapping;
+    direntry_t* direntry;
+    assert(vvv->mapping.size >= vvv->mapping.item_size * vvv->mapping.next);
+    assert(vvv->directory.size >= vvv->directory.item_size * vvv->directory.next);
+    if (vvv->mapping.next<47)
+	return;
+    assert((mapping = array_get(&(vvv->mapping), 47)));
+    assert(mapping->dir_index < vvv->directory.next);
+    direntry = array_get(&(vvv->directory), mapping->dir_index);
+    assert(!memcmp(direntry->name, "USB     H  ", 11) || direntry->name[0]==0);
+#endif
+    return;
+    /* avoid compiler warnings: */
+    hexdump(NULL, 100);
+    remove_mapping(vvv, NULL);
+    print_mapping(NULL);
+    print_direntry(NULL);
+}
+#endif
author	David 'Digit' Turner <digit@google.com>	2009-09-14 14:32:27 -0700
committer	David 'Digit' Turner <digit@google.com>	2009-09-14 14:32:27 -0700
commit	5d8f37ad78fc66901af50c762029a501561f3b23 (patch)
tree	206790f8f21000850a98c4f9590a79e779106278 /block
parent	cd059b15f2c7df69f4a087bd66900eb172e41d1c (diff)
download	external_qemu-5d8f37ad78fc66901af50c762029a501561f3b23.zip external_qemu-5d8f37ad78fc66901af50c762029a501561f3b23.tar.gz external_qemu-5d8f37ad78fc66901af50c762029a501561f3b23.tar.bz2