From cb42a1b1461e02efb034582ac5d8f71534723b92 Mon Sep 17 00:00:00 2001 From: David 'Digit' Turner Date: Thu, 23 Dec 2010 02:54:08 +0100 Subject: upstream: integrate block changes This large patch upgrades the block support code to the upstream version available in ba5e7f82169f32ab8163c707d97c799ca09f8924 dated 2010-08-08 Change-Id: I8b24df0c287e72f6620650a4d6a62e1bb315453e --- block/bochs.c | 77 ++--- block/cloop.c | 46 +-- block/cow.c | 147 ++++++---- block/dmg.c | 121 ++++---- block/nbd.c | 5 +- block/parallels.c | 52 +--- block/qcow.c | 112 +++++--- block/qcow2-cluster.c | 343 ++++++++++++++-------- block/qcow2-refcount.c | 750 ++++++++++++++++++++++++++++++++++++------------- block/qcow2-snapshot.c | 55 ++-- block/qcow2.c | 670 +++++++++++++++++++++++++++++-------------- block/qcow2.h | 36 ++- block/raw-posix.c | 646 +++++++++++++++++------------------------- block/raw-win32.c | 36 +-- block/raw.c | 280 ++++++++++++++++++ block/vpc.c | 97 ++++--- block/vvfat.c | 11 +- 17 files changed, 2198 insertions(+), 1286 deletions(-) create mode 100644 block/raw.c (limited to 'block') diff --git a/block/bochs.c b/block/bochs.c index bac81c4..5fe2fa3 100644 --- a/block/bochs.c +++ b/block/bochs.c @@ -80,8 +80,6 @@ struct bochs_header { }; typedef struct BDRVBochsState { - int fd; - uint32_t *catalog_bitmap; int catalog_size; @@ -109,25 +107,16 @@ static int bochs_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -static int bochs_open(BlockDriverState *bs, const char *filename, int flags) +static int bochs_open(BlockDriverState *bs, int flags) { BDRVBochsState *s = bs->opaque; - int fd, i; + int i; struct bochs_header bochs; struct bochs_header_v1 header_v1; - fd = open(filename, O_RDWR | O_BINARY); - if (fd < 0) { - fd = open(filename, O_RDONLY | O_BINARY); - if (fd < 0) - return -1; - } - bs->read_only = 1; // no write support yet - s->fd = fd; - - if (read(fd, &bochs, sizeof(bochs)) != sizeof(bochs)) { + if (bdrv_pread(bs->file, 0, &bochs, sizeof(bochs)) != sizeof(bochs)) { goto fail; } @@ -146,12 +135,10 @@ static int bochs_open(BlockDriverState *bs, const char *filename, int flags) bs->total_sectors = le64_to_cpu(bochs.extra.redolog.disk) / 512; } - lseek(s->fd, le32_to_cpu(bochs.header), SEEK_SET); - s->catalog_size = le32_to_cpu(bochs.extra.redolog.catalog); s->catalog_bitmap = qemu_malloc(s->catalog_size * 4); - if (read(s->fd, s->catalog_bitmap, s->catalog_size * 4) != - s->catalog_size * 4) + if (bdrv_pread(bs->file, le32_to_cpu(bochs.header), s->catalog_bitmap, + s->catalog_size * 4) != s->catalog_size * 4) goto fail; for (i = 0; i < s->catalog_size; i++) le32_to_cpus(&s->catalog_bitmap[i]); @@ -165,68 +152,53 @@ static int bochs_open(BlockDriverState *bs, const char *filename, int flags) return 0; fail: - close(fd); return -1; } -static inline int seek_to_sector(BlockDriverState *bs, int64_t sector_num) +static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num) { BDRVBochsState *s = bs->opaque; int64_t offset = sector_num * 512; - int64_t extent_index, extent_offset, bitmap_offset, block_offset; + int64_t extent_index, extent_offset, bitmap_offset; char bitmap_entry; // seek to sector extent_index = offset / s->extent_size; extent_offset = (offset % s->extent_size) / 512; - if (s->catalog_bitmap[extent_index] == 0xffffffff) - { -// fprintf(stderr, "page not allocated [%x - %x:%x]\n", -// sector_num, extent_index, extent_offset); - return -1; // not allocated + if (s->catalog_bitmap[extent_index] == 0xffffffff) { + return -1; /* not allocated */ } bitmap_offset = s->data_offset + (512 * s->catalog_bitmap[extent_index] * (s->extent_blocks + s->bitmap_blocks)); - block_offset = bitmap_offset + (512 * (s->bitmap_blocks + extent_offset)); - -// fprintf(stderr, "sect: %x [ext i: %x o: %x] -> %x bitmap: %x block: %x\n", -// sector_num, extent_index, extent_offset, -// le32_to_cpu(s->catalog_bitmap[extent_index]), -// bitmap_offset, block_offset); - // read in bitmap for current extent - lseek(s->fd, bitmap_offset + (extent_offset / 8), SEEK_SET); - - read(s->fd, &bitmap_entry, 1); - - if (!((bitmap_entry >> (extent_offset % 8)) & 1)) - { -// fprintf(stderr, "sector (%x) in bitmap not allocated\n", -// sector_num); - return -1; // not allocated + /* read in bitmap for current extent */ + if (bdrv_pread(bs->file, bitmap_offset + (extent_offset / 8), + &bitmap_entry, 1) != 1) { + return -1; } - lseek(s->fd, block_offset, SEEK_SET); + if (!((bitmap_entry >> (extent_offset % 8)) & 1)) { + return -1; /* not allocated */ + } - return 0; + return bitmap_offset + (512 * (s->bitmap_blocks + extent_offset)); } static int bochs_read(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, int nb_sectors) { - BDRVBochsState *s = bs->opaque; int ret; while (nb_sectors > 0) { - if (!seek_to_sector(bs, sector_num)) - { - ret = read(s->fd, buf, 512); - if (ret != 512) - return -1; - } - else + int64_t block_offset = seek_to_sector(bs, sector_num); + if (block_offset >= 0) { + ret = bdrv_pread(bs->file, block_offset, buf, 512); + if (ret != 512) { + return -1; + } + } else memset(buf, 0, 512); nb_sectors--; sector_num++; @@ -239,7 +211,6 @@ static void bochs_close(BlockDriverState *bs) { BDRVBochsState *s = bs->opaque; qemu_free(s->catalog_bitmap); - close(s->fd); } static BlockDriver bdrv_bochs = { diff --git a/block/cloop.c b/block/cloop.c index 06c687e..fe015c4 100644 --- a/block/cloop.c +++ b/block/cloop.c @@ -27,7 +27,6 @@ #include typedef struct BDRVCloopState { - int fd; uint32_t block_size; uint32_t n_blocks; uint64_t* offsets; @@ -51,34 +50,31 @@ static int cloop_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -static int cloop_open(BlockDriverState *bs, const char *filename, int flags) +static int cloop_open(BlockDriverState *bs, int flags) { BDRVCloopState *s = bs->opaque; uint32_t offsets_size,max_compressed_block_size=1,i; - s->fd = open(filename, O_RDONLY | O_BINARY); - if (s->fd < 0) - return -errno; bs->read_only = 1; /* read header */ - if(lseek(s->fd,128,SEEK_SET)<0) { -cloop_close: - close(s->fd); - return -1; + if (bdrv_pread(bs->file, 128, &s->block_size, 4) < 4) { + goto cloop_close; } - if(read(s->fd,&s->block_size,4)<4) - goto cloop_close; - s->block_size=be32_to_cpu(s->block_size); - if(read(s->fd,&s->n_blocks,4)<4) - goto cloop_close; - s->n_blocks=be32_to_cpu(s->n_blocks); + s->block_size = be32_to_cpu(s->block_size); + + if (bdrv_pread(bs->file, 128 + 4, &s->n_blocks, 4) < 4) { + goto cloop_close; + } + s->n_blocks = be32_to_cpu(s->n_blocks); /* read offsets */ - offsets_size=s->n_blocks*sizeof(uint64_t); - s->offsets=(uint64_t*)qemu_malloc(offsets_size); - if(read(s->fd,s->offsets,offsets_size)n_blocks * sizeof(uint64_t); + s->offsets = qemu_malloc(offsets_size); + if (bdrv_pread(bs->file, 128 + 4 + 4, s->offsets, offsets_size) < + offsets_size) { goto cloop_close; + } for(i=0;in_blocks;i++) { s->offsets[i]=be64_to_cpu(s->offsets[i]); if(i>0) { @@ -98,16 +94,21 @@ cloop_close: s->sectors_per_block = s->block_size/512; bs->total_sectors = s->n_blocks*s->sectors_per_block; return 0; + +cloop_close: + return -1; } -static inline int cloop_read_block(BDRVCloopState *s,int block_num) +static inline int cloop_read_block(BlockDriverState *bs, int block_num) { + BDRVCloopState *s = bs->opaque; + if(s->current_block != block_num) { int ret; uint32_t bytes = s->offsets[block_num+1]-s->offsets[block_num]; - lseek(s->fd, s->offsets[block_num], SEEK_SET); - ret = read(s->fd, s->compressed_block, bytes); + ret = bdrv_pread(bs->file, s->offsets[block_num], s->compressed_block, + bytes); if (ret != bytes) return -1; @@ -136,7 +137,7 @@ static int cloop_read(BlockDriverState *bs, int64_t sector_num, for(i=0;isectors_per_block), block_num=(sector_num+i)/s->sectors_per_block; - if(cloop_read_block(s, block_num) != 0) + if(cloop_read_block(bs, block_num) != 0) return -1; memcpy(buf+i*512,s->uncompressed_block+sector_offset_in_block*512,512); } @@ -146,7 +147,6 @@ static int cloop_read(BlockDriverState *bs, int64_t sector_num, static void cloop_close(BlockDriverState *bs) { BDRVCloopState *s = bs->opaque; - close(s->fd); if(s->n_blocks>0) free(s->offsets); free(s->compressed_block); diff --git a/block/cow.c b/block/cow.c index 84818f1..eedcc48 100644 --- a/block/cow.c +++ b/block/cow.c @@ -21,11 +21,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ -#ifndef _WIN32 #include "qemu-common.h" #include "block_int.h" #include "module.h" -#include /**************************************************************/ /* COW block driver using file system holes */ @@ -44,10 +42,6 @@ struct cow_header_v2 { }; typedef struct BDRVCowState { - int fd; - uint8_t *cow_bitmap; /* if non NULL, COW mappings are used first */ - uint8_t *cow_bitmap_addr; /* mmap address of cow_bitmap */ - int cow_bitmap_size; int64_t cow_sectors_offset; } BDRVCowState; @@ -63,22 +57,16 @@ static int cow_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -static int cow_open(BlockDriverState *bs, const char *filename, int flags) +static int cow_open(BlockDriverState *bs, int flags) { BDRVCowState *s = bs->opaque; - int fd; struct cow_header_v2 cow_header; + int bitmap_size; int64_t size; - fd = open(filename, O_RDWR | O_BINARY | O_LARGEFILE); - if (fd < 0) { - fd = open(filename, O_RDONLY | O_BINARY | O_LARGEFILE); - if (fd < 0) - return -1; - } - s->fd = fd; /* see if it is a cow image */ - if (read(fd, &cow_header, sizeof(cow_header)) != sizeof(cow_header)) { + if (bdrv_pread(bs->file, 0, &cow_header, sizeof(cow_header)) != + sizeof(cow_header)) { goto fail; } @@ -94,61 +82,91 @@ static int cow_open(BlockDriverState *bs, const char *filename, int flags) pstrcpy(bs->backing_file, sizeof(bs->backing_file), cow_header.backing_file); - /* mmap the bitmap */ - s->cow_bitmap_size = ((bs->total_sectors + 7) >> 3) + sizeof(cow_header); - s->cow_bitmap_addr = (void *)mmap(get_mmap_addr(s->cow_bitmap_size), - s->cow_bitmap_size, - PROT_READ | PROT_WRITE, - MAP_SHARED, s->fd, 0); - if (s->cow_bitmap_addr == MAP_FAILED) - goto fail; - s->cow_bitmap = s->cow_bitmap_addr + sizeof(cow_header); - s->cow_sectors_offset = (s->cow_bitmap_size + 511) & ~511; + bitmap_size = ((bs->total_sectors + 7) >> 3) + sizeof(cow_header); + s->cow_sectors_offset = (bitmap_size + 511) & ~511; return 0; fail: - close(fd); return -1; } -static inline void cow_set_bit(uint8_t *bitmap, int64_t bitnum) +/* + * XXX(hch): right now these functions are extremly ineffcient. + * We should just read the whole bitmap we'll need in one go instead. + */ +static inline int cow_set_bit(BlockDriverState *bs, int64_t bitnum) { - bitmap[bitnum / 8] |= (1 << (bitnum%8)); + uint64_t offset = sizeof(struct cow_header_v2) + bitnum / 8; + uint8_t bitmap; + int ret; + + ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap)); + if (ret < 0) { + return ret; + } + + bitmap |= (1 << (bitnum % 8)); + + ret = bdrv_pwrite_sync(bs->file, offset, &bitmap, sizeof(bitmap)); + if (ret < 0) { + return ret; + } + return 0; } -static inline int is_bit_set(const uint8_t *bitmap, int64_t bitnum) +static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum) { - return !!(bitmap[bitnum / 8] & (1 << (bitnum%8))); -} + uint64_t offset = sizeof(struct cow_header_v2) + bitnum / 8; + uint8_t bitmap; + int ret; + + ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap)); + if (ret < 0) { + return ret; + } + return !!(bitmap & (1 << (bitnum % 8))); +} /* Return true if first block has been changed (ie. current version is * in COW file). Set the number of continuous blocks for which that * is true. */ -static inline int is_changed(uint8_t *bitmap, - int64_t sector_num, int nb_sectors, - int *num_same) +static int cow_is_allocated(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, int *num_same) { int changed; - if (!bitmap || nb_sectors == 0) { + if (nb_sectors == 0) { *num_same = nb_sectors; return 0; } - changed = is_bit_set(bitmap, sector_num); + changed = is_bit_set(bs, sector_num); + if (changed < 0) { + return 0; /* XXX: how to return I/O errors? */ + } + for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) { - if (is_bit_set(bitmap, sector_num + *num_same) != changed) + if (is_bit_set(bs, sector_num + *num_same) != changed) break; } return changed; } -static int cow_is_allocated(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, int *pnum) +static int cow_update_bitmap(BlockDriverState *bs, int64_t sector_num, + int nb_sectors) { - BDRVCowState *s = bs->opaque; - return is_changed(s->cow_bitmap, sector_num, nb_sectors, pnum); + int error = 0; + int i; + + for (i = 0; i < nb_sectors; i++) { + error = cow_set_bit(bs, sector_num + i); + if (error) { + break; + } + } + + return error; } static int cow_read(BlockDriverState *bs, int64_t sector_num, @@ -158,9 +176,10 @@ static int cow_read(BlockDriverState *bs, int64_t sector_num, int ret, n; while (nb_sectors > 0) { - if (is_changed(s->cow_bitmap, sector_num, nb_sectors, &n)) { - lseek(s->fd, s->cow_sectors_offset + sector_num * 512, SEEK_SET); - ret = read(s->fd, buf, n * 512); + if (cow_is_allocated(bs, sector_num, nb_sectors, &n)) { + ret = bdrv_pread(bs->file, + s->cow_sectors_offset + sector_num * 512, + buf, n * 512); if (ret != n * 512) return -1; } else { @@ -184,22 +203,18 @@ static int cow_write(BlockDriverState *bs, int64_t sector_num, const uint8_t *buf, int nb_sectors) { BDRVCowState *s = bs->opaque; - int ret, i; + int ret; - lseek(s->fd, s->cow_sectors_offset + sector_num * 512, SEEK_SET); - ret = write(s->fd, buf, nb_sectors * 512); + ret = bdrv_pwrite(bs->file, s->cow_sectors_offset + sector_num * 512, + buf, nb_sectors * 512); if (ret != nb_sectors * 512) return -1; - for (i = 0; i < nb_sectors; i++) - cow_set_bit(s->cow_bitmap, sector_num + i); - return 0; + + return cow_update_bitmap(bs, sector_num, nb_sectors); } static void cow_close(BlockDriverState *bs) { - BDRVCowState *s = bs->opaque; - munmap((void *)s->cow_bitmap_addr, s->cow_bitmap_size); - close(s->fd); } static int cow_create(const char *filename, QEMUOptionParameter *options) @@ -209,6 +224,7 @@ static int cow_create(const char *filename, QEMUOptionParameter *options) struct stat st; int64_t image_sectors = 0; const char *image_filename = NULL; + int ret; /* Read out options */ while (options && options->name) { @@ -223,7 +239,7 @@ static int cow_create(const char *filename, QEMUOptionParameter *options) cow_fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644); if (cow_fd < 0) - return -1; + return -errno; memset(&cow_header, 0, sizeof(cow_header)); cow_header.magic = cpu_to_be32(COW_MAGIC); cow_header.version = cpu_to_be32(COW_VERSION); @@ -248,17 +264,27 @@ static int cow_create(const char *filename, QEMUOptionParameter *options) } cow_header.sectorsize = cpu_to_be32(512); cow_header.size = cpu_to_be64(image_sectors * 512); - write(cow_fd, &cow_header, sizeof(cow_header)); + ret = qemu_write_full(cow_fd, &cow_header, sizeof(cow_header)); + if (ret != sizeof(cow_header)) { + ret = -errno; + goto exit; + } + /* resize to include at least all the bitmap */ - ftruncate(cow_fd, sizeof(cow_header) + ((image_sectors + 7) >> 3)); + ret = ftruncate(cow_fd, sizeof(cow_header) + ((image_sectors + 7) >> 3)); + if (ret) { + ret = -errno; + goto exit; + } + +exit: close(cow_fd); - return 0; + return ret; } static void cow_flush(BlockDriverState *bs) { - BDRVCowState *s = bs->opaque; - fsync(s->fd); + bdrv_flush(bs->file); } static QEMUOptionParameter cow_create_options[] = { @@ -296,4 +322,3 @@ static void bdrv_cow_init(void) } block_init(bdrv_cow_init); -#endif diff --git a/block/dmg.c b/block/dmg.c index 262560f..a3c815b 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -28,8 +28,6 @@ #include typedef struct BDRVDMGState { - int fd; - /* each chunk contains a certain number of sectors, * offsets[i] is the offset in the .dmg file, * lengths[i] is the length of the compressed chunk, @@ -58,72 +56,75 @@ static int dmg_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -static off_t read_off(int fd) +static off_t read_off(BlockDriverState *bs, int64_t offset) { uint64_t buffer; - if(read(fd,&buffer,8)<8) + if (bdrv_pread(bs->file, offset, &buffer, 8) < 8) return 0; return be64_to_cpu(buffer); } -static off_t read_uint32(int fd) +static off_t read_uint32(BlockDriverState *bs, int64_t offset) { uint32_t buffer; - if(read(fd,&buffer,4)<4) + if (bdrv_pread(bs->file, offset, &buffer, 4) < 4) return 0; return be32_to_cpu(buffer); } -static int dmg_open(BlockDriverState *bs, const char *filename, int flags) +static int dmg_open(BlockDriverState *bs, int flags) { BDRVDMGState *s = bs->opaque; off_t info_begin,info_end,last_in_offset,last_out_offset; uint32_t count; uint32_t max_compressed_size=1,max_sectors_per_chunk=1,i; + int64_t offset; - s->fd = open(filename, O_RDONLY | O_BINARY); - if (s->fd < 0) - return -errno; bs->read_only = 1; s->n_chunks = 0; s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL; /* read offset of info blocks */ - if(lseek(s->fd,-0x1d8,SEEK_END)<0) { -dmg_close: - close(s->fd); - /* open raw instead */ - bs->drv=bdrv_find_format("raw"); - return bs->drv->bdrv_open(bs, filename, flags); + offset = bdrv_getlength(bs->file); + if (offset < 0) { + goto fail; + } + offset -= 0x1d8; + + info_begin = read_off(bs, offset); + if (info_begin == 0) { + goto fail; + } + + if (read_uint32(bs, info_begin) != 0x100) { + goto fail; } - info_begin=read_off(s->fd); - if(info_begin==0) - goto dmg_close; - if(lseek(s->fd,info_begin,SEEK_SET)<0) - goto dmg_close; - if(read_uint32(s->fd)!=0x100) - goto dmg_close; - if((count = read_uint32(s->fd))==0) - goto dmg_close; - info_end = info_begin+count; - if(lseek(s->fd,0xf8,SEEK_CUR)<0) - goto dmg_close; + + count = read_uint32(bs, info_begin + 4); + if (count == 0) { + goto fail; + } + info_end = info_begin + count; + + offset = info_begin + 0x100; /* read offsets */ last_in_offset = last_out_offset = 0; - while(lseek(s->fd,0,SEEK_CUR)fd); + count = read_uint32(bs, offset); if(count==0) - goto dmg_close; - type = read_uint32(s->fd); - if(type!=0x6d697368 || count<244) - lseek(s->fd,count-4,SEEK_CUR); - else { + goto fail; + offset += 4; + + type = read_uint32(bs, offset); + if (type == 0x6d697368 && count >= 244) { int new_size, chunk_count; - if(lseek(s->fd,200,SEEK_CUR)<0) - goto dmg_close; + + offset += 4; + offset += 200; + chunk_count = (count-204)/40; new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count); s->types = qemu_realloc(s->types, new_size/2); @@ -133,7 +134,8 @@ dmg_close: s->sectorcounts = qemu_realloc(s->sectorcounts, new_size); for(i=s->n_chunks;in_chunks+chunk_count;i++) { - s->types[i] = read_uint32(s->fd); + s->types[i] = read_uint32(bs, offset); + offset += 4; if(s->types[i]!=0x80000005 && s->types[i]!=1 && s->types[i]!=2) { if(s->types[i]==0xffffffff) { last_in_offset = s->offsets[i-1]+s->lengths[i-1]; @@ -141,15 +143,23 @@ dmg_close: } chunk_count--; i--; - if(lseek(s->fd,36,SEEK_CUR)<0) - goto dmg_close; + offset += 36; continue; } - read_uint32(s->fd); - s->sectors[i] = last_out_offset+read_off(s->fd); - s->sectorcounts[i] = read_off(s->fd); - s->offsets[i] = last_in_offset+read_off(s->fd); - s->lengths[i] = read_off(s->fd); + offset += 4; + + s->sectors[i] = last_out_offset+read_off(bs, offset); + offset += 8; + + s->sectorcounts[i] = read_off(bs, offset); + offset += 8; + + s->offsets[i] = last_in_offset+read_off(bs, offset); + offset += 8; + + s->lengths[i] = read_off(bs, offset); + offset += 8; + if(s->lengths[i]>max_compressed_size) max_compressed_size = s->lengths[i]; if(s->sectorcounts[i]>max_sectors_per_chunk) @@ -163,11 +173,13 @@ dmg_close: s->compressed_chunk = qemu_malloc(max_compressed_size+1); s->uncompressed_chunk = qemu_malloc(512*max_sectors_per_chunk); if(inflateInit(&s->zstream) != Z_OK) - goto dmg_close; + goto fail; s->current_chunk = s->n_chunks; return 0; +fail: + return -1; } static inline int is_sector_in_chunk(BDRVDMGState* s, @@ -196,8 +208,10 @@ static inline uint32_t search_chunk(BDRVDMGState* s,int sector_num) return s->n_chunks; /* error */ } -static inline int dmg_read_chunk(BDRVDMGState *s,int sector_num) +static inline int dmg_read_chunk(BlockDriverState *bs, int sector_num) { + BDRVDMGState *s = bs->opaque; + if(!is_sector_in_chunk(s,s->current_chunk,sector_num)) { int ret; uint32_t chunk = search_chunk(s,sector_num); @@ -210,15 +224,12 @@ static inline int dmg_read_chunk(BDRVDMGState *s,int sector_num) case 0x80000005: { /* zlib compressed */ int i; - ret = lseek(s->fd, s->offsets[chunk], SEEK_SET); - if(ret<0) - return -1; - /* we need to buffer, because only the chunk as whole can be * inflated. */ i=0; do { - ret = read(s->fd, s->compressed_chunk+i, s->lengths[chunk]-i); + ret = bdrv_pread(bs->file, s->offsets[chunk] + i, + s->compressed_chunk+i, s->lengths[chunk]-i); if(ret<0 && errno==EINTR) ret=0; i+=ret; @@ -239,7 +250,8 @@ static inline int dmg_read_chunk(BDRVDMGState *s,int sector_num) return -1; break; } case 1: /* copy */ - ret = read(s->fd, s->uncompressed_chunk, s->lengths[chunk]); + ret = bdrv_pread(bs->file, s->offsets[chunk], + s->uncompressed_chunk, s->lengths[chunk]); if (ret != s->lengths[chunk]) return -1; break; @@ -260,7 +272,7 @@ static int dmg_read(BlockDriverState *bs, int64_t sector_num, for(i=0;isectors[s->current_chunk]; memcpy(buf+i*512,s->uncompressed_chunk+sector_offset_in_chunk*512,512); @@ -271,7 +283,6 @@ static int dmg_read(BlockDriverState *bs, int64_t sector_num, static void dmg_close(BlockDriverState *bs) { BDRVDMGState *s = bs->opaque; - close(s->fd); if(s->n_chunks>0) { free(s->types); free(s->offsets); diff --git a/block/nbd.c b/block/nbd.c index 47d4778..a1ec123 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -49,9 +49,6 @@ static int nbd_open(BlockDriverState *bs, const char* filename, int flags) size_t blocksize; int ret; - if ((flags & BDRV_O_CREAT)) - return -EINVAL; - if (!strstart(filename, "nbd:", &host)) return -EINVAL; @@ -180,7 +177,7 @@ static int64_t nbd_getlength(BlockDriverState *bs) static BlockDriver bdrv_nbd = { .format_name = "nbd", .instance_size = sizeof(BDRVNBDState), - .bdrv_open = nbd_open, + .bdrv_file_open = nbd_open, .bdrv_read = nbd_read, .bdrv_write = nbd_write, .bdrv_close = nbd_close, diff --git a/block/parallels.c b/block/parallels.c index 0b64a5c..35a14aa 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -46,7 +46,6 @@ struct parallels_header { } __attribute__((packed)); typedef struct BDRVParallelsState { - int fd; uint32_t *catalog_bitmap; int catalog_size; @@ -68,24 +67,15 @@ static int parallels_probe(const uint8_t *buf, int buf_size, const char *filenam return 0; } -static int parallels_open(BlockDriverState *bs, const char *filename, int flags) +static int parallels_open(BlockDriverState *bs, int flags) { BDRVParallelsState *s = bs->opaque; - int fd, i; + int i; struct parallels_header ph; - fd = open(filename, O_RDWR | O_BINARY | O_LARGEFILE); - if (fd < 0) { - fd = open(filename, O_RDONLY | O_BINARY | O_LARGEFILE); - if (fd < 0) - return -1; - } - bs->read_only = 1; // no write support yet - s->fd = fd; - - if (read(fd, &ph, sizeof(ph)) != sizeof(ph)) + if (bdrv_pread(bs->file, 0, &ph, sizeof(ph)) != sizeof(ph)) goto fail; if (memcmp(ph.magic, HEADER_MAGIC, 16) || @@ -95,14 +85,11 @@ static int parallels_open(BlockDriverState *bs, const char *filename, int flags) bs->total_sectors = le32_to_cpu(ph.nb_sectors); - if (lseek(s->fd, 64, SEEK_SET) != 64) - goto fail; - s->tracks = le32_to_cpu(ph.tracks); s->catalog_size = le32_to_cpu(ph.catalog_entries); s->catalog_bitmap = qemu_malloc(s->catalog_size * 4); - if (read(s->fd, s->catalog_bitmap, s->catalog_size * 4) != + if (bdrv_pread(bs->file, 64, s->catalog_bitmap, s->catalog_size * 4) != s->catalog_size * 4) goto fail; for (i = 0; i < s->catalog_size; i++) @@ -112,44 +99,34 @@ static int parallels_open(BlockDriverState *bs, const char *filename, int flags) fail: if (s->catalog_bitmap) qemu_free(s->catalog_bitmap); - close(fd); return -1; } -static inline int seek_to_sector(BlockDriverState *bs, int64_t sector_num) +static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num) { BDRVParallelsState *s = bs->opaque; - uint32_t index, offset, position; + uint32_t index, offset; index = sector_num / s->tracks; offset = sector_num % s->tracks; - // not allocated + /* not allocated */ if ((index > s->catalog_size) || (s->catalog_bitmap[index] == 0)) return -1; - - position = (s->catalog_bitmap[index] + offset) * 512; - -// fprintf(stderr, "sector: %llx index=%x offset=%x pointer=%x position=%x\n", -// sector_num, index, offset, s->catalog_bitmap[index], position); - - if (lseek(s->fd, position, SEEK_SET) != position) - return -1; - - return 0; + return (uint64_t)(s->catalog_bitmap[index] + offset) * 512; } static int parallels_read(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, int nb_sectors) { - BDRVParallelsState *s = bs->opaque; - while (nb_sectors > 0) { - if (!seek_to_sector(bs, sector_num)) { - if (read(s->fd, buf, 512) != 512) - return -1; - } else + int64_t position = seek_to_sector(bs, sector_num); + if (position >= 0) { + if (bdrv_pread(bs->file, position, buf, 512) != 512) + return -1; + } else { memset(buf, 0, 512); + } nb_sectors--; sector_num++; buf += 512; @@ -161,7 +138,6 @@ static void parallels_close(BlockDriverState *bs) { BDRVParallelsState *s = bs->opaque; qemu_free(s->catalog_bitmap); - close(s->fd); } static BlockDriver bdrv_parallels = { diff --git a/block/qcow.c b/block/qcow.c index 55a68a6..816103d 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -76,7 +76,7 @@ typedef struct BDRVQcowState { AES_KEY aes_decrypt_key; } BDRVQcowState; -static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset); +static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset); static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename) { @@ -90,16 +90,13 @@ static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -static int qcow_open(BlockDriverState *bs, const char *filename, int flags) +static int qcow_open(BlockDriverState *bs, int flags) { BDRVQcowState *s = bs->opaque; - int len, i, shift, ret; + int len, i, shift; QCowHeader header; - ret = bdrv_file_open(&s->hd, filename, flags); - if (ret < 0) - return ret; - if (bdrv_pread(s->hd, 0, &header, sizeof(header)) != sizeof(header)) + if (bdrv_pread(bs->file, 0, &header, sizeof(header)) != sizeof(header)) goto fail; be32_to_cpus(&header.magic); be32_to_cpus(&header.version); @@ -135,7 +132,7 @@ static int qcow_open(BlockDriverState *bs, const char *filename, int flags) s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t)); if (!s->l1_table) goto fail; - if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) != + if (bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) != s->l1_size * sizeof(uint64_t)) goto fail; for(i = 0;i < s->l1_size; i++) { @@ -158,7 +155,7 @@ static int qcow_open(BlockDriverState *bs, const char *filename, int flags) len = header.backing_file_size; if (len > 1023) len = 1023; - if (bdrv_pread(s->hd, header.backing_file_offset, bs->backing_file, len) != len) + if (bdrv_pread(bs->file, header.backing_file_offset, bs->backing_file, len) != len) goto fail; bs->backing_file[len] = '\0'; } @@ -169,7 +166,6 @@ static int qcow_open(BlockDriverState *bs, const char *filename, int flags) qemu_free(s->l2_cache); qemu_free(s->cluster_cache); qemu_free(s->cluster_data); - bdrv_delete(s->hd); return -1; } @@ -271,14 +267,15 @@ static uint64_t get_cluster_offset(BlockDriverState *bs, if (!allocate) return 0; /* allocate a new l2 entry */ - l2_offset = bdrv_getlength(s->hd); + l2_offset = bdrv_getlength(bs->file); /* round to cluster size */ l2_offset = (l2_offset + s->cluster_size - 1) & ~(s->cluster_size - 1); /* update the L1 entry */ s->l1_table[l1_index] = l2_offset; tmp = cpu_to_be64(l2_offset); - if (bdrv_pwrite(s->hd, s->l1_table_offset + l1_index * sizeof(tmp), - &tmp, sizeof(tmp)) != sizeof(tmp)) + if (bdrv_pwrite_sync(bs->file, + s->l1_table_offset + l1_index * sizeof(tmp), + &tmp, sizeof(tmp)) < 0) return 0; new_l2_table = 1; } @@ -306,11 +303,11 @@ static uint64_t get_cluster_offset(BlockDriverState *bs, l2_table = s->l2_cache + (min_index << s->l2_bits); if (new_l2_table) { memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); - if (bdrv_pwrite(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) != - s->l2_size * sizeof(uint64_t)) + if (bdrv_pwrite_sync(bs->file, l2_offset, l2_table, + s->l2_size * sizeof(uint64_t)) < 0) return 0; } else { - if (bdrv_pread(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) != + if (bdrv_pread(bs->file, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) != s->l2_size * sizeof(uint64_t)) return 0; } @@ -329,22 +326,22 @@ static uint64_t get_cluster_offset(BlockDriverState *bs, /* if the cluster is already compressed, we must decompress it in the case it is not completely overwritten */ - if (decompress_cluster(s, cluster_offset) < 0) + if (decompress_cluster(bs, cluster_offset) < 0) return 0; - cluster_offset = bdrv_getlength(s->hd); + cluster_offset = bdrv_getlength(bs->file); cluster_offset = (cluster_offset + s->cluster_size - 1) & ~(s->cluster_size - 1); /* write the cluster content */ - if (bdrv_pwrite(s->hd, cluster_offset, s->cluster_cache, s->cluster_size) != + if (bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache, s->cluster_size) != s->cluster_size) return -1; } else { - cluster_offset = bdrv_getlength(s->hd); + cluster_offset = bdrv_getlength(bs->file); if (allocate == 1) { /* round to cluster size */ cluster_offset = (cluster_offset + s->cluster_size - 1) & ~(s->cluster_size - 1); - bdrv_truncate(s->hd, cluster_offset + s->cluster_size); + bdrv_truncate(bs->file, cluster_offset + s->cluster_size); /* if encrypted, we must initialize the cluster content which won't be written */ if (s->crypt_method && @@ -358,7 +355,7 @@ static uint64_t get_cluster_offset(BlockDriverState *bs, s->cluster_data, s->cluster_data + 512, 1, 1, &s->aes_encrypt_key); - if (bdrv_pwrite(s->hd, cluster_offset + i * 512, + if (bdrv_pwrite(bs->file, cluster_offset + i * 512, s->cluster_data, 512) != 512) return -1; } @@ -372,8 +369,8 @@ static uint64_t get_cluster_offset(BlockDriverState *bs, /* update L2 table */ tmp = cpu_to_be64(cluster_offset); l2_table[l2_index] = tmp; - if (bdrv_pwrite(s->hd, - l2_offset + l2_index * sizeof(tmp), &tmp, sizeof(tmp)) != sizeof(tmp)) + if (bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp), + &tmp, sizeof(tmp)) < 0) return 0; } return cluster_offset; @@ -422,8 +419,9 @@ static int decompress_buffer(uint8_t *out_buf, int out_buf_size, return 0; } -static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset) +static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) { + BDRVQcowState *s = bs->opaque; int ret, csize; uint64_t coffset; @@ -431,7 +429,7 @@ static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset) if (s->cluster_cache_offset != coffset) { csize = cluster_offset >> (63 - s->cluster_bits); csize &= (s->cluster_size - 1); - ret = bdrv_pread(s->hd, coffset, s->cluster_data, csize); + ret = bdrv_pread(bs->file, coffset, s->cluster_data, csize); if (ret != csize) return -1; if (decompress_buffer(s->cluster_cache, s->cluster_size, @@ -468,11 +466,11 @@ static int qcow_read(BlockDriverState *bs, int64_t sector_num, memset(buf, 0, 512 * n); } } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) { - if (decompress_cluster(s, cluster_offset) < 0) + if (decompress_cluster(bs, cluster_offset) < 0) return -1; memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n); } else { - ret = bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512); + ret = bdrv_pread(bs->file, cluster_offset + index_in_cluster * 512, buf, n * 512); if (ret != n * 512) return -1; if (s->crypt_method) { @@ -505,7 +503,7 @@ typedef struct QCowAIOCB { static void qcow_aio_cancel(BlockDriverAIOCB *blockacb) { - QCowAIOCB *acb = (QCowAIOCB *)blockacb; + QCowAIOCB *acb = container_of(blockacb, QCowAIOCB, common); if (acb->hd_aiocb) bdrv_aio_cancel(acb->hd_aiocb); qemu_aio_release(acb); @@ -601,7 +599,7 @@ static void qcow_aio_read_cb(void *opaque, int ret) } } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ - if (decompress_cluster(s, acb->cluster_offset) < 0) + if (decompress_cluster(bs, acb->cluster_offset) < 0) goto done; memcpy(acb->buf, s->cluster_cache + index_in_cluster * 512, 512 * acb->n); @@ -614,7 +612,7 @@ static void qcow_aio_read_cb(void *opaque, int ret) acb->hd_iov.iov_base = (void *)acb->buf; acb->hd_iov.iov_len = acb->n * 512; qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1); - acb->hd_aiocb = bdrv_aio_readv(s->hd, + acb->hd_aiocb = bdrv_aio_readv(bs->file, (acb->cluster_offset >> 9) + index_in_cluster, &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb); if (acb->hd_aiocb == NULL) @@ -699,7 +697,7 @@ static void qcow_aio_write_cb(void *opaque, int ret) acb->hd_iov.iov_base = (void *)src_buf; acb->hd_iov.iov_len = acb->n * 512; qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1); - acb->hd_aiocb = bdrv_aio_writev(s->hd, + acb->hd_aiocb = bdrv_aio_writev(bs->file, (cluster_offset >> 9) + index_in_cluster, &acb->hd_qiov, acb->n, qcow_aio_write_cb, acb); @@ -723,7 +721,7 @@ static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs, s->cluster_cache_offset = -1; /* disable compressed cache */ - acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); + acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); if (!acb) return NULL; @@ -739,7 +737,6 @@ static void qcow_close(BlockDriverState *bs) qemu_free(s->l2_cache); qemu_free(s->cluster_cache); qemu_free(s->cluster_data); - bdrv_delete(s->hd); } static int qcow_create(const char *filename, QEMUOptionParameter *options) @@ -750,6 +747,7 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options) int64_t total_size = 0; const char *backing_file = NULL; int flags = 0; + int ret; /* Read out options */ while (options && options->name) { @@ -765,7 +763,7 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options) fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644); if (fd < 0) - return -1; + return -errno; memset(&header, 0, sizeof(header)); header.magic = cpu_to_be32(QCOW_MAGIC); header.version = cpu_to_be32(QCOW_VERSION); @@ -801,17 +799,34 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options) } /* write all the data */ - write(fd, &header, sizeof(header)); + ret = qemu_write_full(fd, &header, sizeof(header)); + if (ret != sizeof(header)) { + ret = -errno; + goto exit; + } + if (backing_file) { - write(fd, backing_file, backing_filename_len); + ret = qemu_write_full(fd, backing_file, backing_filename_len); + if (ret != backing_filename_len) { + ret = -errno; + goto exit; + } + } lseek(fd, header_size, SEEK_SET); tmp = 0; for(i = 0;i < l1_size; i++) { - write(fd, &tmp, sizeof(tmp)); + ret = qemu_write_full(fd, &tmp, sizeof(tmp)); + if (ret != sizeof(tmp)) { + ret = -errno; + goto exit; + } } + + ret = 0; +exit: close(fd); - return 0; + return ret; } static int qcow_make_empty(BlockDriverState *bs) @@ -821,9 +836,10 @@ static int qcow_make_empty(BlockDriverState *bs) int ret; memset(s->l1_table, 0, l1_length); - if (bdrv_pwrite(s->hd, s->l1_table_offset, s->l1_table, l1_length) < 0) - return -1; - ret = bdrv_truncate(s->hd, s->l1_table_offset + l1_length); + if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table, + l1_length) < 0) + return -1; + ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length); if (ret < 0) return ret; @@ -884,7 +900,7 @@ static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num, cluster_offset = get_cluster_offset(bs, sector_num << 9, 2, out_len, 0, 0); cluster_offset &= s->cluster_offset_mask; - if (bdrv_pwrite(s->hd, cluster_offset, out_buf, out_len) != out_len) { + if (bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len) != out_len) { qemu_free(out_buf); return -1; } @@ -896,8 +912,13 @@ static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num, static void qcow_flush(BlockDriverState *bs) { - BDRVQcowState *s = bs->opaque; - bdrv_flush(s->hd); + bdrv_flush(bs->file); +} + +static BlockDriverAIOCB *qcow_aio_flush(BlockDriverState *bs, + BlockDriverCompletionFunc *cb, void *opaque) +{ + return bdrv_aio_flush(bs->file, cb, opaque); } static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) @@ -940,6 +961,7 @@ static BlockDriver bdrv_qcow = { .bdrv_make_empty = qcow_make_empty, .bdrv_aio_readv = qcow_aio_readv, .bdrv_aio_writev = qcow_aio_writev, + .bdrv_aio_flush = qcow_aio_flush, .bdrv_write_compressed = qcow_write_compressed, .bdrv_get_info = qcow_get_info, diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index fdedf17..166922f 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -33,12 +33,15 @@ int qcow2_grow_l1_table(BlockDriverState *bs, int min_size) BDRVQcowState *s = bs->opaque; int new_l1_size, new_l1_size2, ret, i; uint64_t *new_l1_table; - uint64_t new_l1_table_offset; + int64_t new_l1_table_offset; uint8_t data[12]; new_l1_size = s->l1_size; if (min_size <= new_l1_size) return 0; + if (new_l1_size == 0) { + new_l1_size = 1; + } while (min_size > new_l1_size) { new_l1_size = (new_l1_size * 3 + 1) / 2; } @@ -47,26 +50,34 @@ int qcow2_grow_l1_table(BlockDriverState *bs, int min_size) #endif new_l1_size2 = sizeof(uint64_t) * new_l1_size; - new_l1_table = qemu_mallocz(new_l1_size2); + new_l1_table = qemu_mallocz(align_offset(new_l1_size2, 512)); memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t)); /* write new table (align to cluster) */ + BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE); new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2); + if (new_l1_table_offset < 0) { + qemu_free(new_l1_table); + return new_l1_table_offset; + } + BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE); for(i = 0; i < s->l1_size; i++) new_l1_table[i] = cpu_to_be64(new_l1_table[i]); - ret = bdrv_pwrite(s->hd, new_l1_table_offset, new_l1_table, new_l1_size2); - if (ret != new_l1_size2) + ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset, new_l1_table, new_l1_size2); + if (ret < 0) goto fail; for(i = 0; i < s->l1_size; i++) new_l1_table[i] = be64_to_cpu(new_l1_table[i]); /* set new table */ + BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE); cpu_to_be32w((uint32_t*)data, new_l1_size); cpu_to_be64w((uint64_t*)(data + 4), new_l1_table_offset); - if (bdrv_pwrite(s->hd, offsetof(QCowHeader, l1_size), data, - sizeof(data)) != sizeof(data)) + ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size), data,sizeof(data)); + if (ret < 0) { goto fail; + } qemu_free(s->l1_table); qcow2_free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t)); s->l1_table_offset = new_l1_table_offset; @@ -74,8 +85,9 @@ int qcow2_grow_l1_table(BlockDriverState *bs, int min_size) s->l1_size = new_l1_size; return 0; fail: - qemu_free(s->l1_table); - return -EIO; + qemu_free(new_l1_table); + qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2); + return ret; } void qcow2_l2_cache_reset(BlockDriverState *bs) @@ -145,29 +157,36 @@ static uint64_t *seek_l2_table(BDRVQcowState *s, uint64_t l2_offset) * the image file failed. */ -static uint64_t *l2_load(BlockDriverState *bs, uint64_t l2_offset) +static int l2_load(BlockDriverState *bs, uint64_t l2_offset, + uint64_t **l2_table) { BDRVQcowState *s = bs->opaque; int min_index; - uint64_t *l2_table; + int ret; /* seek if the table for the given offset is in the cache */ - l2_table = seek_l2_table(s, l2_offset); - if (l2_table != NULL) - return l2_table; + *l2_table = seek_l2_table(s, l2_offset); + if (*l2_table != NULL) { + return 0; + } /* not found: load a new entry in the least used one */ min_index = l2_cache_new_entry(bs); - l2_table = s->l2_cache + (min_index << s->l2_bits); - if (bdrv_pread(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) != - s->l2_size * sizeof(uint64_t)) - return NULL; + *l2_table = s->l2_cache + (min_index << s->l2_bits); + + BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD); + ret = bdrv_pread(bs->file, l2_offset, *l2_table, + s->l2_size * sizeof(uint64_t)); + if (ret < 0) { + return ret; + } + s->l2_cache_offsets[min_index] = l2_offset; s->l2_cache_counts[min_index] = 1; - return l2_table; + return 0; } /* @@ -175,21 +194,23 @@ static uint64_t *l2_load(BlockDriverState *bs, uint64_t l2_offset) * and we really don't want bdrv_pread to perform a read-modify-write) */ #define L1_ENTRIES_PER_SECTOR (512 / 8) -static int write_l1_entry(BDRVQcowState *s, int l1_index) +static int write_l1_entry(BlockDriverState *bs, int l1_index) { + BDRVQcowState *s = bs->opaque; uint64_t buf[L1_ENTRIES_PER_SECTOR]; int l1_start_index; - int i; + int i, ret; l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1); for (i = 0; i < L1_ENTRIES_PER_SECTOR; i++) { buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]); } - if (bdrv_pwrite(s->hd, s->l1_table_offset + 8 * l1_start_index, - buf, sizeof(buf)) != sizeof(buf)) - { - return -1; + BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); + ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset + 8 * l1_start_index, + buf, sizeof(buf)); + if (ret < 0) { + return ret; } return 0; @@ -205,24 +226,22 @@ static int write_l1_entry(BDRVQcowState *s, int l1_index) * */ -static uint64_t *l2_allocate(BlockDriverState *bs, int l1_index) +static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) { BDRVQcowState *s = bs->opaque; int min_index; uint64_t old_l2_offset; - uint64_t *l2_table, l2_offset; + uint64_t *l2_table; + int64_t l2_offset; + int ret; old_l2_offset = s->l1_table[l1_index]; /* allocate a new l2 entry */ l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t)); - - /* update the L1 entry */ - - s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED; - if (write_l1_entry(s, l1_index) < 0) { - return NULL; + if (l2_offset < 0) { + return l2_offset; } /* allocate a new entry in the l2 cache */ @@ -235,23 +254,40 @@ static uint64_t *l2_allocate(BlockDriverState *bs, int l1_index) memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); } else { /* if there was an old l2 table, read it from the disk */ - if (bdrv_pread(s->hd, old_l2_offset, - l2_table, s->l2_size * sizeof(uint64_t)) != - s->l2_size * sizeof(uint64_t)) - return NULL; + BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ); + ret = bdrv_pread(bs->file, old_l2_offset, l2_table, + s->l2_size * sizeof(uint64_t)); + if (ret < 0) { + goto fail; + } } /* write the l2 table to the file */ - if (bdrv_pwrite(s->hd, l2_offset, - l2_table, s->l2_size * sizeof(uint64_t)) != - s->l2_size * sizeof(uint64_t)) - return NULL; + BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE); + ret = bdrv_pwrite_sync(bs->file, l2_offset, l2_table, + s->l2_size * sizeof(uint64_t)); + if (ret < 0) { + goto fail; + } + + /* update the L1 entry */ + s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED; + ret = write_l1_entry(bs, l1_index); + if (ret < 0) { + goto fail; + } /* update the l2 cache entry */ s->l2_cache_offsets[min_index] = l2_offset; s->l2_cache_counts[min_index] = 1; - return l2_table; + *table = l2_table; + return 0; + +fail: + s->l1_table[l1_index] = old_l2_offset; + qcow2_l2_cache_reset(bs); + return ret; } static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size, @@ -264,7 +300,7 @@ static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size, return 0; for (i = start; i < start + nb_clusters; i++) - if (offset + i * cluster_size != (be64_to_cpu(l2_table[i]) & ~mask)) + if (offset + (uint64_t) i * cluster_size != (be64_to_cpu(l2_table[i]) & ~mask)) break; return (i - start); @@ -306,8 +342,8 @@ void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, } -int qcow2_read(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, - int nb_sectors) +static int qcow_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) { BDRVQcowState *s = bs->opaque; int ret, index_in_cluster, n, n1; @@ -315,13 +351,20 @@ int qcow2_read(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, while (nb_sectors > 0) { n = nb_sectors; - cluster_offset = qcow2_get_cluster_offset(bs, sector_num << 9, &n); + + ret = qcow2_get_cluster_offset(bs, sector_num << 9, &n, + &cluster_offset); + if (ret < 0) { + return ret; + } + index_in_cluster = sector_num & (s->cluster_sectors - 1); if (!cluster_offset) { if (bs->backing_hd) { /* read from the base image */ n1 = qcow2_backing_read1(bs->backing_hd, sector_num, buf, n); if (n1 > 0) { + BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING); ret = bdrv_read(bs->backing_hd, sector_num, buf, n1); if (ret < 0) return -1; @@ -330,11 +373,12 @@ int qcow2_read(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, memset(buf, 0, 512 * n); } } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) { - if (qcow2_decompress_cluster(s, cluster_offset) < 0) + if (qcow2_decompress_cluster(bs, cluster_offset) < 0) return -1; memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n); } else { - ret = bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512); + BLKDBG_EVENT(bs->file, BLKDBG_READ); + ret = bdrv_pread(bs->file, cluster_offset + index_in_cluster * 512, buf, n * 512); if (ret != n * 512) return -1; if (s->crypt_method) { @@ -358,7 +402,8 @@ static int copy_sectors(BlockDriverState *bs, uint64_t start_sect, n = n_end - n_start; if (n <= 0) return 0; - ret = qcow2_read(bs, start_sect + n_start, s->cluster_data, n); + BLKDBG_EVENT(bs->file, BLKDBG_COW_READ); + ret = qcow_read(bs, start_sect + n_start, s->cluster_data, n); if (ret < 0) return ret; if (s->crypt_method) { @@ -367,8 +412,9 @@ static int copy_sectors(BlockDriverState *bs, uint64_t start_sect, s->cluster_data, n, 1, &s->aes_encrypt_key); } - ret = bdrv_write(s->hd, (cluster_offset >> 9) + n_start, - s->cluster_data, n); + BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE); + ret = bdrv_write_sync(bs->file, (cluster_offset >> 9) + n_start, + s->cluster_data, n); if (ret < 0) return ret; return 0; @@ -378,27 +424,29 @@ static int copy_sectors(BlockDriverState *bs, uint64_t start_sect, /* * get_cluster_offset * - * For a given offset of the disk image, return cluster offset in - * qcow2 file. + * For a given offset of the disk image, find the cluster offset in + * qcow2 file. The offset is stored in *cluster_offset. * * on entry, *num is the number of contiguous clusters we'd like to * access following offset. * * on exit, *num is the number of contiguous clusters we can read. * - * Return 1, if the offset is found - * Return 0, otherwise. + * Return 0, if the offset is found + * Return -errno, otherwise. * */ -uint64_t qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, - int *num) +int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, + int *num, uint64_t *cluster_offset) { BDRVQcowState *s = bs->opaque; - int l1_index, l2_index; - uint64_t l2_offset, *l2_table, cluster_offset; + unsigned int l1_index, l2_index; + uint64_t l2_offset, *l2_table; int l1_bits, c; - int index_in_cluster, nb_available, nb_needed, nb_clusters; + unsigned int index_in_cluster, nb_clusters; + uint64_t nb_available, nb_needed; + int ret; index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1); nb_needed = *num + index_in_cluster; @@ -409,7 +457,7 @@ uint64_t qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, * the end of the l1 entry */ - nb_available = (1 << l1_bits) - (offset & ((1 << l1_bits) - 1)); + nb_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1)); /* compute the number of available sectors */ @@ -419,7 +467,7 @@ uint64_t qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, nb_needed = nb_available; } - cluster_offset = 0; + *cluster_offset = 0; /* seek the the l2 offset in the l1 table */ @@ -437,17 +485,18 @@ uint64_t qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, /* load the l2 table in memory */ l2_offset &= ~QCOW_OFLAG_COPIED; - l2_table = l2_load(bs, l2_offset); - if (l2_table == NULL) - return 0; + ret = l2_load(bs, l2_offset, &l2_table); + if (ret < 0) { + return ret; + } /* find the cluster offset for the given disk offset */ l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); - cluster_offset = be64_to_cpu(l2_table[l2_index]); + *cluster_offset = be64_to_cpu(l2_table[l2_index]); nb_clusters = size_to_clusters(s, nb_needed << 9); - if (!cluster_offset) { + if (!*cluster_offset) { /* how many empty clusters ? */ c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]); } else { @@ -463,7 +512,8 @@ out: *num = nb_available - index_in_cluster; - return cluster_offset & ~QCOW_OFLAG_COPIED; + *cluster_offset &=~QCOW_OFLAG_COPIED; + return 0; } /* @@ -475,24 +525,27 @@ out: * the l2 table offset in the qcow2 file and the cluster index * in the l2 table are given to the caller. * + * Returns 0 on success, -errno in failure case */ - static int get_cluster_table(BlockDriverState *bs, uint64_t offset, uint64_t **new_l2_table, uint64_t *new_l2_offset, int *new_l2_index) { BDRVQcowState *s = bs->opaque; - int l1_index, l2_index, ret; - uint64_t l2_offset, *l2_table; + unsigned int l1_index, l2_index; + uint64_t l2_offset; + uint64_t *l2_table = NULL; + int ret; /* seek the the l2 offset in the l1 table */ l1_index = offset >> (s->l2_bits + s->cluster_bits); if (l1_index >= s->l1_size) { ret = qcow2_grow_l1_table(bs, l1_index + 1); - if (ret < 0) - return 0; + if (ret < 0) { + return ret; + } } l2_offset = s->l1_table[l1_index]; @@ -501,15 +554,17 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset, if (l2_offset & QCOW_OFLAG_COPIED) { /* load the l2 table in memory */ l2_offset &= ~QCOW_OFLAG_COPIED; - l2_table = l2_load(bs, l2_offset); - if (l2_table == NULL) - return 0; + ret = l2_load(bs, l2_offset, &l2_table); + if (ret < 0) { + return ret; + } } else { if (l2_offset) qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t)); - l2_table = l2_allocate(bs, l1_index); - if (l2_table == NULL) - return 0; + ret = l2_allocate(bs, l1_index, &l2_table); + if (ret < 0) { + return ret; + } l2_offset = s->l1_table[l1_index] & ~QCOW_OFLAG_COPIED; } @@ -521,7 +576,7 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset, *new_l2_offset = l2_offset; *new_l2_index = l2_index; - return 1; + return 0; } /* @@ -543,12 +598,14 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, { BDRVQcowState *s = bs->opaque; int l2_index, ret; - uint64_t l2_offset, *l2_table, cluster_offset; + uint64_t l2_offset, *l2_table; + int64_t cluster_offset; int nb_csectors; ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index); - if (ret == 0) + if (ret < 0) { return 0; + } cluster_offset = be64_to_cpu(l2_table[l2_index]); if (cluster_offset & QCOW_OFLAG_COPIED) @@ -558,6 +615,10 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, qcow2_free_any_clusters(bs, cluster_offset, 1); cluster_offset = qcow2_alloc_bytes(bs, compressed_size); + if (cluster_offset < 0) { + return 0; + } + nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) - (cluster_offset >> 9); @@ -568,11 +629,12 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, /* compressed clusters never have the copied flag */ + BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED); l2_table[l2_index] = cpu_to_be64(cluster_offset); - if (bdrv_pwrite(s->hd, + if (bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(uint64_t), l2_table + l2_index, - sizeof(uint64_t)) != sizeof(uint64_t)) + sizeof(uint64_t)) < 0) return 0; return cluster_offset; @@ -583,29 +645,31 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, * read-modify-write in bdrv_pwrite */ #define L2_ENTRIES_PER_SECTOR (512 / 8) -static int write_l2_entries(BDRVQcowState *s, uint64_t *l2_table, +static int write_l2_entries(BlockDriverState *bs, uint64_t *l2_table, uint64_t l2_offset, int l2_index, int num) { int l2_start_index = l2_index & ~(L1_ENTRIES_PER_SECTOR - 1); int start_offset = (8 * l2_index) & ~511; int end_offset = (8 * (l2_index + num) + 511) & ~511; size_t len = end_offset - start_offset; + int ret; - if (bdrv_pwrite(s->hd, l2_offset + start_offset, &l2_table[l2_start_index], - len) != len) - { - return -1; + BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE); + ret = bdrv_pwrite_sync(bs->file, l2_offset + start_offset, + &l2_table[l2_start_index], len); + if (ret < 0) { + return ret; } return 0; } -int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, uint64_t cluster_offset, - QCowL2Meta *m) +int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) { BDRVQcowState *s = bs->opaque; int i, j = 0, l2_index, ret; uint64_t *old_cluster, start_sect, l2_offset, *l2_table; + uint64_t cluster_offset = m->cluster_offset; if (m->nb_clusters == 0) return 0; @@ -628,10 +692,11 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, uint64_t cluster_offset, goto err; } - ret = -EIO; /* update L2 table */ - if (!get_cluster_table(bs, m->offset, &l2_table, &l2_offset, &l2_index)) + ret = get_cluster_table(bs, m->offset, &l2_table, &l2_offset, &l2_index); + if (ret < 0) { goto err; + } for (i = 0; i < m->nb_clusters; i++) { /* if two concurrent writes happen to the same unallocated cluster @@ -647,8 +712,9 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, uint64_t cluster_offset, (i << s->cluster_bits)) | QCOW_OFLAG_COPIED); } - if (write_l2_entries(s, l2_table, l2_offset, l2_index, m->nb_clusters) < 0) { - ret = -1; + ret = write_l2_entries(bs, l2_table, l2_offset, l2_index, m->nb_clusters); + if (ret < 0) { + qcow2_l2_cache_reset(bs); goto err; } @@ -665,29 +731,36 @@ err: /* * alloc_cluster_offset * - * For a given offset of the disk image, return cluster offset in - * qcow2 file. - * + * For a given offset of the disk image, return cluster offset in qcow2 file. * If the offset is not found, allocate a new cluster. * - * Return the cluster offset if successful, - * Return 0, otherwise. + * If the cluster was already allocated, m->nb_clusters is set to 0, + * m->depends_on is set to NULL and the other fields in m are meaningless. + * + * If the cluster is newly allocated, m->nb_clusters is set to the number of + * contiguous clusters that have been allocated. This may be 0 if the request + * conflict with another write request in flight; in this case, m->depends_on + * is set and the remaining fields of m are meaningless. + * + * If m->nb_clusters is non-zero, the other fields of m are valid and contain + * information about the first allocated cluster. * + * Return 0 on success and -errno in error cases */ - -uint64_t qcow2_alloc_cluster_offset(BlockDriverState *bs, - uint64_t offset, - int n_start, int n_end, - int *num, QCowL2Meta *m) +int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, + int n_start, int n_end, int *num, QCowL2Meta *m) { BDRVQcowState *s = bs->opaque; int l2_index, ret; - uint64_t l2_offset, *l2_table, cluster_offset; - int nb_clusters, i = 0; + uint64_t l2_offset, *l2_table; + int64_t cluster_offset; + unsigned int nb_clusters, i = 0; + QCowL2Meta *old_alloc; ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index); - if (ret == 0) - return 0; + if (ret < 0) { + return ret; + } nb_clusters = size_to_clusters(s, n_end << 9); @@ -703,6 +776,7 @@ uint64_t qcow2_alloc_cluster_offset(BlockDriverState *bs, cluster_offset &= ~QCOW_OFLAG_COPIED; m->nb_clusters = 0; + m->depends_on = NULL; goto out; } @@ -717,12 +791,15 @@ uint64_t qcow2_alloc_cluster_offset(BlockDriverState *bs, while (i < nb_clusters) { i += count_contiguous_clusters(nb_clusters - i, s->cluster_size, &l2_table[l2_index], i, 0); - - if(be64_to_cpu(l2_table[l2_index + i])) + if ((i >= nb_clusters) || be64_to_cpu(l2_table[l2_index + i])) { break; + } i += count_contiguous_free_clusters(nb_clusters - i, &l2_table[l2_index + i]); + if (i >= nb_clusters) { + break; + } cluster_offset = be64_to_cpu(l2_table[l2_index + i]); @@ -730,8 +807,41 @@ uint64_t qcow2_alloc_cluster_offset(BlockDriverState *bs, (cluster_offset & QCOW_OFLAG_COMPRESSED)) break; } + assert(i <= nb_clusters); nb_clusters = i; + /* + * Check if there already is an AIO write request in flight which allocates + * the same cluster. In this case we need to wait until the previous + * request has completed and updated the L2 table accordingly. + */ + QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) { + + uint64_t end_offset = offset + nb_clusters * s->cluster_size; + uint64_t old_offset = old_alloc->offset; + uint64_t old_end_offset = old_alloc->offset + + old_alloc->nb_clusters * s->cluster_size; + + if (end_offset < old_offset || offset > old_end_offset) { + /* No intersection */ + } else { + if (offset < old_offset) { + /* Stop at the start of a running allocation */ + nb_clusters = (old_offset - offset) >> s->cluster_bits; + } else { + nb_clusters = 0; + } + + if (nb_clusters == 0) { + /* Set dependency and wait for a callback */ + m->depends_on = old_alloc; + m->nb_clusters = 0; + *num = 0; + return 0; + } + } + } + if (!nb_clusters) { abort(); } @@ -741,6 +851,10 @@ uint64_t qcow2_alloc_cluster_offset(BlockDriverState *bs, /* allocate a new cluster */ cluster_offset = qcow2_alloc_clusters(bs, nb_clusters * s->cluster_size); + if (cluster_offset < 0) { + QLIST_REMOVE(m, next_in_flight); + return cluster_offset; + } /* save info needed for meta data update */ m->offset = offset; @@ -749,10 +863,11 @@ uint64_t qcow2_alloc_cluster_offset(BlockDriverState *bs, out: m->nb_available = MIN(nb_clusters << (s->cluster_bits - 9), n_end); + m->cluster_offset = cluster_offset; *num = m->nb_available - n_start; - return cluster_offset; + return 0; } static int decompress_buffer(uint8_t *out_buf, int out_buf_size, @@ -782,8 +897,9 @@ static int decompress_buffer(uint8_t *out_buf, int out_buf_size, return 0; } -int qcow2_decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset) +int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) { + BDRVQcowState *s = bs->opaque; int ret, csize, nb_csectors, sector_offset; uint64_t coffset; @@ -792,7 +908,8 @@ int qcow2_decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset) nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1; sector_offset = coffset & 511; csize = nb_csectors * 512 - sector_offset; - ret = bdrv_read(s->hd, coffset >> 9, s->cluster_data, nb_csectors); + BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED); + ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data, nb_csectors); if (ret < 0) { return -1; } diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index dd6e293..4c19e7e 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -27,10 +27,32 @@ #include "block/qcow2.h" static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size); -static int update_refcount(BlockDriverState *bs, +static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, int64_t offset, int64_t length, int addend); + +static int cache_refcount_updates = 0; + +static int write_refcount_block(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + size_t size = s->cluster_size; + + if (s->refcount_block_cache_offset == 0) { + return 0; + } + + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE); + if (bdrv_pwrite_sync(bs->file, s->refcount_block_cache_offset, + s->refcount_block_cache, size) < 0) + { + return -EIO; + } + + return 0; +} + /*********************************************************/ /* refcount handling */ @@ -43,7 +65,8 @@ int qcow2_refcount_init(BlockDriverState *bs) refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t); s->refcount_table = qemu_malloc(refcount_table_size2); if (s->refcount_table_size > 0) { - ret = bdrv_pread(s->hd, s->refcount_table_offset, + BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD); + ret = bdrv_pread(bs->file, s->refcount_table_offset, s->refcount_table, refcount_table_size2); if (ret != refcount_table_size2) goto fail; @@ -68,19 +91,36 @@ static int load_refcount_block(BlockDriverState *bs, { BDRVQcowState *s = bs->opaque; int ret; - ret = bdrv_pread(s->hd, refcount_block_offset, s->refcount_block_cache, + + if (cache_refcount_updates) { + ret = write_refcount_block(bs); + if (ret < 0) { + return ret; + } + } + + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD); + ret = bdrv_pread(bs->file, refcount_block_offset, s->refcount_block_cache, s->cluster_size); - if (ret != s->cluster_size) - return -EIO; + if (ret < 0) { + return ret; + } + s->refcount_block_cache_offset = refcount_block_offset; return 0; } +/* + * Returns the refcount of the cluster given by its index. Any non-negative + * return value is the refcount of the cluster, negative values are -errno + * and indicate an error. + */ static int get_refcount(BlockDriverState *bs, int64_t cluster_index) { BDRVQcowState *s = bs->opaque; int refcount_table_index, block_index; int64_t refcount_block_offset; + int ret; refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT); if (refcount_table_index >= s->refcount_table_size) @@ -90,163 +130,351 @@ static int get_refcount(BlockDriverState *bs, int64_t cluster_index) return 0; if (refcount_block_offset != s->refcount_block_cache_offset) { /* better than nothing: return allocated if read error */ - if (load_refcount_block(bs, refcount_block_offset) < 0) - return 1; + ret = load_refcount_block(bs, refcount_block_offset); + if (ret < 0) { + return ret; + } } block_index = cluster_index & ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1); return be16_to_cpu(s->refcount_block_cache[block_index]); } -static int grow_refcount_table(BlockDriverState *bs, int min_size) +/* + * Rounds the refcount table size up to avoid growing the table for each single + * refcount block that is allocated. + */ +static unsigned int next_refcount_table_size(BDRVQcowState *s, + unsigned int min_size) { - BDRVQcowState *s = bs->opaque; - int new_table_size, new_table_size2, refcount_table_clusters, i, ret; - uint64_t *new_table; - int64_t table_offset; - uint8_t data[12]; - int old_table_size; - int64_t old_table_offset; + unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1; + unsigned int refcount_table_clusters = + MAX(1, s->refcount_table_size >> (s->cluster_bits - 3)); - if (min_size <= s->refcount_table_size) - return 0; - /* compute new table size */ - refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3); - for(;;) { - if (refcount_table_clusters == 0) { - refcount_table_clusters = 1; - } else { - refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2; - } - new_table_size = refcount_table_clusters << (s->cluster_bits - 3); - if (min_size <= new_table_size) - break; + while (min_clusters > refcount_table_clusters) { + refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2; } -#ifdef DEBUG_ALLOC2 - printf("grow_refcount_table from %d to %d\n", - s->refcount_table_size, - new_table_size); -#endif - new_table_size2 = new_table_size * sizeof(uint64_t); - new_table = qemu_mallocz(new_table_size2); - memcpy(new_table, s->refcount_table, - s->refcount_table_size * sizeof(uint64_t)); - for(i = 0; i < s->refcount_table_size; i++) - cpu_to_be64s(&new_table[i]); - /* Note: we cannot update the refcount now to avoid recursion */ - table_offset = alloc_clusters_noref(bs, new_table_size2); - ret = bdrv_pwrite(s->hd, table_offset, new_table, new_table_size2); - if (ret != new_table_size2) - goto fail; - for(i = 0; i < s->refcount_table_size; i++) - be64_to_cpus(&new_table[i]); - - cpu_to_be64w((uint64_t*)data, table_offset); - cpu_to_be32w((uint32_t*)(data + 8), refcount_table_clusters); - if (bdrv_pwrite(s->hd, offsetof(QCowHeader, refcount_table_offset), - data, sizeof(data)) != sizeof(data)) - goto fail; - qemu_free(s->refcount_table); - old_table_offset = s->refcount_table_offset; - old_table_size = s->refcount_table_size; - s->refcount_table = new_table; - s->refcount_table_size = new_table_size; - s->refcount_table_offset = table_offset; - update_refcount(bs, table_offset, new_table_size2, 1); - qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t)); - return 0; - fail: - qcow2_free_clusters(bs, table_offset, new_table_size2); - qemu_free(new_table); - return -EIO; + return refcount_table_clusters << (s->cluster_bits - 3); } +/* Checks if two offsets are described by the same refcount block */ +static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a, + uint64_t offset_b) +{ + uint64_t block_a = offset_a >> (2 * s->cluster_bits - REFCOUNT_SHIFT); + uint64_t block_b = offset_b >> (2 * s->cluster_bits - REFCOUNT_SHIFT); + + return (block_a == block_b); +} + +/* + * Loads a refcount block. If it doesn't exist yet, it is allocated first + * (including growing the refcount table if needed). + * + * Returns the offset of the refcount block on success or -errno in error case + */ static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index) { BDRVQcowState *s = bs->opaque; - int64_t offset, refcount_block_offset; - int ret, refcount_table_index; - uint64_t data64; + unsigned int refcount_table_index; + int ret; - /* Find L1 index and grow refcount table if needed */ + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC); + + /* Find the refcount block for the given cluster */ refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT); - if (refcount_table_index >= s->refcount_table_size) { - ret = grow_refcount_table(bs, refcount_table_index + 1); - if (ret < 0) + + if (refcount_table_index < s->refcount_table_size) { + + uint64_t refcount_block_offset = + s->refcount_table[refcount_table_index]; + + /* If it's already there, we're done */ + if (refcount_block_offset) { + if (refcount_block_offset != s->refcount_block_cache_offset) { + ret = load_refcount_block(bs, refcount_block_offset); + if (ret < 0) { + return ret; + } + } + return refcount_block_offset; + } + } + + /* + * If we came here, we need to allocate something. Something is at least + * a cluster for the new refcount block. It may also include a new refcount + * table if the old refcount table is too small. + * + * Note that allocating clusters here needs some special care: + * + * - We can't use the normal qcow2_alloc_clusters(), it would try to + * increase the refcount and very likely we would end up with an endless + * recursion. Instead we must place the refcount blocks in a way that + * they can describe them themselves. + * + * - We need to consider that at this point we are inside update_refcounts + * and doing the initial refcount increase. This means that some clusters + * have already been allocated by the caller, but their refcount isn't + * accurate yet. free_cluster_index tells us where this allocation ends + * as long as we don't overwrite it by freeing clusters. + * + * - alloc_clusters_noref and qcow2_free_clusters may load a different + * refcount block into the cache + */ + + if (cache_refcount_updates) { + ret = write_refcount_block(bs); + if (ret < 0) { return ret; + } } - /* Load or allocate the refcount block */ - refcount_block_offset = s->refcount_table[refcount_table_index]; - if (!refcount_block_offset) { - /* create a new refcount block */ - /* Note: we cannot update the refcount now to avoid recursion */ - offset = alloc_clusters_noref(bs, s->cluster_size); + /* Allocate the refcount block itself and mark it as used */ + int64_t new_block = alloc_clusters_noref(bs, s->cluster_size); + if (new_block < 0) { + return new_block; + } + +#ifdef DEBUG_ALLOC2 + fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64 + " at %" PRIx64 "\n", + refcount_table_index, cluster_index << s->cluster_bits, new_block); +#endif + + if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) { + /* Zero the new refcount block before updating it */ memset(s->refcount_block_cache, 0, s->cluster_size); - ret = bdrv_pwrite(s->hd, offset, s->refcount_block_cache, s->cluster_size); - if (ret != s->cluster_size) - return -EINVAL; - s->refcount_table[refcount_table_index] = offset; - data64 = cpu_to_be64(offset); - ret = bdrv_pwrite(s->hd, s->refcount_table_offset + - refcount_table_index * sizeof(uint64_t), - &data64, sizeof(data64)); - if (ret != sizeof(data64)) - return -EINVAL; - - refcount_block_offset = offset; - s->refcount_block_cache_offset = offset; - update_refcount(bs, offset, s->cluster_size, 1); + s->refcount_block_cache_offset = new_block; + + /* The block describes itself, need to update the cache */ + int block_index = (new_block >> s->cluster_bits) & + ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1); + s->refcount_block_cache[block_index] = cpu_to_be16(1); } else { - if (refcount_block_offset != s->refcount_block_cache_offset) { - if (load_refcount_block(bs, refcount_block_offset) < 0) - return -EIO; + /* Described somewhere else. This can recurse at most twice before we + * arrive at a block that describes itself. */ + ret = update_refcount(bs, new_block, s->cluster_size, 1); + if (ret < 0) { + goto fail_block; + } + + /* Initialize the new refcount block only after updating its refcount, + * update_refcount uses the refcount cache itself */ + memset(s->refcount_block_cache, 0, s->cluster_size); + s->refcount_block_cache_offset = new_block; + } + + /* Now the new refcount block needs to be written to disk */ + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE); + ret = bdrv_pwrite_sync(bs->file, new_block, s->refcount_block_cache, + s->cluster_size); + if (ret < 0) { + goto fail_block; + } + + /* If the refcount table is big enough, just hook the block up there */ + if (refcount_table_index < s->refcount_table_size) { + uint64_t data64 = cpu_to_be64(new_block); + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP); + ret = bdrv_pwrite_sync(bs->file, + s->refcount_table_offset + refcount_table_index * sizeof(uint64_t), + &data64, sizeof(data64)); + if (ret < 0) { + goto fail_block; } + + s->refcount_table[refcount_table_index] = new_block; + return new_block; + } + + /* + * If we come here, we need to grow the refcount table. Again, a new + * refcount table needs some space and we can't simply allocate to avoid + * endless recursion. + * + * Therefore let's grab new refcount blocks at the end of the image, which + * will describe themselves and the new refcount table. This way we can + * reference them only in the new table and do the switch to the new + * refcount table at once without producing an inconsistent state in + * between. + */ + BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW); + + /* Calculate the number of refcount blocks needed so far */ + uint64_t refcount_block_clusters = 1 << (s->cluster_bits - REFCOUNT_SHIFT); + uint64_t blocks_used = (s->free_cluster_index + + refcount_block_clusters - 1) / refcount_block_clusters; + + /* And now we need at least one block more for the new metadata */ + uint64_t table_size = next_refcount_table_size(s, blocks_used + 1); + uint64_t last_table_size; + uint64_t blocks_clusters; + do { + uint64_t table_clusters = size_to_clusters(s, table_size); + blocks_clusters = 1 + + ((table_clusters + refcount_block_clusters - 1) + / refcount_block_clusters); + uint64_t meta_clusters = table_clusters + blocks_clusters; + + last_table_size = table_size; + table_size = next_refcount_table_size(s, blocks_used + + ((meta_clusters + refcount_block_clusters - 1) + / refcount_block_clusters)); + + } while (last_table_size != table_size); + +#ifdef DEBUG_ALLOC2 + fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n", + s->refcount_table_size, table_size); +#endif + + /* Create the new refcount table and blocks */ + uint64_t meta_offset = (blocks_used * refcount_block_clusters) * + s->cluster_size; + uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size; + uint16_t *new_blocks = qemu_mallocz(blocks_clusters * s->cluster_size); + uint64_t *new_table = qemu_mallocz(table_size * sizeof(uint64_t)); + + assert(meta_offset >= (s->free_cluster_index * s->cluster_size)); + + /* Fill the new refcount table */ + memcpy(new_table, s->refcount_table, + s->refcount_table_size * sizeof(uint64_t)); + new_table[refcount_table_index] = new_block; + + int i; + for (i = 0; i < blocks_clusters; i++) { + new_table[blocks_used + i] = meta_offset + (i * s->cluster_size); + } + + /* Fill the refcount blocks */ + uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t)); + int block = 0; + for (i = 0; i < table_clusters + blocks_clusters; i++) { + new_blocks[block++] = cpu_to_be16(1); + } + + /* Write refcount blocks to disk */ + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS); + ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks, + blocks_clusters * s->cluster_size); + qemu_free(new_blocks); + if (ret < 0) { + goto fail_table; + } + + /* Write refcount table to disk */ + for(i = 0; i < table_size; i++) { + cpu_to_be64s(&new_table[i]); + } + + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE); + ret = bdrv_pwrite_sync(bs->file, table_offset, new_table, + table_size * sizeof(uint64_t)); + if (ret < 0) { + goto fail_table; + } + + for(i = 0; i < table_size; i++) { + cpu_to_be64s(&new_table[i]); + } + + /* Hook up the new refcount table in the qcow2 header */ + uint8_t data[12]; + cpu_to_be64w((uint64_t*)data, table_offset); + cpu_to_be32w((uint32_t*)(data + 8), table_clusters); + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE); + ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, refcount_table_offset), + data, sizeof(data)); + if (ret < 0) { + goto fail_table; } - return refcount_block_offset; + /* And switch it in memory */ + uint64_t old_table_offset = s->refcount_table_offset; + uint64_t old_table_size = s->refcount_table_size; + + qemu_free(s->refcount_table); + s->refcount_table = new_table; + s->refcount_table_size = table_size; + s->refcount_table_offset = table_offset; + + /* Free old table. Remember, we must not change free_cluster_index */ + uint64_t old_free_cluster_index = s->free_cluster_index; + qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t)); + s->free_cluster_index = old_free_cluster_index; + + ret = load_refcount_block(bs, new_block); + if (ret < 0) { + goto fail_block; + } + + return new_block; + +fail_table: + qemu_free(new_table); +fail_block: + s->refcount_block_cache_offset = 0; + return ret; } #define REFCOUNTS_PER_SECTOR (512 >> REFCOUNT_SHIFT) -static int write_refcount_block_entries(BDRVQcowState *s, +static int write_refcount_block_entries(BlockDriverState *bs, int64_t refcount_block_offset, int first_index, int last_index) { + BDRVQcowState *s = bs->opaque; size_t size; + int ret; + + if (cache_refcount_updates) { + return 0; + } + + if (first_index < 0) { + return 0; + } first_index &= ~(REFCOUNTS_PER_SECTOR - 1); last_index = (last_index + REFCOUNTS_PER_SECTOR) & ~(REFCOUNTS_PER_SECTOR - 1); size = (last_index - first_index) << REFCOUNT_SHIFT; - if (bdrv_pwrite(s->hd, + + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART); + ret = bdrv_pwrite_sync(bs->file, refcount_block_offset + (first_index << REFCOUNT_SHIFT), - &s->refcount_block_cache[first_index], size) != size) - { - return -EIO; + &s->refcount_block_cache[first_index], size); + if (ret < 0) { + return ret; } return 0; } /* XXX: cache several refcount block clusters ? */ -static int update_refcount(BlockDriverState *bs, - int64_t offset, int64_t length, - int addend) +static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, + int64_t offset, int64_t length, int addend) { BDRVQcowState *s = bs->opaque; int64_t start, last, cluster_offset; int64_t refcount_block_offset = 0; int64_t table_index = -1, old_table_index; int first_index = -1, last_index = -1; + int ret; #ifdef DEBUG_ALLOC2 - printf("update_refcount: offset=%lld size=%lld addend=%d\n", + printf("update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n", offset, length, addend); #endif - if (length <= 0) + if (length < 0) { return -EINVAL; + } else if (length == 0) { + return 0; + } + start = offset & ~(s->cluster_size - 1); last = (offset + length - 1) & ~(s->cluster_size - 1); for(cluster_offset = start; cluster_offset <= last; @@ -254,16 +482,17 @@ static int update_refcount(BlockDriverState *bs, { int block_index, refcount; int64_t cluster_index = cluster_offset >> s->cluster_bits; + int64_t new_block; /* Only write refcount block to disk when we are done with it */ old_table_index = table_index; table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT); if ((old_table_index >= 0) && (table_index != old_table_index)) { - if (write_refcount_block_entries(s, refcount_block_offset, - first_index, last_index) < 0) - { - return -EIO; + ret = write_refcount_block_entries(bs, refcount_block_offset, + first_index, last_index); + if (ret < 0) { + return ret; } first_index = -1; @@ -271,10 +500,12 @@ static int update_refcount(BlockDriverState *bs, } /* Load the refcount block and allocate it if needed */ - refcount_block_offset = alloc_refcount_block(bs, cluster_index); - if (refcount_block_offset < 0) { - return refcount_block_offset; + new_block = alloc_refcount_block(bs, cluster_index); + if (new_block < 0) { + ret = new_block; + goto fail; } + refcount_block_offset = new_block; /* we can update the count and save it */ block_index = cluster_index & @@ -288,27 +519,48 @@ static int update_refcount(BlockDriverState *bs, refcount = be16_to_cpu(s->refcount_block_cache[block_index]); refcount += addend; - if (refcount < 0 || refcount > 0xffff) - return -EINVAL; + if (refcount < 0 || refcount > 0xffff) { + ret = -EINVAL; + goto fail; + } if (refcount == 0 && cluster_index < s->free_cluster_index) { s->free_cluster_index = cluster_index; } s->refcount_block_cache[block_index] = cpu_to_be16(refcount); } + ret = 0; +fail: + /* Write last changed block to disk */ if (refcount_block_offset != 0) { - if (write_refcount_block_entries(s, refcount_block_offset, - first_index, last_index) < 0) - { - return -EIO; + int wret; + wret = write_refcount_block_entries(bs, refcount_block_offset, + first_index, last_index); + if (wret < 0) { + return ret < 0 ? ret : wret; } } - return 0; + /* + * Try do undo any updates if an error is returned (This may succeed in + * some cases like ENOSPC for allocating a new refcount block) + */ + if (ret < 0) { + int dummy; + dummy = update_refcount(bs, offset, cluster_offset - offset, -addend); + } + + return ret; } -/* addend must be 1 or -1 */ +/* + * Increases or decreases the refcount of a given cluster by one. + * addend must be 1 or -1. + * + * If the return value is non-negative, it is the new refcount of the cluster. + * If it is negative, it is -errno and indicates an error. + */ static int update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index, int addend) @@ -335,17 +587,22 @@ static int update_cluster_refcount(BlockDriverState *bs, static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size) { BDRVQcowState *s = bs->opaque; - int i, nb_clusters; + int i, nb_clusters, refcount; nb_clusters = size_to_clusters(s, size); retry: for(i = 0; i < nb_clusters; i++) { - int64_t i = s->free_cluster_index++; - if (get_refcount(bs, i) != 0) + int64_t next_cluster_index = s->free_cluster_index++; + refcount = get_refcount(bs, next_cluster_index); + + if (refcount < 0) { + return refcount; + } else if (refcount != 0) { goto retry; + } } #ifdef DEBUG_ALLOC2 - printf("alloc_clusters: size=%lld -> %lld\n", + printf("alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n", size, (s->free_cluster_index - nb_clusters) << s->cluster_bits); #endif @@ -355,9 +612,18 @@ retry: int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size) { int64_t offset; + int ret; + BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC); offset = alloc_clusters_noref(bs, size); - update_refcount(bs, offset, size, 1); + if (offset < 0) { + return offset; + } + + ret = update_refcount(bs, offset, size, 1); + if (ret < 0) { + return ret; + } return offset; } @@ -369,9 +635,13 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) int64_t offset, cluster_offset; int free_in_cluster; + BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES); assert(size > 0 && size <= s->cluster_size); if (s->free_byte_offset == 0) { s->free_byte_offset = qcow2_alloc_clusters(bs, s->cluster_size); + if (s->free_byte_offset < 0) { + return s->free_byte_offset; + } } redo: free_in_cluster = s->cluster_size - @@ -387,6 +657,9 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) update_cluster_refcount(bs, offset >> s->cluster_bits, 1); } else { offset = qcow2_alloc_clusters(bs, s->cluster_size); + if (offset < 0) { + return offset; + } cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1); if ((cluster_offset + s->cluster_size) == offset) { /* we are lucky: contiguous data */ @@ -404,7 +677,14 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) void qcow2_free_clusters(BlockDriverState *bs, int64_t offset, int64_t size) { - update_refcount(bs, offset, size, -1); + int ret; + + BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE); + ret = update_refcount(bs, offset, size, -1); + if (ret < 0) { + fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret)); + /* TODO Remember the clusters to free them later and avoid leaking */ + } } /* @@ -471,15 +751,19 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, int l2_size, i, j, l1_modified, l2_modified, nb_csectors, refcount; qcow2_l2_cache_reset(bs); + cache_refcount_updates = 1; l2_table = NULL; l1_table = NULL; l1_size2 = l1_size * sizeof(uint64_t); - l1_allocated = 0; if (l1_table_offset != s->l1_table_offset) { - l1_table = qemu_malloc(l1_size2); + if (l1_size2 != 0) { + l1_table = qemu_mallocz(align_offset(l1_size2, 512)); + } else { + l1_table = NULL; + } l1_allocated = 1; - if (bdrv_pread(s->hd, l1_table_offset, + if (bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2) != l1_size2) goto fail; for(i = 0;i < l1_size; i++) @@ -499,7 +783,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, old_l2_offset = l2_offset; l2_offset &= ~QCOW_OFLAG_COPIED; l2_modified = 0; - if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size) + if (bdrv_pread(bs->file, l2_offset, l2_table, l2_size) != l2_size) goto fail; for(j = 0; j < s->l2_size; j++) { offset = be64_to_cpu(l2_table[j]); @@ -509,9 +793,15 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, if (offset & QCOW_OFLAG_COMPRESSED) { nb_csectors = ((offset >> s->csize_shift) & s->csize_mask) + 1; - if (addend != 0) - update_refcount(bs, (offset & s->cluster_offset_mask) & ~511, - nb_csectors * 512, addend); + if (addend != 0) { + int ret; + ret = update_refcount(bs, + (offset & s->cluster_offset_mask) & ~511, + nb_csectors * 512, addend); + if (ret < 0) { + goto fail; + } + } /* compressed clusters are never modified */ refcount = 2; } else { @@ -520,6 +810,10 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, } else { refcount = get_refcount(bs, offset >> s->cluster_bits); } + + if (refcount < 0) { + goto fail; + } } if (refcount == 1) { @@ -532,8 +826,8 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, } } if (l2_modified) { - if (bdrv_pwrite(s->hd, - l2_offset, l2_table, l2_size) != l2_size) + if (bdrv_pwrite_sync(bs->file, + l2_offset, l2_table, l2_size) < 0) goto fail; } @@ -542,7 +836,9 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, } else { refcount = get_refcount(bs, l2_offset >> s->cluster_bits); } - if (refcount == 1) { + if (refcount < 0) { + goto fail; + } else if (refcount == 1) { l2_offset |= QCOW_OFLAG_COPIED; } if (l2_offset != old_l2_offset) { @@ -554,8 +850,8 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, if (l1_modified) { for(i = 0; i < l1_size; i++) cpu_to_be64s(&l1_table[i]); - if (bdrv_pwrite(s->hd, l1_table_offset, l1_table, - l1_size2) != l1_size2) + if (bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table, + l1_size2) < 0) goto fail; for(i = 0; i < l1_size; i++) be64_to_cpus(&l1_table[i]); @@ -563,11 +859,15 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, if (l1_allocated) qemu_free(l1_table); qemu_free(l2_table); + cache_refcount_updates = 0; + write_refcount_block(bs); return 0; fail: if (l1_allocated) qemu_free(l1_table); qemu_free(l2_table); + cache_refcount_updates = 0; + write_refcount_block(bs); return -EIO; } @@ -584,9 +884,10 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, * This is used to construct a temporary refcount table out of L1 and L2 tables * which can be compared the the refcount table saved in the image. * - * Returns the number of errors in the image that were found + * Modifies the number of errors in res. */ -static int inc_refcounts(BlockDriverState *bs, +static void inc_refcounts(BlockDriverState *bs, + BdrvCheckResult *res, uint16_t *refcount_table, int refcount_table_size, int64_t offset, int64_t size) @@ -594,30 +895,32 @@ static int inc_refcounts(BlockDriverState *bs, BDRVQcowState *s = bs->opaque; int64_t start, last, cluster_offset; int k; - int errors = 0; if (size <= 0) - return 0; + return; start = offset & ~(s->cluster_size - 1); last = (offset + size - 1) & ~(s->cluster_size - 1); for(cluster_offset = start; cluster_offset <= last; cluster_offset += s->cluster_size) { k = cluster_offset >> s->cluster_bits; - if (k < 0 || k >= refcount_table_size) { + if (k < 0) { fprintf(stderr, "ERROR: invalid cluster offset=0x%" PRIx64 "\n", cluster_offset); - errors++; + res->corruptions++; + } else if (k >= refcount_table_size) { + fprintf(stderr, "Warning: cluster offset=0x%" PRIx64 " is after " + "the end of the image file, can't properly check refcounts.\n", + cluster_offset); + res->check_errors++; } else { if (++refcount_table[k] == 0) { fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64 "\n", cluster_offset); - errors++; + res->corruptions++; } } } - - return errors; } /* @@ -628,20 +931,19 @@ static int inc_refcounts(BlockDriverState *bs, * Returns the number of errors found by the checks or -errno if an internal * error occurred. */ -static int check_refcounts_l2(BlockDriverState *bs, +static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset, int check_copied) { BDRVQcowState *s = bs->opaque; uint64_t *l2_table, offset; int i, l2_size, nb_csectors, refcount; - int errors = 0; /* Read L2 table from disk */ l2_size = s->l2_size * sizeof(uint64_t); l2_table = qemu_malloc(l2_size); - if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size) + if (bdrv_pread(bs->file, l2_offset, l2_table, l2_size) != l2_size) goto fail; /* Do the actual checks */ @@ -655,50 +957,53 @@ static int check_refcounts_l2(BlockDriverState *bs, "copied flag must never be set for compressed " "clusters\n", offset >> s->cluster_bits); offset &= ~QCOW_OFLAG_COPIED; - errors++; + res->corruptions++; } /* Mark cluster as used */ nb_csectors = ((offset >> s->csize_shift) & s->csize_mask) + 1; offset &= s->cluster_offset_mask; - errors += inc_refcounts(bs, refcount_table, - refcount_table_size, - offset & ~511, nb_csectors * 512); + inc_refcounts(bs, res, refcount_table, refcount_table_size, + offset & ~511, nb_csectors * 512); } else { /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */ if (check_copied) { uint64_t entry = offset; offset &= ~QCOW_OFLAG_COPIED; refcount = get_refcount(bs, offset >> s->cluster_bits); + if (refcount < 0) { + fprintf(stderr, "Can't get refcount for offset %" + PRIx64 ": %s\n", entry, strerror(-refcount)); + goto fail; + } if ((refcount == 1) != ((entry & QCOW_OFLAG_COPIED) != 0)) { fprintf(stderr, "ERROR OFLAG_COPIED: offset=%" PRIx64 " refcount=%d\n", entry, refcount); - errors++; + res->corruptions++; } } /* Mark cluster as used */ offset &= ~QCOW_OFLAG_COPIED; - errors += inc_refcounts(bs, refcount_table, - refcount_table_size, - offset, s->cluster_size); + inc_refcounts(bs, res, refcount_table,refcount_table_size, + offset, s->cluster_size); /* Correct offsets are cluster aligned */ if (offset & (s->cluster_size - 1)) { fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not " "properly aligned; L2 entry corrupted.\n", offset); - errors++; + res->corruptions++; } } } } qemu_free(l2_table); - return errors; + return 0; fail: - fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n"); + fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n"); qemu_free(l2_table); return -EIO; } @@ -712,6 +1017,7 @@ fail: * error occurred. */ static int check_refcounts_l1(BlockDriverState *bs, + BdrvCheckResult *res, uint16_t *refcount_table, int refcount_table_size, int64_t l1_table_offset, int l1_size, @@ -720,21 +1026,24 @@ static int check_refcounts_l1(BlockDriverState *bs, BDRVQcowState *s = bs->opaque; uint64_t *l1_table, l2_offset, l1_size2; int i, refcount, ret; - int errors = 0; l1_size2 = l1_size * sizeof(uint64_t); /* Mark L1 table as used */ - errors += inc_refcounts(bs, refcount_table, refcount_table_size, - l1_table_offset, l1_size2); + inc_refcounts(bs, res, refcount_table, refcount_table_size, + l1_table_offset, l1_size2); /* Read L1 table entries from disk */ - l1_table = qemu_malloc(l1_size2); - if (bdrv_pread(s->hd, l1_table_offset, - l1_table, l1_size2) != l1_size2) - goto fail; - for(i = 0;i < l1_size; i++) - be64_to_cpus(&l1_table[i]); + if (l1_size2 == 0) { + l1_table = NULL; + } else { + l1_table = qemu_malloc(l1_size2); + if (bdrv_pread(bs->file, l1_table_offset, + l1_table, l1_size2) != l1_size2) + goto fail; + for(i = 0;i < l1_size; i++) + be64_to_cpus(&l1_table[i]); + } /* Do the actual checks */ for(i = 0; i < l1_size; i++) { @@ -744,41 +1053,44 @@ static int check_refcounts_l1(BlockDriverState *bs, if (check_copied) { refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED) >> s->cluster_bits); + if (refcount < 0) { + fprintf(stderr, "Can't get refcount for l2_offset %" + PRIx64 ": %s\n", l2_offset, strerror(-refcount)); + goto fail; + } if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) { fprintf(stderr, "ERROR OFLAG_COPIED: l2_offset=%" PRIx64 " refcount=%d\n", l2_offset, refcount); - errors++; + res->corruptions++; } } /* Mark L2 table as used */ l2_offset &= ~QCOW_OFLAG_COPIED; - errors += inc_refcounts(bs, refcount_table, - refcount_table_size, - l2_offset, - s->cluster_size); + inc_refcounts(bs, res, refcount_table, refcount_table_size, + l2_offset, s->cluster_size); /* L2 tables are cluster aligned */ if (l2_offset & (s->cluster_size - 1)) { fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not " "cluster aligned; L1 entry corrupted\n", l2_offset); - errors++; + res->corruptions++; } /* Process and check L2 entries */ - ret = check_refcounts_l2(bs, refcount_table, refcount_table_size, - l2_offset, check_copied); + ret = check_refcounts_l2(bs, res, refcount_table, + refcount_table_size, l2_offset, check_copied); if (ret < 0) { goto fail; } - errors += ret; } } qemu_free(l1_table); - return errors; + return 0; fail: fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n"); + res->check_errors++; qemu_free(l1_table); return -EIO; } @@ -789,66 +1101,102 @@ fail: * Returns 0 if no errors are found, the number of errors in case the image is * detected as corrupted, and -errno when an internal error occured. */ -int qcow2_check_refcounts(BlockDriverState *bs) +int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res) { BDRVQcowState *s = bs->opaque; int64_t size; int nb_clusters, refcount1, refcount2, i; QCowSnapshot *sn; uint16_t *refcount_table; - int ret, errors = 0; + int ret; - size = bdrv_getlength(s->hd); + size = bdrv_getlength(bs->file); nb_clusters = size_to_clusters(s, size); refcount_table = qemu_mallocz(nb_clusters * sizeof(uint16_t)); /* header */ - errors += inc_refcounts(bs, refcount_table, nb_clusters, - 0, s->cluster_size); + inc_refcounts(bs, res, refcount_table, nb_clusters, + 0, s->cluster_size); /* current L1 table */ - ret = check_refcounts_l1(bs, refcount_table, nb_clusters, + ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, s->l1_table_offset, s->l1_size, 1); if (ret < 0) { return ret; } - errors += ret; /* snapshots */ for(i = 0; i < s->nb_snapshots; i++) { sn = s->snapshots + i; - check_refcounts_l1(bs, refcount_table, nb_clusters, - sn->l1_table_offset, sn->l1_size, 0); + ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, + sn->l1_table_offset, sn->l1_size, 0); + if (ret < 0) { + return ret; + } } - errors += inc_refcounts(bs, refcount_table, nb_clusters, - s->snapshots_offset, s->snapshots_size); + inc_refcounts(bs, res, refcount_table, nb_clusters, + s->snapshots_offset, s->snapshots_size); /* refcount data */ - errors += inc_refcounts(bs, refcount_table, nb_clusters, - s->refcount_table_offset, - s->refcount_table_size * sizeof(uint64_t)); + inc_refcounts(bs, res, refcount_table, nb_clusters, + s->refcount_table_offset, + s->refcount_table_size * sizeof(uint64_t)); + for(i = 0; i < s->refcount_table_size; i++) { - int64_t offset; + uint64_t offset, cluster; offset = s->refcount_table[i]; + cluster = offset >> s->cluster_bits; + + /* Refcount blocks are cluster aligned */ + if (offset & (s->cluster_size - 1)) { + fprintf(stderr, "ERROR refcount block %d is not " + "cluster aligned; refcount table entry corrupted\n", i); + res->corruptions++; + continue; + } + + if (cluster >= nb_clusters) { + fprintf(stderr, "ERROR refcount block %d is outside image\n", i); + res->corruptions++; + continue; + } + if (offset != 0) { - errors += inc_refcounts(bs, refcount_table, nb_clusters, - offset, s->cluster_size); + inc_refcounts(bs, res, refcount_table, nb_clusters, + offset, s->cluster_size); + if (refcount_table[cluster] != 1) { + fprintf(stderr, "ERROR refcount block %d refcount=%d\n", + i, refcount_table[cluster]); + res->corruptions++; + } } } /* compare ref counts */ for(i = 0; i < nb_clusters; i++) { refcount1 = get_refcount(bs, i); + if (refcount1 < 0) { + fprintf(stderr, "Can't get refcount for cluster %d: %s\n", + i, strerror(-refcount1)); + res->check_errors++; + continue; + } + refcount2 = refcount_table[i]; if (refcount1 != refcount2) { - fprintf(stderr, "ERROR cluster %d refcount=%d reference=%d\n", + fprintf(stderr, "%s cluster %d refcount=%d reference=%d\n", + refcount1 < refcount2 ? "ERROR" : "Leaked", i, refcount1, refcount2); - errors++; + if (refcount1 < refcount2) { + res->corruptions++; + } else { + res->leaks++; + } } } qemu_free(refcount_table); - return errors; + return 0; } diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c index e1e4d89..6228612 100644 --- a/block/qcow2-snapshot.c +++ b/block/qcow2-snapshot.c @@ -79,7 +79,7 @@ int qcow2_read_snapshots(BlockDriverState *bs) s->snapshots = qemu_mallocz(s->nb_snapshots * sizeof(QCowSnapshot)); for(i = 0; i < s->nb_snapshots; i++) { offset = align_offset(offset, 8); - if (bdrv_pread(s->hd, offset, &h, sizeof(h)) != sizeof(h)) + if (bdrv_pread(bs->file, offset, &h, sizeof(h)) != sizeof(h)) goto fail; offset += sizeof(h); sn = s->snapshots + i; @@ -97,13 +97,13 @@ int qcow2_read_snapshots(BlockDriverState *bs) offset += extra_data_size; sn->id_str = qemu_malloc(id_str_size + 1); - if (bdrv_pread(s->hd, offset, sn->id_str, id_str_size) != id_str_size) + if (bdrv_pread(bs->file, offset, sn->id_str, id_str_size) != id_str_size) goto fail; offset += id_str_size; sn->id_str[id_str_size] = '\0'; sn->name = qemu_malloc(name_size + 1); - if (bdrv_pread(s->hd, offset, sn->name, name_size) != name_size) + if (bdrv_pread(bs->file, offset, sn->name, name_size) != name_size) goto fail; offset += name_size; sn->name[name_size] = '\0'; @@ -139,6 +139,9 @@ static int qcow_write_snapshots(BlockDriverState *bs) snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size); offset = snapshots_offset; + if (offset < 0) { + return offset; + } for(i = 0; i < s->nb_snapshots; i++) { sn = s->snapshots + i; @@ -155,25 +158,25 @@ static int qcow_write_snapshots(BlockDriverState *bs) h.id_str_size = cpu_to_be16(id_str_size); h.name_size = cpu_to_be16(name_size); offset = align_offset(offset, 8); - if (bdrv_pwrite(s->hd, offset, &h, sizeof(h)) != sizeof(h)) + if (bdrv_pwrite_sync(bs->file, offset, &h, sizeof(h)) < 0) goto fail; offset += sizeof(h); - if (bdrv_pwrite(s->hd, offset, sn->id_str, id_str_size) != id_str_size) + if (bdrv_pwrite_sync(bs->file, offset, sn->id_str, id_str_size) < 0) goto fail; offset += id_str_size; - if (bdrv_pwrite(s->hd, offset, sn->name, name_size) != name_size) + if (bdrv_pwrite_sync(bs->file, offset, sn->name, name_size) < 0) goto fail; offset += name_size; } /* update the various header fields */ data64 = cpu_to_be64(snapshots_offset); - if (bdrv_pwrite(s->hd, offsetof(QCowHeader, snapshots_offset), - &data64, sizeof(data64)) != sizeof(data64)) + if (bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, snapshots_offset), + &data64, sizeof(data64)) < 0) goto fail; data32 = cpu_to_be32(s->nb_snapshots); - if (bdrv_pwrite(s->hd, offsetof(QCowHeader, nb_snapshots), - &data32, sizeof(data32)) != sizeof(data32)) + if (bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, nb_snapshots), + &data32, sizeof(data32)) < 0) goto fail; /* free the old snapshot table */ @@ -235,6 +238,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) QCowSnapshot *snapshots1, sn1, *sn = &sn1; int i, ret; uint64_t *l1_table = NULL; + int64_t l1_table_offset; memset(sn, 0, sizeof(*sn)); @@ -263,16 +267,25 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) goto fail; /* create the L1 table of the snapshot */ - sn->l1_table_offset = qcow2_alloc_clusters(bs, s->l1_size * sizeof(uint64_t)); + l1_table_offset = qcow2_alloc_clusters(bs, s->l1_size * sizeof(uint64_t)); + if (l1_table_offset < 0) { + goto fail; + } + + sn->l1_table_offset = l1_table_offset; sn->l1_size = s->l1_size; - l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t)); + if (s->l1_size != 0) { + l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t)); + } else { + l1_table = NULL; + } + for(i = 0; i < s->l1_size; i++) { l1_table[i] = cpu_to_be64(s->l1_table[i]); } - if (bdrv_pwrite(s->hd, sn->l1_table_offset, - l1_table, s->l1_size * sizeof(uint64_t)) != - (s->l1_size * sizeof(uint64_t))) + if (bdrv_pwrite_sync(bs->file, sn->l1_table_offset, + l1_table, s->l1_size * sizeof(uint64_t)) < 0) goto fail; qemu_free(l1_table); l1_table = NULL; @@ -288,7 +301,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) if (qcow_write_snapshots(bs) < 0) goto fail; #ifdef DEBUG_ALLOC - check_refcounts(bs); + qcow2_check_refcounts(bs); #endif return 0; fail: @@ -318,11 +331,11 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) s->l1_size = sn->l1_size; l1_size2 = s->l1_size * sizeof(uint64_t); /* copy the snapshot l1 table to the current l1 table */ - if (bdrv_pread(s->hd, sn->l1_table_offset, + if (bdrv_pread(bs->file, sn->l1_table_offset, s->l1_table, l1_size2) != l1_size2) goto fail; - if (bdrv_pwrite(s->hd, s->l1_table_offset, - s->l1_table, l1_size2) != l1_size2) + if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, + s->l1_table, l1_size2) < 0) goto fail; for(i = 0;i < s->l1_size; i++) { be64_to_cpus(&s->l1_table[i]); @@ -332,7 +345,7 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) goto fail; #ifdef DEBUG_ALLOC - check_refcounts(bs); + qcow2_check_refcounts(bs); #endif return 0; fail: @@ -369,7 +382,7 @@ int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) return ret; } #ifdef DEBUG_ALLOC - check_refcounts(bs); + qcow2_check_refcounts(bs); #endif return 0; } diff --git a/block/qcow2.c b/block/qcow2.c index 5ca20b2..a53014d 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -44,10 +44,6 @@ - L2 tables have always a size of one cluster. */ -//#define DEBUG_ALLOC -//#define DEBUG_ALLOC2 -//#define DEBUG_EXT - typedef struct { uint32_t magic; @@ -56,8 +52,6 @@ typedef struct { #define QCOW_EXT_MAGIC_END 0 #define QCOW_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA - - static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename) { const QCowHeader *cow_header = (const void *)buf; @@ -71,7 +65,7 @@ static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename) } -/* +/* * read qcow2 extension and fill bs * start reading from start_offset * finish reading upon magic of value 0 or when end_offset reached @@ -81,7 +75,6 @@ static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename) static int qcow_read_extensions(BlockDriverState *bs, uint64_t start_offset, uint64_t end_offset) { - BDRVQcowState *s = bs->opaque; QCowExtension ext; uint64_t offset; @@ -99,9 +92,10 @@ static int qcow_read_extensions(BlockDriverState *bs, uint64_t start_offset, printf("attemting to read extended header in offset %lu\n", offset); #endif - if (bdrv_pread(s->hd, offset, &ext, sizeof(ext)) != sizeof(ext)) { - fprintf(stderr, "qcow_handle_extension: ERROR: pread fail from offset %llu\n", - (unsigned long long)offset); + if (bdrv_pread(bs->file, offset, &ext, sizeof(ext)) != sizeof(ext)) { + fprintf(stderr, "qcow_handle_extension: ERROR: " + "pread fail from offset %" PRIu64 "\n", + offset); return 1; } be32_to_cpus(&ext.magic); @@ -121,19 +115,19 @@ static int qcow_read_extensions(BlockDriverState *bs, uint64_t start_offset, ext.len, sizeof(bs->backing_format)); return 2; } - if (bdrv_pread(s->hd, offset , bs->backing_format, + if (bdrv_pread(bs->file, offset , bs->backing_format, ext.len) != ext.len) return 3; bs->backing_format[ext.len] = '\0'; #ifdef DEBUG_EXT printf("Qcow2: Got format extension %s\n", bs->backing_format); #endif - offset += ((ext.len + 7) & ~7); + offset = ((offset + ext.len + 7) & ~7); break; default: /* unknown magic -- just skip it */ - offset += ((ext.len + 7) & ~7); + offset = ((offset + ext.len + 7) & ~7); break; } } @@ -142,25 +136,14 @@ static int qcow_read_extensions(BlockDriverState *bs, uint64_t start_offset, } -static int qcow_open(BlockDriverState *bs, const char *filename, int flags) +static int qcow_open(BlockDriverState *bs, int flags) { BDRVQcowState *s = bs->opaque; - int len, i, shift, ret; + int len, i; QCowHeader header; uint64_t ext_end; - /* Performance is terrible right now with cache=writethrough due mainly - * to reference count updates. If the user does not explicitly specify - * a caching type, force to writeback caching. - */ - if ((flags & BDRV_O_CACHE_DEF)) { - flags |= BDRV_O_CACHE_WB; - flags &= ~BDRV_O_CACHE_DEF; - } - ret = bdrv_file_open(&s->hd, filename, flags); - if (ret < 0) - return ret; - if (bdrv_pread(s->hd, 0, &header, sizeof(header)) != sizeof(header)) + if (bdrv_pread(bs->file, 0, &header, sizeof(header)) != sizeof(header)) goto fail; be32_to_cpus(&header.magic); be32_to_cpus(&header.version); @@ -178,8 +161,7 @@ static int qcow_open(BlockDriverState *bs, const char *filename, int flags) if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION) goto fail; - if (header.size <= 1 || - header.cluster_bits < MIN_CLUSTER_BITS || + if (header.cluster_bits < MIN_CLUSTER_BITS || header.cluster_bits > MAX_CLUSTER_BITS) goto fail; if (header.crypt_method > QCOW_CRYPT_AES) @@ -205,19 +187,21 @@ static int qcow_open(BlockDriverState *bs, const char *filename, int flags) /* read the level 1 table */ s->l1_size = header.l1_size; - shift = s->cluster_bits + s->l2_bits; - s->l1_vm_state_index = (header.size + (1LL << shift) - 1) >> shift; + s->l1_vm_state_index = size_to_l1(s, header.size); /* the L1 table must contain at least enough entries to put header.size bytes */ if (s->l1_size < s->l1_vm_state_index) goto fail; s->l1_table_offset = header.l1_table_offset; - s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t)); - if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) != - s->l1_size * sizeof(uint64_t)) - goto fail; - for(i = 0;i < s->l1_size; i++) { - be64_to_cpus(&s->l1_table[i]); + if (s->l1_size > 0) { + s->l1_table = qemu_mallocz( + align_offset(s->l1_size * sizeof(uint64_t), 512)); + if (bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) != + s->l1_size * sizeof(uint64_t)) + goto fail; + for(i = 0;i < s->l1_size; i++) { + be64_to_cpus(&s->l1_table[i]); + } } /* alloc L2 cache */ s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); @@ -245,7 +229,7 @@ static int qcow_open(BlockDriverState *bs, const char *filename, int flags) len = header.backing_file_size; if (len > 1023) len = 1023; - if (bdrv_pread(s->hd, header.backing_file_offset, bs->backing_file, len) != len) + if (bdrv_pread(bs->file, header.backing_file_offset, bs->backing_file, len) != len) goto fail; bs->backing_file[len] = '\0'; } @@ -253,7 +237,7 @@ static int qcow_open(BlockDriverState *bs, const char *filename, int flags) goto fail; #ifdef DEBUG_ALLOC - check_refcounts(bs); + qcow2_check_refcounts(bs); #endif return 0; @@ -264,7 +248,6 @@ static int qcow_open(BlockDriverState *bs, const char *filename, int flags) qemu_free(s->l2_cache); qemu_free(s->cluster_cache); qemu_free(s->cluster_data); - bdrv_delete(s->hd); return -1; } @@ -314,9 +297,15 @@ static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum) { uint64_t cluster_offset; + int ret; *pnum = nb_sectors; - cluster_offset = qcow2_get_cluster_offset(bs, sector_num << 9, pnum); + /* FIXME We can get errors here, but the bdrv_is_allocated interface can't + * pass them on today */ + ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset); + if (ret < 0) { + *pnum = 0; + } return (cluster_offset != 0); } @@ -342,8 +331,8 @@ typedef struct QCowAIOCB { QEMUIOVector *qiov; uint8_t *buf; void *orig_buf; - int nb_sectors; - int n; + int remaining_sectors; + int cur_nr_sectors; /* number of sectors in current iteration */ uint64_t cluster_offset; uint8_t *cluster_data; BlockDriverAIOCB *hd_aiocb; @@ -351,11 +340,12 @@ typedef struct QCowAIOCB { QEMUIOVector hd_qiov; QEMUBH *bh; QCowL2Meta l2meta; + QLIST_ENTRY(QCowAIOCB) next_depend; } QCowAIOCB; static void qcow_aio_cancel(BlockDriverAIOCB *blockacb) { - QCowAIOCB *acb = (QCowAIOCB *)blockacb; + QCowAIOCB *acb = container_of(blockacb, QCowAIOCB, common); if (acb->hd_aiocb) bdrv_aio_cancel(acb->hd_aiocb); qemu_aio_release(acb); @@ -408,38 +398,43 @@ static void qcow_aio_read_cb(void *opaque, int ret) } else { if (s->crypt_method) { qcow2_encrypt_sectors(s, acb->sector_num, acb->buf, acb->buf, - acb->n, 0, + acb->cur_nr_sectors, 0, &s->aes_decrypt_key); } } - acb->nb_sectors -= acb->n; - acb->sector_num += acb->n; - acb->buf += acb->n * 512; + acb->remaining_sectors -= acb->cur_nr_sectors; + acb->sector_num += acb->cur_nr_sectors; + acb->buf += acb->cur_nr_sectors * 512; - if (acb->nb_sectors == 0) { + if (acb->remaining_sectors == 0) { /* request completed */ ret = 0; goto done; } /* prepare next AIO request */ - acb->n = acb->nb_sectors; - acb->cluster_offset = - qcow2_get_cluster_offset(bs, acb->sector_num << 9, &acb->n); + acb->cur_nr_sectors = acb->remaining_sectors; + ret = qcow2_get_cluster_offset(bs, acb->sector_num << 9, + &acb->cur_nr_sectors, &acb->cluster_offset); + if (ret < 0) { + goto done; + } + index_in_cluster = acb->sector_num & (s->cluster_sectors - 1); if (!acb->cluster_offset) { if (bs->backing_hd) { /* read from the base image */ n1 = qcow2_backing_read1(bs->backing_hd, acb->sector_num, - acb->buf, acb->n); + acb->buf, acb->cur_nr_sectors); if (n1 > 0) { acb->hd_iov.iov_base = (void *)acb->buf; - acb->hd_iov.iov_len = acb->n * 512; + acb->hd_iov.iov_len = acb->cur_nr_sectors * 512; qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1); + BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO); acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num, - &acb->hd_qiov, acb->n, + &acb->hd_qiov, acb->cur_nr_sectors, qcow_aio_read_cb, acb); if (acb->hd_aiocb == NULL) goto done; @@ -450,17 +445,17 @@ static void qcow_aio_read_cb(void *opaque, int ret) } } else { /* Note: in this case, no need to wait */ - memset(acb->buf, 0, 512 * acb->n); + memset(acb->buf, 0, 512 * acb->cur_nr_sectors); ret = qcow_schedule_bh(qcow_aio_read_bh, acb); if (ret < 0) goto done; } } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) { /* add AIO support for compressed blocks ? */ - if (qcow2_decompress_cluster(s, acb->cluster_offset) < 0) + if (qcow2_decompress_cluster(bs, acb->cluster_offset) < 0) goto done; - memcpy(acb->buf, - s->cluster_cache + index_in_cluster * 512, 512 * acb->n); + memcpy(acb->buf, s->cluster_cache + index_in_cluster * 512, + 512 * acb->cur_nr_sectors); ret = qcow_schedule_bh(qcow_aio_read_bh, acb); if (ret < 0) goto done; @@ -471,13 +466,17 @@ static void qcow_aio_read_cb(void *opaque, int ret) } acb->hd_iov.iov_base = (void *)acb->buf; - acb->hd_iov.iov_len = acb->n * 512; + acb->hd_iov.iov_len = acb->cur_nr_sectors * 512; qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1); - acb->hd_aiocb = bdrv_aio_readv(s->hd, + BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); + acb->hd_aiocb = bdrv_aio_readv(bs->file, (acb->cluster_offset >> 9) + index_in_cluster, - &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb); - if (acb->hd_aiocb == NULL) + &acb->hd_qiov, acb->cur_nr_sectors, + qcow_aio_read_cb, acb); + if (acb->hd_aiocb == NULL) { + ret = -EIO; goto done; + } } return; @@ -509,10 +508,11 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, } else { acb->buf = (uint8_t *)qiov->iov->iov_base; } - acb->nb_sectors = nb_sectors; - acb->n = 0; + acb->remaining_sectors = nb_sectors; + acb->cur_nr_sectors = 0; acb->cluster_offset = 0; acb->l2meta.nb_clusters = 0; + QLIST_INIT(&acb->l2meta.dependent_requests); return acb; } @@ -530,6 +530,27 @@ static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs, return &acb->common; } +static void qcow_aio_write_cb(void *opaque, int ret); + +static void run_dependent_requests(QCowL2Meta *m) +{ + QCowAIOCB *req; + QCowAIOCB *next; + + /* Take the request off the list of running requests */ + if (m->nb_clusters != 0) { + QLIST_REMOVE(m, next_in_flight); + } + + /* Restart all dependent requests */ + QLIST_FOREACH_SAFE(req, &m->dependent_requests, next_depend, next) { + qcow_aio_write_cb(req, 0); + } + + /* Empty the list for the next part of the request */ + QLIST_INIT(&m->dependent_requests); +} + static void qcow_aio_write_cb(void *opaque, int ret) { QCowAIOCB *acb = opaque; @@ -541,60 +562,78 @@ static void qcow_aio_write_cb(void *opaque, int ret) acb->hd_aiocb = NULL; - if (ret < 0) - goto done; + if (ret >= 0) { + ret = qcow2_alloc_cluster_link_l2(bs, &acb->l2meta); + } - if (qcow2_alloc_cluster_link_l2(bs, acb->cluster_offset, &acb->l2meta) < 0) { - qcow2_free_any_clusters(bs, acb->cluster_offset, acb->l2meta.nb_clusters); + run_dependent_requests(&acb->l2meta); + + if (ret < 0) goto done; - } - acb->nb_sectors -= acb->n; - acb->sector_num += acb->n; - acb->buf += acb->n * 512; + acb->remaining_sectors -= acb->cur_nr_sectors; + acb->sector_num += acb->cur_nr_sectors; + acb->buf += acb->cur_nr_sectors * 512; - if (acb->nb_sectors == 0) { + if (acb->remaining_sectors == 0) { /* request completed */ ret = 0; goto done; } index_in_cluster = acb->sector_num & (s->cluster_sectors - 1); - n_end = index_in_cluster + acb->nb_sectors; + n_end = index_in_cluster + acb->remaining_sectors; if (s->crypt_method && n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors; - acb->cluster_offset = qcow2_alloc_cluster_offset(bs, acb->sector_num << 9, - index_in_cluster, - n_end, &acb->n, &acb->l2meta); - if (!acb->cluster_offset || (acb->cluster_offset & 511) != 0) { - ret = -EIO; + ret = qcow2_alloc_cluster_offset(bs, acb->sector_num << 9, + index_in_cluster, n_end, &acb->cur_nr_sectors, &acb->l2meta); + if (ret < 0) { goto done; } + + acb->cluster_offset = acb->l2meta.cluster_offset; + + /* Need to wait for another request? If so, we are done for now. */ + if (acb->l2meta.nb_clusters == 0 && acb->l2meta.depends_on != NULL) { + QLIST_INSERT_HEAD(&acb->l2meta.depends_on->dependent_requests, + acb, next_depend); + return; + } + + assert((acb->cluster_offset & 511) == 0); + if (s->crypt_method) { if (!acb->cluster_data) { acb->cluster_data = qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); } qcow2_encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf, - acb->n, 1, &s->aes_encrypt_key); + acb->cur_nr_sectors, 1, &s->aes_encrypt_key); src_buf = acb->cluster_data; } else { src_buf = acb->buf; } acb->hd_iov.iov_base = (void *)src_buf; - acb->hd_iov.iov_len = acb->n * 512; + acb->hd_iov.iov_len = acb->cur_nr_sectors * 512; qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1); - acb->hd_aiocb = bdrv_aio_writev(s->hd, + BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); + acb->hd_aiocb = bdrv_aio_writev(bs->file, (acb->cluster_offset >> 9) + index_in_cluster, - &acb->hd_qiov, acb->n, + &acb->hd_qiov, acb->cur_nr_sectors, qcow_aio_write_cb, acb); - if (acb->hd_aiocb == NULL) - goto done; + if (acb->hd_aiocb == NULL) { + ret = -EIO; + goto fail; + } return; +fail: + if (acb->l2meta.nb_clusters != 0) { + QLIST_REMOVE(&acb->l2meta, next_in_flight); + } done: if (acb->qiov->niov > 1) qemu_vfree(acb->orig_buf); @@ -627,7 +666,105 @@ static void qcow_close(BlockDriverState *bs) qemu_free(s->cluster_cache); qemu_free(s->cluster_data); qcow2_refcount_close(bs); - bdrv_delete(s->hd); +} + +/* + * Updates the variable length parts of the qcow2 header, i.e. the backing file + * name and all extensions. qcow2 was not designed to allow such changes, so if + * we run out of space (we can only use the first cluster) this function may + * fail. + * + * Returns 0 on success, -errno in error cases. + */ +static int qcow2_update_ext_header(BlockDriverState *bs, + const char *backing_file, const char *backing_fmt) +{ + size_t backing_file_len = 0; + size_t backing_fmt_len = 0; + BDRVQcowState *s = bs->opaque; + QCowExtension ext_backing_fmt = {0, 0}; + int ret; + + /* Backing file format doesn't make sense without a backing file */ + if (backing_fmt && !backing_file) { + return -EINVAL; + } + + /* Prepare the backing file format extension if needed */ + if (backing_fmt) { + ext_backing_fmt.len = cpu_to_be32(strlen(backing_fmt)); + ext_backing_fmt.magic = cpu_to_be32(QCOW_EXT_MAGIC_BACKING_FORMAT); + backing_fmt_len = ((sizeof(ext_backing_fmt) + + strlen(backing_fmt) + 7) & ~7); + } + + /* Check if we can fit the new header into the first cluster */ + if (backing_file) { + backing_file_len = strlen(backing_file); + } + + size_t header_size = sizeof(QCowHeader) + backing_file_len + + backing_fmt_len; + + if (header_size > s->cluster_size) { + return -ENOSPC; + } + + /* Rewrite backing file name and qcow2 extensions */ + size_t ext_size = header_size - sizeof(QCowHeader); + uint8_t buf[ext_size]; + size_t offset = 0; + size_t backing_file_offset = 0; + + if (backing_file) { + if (backing_fmt) { + int padding = backing_fmt_len - + (sizeof(ext_backing_fmt) + strlen(backing_fmt)); + + memcpy(buf + offset, &ext_backing_fmt, sizeof(ext_backing_fmt)); + offset += sizeof(ext_backing_fmt); + + memcpy(buf + offset, backing_fmt, strlen(backing_fmt)); + offset += strlen(backing_fmt); + + memset(buf + offset, 0, padding); + offset += padding; + } + + memcpy(buf + offset, backing_file, backing_file_len); + backing_file_offset = sizeof(QCowHeader) + offset; + } + + ret = bdrv_pwrite_sync(bs->file, sizeof(QCowHeader), buf, ext_size); + if (ret < 0) { + goto fail; + } + + /* Update header fields */ + uint64_t be_backing_file_offset = cpu_to_be64(backing_file_offset); + uint32_t be_backing_file_size = cpu_to_be32(backing_file_len); + + ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, backing_file_offset), + &be_backing_file_offset, sizeof(uint64_t)); + if (ret < 0) { + goto fail; + } + + ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, backing_file_size), + &be_backing_file_size, sizeof(uint32_t)); + if (ret < 0) { + goto fail; + } + + ret = 0; +fail: + return ret; +} + +static int qcow2_change_backing_file(BlockDriverState *bs, + const char *backing_file, const char *backing_fmt) +{ + return qcow2_update_ext_header(bs, backing_file, backing_fmt); } static int get_bits_from_size(size_t size) @@ -651,52 +788,80 @@ static int get_bits_from_size(size_t size) return res; } -static int write_all(int fd, const void *buff, size_t bufsize) + +static int preallocate(BlockDriverState *bs) { - int ret = 0; - const char *ptr = buff; - while (bufsize > 0) { - ret = write(fd, ptr, bufsize); + uint64_t nb_sectors; + uint64_t offset; + int num; + int ret; + QCowL2Meta meta; + + nb_sectors = bdrv_getlength(bs) >> 9; + offset = 0; + QLIST_INIT(&meta.dependent_requests); + meta.cluster_offset = 0; + + while (nb_sectors) { + num = MIN(nb_sectors, INT_MAX >> 9); + ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num, &meta); if (ret < 0) { - if (errno != EINTR) - return -1; - } else { - bufsize -= ret; + return ret; } - } - return 0; -} -static int lseek_to(int fd, off_t offset) -{ - off_t ret; - do { - ret = lseek(fd, offset, SEEK_SET); - } while (ret == (off_t)-1 && errno == EINTR); + ret = qcow2_alloc_cluster_link_l2(bs, &meta); + if (ret < 0) { + qcow2_free_any_clusters(bs, meta.cluster_offset, meta.nb_clusters); + return ret; + } - if (ret == (off_t)-1) - return -1; + /* There are no dependent requests, but we need to remove our request + * from the list of in-flight requests */ + run_dependent_requests(&meta); + + /* TODO Preallocate data if requested */ + + nb_sectors -= num; + offset += num << 9; + } + + /* + * It is expected that the image file is large enough to actually contain + * all of the allocated clusters (otherwise we get failing reads after + * EOF). Extend the image to the last allocated sector. + */ + if (meta.cluster_offset != 0) { + uint8_t buf[512]; + memset(buf, 0, 512); + ret = bdrv_write(bs->file, (meta.cluster_offset >> 9) + num - 1, buf, 1); + if (ret < 0) { + return ret; + } + } return 0; } static int qcow_create2(const char *filename, int64_t total_size, const char *backing_file, const char *backing_format, - int flags, size_t cluster_size) + int flags, size_t cluster_size, int prealloc) { int fd, header_size, backing_filename_len, l1_size, i, shift, l2_bits; - int ref_clusters, backing_format_len = 0; + int ref_clusters, reftable_clusters, backing_format_len = 0; + int rounded_ext_bf_len = 0; QCowHeader header; uint64_t tmp, offset; + uint64_t old_ref_clusters; QCowCreateState s1, *s = &s1; QCowExtension ext_bf = {0, 0}; + int ret; memset(s, 0, sizeof(*s)); fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644); if (fd < 0) - return -1; + return -errno; memset(&header, 0, sizeof(header)); header.magic = cpu_to_be32(QCOW_MAGIC); header.version = cpu_to_be32(QCOW_VERSION); @@ -707,8 +872,9 @@ static int qcow_create2(const char *filename, int64_t total_size, if (backing_format) { ext_bf.magic = QCOW_EXT_MAGIC_BACKING_FORMAT; backing_format_len = strlen(backing_format); - ext_bf.len = (backing_format_len + 7) & ~7; - header_size += ((sizeof(ext_bf) + ext_bf.len + 7) & ~7); + ext_bf.len = backing_format_len; + rounded_ext_bf_len = (sizeof(ext_bf) + ext_bf.len + 7) & ~7; + header_size += rounded_ext_bf_len; } header.backing_file_offset = cpu_to_be64(header_size); backing_filename_len = strlen(backing_file); @@ -745,17 +911,37 @@ static int qcow_create2(const char *filename, int64_t total_size, header.l1_size = cpu_to_be32(l1_size); offset += align_offset(l1_size * sizeof(uint64_t), s->cluster_size); - s->refcount_table = qemu_mallocz(s->cluster_size); + /* count how many refcount blocks needed */ + +#define NUM_CLUSTERS(bytes) \ + (((bytes) + (s->cluster_size) - 1) / (s->cluster_size)) + + ref_clusters = NUM_CLUSTERS(NUM_CLUSTERS(offset) * sizeof(uint16_t)); + + do { + uint64_t image_clusters; + old_ref_clusters = ref_clusters; + + /* Number of clusters used for the refcount table */ + reftable_clusters = NUM_CLUSTERS(ref_clusters * sizeof(uint64_t)); + + /* Number of clusters that the whole image will have */ + image_clusters = NUM_CLUSTERS(offset) + ref_clusters + + reftable_clusters; + + /* Number of refcount blocks needed for the image */ + ref_clusters = NUM_CLUSTERS(image_clusters * sizeof(uint16_t)); + + } while (ref_clusters != old_ref_clusters); + + s->refcount_table = qemu_mallocz(reftable_clusters * s->cluster_size); s->refcount_table_offset = offset; header.refcount_table_offset = cpu_to_be64(offset); - header.refcount_table_clusters = cpu_to_be32(1); - offset += s->cluster_size; + header.refcount_table_clusters = cpu_to_be32(reftable_clusters); + offset += (reftable_clusters * s->cluster_size); s->refcount_block_offset = offset; - /* count how many refcount blocks needed */ - tmp = offset >> s->cluster_bits; - ref_clusters = (tmp >> (s->cluster_bits - REFCOUNT_SHIFT)) + 1; for (i=0; i < ref_clusters; i++) { s->refcount_table[i] = cpu_to_be64(offset); offset += s->cluster_size; @@ -767,57 +953,91 @@ static int qcow_create2(const char *filename, int64_t total_size, qcow2_create_refcount_update(s, 0, header_size); qcow2_create_refcount_update(s, s->l1_table_offset, l1_size * sizeof(uint64_t)); - qcow2_create_refcount_update(s, s->refcount_table_offset, s->cluster_size); + qcow2_create_refcount_update(s, s->refcount_table_offset, + reftable_clusters * s->cluster_size); qcow2_create_refcount_update(s, s->refcount_block_offset, ref_clusters * s->cluster_size); /* write all the data */ - if (write_all(fd, &header, sizeof(header)) < 0) - goto FAIL; + ret = qemu_write_full(fd, &header, sizeof(header)); + if (ret != sizeof(header)) { + ret = -errno; + goto exit; + } if (backing_file) { if (backing_format_len) { char zero[16]; - int d = ext_bf.len - backing_format_len; + int padding = rounded_ext_bf_len - (ext_bf.len + sizeof(ext_bf)); memset(zero, 0, sizeof(zero)); cpu_to_be32s(&ext_bf.magic); cpu_to_be32s(&ext_bf.len); - if (write_all(fd, &ext_bf, sizeof(ext_bf)) < 0 || - write_all(fd, backing_format, backing_format_len) < 0) - goto FAIL; - if (d>0) { - if (write_all(fd, zero, d) < 0) - goto FAIL; + ret = qemu_write_full(fd, &ext_bf, sizeof(ext_bf)); + if (ret != sizeof(ext_bf)) { + ret = -errno; + goto exit; } + ret = qemu_write_full(fd, backing_format, backing_format_len); + if (ret != backing_format_len) { + ret = -errno; + goto exit; + } + if (padding > 0) { + ret = qemu_write_full(fd, zero, padding); + if (ret != padding) { + ret = -errno; + goto exit; + } + } + } + ret = qemu_write_full(fd, backing_file, backing_filename_len); + if (ret != backing_filename_len) { + ret = -errno; + goto exit; } - if (write_all(fd, backing_file, backing_filename_len) < 0) - goto FAIL; } - if (lseek_to(fd, s->l1_table_offset) < 0) - goto FAIL; - + lseek(fd, s->l1_table_offset, SEEK_SET); tmp = 0; for(i = 0;i < l1_size; i++) { - if (write_all(fd, &tmp, sizeof(tmp)) < 0) - goto FAIL; + ret = qemu_write_full(fd, &tmp, sizeof(tmp)); + if (ret != sizeof(tmp)) { + ret = -errno; + goto exit; + } + } + lseek(fd, s->refcount_table_offset, SEEK_SET); + ret = qemu_write_full(fd, s->refcount_table, + reftable_clusters * s->cluster_size); + if (ret != reftable_clusters * s->cluster_size) { + ret = -errno; + goto exit; } - if (lseek_to(fd, s->refcount_table_offset) < 0 || - write_all(fd, s->refcount_table, s->cluster_size) < 0) - goto FAIL; - if (lseek_to(fd, s->refcount_block_offset) < 0 || - write_all(fd, s->refcount_block, ref_clusters * s->cluster_size) < 0) - goto FAIL; + lseek(fd, s->refcount_block_offset, SEEK_SET); + ret = qemu_write_full(fd, s->refcount_block, + ref_clusters * s->cluster_size); + if (ret != ref_clusters * s->cluster_size) { + ret = -errno; + goto exit; + } + ret = 0; +exit: qemu_free(s->refcount_table); qemu_free(s->refcount_block); close(fd); - return 0; -FAIL: - qemu_free(s->refcount_table); - qemu_free(s->refcount_block); - close(fd); - return -errno; + + /* Preallocate metadata */ + if (ret == 0 && prealloc) { + BlockDriverState *bs; + BlockDriver *drv = bdrv_find_format("qcow2"); + bs = bdrv_new(""); + bdrv_open(bs, filename, BDRV_O_CACHE_WB | BDRV_O_RDWR, drv); + ret = preallocate(bs); + bdrv_close(bs); + } + + return ret; } static int qcow_create(const char *filename, QEMUOptionParameter *options) @@ -827,6 +1047,7 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options) uint64_t sectors = 0; int flags = 0; size_t cluster_size = 65536; + int prealloc = 0; /* Read out options */ while (options && options->name) { @@ -842,12 +1063,28 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options) if (options->value.n) { cluster_size = options->value.n; } + } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) { + if (!options->value.s || !strcmp(options->value.s, "off")) { + prealloc = 0; + } else if (!strcmp(options->value.s, "metadata")) { + prealloc = 1; + } else { + fprintf(stderr, "Invalid preallocation mode: '%s'\n", + options->value.s); + return -EINVAL; + } } options++; } + if (backing_file && prealloc) { + fprintf(stderr, "Backing file and preallocation cannot be used at " + "the same time\n"); + return -EINVAL; + } + return qcow_create2(filename, sectors, backing_file, backing_fmt, flags, - cluster_size); + cluster_size, prealloc); } static int qcow_make_empty(BlockDriverState *bs) @@ -859,9 +1096,9 @@ static int qcow_make_empty(BlockDriverState *bs) int ret; memset(s->l1_table, 0, l1_length); - if (bdrv_pwrite(s->hd, s->l1_table_offset, s->l1_table, l1_length) < 0) + if (bdrv_pwrite(bs->file, s->l1_table_offset, s->l1_table, l1_length) < 0) return -1; - ret = bdrv_truncate(s->hd, s->l1_table_offset + l1_length); + ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length); if (ret < 0) return ret; @@ -870,51 +1107,40 @@ static int qcow_make_empty(BlockDriverState *bs) return 0; } -/** - * Write data synchronously - */ -static int qcow2_write(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) +static int qcow2_truncate(BlockDriverState *bs, int64_t offset) { BDRVQcowState *s = bs->opaque; - int ret, index_in_cluster, n; - uint64_t cluster_offset; - int n_end; - QCowL2Meta l2meta; + int ret, new_l1_size; - while (nb_sectors > 0) { - memset(&l2meta, 0, sizeof(l2meta)); - - index_in_cluster = sector_num & (s->cluster_sectors - 1); - n_end = index_in_cluster + nb_sectors; - if (s->crypt_method && - n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) - n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors; - cluster_offset = qcow2_alloc_cluster_offset(bs, sector_num << 9, - index_in_cluster, - n_end, &n, &l2meta); - if (!cluster_offset) - return -1; - if (s->crypt_method) { - qcow2_encrypt_sectors(s, sector_num, s->cluster_data, buf, n, 1, - &s->aes_encrypt_key); - ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, - s->cluster_data, n * 512); - } else { - ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512); - } - if (ret != n * 512 || qcow2_alloc_cluster_link_l2(bs, cluster_offset, &l2meta) < 0) { - qcow2_free_any_clusters(bs, cluster_offset, l2meta.nb_clusters); - return -1; - } - nb_sectors -= n; - sector_num += n; - buf += n * 512; - if (l2meta.nb_clusters != 0) { - QLIST_REMOVE(&l2meta, next_in_flight); - } + if (offset & 511) { + return -EINVAL; } - s->cluster_cache_offset = -1; /* disable compressed cache */ + + /* cannot proceed if image has snapshots */ + if (s->nb_snapshots) { + return -ENOTSUP; + } + + /* shrinking is currently not supported */ + if (offset < bs->total_sectors * 512) { + return -ENOTSUP; + } + + new_l1_size = size_to_l1(s, offset); + ret = qcow2_grow_l1_table(bs, new_l1_size); + if (ret < 0) { + return ret; + } + + /* write updated header.size */ + offset = cpu_to_be64(offset); + ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size), + &offset, sizeof(uint64_t)); + if (ret < 0) { + return ret; + } + + s->l1_vm_state_index = new_l1_size; return 0; } @@ -932,9 +1158,9 @@ static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num, if (nb_sectors == 0) { /* align end of file to a sector boundary to ease reading with sector based I/Os */ - cluster_offset = bdrv_getlength(s->hd); + cluster_offset = bdrv_getlength(bs->file); cluster_offset = (cluster_offset + 511) & ~511; - bdrv_truncate(s->hd, cluster_offset); + bdrv_truncate(bs->file, cluster_offset); return 0; } @@ -977,7 +1203,8 @@ static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num, if (!cluster_offset) return -1; cluster_offset &= s->cluster_offset_mask; - if (bdrv_pwrite(s->hd, cluster_offset, out_buf, out_len) != out_len) { + BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED); + if (bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len) != out_len) { qemu_free(out_buf); return -1; } @@ -989,13 +1216,18 @@ static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num, static void qcow_flush(BlockDriverState *bs) { - BDRVQcowState *s = bs->opaque; - bdrv_flush(s->hd); + bdrv_flush(bs->file); +} + +static BlockDriverAIOCB *qcow_aio_flush(BlockDriverState *bs, + BlockDriverCompletionFunc *cb, void *opaque) +{ + return bdrv_aio_flush(bs->file, cb, opaque); } static int64_t qcow_vm_state_offset(BDRVQcowState *s) { - return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits); + return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits); } static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) @@ -1007,9 +1239,9 @@ static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) } -static int qcow_check(BlockDriverState *bs) +static int qcow_check(BlockDriverState *bs, BdrvCheckResult *result) { - return qcow2_check_refcounts(bs); + return qcow2_check_refcounts(bs, result); } #if 0 @@ -1019,7 +1251,7 @@ static void dump_refcounts(BlockDriverState *bs) int64_t nb_clusters, k, k1, size; int refcount; - size = bdrv_getlength(s->hd); + size = bdrv_getlength(bs->file); nb_clusters = size_to_clusters(s, size); for(k = 0; k < nb_clusters;) { k1 = k; @@ -1027,31 +1259,35 @@ static void dump_refcounts(BlockDriverState *bs) k++; while (k < nb_clusters && get_refcount(bs, k) == refcount) k++; - printf("%lld: refcount=%d nb=%lld\n", k, refcount, k - k1); + printf("%" PRId64 ": refcount=%d nb=%" PRId64 "\n", k, refcount, + k - k1); } } #endif -static int qcow_put_buffer(BlockDriverState *bs, const uint8_t *buf, +static int qcow_save_vmstate(BlockDriverState *bs, const uint8_t *buf, int64_t pos, int size) { BDRVQcowState *s = bs->opaque; int growable = bs->growable; + int ret; + BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE); bs->growable = 1; - bdrv_pwrite(bs, qcow_vm_state_offset(s) + pos, buf, size); + ret = bdrv_pwrite(bs, qcow_vm_state_offset(s) + pos, buf, size); bs->growable = growable; - return size; + return ret; } -static int qcow_get_buffer(BlockDriverState *bs, uint8_t *buf, +static int qcow_load_vmstate(BlockDriverState *bs, uint8_t *buf, int64_t pos, int size) { BDRVQcowState *s = bs->opaque; int growable = bs->growable; int ret; + BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD); bs->growable = 1; ret = bdrv_pread(bs, qcow_vm_state_offset(s) + pos, buf, size); bs->growable = growable; @@ -1085,6 +1321,11 @@ static QEMUOptionParameter qcow_create_options[] = { .type = OPT_SIZE, .help = "qcow2 cluster size" }, + { + .name = BLOCK_OPT_PREALLOC, + .type = OPT_STRING, + .help = "Preallocation mode (allowed values: off, metadata)" + }, { NULL } }; @@ -1100,11 +1341,12 @@ static BlockDriver bdrv_qcow2 = { .bdrv_set_key = qcow_set_key, .bdrv_make_empty = qcow_make_empty, - .bdrv_read = qcow2_read, - .bdrv_write = qcow2_write, - .bdrv_aio_readv = qcow_aio_readv, - .bdrv_aio_writev = qcow_aio_writev, - .bdrv_write_compressed = qcow_write_compressed, + .bdrv_aio_readv = qcow_aio_readv, + .bdrv_aio_writev = qcow_aio_writev, + .bdrv_aio_flush = qcow_aio_flush, + + .bdrv_truncate = qcow2_truncate, + .bdrv_write_compressed = qcow_write_compressed, .bdrv_snapshot_create = qcow2_snapshot_create, .bdrv_snapshot_goto = qcow2_snapshot_goto, @@ -1112,8 +1354,10 @@ static BlockDriver bdrv_qcow2 = { .bdrv_snapshot_list = qcow2_snapshot_list, .bdrv_get_info = qcow_get_info, - .bdrv_put_buffer = qcow_put_buffer, - .bdrv_get_buffer = qcow_get_buffer, + .bdrv_save_vmstate = qcow_save_vmstate, + .bdrv_load_vmstate = qcow_load_vmstate, + + .bdrv_change_backing_file = qcow2_change_backing_file, .create_options = qcow_create_options, .bdrv_check = qcow_check, diff --git a/block/qcow2.h b/block/qcow2.h index 542292d..3ff162e 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -27,6 +27,10 @@ #include "aes.h" +//#define DEBUG_ALLOC +//#define DEBUG_ALLOC2 +//#define DEBUG_EXT + #define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb) #define QCOW_VERSION 2 @@ -43,7 +47,7 @@ #define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */ #define MIN_CLUSTER_BITS 9 -#define MAX_CLUSTER_BITS 16 +#define MAX_CLUSTER_BITS 21 #define L2_CACHE_SIZE 16 @@ -125,14 +129,18 @@ typedef struct QCowCreateState { int64_t refcount_block_offset; } QCowCreateState; +struct QCowAIOCB; + /* XXX This could be private for qcow2-cluster.c */ typedef struct QCowL2Meta { uint64_t offset; + uint64_t cluster_offset; int n_start; int nb_available; int nb_clusters; struct QCowL2Meta *depends_on; + QLIST_HEAD(QCowAioDependencies, QCowAIOCB) dependent_requests; QLIST_ENTRY(QCowL2Meta) next_in_flight; } QCowL2Meta; @@ -142,6 +150,12 @@ static inline int size_to_clusters(BDRVQcowState *s, int64_t size) return (size + (s->cluster_size - 1)) >> s->cluster_bits; } +static inline int size_to_l1(BDRVQcowState *s, int64_t size) +{ + int shift = s->cluster_bits + s->l2_bits; + return (size + (1ULL << shift) - 1) >> shift; +} + static inline int64_t align_offset(int64_t offset, int n) { offset = (offset + n - 1) & ~(n - 1); @@ -171,32 +185,26 @@ void qcow2_create_refcount_update(QCowCreateState *s, int64_t offset, int qcow2_update_snapshot_refcount(BlockDriverState *bs, int64_t l1_table_offset, int l1_size, int addend); -int qcow2_check_refcounts(BlockDriverState *bs); +int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res); /* qcow2-cluster.c functions */ int qcow2_grow_l1_table(BlockDriverState *bs, int min_size); void qcow2_l2_cache_reset(BlockDriverState *bs); -int qcow2_decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset); +int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset); void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, uint8_t *out_buf, const uint8_t *in_buf, int nb_sectors, int enc, const AES_KEY *key); -uint64_t qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, - int *num); -uint64_t qcow2_alloc_cluster_offset(BlockDriverState *bs, - uint64_t offset, - int n_start, int n_end, - int *num, QCowL2Meta *m); +int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, + int *num, uint64_t *cluster_offset); +int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, + int n_start, int n_end, int *num, QCowL2Meta *m); uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, uint64_t offset, int compressed_size); -int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, uint64_t cluster_offset, - QCowL2Meta *m); - -int qcow2_read(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, - int nb_sectors); +int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m); /* qcow2-snapshot.c functions */ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info); diff --git a/block/raw-posix.c b/block/raw-posix.c index 0e9e343..72fb8ce 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -24,11 +24,10 @@ #include "qemu-common.h" #include "qemu-timer.h" #include "qemu-char.h" +#include "qemu-log.h" #include "block_int.h" #include "module.h" -#ifdef CONFIG_AIO -#include "posix-aio-compat.h" -#endif +#include "block/raw-posix-aio.h" #ifdef CONFIG_COCOA #include @@ -52,7 +51,7 @@ #include #include #endif -#ifdef __FreeBSD__ +#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) #include #include #include @@ -81,7 +80,11 @@ /* OS X does not have O_DSYNC */ #ifndef O_DSYNC +#ifdef O_SYNC #define O_DSYNC O_SYNC +#elif defined(O_FSYNC) +#define O_DSYNC O_FSYNC +#endif #endif /* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */ @@ -102,7 +105,6 @@ typedef struct BDRVRawState { int fd; int type; - unsigned int lseek_err_cnt; int open_flags; #if defined(__linux__) /* linux floppy specific */ @@ -111,14 +113,17 @@ typedef struct BDRVRawState { int fd_got_error; int fd_media_changed; #endif +#ifdef CONFIG_LINUX_AIO + int use_aio; + void *aio_ctx; +#endif uint8_t* aligned_buf; } BDRVRawState; -static int posix_aio_init(void); - static int fd_open(BlockDriverState *bs); +static int64_t raw_getlength(BlockDriverState *bs); -#if defined(__FreeBSD__) +#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) static int cdrom_reopen(BlockDriverState *bs); #endif @@ -128,17 +133,12 @@ static int raw_open_common(BlockDriverState *bs, const char *filename, BDRVRawState *s = bs->opaque; int fd, ret; - posix_aio_init(); - - s->lseek_err_cnt = 0; - s->open_flags = open_flags | O_BINARY; s->open_flags &= ~O_ACCMODE; - if ((bdrv_flags & BDRV_O_ACCESS) == BDRV_O_RDWR) { + if (bdrv_flags & BDRV_O_RDWR) { s->open_flags |= O_RDWR; } else { s->open_flags |= O_RDONLY; - bs->read_only = 1; } /* Use O_DSYNC for write-through caching, no flags for write-back caching, @@ -149,7 +149,7 @@ static int raw_open_common(BlockDriverState *bs, const char *filename, s->open_flags |= O_DSYNC; s->fd = -1; - fd = open(filename, s->open_flags, 0644); + fd = qemu_open(filename, s->open_flags, 0644); if (fd < 0) { ret = -errno; if (ret == -EROFS) @@ -158,27 +158,52 @@ static int raw_open_common(BlockDriverState *bs, const char *filename, } s->fd = fd; s->aligned_buf = NULL; + if ((bdrv_flags & BDRV_O_NOCACHE)) { s->aligned_buf = qemu_blockalign(bs, ALIGNED_BUFFER_SIZE); if (s->aligned_buf == NULL) { - ret = -errno; - close(fd); - return ret; + goto out_close; } } + +#ifdef CONFIG_LINUX_AIO + if ((bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) == + (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) { + + /* We're falling back to POSIX AIO in some cases */ + paio_init(); + + s->aio_ctx = laio_init(); + if (!s->aio_ctx) { + goto out_free_buf; + } + s->use_aio = 1; + } else +#endif + { + if (paio_init() < 0) { + goto out_free_buf; + } +#ifdef CONFIG_LINUX_AIO + s->use_aio = 0; +#endif + } + return 0; + +out_free_buf: + qemu_vfree(s->aligned_buf); +out_close: + close(fd); + return -errno; } static int raw_open(BlockDriverState *bs, const char *filename, int flags) { BDRVRawState *s = bs->opaque; - int open_flags = 0; s->type = FTYPE_FILE; - if (flags & BDRV_O_CREAT) - open_flags = O_CREAT | O_TRUNC; - - return raw_open_common(bs, filename, flags, open_flags); + return raw_open_common(bs, filename, flags, 0); } /* XXX: use host sector size if necessary with: @@ -191,7 +216,7 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags) } #endif #ifdef CONFIG_COCOA - u_int32_t blockSize = 512; + uint32_t blockSize = 512; if ( !ioctl( fd, DKIOCGETBLOCKSIZE, &blockSize ) && blockSize > bufsize) { bufsize = blockSize; } @@ -215,21 +240,18 @@ static int raw_pread_aligned(BlockDriverState *bs, int64_t offset, if (ret < 0) return ret; - if (offset >= 0 && lseek(s->fd, offset, SEEK_SET) == (off_t)-1) { - ++(s->lseek_err_cnt); - if(s->lseek_err_cnt <= 10) { - DEBUG_BLOCK_PRINT("raw_pread(%d:%s, %" PRId64 ", %p, %d) [%" PRId64 - "] lseek failed : %d = %s\n", - s->fd, bs->filename, offset, buf, count, - bs->total_sectors, errno, strerror(errno)); + ret = pread(s->fd, buf, count, offset); + if (ret == count) + return ret; + + /* Allow reads beyond the end (needed for pwrite) */ + if ((ret == 0) && bs->growable) { + int64_t size = raw_getlength(bs); + if (offset >= size) { + memset(buf, 0, count); + return count; } - return -1; } - s->lseek_err_cnt=0; - - ret = read(s->fd, buf, count); - if (ret == count) - goto label__raw_read__success; DEBUG_BLOCK_PRINT("raw_pread(%d:%s, %" PRId64 ", %p, %d) [%" PRId64 "] read failed %d : %d = %s\n", @@ -237,15 +259,13 @@ static int raw_pread_aligned(BlockDriverState *bs, int64_t offset, bs->total_sectors, ret, errno, strerror(errno)); /* Try harder for CDrom. */ - if (bs->type == BDRV_TYPE_CDROM) { - lseek(s->fd, offset, SEEK_SET); - ret = read(s->fd, buf, count); + if (s->type != FTYPE_FILE) { + ret = pread(s->fd, buf, count, offset); if (ret == count) - goto label__raw_read__success; - lseek(s->fd, offset, SEEK_SET); - ret = read(s->fd, buf, count); + return ret; + ret = pread(s->fd, buf, count, offset); if (ret == count) - goto label__raw_read__success; + return ret; DEBUG_BLOCK_PRINT("raw_pread(%d:%s, %" PRId64 ", %p, %d) [%" PRId64 "] retry read failed %d : %d = %s\n", @@ -253,8 +273,6 @@ static int raw_pread_aligned(BlockDriverState *bs, int64_t offset, bs->total_sectors, ret, errno, strerror(errno)); } -label__raw_read__success: - return (ret < 0) ? -errno : ret; } @@ -275,29 +293,15 @@ static int raw_pwrite_aligned(BlockDriverState *bs, int64_t offset, if (ret < 0) return -errno; - if (offset >= 0 && lseek(s->fd, offset, SEEK_SET) == (off_t)-1) { - ++(s->lseek_err_cnt); - if(s->lseek_err_cnt) { - DEBUG_BLOCK_PRINT("raw_pwrite(%d:%s, %" PRId64 ", %p, %d) [%" - PRId64 "] lseek failed : %d = %s\n", - s->fd, bs->filename, offset, buf, count, - bs->total_sectors, errno, strerror(errno)); - } - return -EIO; - } - s->lseek_err_cnt = 0; - - ret = write(s->fd, buf, count); + ret = pwrite(s->fd, buf, count, offset); if (ret == count) - goto label__raw_write__success; + return ret; DEBUG_BLOCK_PRINT("raw_pwrite(%d:%s, %" PRId64 ", %p, %d) [%" PRId64 "] write failed %d : %d = %s\n", s->fd, bs->filename, offset, buf, count, bs->total_sectors, ret, errno, strerror(errno)); -label__raw_write__success: - return (ret < 0) ? -errno : ret; } @@ -352,8 +356,12 @@ static int raw_pread(BlockDriverState *bs, int64_t offset, size = ALIGNED_BUFFER_SIZE; ret = raw_pread_aligned(bs, offset, s->aligned_buf, size); - if (ret < 0) + if (ret < 0) { return ret; + } else if (ret == 0) { + fprintf(stderr, "raw_pread: read beyond end of file\n"); + abort(); + } size = ret; if (size > count) @@ -379,8 +387,9 @@ static int raw_read(BlockDriverState *bs, int64_t sector_num, { int ret; - ret = raw_pread(bs, sector_num * 512, buf, nb_sectors * 512); - if (ret == (nb_sectors * 512)) + ret = raw_pread(bs, sector_num * BDRV_SECTOR_SIZE, buf, + nb_sectors * BDRV_SECTOR_SIZE); + if (ret == (nb_sectors * BDRV_SECTOR_SIZE)) ret = 0; return ret; } @@ -467,255 +476,84 @@ static int raw_write(BlockDriverState *bs, int64_t sector_num, const uint8_t *buf, int nb_sectors) { int ret; - ret = raw_pwrite(bs, sector_num * 512, buf, nb_sectors * 512); - if (ret == (nb_sectors * 512)) + ret = raw_pwrite(bs, sector_num * BDRV_SECTOR_SIZE, buf, + nb_sectors * BDRV_SECTOR_SIZE); + if (ret == (nb_sectors * BDRV_SECTOR_SIZE)) ret = 0; return ret; } -#ifdef CONFIG_AIO -/***********************************************************/ -/* Unix AIO using POSIX AIO */ - -typedef struct RawAIOCB { - BlockDriverAIOCB common; - struct qemu_paiocb aiocb; - struct RawAIOCB *next; - int ret; -} RawAIOCB; - -typedef struct PosixAioState -{ - int rfd, wfd; - RawAIOCB *first_aio; -} PosixAioState; - -static void posix_aio_read(void *opaque) -{ - PosixAioState *s = opaque; - RawAIOCB *acb, **pacb; - int ret; - ssize_t len; - - /* read all bytes from signal pipe */ - for (;;) { - char bytes[16]; - - len = read(s->rfd, bytes, sizeof(bytes)); - if (len == -1 && errno == EINTR) - continue; /* try again */ - if (len == sizeof(bytes)) - continue; /* more to read */ - break; - } - - for(;;) { - pacb = &s->first_aio; - for(;;) { - acb = *pacb; - if (!acb) - goto the_end; - ret = qemu_paio_error(&acb->aiocb); - if (ret == ECANCELED) { - /* remove the request */ - *pacb = acb->next; - qemu_aio_release(acb); - } else if (ret != EINPROGRESS) { - /* end of aio */ - if (ret == 0) { - ret = qemu_paio_return(&acb->aiocb); - if (ret == acb->aiocb.aio_nbytes) - ret = 0; - else - ret = -EINVAL; - } else { - ret = -ret; - } - /* remove the request */ - *pacb = acb->next; - /* call the callback */ - acb->common.cb(acb->common.opaque, ret); - qemu_aio_release(acb); - break; - } else { - pacb = &acb->next; - } - } - } - the_end: ; -} - -static int posix_aio_flush(void *opaque) -{ - PosixAioState *s = opaque; - return !!s->first_aio; -} - -static PosixAioState *posix_aio_state; - -static void aio_signal_handler(int signum) -{ - if (posix_aio_state) { - char byte = 0; - - write(posix_aio_state->wfd, &byte, sizeof(byte)); - } - - qemu_service_io(); -} - -static int posix_aio_init(void) +/* + * Check if all memory in this vector is sector aligned. + */ +static int qiov_is_aligned(QEMUIOVector *qiov) { - struct sigaction act; - PosixAioState *s; - int fds[2]; - struct qemu_paioinit ai; + int i; - if (posix_aio_state) - return 0; - - s = qemu_malloc(sizeof(PosixAioState)); - - sigfillset(&act.sa_mask); - act.sa_flags = 0; /* do not restart syscalls to interrupt select() */ - act.sa_handler = aio_signal_handler; - sigaction(SIGUSR2, &act, NULL); - - s->first_aio = NULL; - if (pipe(fds) == -1) { - fprintf(stderr, "failed to create pipe\n"); - return -errno; - } - - s->rfd = fds[0]; - s->wfd = fds[1]; - - fcntl(s->rfd, F_SETFL, O_NONBLOCK); - fcntl(s->wfd, F_SETFL, O_NONBLOCK); - - qemu_aio_set_fd_handler(s->rfd, posix_aio_read, NULL, posix_aio_flush, s); - - memset(&ai, 0, sizeof(ai)); - ai.aio_threads = 64; - ai.aio_num = 64; - qemu_paio_init(&ai); - - posix_aio_state = s; - - return 0; -} - -static void raw_aio_remove(RawAIOCB *acb) -{ - RawAIOCB **pacb; - - /* remove the callback from the queue */ - pacb = &posix_aio_state->first_aio; - for(;;) { - if (*pacb == NULL) { - fprintf(stderr, "raw_aio_remove: aio request not found!\n"); - break; - } else if (*pacb == acb) { - *pacb = acb->next; - qemu_aio_release(acb); - break; + for (i = 0; i < qiov->niov; i++) { + if ((uintptr_t) qiov->iov[i].iov_base % BDRV_SECTOR_SIZE) { + return 0; } - pacb = &(*pacb)->next; } -} - -static void raw_aio_cancel(BlockDriverAIOCB *blockacb) -{ - int ret; - RawAIOCB *acb = (RawAIOCB *)blockacb; - ret = qemu_paio_cancel(acb->aiocb.aio_fildes, &acb->aiocb); - if (ret == QEMU_PAIO_NOTCANCELED) { - /* fail safe: if the aio could not be canceled, we wait for - it */ - while (qemu_paio_error(&acb->aiocb) == EINPROGRESS); - } - - raw_aio_remove(acb); + return 1; } -static AIOPool raw_aio_pool = { - .aiocb_size = sizeof(RawAIOCB), - .cancel = raw_aio_cancel, -}; - -static RawAIOCB *raw_aio_setup(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) +static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque, int type) { BDRVRawState *s = bs->opaque; - RawAIOCB *acb; if (fd_open(bs) < 0) return NULL; - acb = qemu_aio_get(&raw_aio_pool, bs, cb, opaque); - if (!acb) - return NULL; - acb->aiocb.aio_fildes = s->fd; - acb->aiocb.ev_signo = SIGUSR2; - acb->aiocb.aio_iov = qiov->iov; - acb->aiocb.aio_niov = qiov->niov; - acb->aiocb.aio_nbytes = nb_sectors * 512; - acb->aiocb.aio_offset = sector_num * 512; - acb->aiocb.aio_flags = 0; - /* * If O_DIRECT is used the buffer needs to be aligned on a sector - * boundary. Tell the low level code to ensure that in case it's - * not done yet. + * boundary. Check if this is the case or telll the low-level + * driver that it needs to copy the buffer. */ - if (s->aligned_buf) - acb->aiocb.aio_flags |= QEMU_AIO_SECTOR_ALIGNED; + if (s->aligned_buf) { + if (!qiov_is_aligned(qiov)) { + type |= QEMU_AIO_MISALIGNED; +#ifdef CONFIG_LINUX_AIO + } else if (s->use_aio) { + return laio_submit(bs, s->aio_ctx, s->fd, sector_num, qiov, + nb_sectors, cb, opaque, type); +#endif + } + } - acb->next = posix_aio_state->first_aio; - posix_aio_state->first_aio = acb; - return acb; + return paio_submit(bs, s->fd, sector_num, qiov, nb_sectors, + cb, opaque, type); } static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { - RawAIOCB *acb; - - acb = raw_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque); - if (!acb) - return NULL; - if (qemu_paio_read(&acb->aiocb) < 0) { - raw_aio_remove(acb); - return NULL; - } - return &acb->common; + return raw_aio_submit(bs, sector_num, qiov, nb_sectors, + cb, opaque, QEMU_AIO_READ); } static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { - RawAIOCB *acb; - - acb = raw_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque); - if (!acb) - return NULL; - if (qemu_paio_write(&acb->aiocb) < 0) { - raw_aio_remove(acb); - return NULL; - } - return &acb->common; + return raw_aio_submit(bs, sector_num, qiov, nb_sectors, + cb, opaque, QEMU_AIO_WRITE); } -#else /* CONFIG_AIO */ -static int posix_aio_init(void) + +static BlockDriverAIOCB *raw_aio_flush(BlockDriverState *bs, + BlockDriverCompletionFunc *cb, void *opaque) { - return 0; -} -#endif /* CONFIG_AIO */ + BDRVRawState *s = bs->opaque; + + if (fd_open(bs) < 0) + return NULL; + return paio_submit(bs, s->fd, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH); +} static void raw_close(BlockDriverState *bs) { @@ -724,7 +562,7 @@ static void raw_close(BlockDriverState *bs) close(s->fd); s->fd = -1; if (s->aligned_buf != NULL) - qemu_free(s->aligned_buf); + qemu_vfree(s->aligned_buf); } } @@ -757,30 +595,49 @@ static int64_t raw_getlength(BlockDriverState *bs) } else return st.st_size; } -#else /* !__OpenBSD__ */ -static int64_t raw_getlength(BlockDriverState *bs) +#elif defined(__sun__) +static int64_t raw_getlength(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + struct dk_minfo minfo; + int ret; + + ret = fd_open(bs); + if (ret < 0) { + return ret; + } + + /* + * Use the DKIOCGMEDIAINFO ioctl to read the size. + */ + ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo); + if (ret != -1) { + return minfo.dki_lbsize * minfo.dki_capacity; + } + + /* + * There are reports that lseek on some devices fails, but + * irc discussion said that contingency on contingency was overkill. + */ + return lseek(s->fd, 0, SEEK_END); +} +#elif defined(CONFIG_BSD) +static int64_t raw_getlength(BlockDriverState *bs) { BDRVRawState *s = bs->opaque; int fd = s->fd; int64_t size; -#ifdef CONFIG_BSD struct stat sb; -#ifdef __FreeBSD__ +#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) int reopened = 0; #endif -#endif -#ifdef __sun__ - struct dk_minfo minfo; - int rv; -#endif int ret; ret = fd_open(bs); if (ret < 0) return ret; -#ifdef CONFIG_BSD -#ifdef __FreeBSD__ +#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) again: #endif if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) { @@ -801,7 +658,7 @@ again: #else size = lseek(fd, 0LL, SEEK_END); #endif -#ifdef __FreeBSD__ +#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) switch(s->type) { case FTYPE_CD: /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */ @@ -814,56 +671,59 @@ again: } } #endif - } else -#endif -#ifdef __sun__ - /* - * use the DKIOCGMEDIAINFO ioctl to read the size. - */ - rv = ioctl ( fd, DKIOCGMEDIAINFO, &minfo ); - if ( rv != -1 ) { - size = minfo.dki_lbsize * minfo.dki_capacity; - } else /* there are reports that lseek on some devices - fails, but irc discussion said that contingency - on contingency was overkill */ -#endif - { + } else { size = lseek(fd, 0, SEEK_END); } return size; } +#else +static int64_t raw_getlength(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + int ret; + + ret = fd_open(bs); + if (ret < 0) { + return ret; + } + + return lseek(s->fd, 0, SEEK_END); +} #endif static int raw_create(const char *filename, QEMUOptionParameter *options) { - int fd, ret; + int fd; + int result = 0; int64_t total_size = 0; /* Read out options */ while (options && options->name) { if (!strcmp(options->name, BLOCK_OPT_SIZE)) { - total_size = options->value.n / 512; + total_size = options->value.n / BDRV_SECTOR_SIZE; } options++; } fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644); - if (fd < 0) - return -EIO; - do { - ret = ftruncate(fd, total_size * 512); - } while (ret < 0 && errno == EINTR); - close(fd); - if (ret != 0) - return -errno; - return 0; + if (fd < 0) { + result = -errno; + } else { + if (ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) { + result = -errno; + } + if (close(fd) != 0) { + result = -errno; + } + } + return result; } static void raw_flush(BlockDriverState *bs) { BDRVRawState *s = bs->opaque; - fsync(s->fd); + qemu_fdatasync(s->fd); } @@ -876,21 +736,21 @@ static QEMUOptionParameter raw_create_options[] = { { NULL } }; -static BlockDriver bdrv_raw = { - .format_name = "raw", +static BlockDriver bdrv_file = { + .format_name = "file", + .protocol_name = "file", .instance_size = sizeof(BDRVRawState), .bdrv_probe = NULL, /* no probe for protocols */ - .bdrv_open = raw_open, + .bdrv_file_open = raw_open, .bdrv_read = raw_read, .bdrv_write = raw_write, .bdrv_close = raw_close, .bdrv_create = raw_create, .bdrv_flush = raw_flush, -#ifdef CONFIG_AIO .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, -#endif + .bdrv_aio_flush = raw_aio_flush, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, @@ -1007,7 +867,7 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags) #endif s->type = FTYPE_FILE; -#if defined(__linux__) && defined(CONFIG_AIO) +#if defined(__linux__) if (strstart(filename, "/dev/sg", NULL)) { bs->sg = 1; } @@ -1073,40 +933,18 @@ static int hdev_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) return ioctl(s->fd, req, buf); } -#ifdef CONFIG_AIO static BlockDriverAIOCB *hdev_aio_ioctl(BlockDriverState *bs, unsigned long int req, void *buf, BlockDriverCompletionFunc *cb, void *opaque) { BDRVRawState *s = bs->opaque; - RawAIOCB *acb; if (fd_open(bs) < 0) return NULL; - - acb = qemu_aio_get(&raw_aio_pool, bs, cb, opaque); - if (!acb) - return NULL; - acb->aiocb.aio_fildes = s->fd; - acb->aiocb.ev_signo = SIGUSR2; - acb->aiocb.aio_offset = 0; - acb->aiocb.aio_flags = 0; - - acb->next = posix_aio_state->first_aio; - posix_aio_state->first_aio = acb; - - acb->aiocb.aio_ioctl_buf = buf; - acb->aiocb.aio_ioctl_cmd = req; - if (qemu_paio_ioctl(&acb->aiocb) < 0) { - raw_aio_remove(acb); - return NULL; - } - - return &acb->common; + return paio_ioctl(bs, s->fd, req, buf, cb, opaque); } -#endif -#elif defined(__FreeBSD__) +#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) static int fd_open(BlockDriverState *bs) { BDRVRawState *s = bs->opaque; @@ -1135,39 +973,46 @@ static int hdev_create(const char *filename, QEMUOptionParameter *options) /* Read out options */ while (options && options->name) { if (!strcmp(options->name, "size")) { - total_size = options->value.n / 512; + total_size = options->value.n / BDRV_SECTOR_SIZE; } options++; } fd = open(filename, O_WRONLY | O_BINARY); if (fd < 0) - return -EIO; + return -errno; if (fstat(fd, &stat_buf) < 0) - ret = -EIO; + ret = -errno; else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) - ret = -EIO; - else if (lseek(fd, 0, SEEK_END) < total_size * 512) + ret = -ENODEV; + else if (lseek(fd, 0, SEEK_END) < total_size * BDRV_SECTOR_SIZE) ret = -ENOSPC; close(fd); return ret; } +static int hdev_has_zero_init(BlockDriverState *bs) +{ + return 0; +} + static BlockDriver bdrv_host_device = { - .format_name = "host_device", - .instance_size = sizeof(BDRVRawState), - .bdrv_probe_device = hdev_probe_device, - .bdrv_open = hdev_open, - .bdrv_close = raw_close, + .format_name = "host_device", + .protocol_name = "host_device", + .instance_size = sizeof(BDRVRawState), + .bdrv_probe_device = hdev_probe_device, + .bdrv_file_open = hdev_open, + .bdrv_close = raw_close, .bdrv_create = hdev_create, - .bdrv_flush = raw_flush, + .create_options = raw_create_options, + .bdrv_has_zero_init = hdev_has_zero_init, + .bdrv_flush = raw_flush, -#ifdef CONFIG_AIO .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, -#endif + .bdrv_aio_flush = raw_aio_flush, .bdrv_read = raw_read, .bdrv_write = raw_write, @@ -1176,10 +1021,8 @@ static BlockDriver bdrv_host_device = { /* generic scsi device */ #ifdef __linux__ .bdrv_ioctl = hdev_ioctl, -#ifdef CONFIG_AIO .bdrv_aio_ioctl = hdev_aio_ioctl, #endif -#endif }; #ifdef __linux__ @@ -1188,8 +1031,6 @@ static int floppy_open(BlockDriverState *bs, const char *filename, int flags) BDRVRawState *s = bs->opaque; int ret; - posix_aio_init(); - s->type = FTYPE_FD; /* open will not fail even if no floppy is inserted, so add O_NONBLOCK */ @@ -1207,9 +1048,26 @@ static int floppy_open(BlockDriverState *bs, const char *filename, int flags) static int floppy_probe_device(const char *filename) { + int fd, ret; + int prio = 0; + struct floppy_struct fdparam; + if (strstart(filename, "/dev/fd", NULL)) - return 100; - return 0; + prio = 50; + + fd = open(filename, O_RDONLY | O_NONBLOCK); + if (fd < 0) { + goto out; + } + + /* Attempt to detect via a floppy specific ioctl */ + ret = ioctl(fd, FDGETPRM, &fdparam); + if (ret >= 0) + prio = 100; + + close(fd); +out: + return prio; } @@ -1257,17 +1115,19 @@ static int floppy_eject(BlockDriverState *bs, int eject_flag) static BlockDriver bdrv_host_floppy = { .format_name = "host_floppy", + .protocol_name = "host_floppy", .instance_size = sizeof(BDRVRawState), .bdrv_probe_device = floppy_probe_device, - .bdrv_open = floppy_open, + .bdrv_file_open = floppy_open, .bdrv_close = raw_close, .bdrv_create = hdev_create, + .create_options = raw_create_options, + .bdrv_has_zero_init = hdev_has_zero_init, .bdrv_flush = raw_flush, -#ifdef CONFIG_AIO .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, -#endif + .bdrv_aio_flush = raw_aio_flush, .bdrv_read = raw_read, .bdrv_write = raw_write, @@ -1291,9 +1151,25 @@ static int cdrom_open(BlockDriverState *bs, const char *filename, int flags) static int cdrom_probe_device(const char *filename) { + int fd, ret; + int prio = 0; + if (strstart(filename, "/dev/cd", NULL)) - return 100; - return 0; + prio = 50; + + fd = open(filename, O_RDONLY | O_NONBLOCK); + if (fd < 0) { + goto out; + } + + /* Attempt to detect via a CDROM specific ioctl */ + ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT); + if (ret >= 0) + prio = 100; + + close(fd); +out: + return prio; } static int cdrom_is_inserted(BlockDriverState *bs) @@ -1339,17 +1215,19 @@ static int cdrom_set_locked(BlockDriverState *bs, int locked) static BlockDriver bdrv_host_cdrom = { .format_name = "host_cdrom", + .protocol_name = "host_cdrom", .instance_size = sizeof(BDRVRawState), .bdrv_probe_device = cdrom_probe_device, - .bdrv_open = cdrom_open, + .bdrv_file_open = cdrom_open, .bdrv_close = raw_close, .bdrv_create = hdev_create, + .create_options = raw_create_options, + .bdrv_has_zero_init = hdev_has_zero_init, .bdrv_flush = raw_flush, -#ifdef CONFIG_AIO .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, -#endif + .bdrv_aio_flush = raw_aio_flush, .bdrv_read = raw_read, .bdrv_write = raw_write, @@ -1362,13 +1240,11 @@ static BlockDriver bdrv_host_cdrom = { /* generic scsi device */ .bdrv_ioctl = hdev_ioctl, -#ifdef CONFIG_AIO .bdrv_aio_ioctl = hdev_aio_ioctl, -#endif }; #endif /* __linux__ */ -#ifdef __FreeBSD__ +#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) static int cdrom_open(BlockDriverState *bs, const char *filename, int flags) { BDRVRawState *s = bs->opaque; @@ -1462,17 +1338,19 @@ static int cdrom_set_locked(BlockDriverState *bs, int locked) static BlockDriver bdrv_host_cdrom = { .format_name = "host_cdrom", + .protocol_name = "host_cdrom", .instance_size = sizeof(BDRVRawState), .bdrv_probe_device = cdrom_probe_device, - .bdrv_open = cdrom_open, + .bdrv_file_open = cdrom_open, .bdrv_close = raw_close, .bdrv_create = hdev_create, + .create_options = raw_create_options, + .bdrv_has_zero_init = hdev_has_zero_init, .bdrv_flush = raw_flush, -#ifdef CONFIG_AIO .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, -#endif + .bdrv_aio_flush = raw_aio_flush, .bdrv_read = raw_read, .bdrv_write = raw_write, @@ -1485,21 +1363,21 @@ static BlockDriver bdrv_host_cdrom = { }; #endif /* __FreeBSD__ */ -static void bdrv_raw_init(void) +static void bdrv_file_init(void) { /* * Register all the drivers. Note that order is important, the driver * registered last will get probed first. */ - bdrv_register(&bdrv_raw); + bdrv_register(&bdrv_file); bdrv_register(&bdrv_host_device); #ifdef __linux__ bdrv_register(&bdrv_host_floppy); bdrv_register(&bdrv_host_cdrom); #endif -#ifdef __FreeBSD__ +#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) bdrv_register(&bdrv_host_cdrom); #endif } -block_init(bdrv_raw_init); +block_init(bdrv_file_init); diff --git a/block/raw-win32.c b/block/raw-win32.c index 72acad5..503ed39 100644 --- a/block/raw-win32.c +++ b/block/raw-win32.c @@ -76,21 +76,17 @@ static int set_sparse(int fd) static int raw_open(BlockDriverState *bs, const char *filename, int flags) { BDRVRawState *s = bs->opaque; - int access_flags, create_flags; + int access_flags; DWORD overlapped; s->type = FTYPE_FILE; - if ((flags & BDRV_O_ACCESS) == O_RDWR) { + if (flags & BDRV_O_RDWR) { access_flags = GENERIC_READ | GENERIC_WRITE; } else { access_flags = GENERIC_READ; } - if (flags & BDRV_O_CREAT) { - create_flags = CREATE_ALWAYS; - } else { - create_flags = OPEN_EXISTING; - } + overlapped = FILE_ATTRIBUTE_NORMAL; if ((flags & BDRV_O_NOCACHE)) overlapped |= FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH; @@ -98,7 +94,7 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags) overlapped |= FILE_FLAG_WRITE_THROUGH; s->hfile = CreateFile(filename, access_flags, FILE_SHARE_READ, NULL, - create_flags, overlapped, NULL); + OPEN_EXISTING, overlapped, NULL); if (s->hfile == INVALID_HANDLE_VALUE) { int err = GetLastError(); @@ -242,10 +238,11 @@ static QEMUOptionParameter raw_create_options[] = { { NULL } }; -static BlockDriver bdrv_raw = { - .format_name = "raw", +static BlockDriver bdrv_file = { + .format_name = "file", + .protocol_name = "file", .instance_size = sizeof(BDRVRawState), - .bdrv_open = raw_open, + .bdrv_file_open = raw_open, .bdrv_close = raw_close, .bdrv_create = raw_create, .bdrv_flush = raw_flush, @@ -337,7 +334,7 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags) } s->type = find_device_type(bs, filename); - if ((flags & BDRV_O_ACCESS) == O_RDWR) { + if (flags & BDRV_O_RDWR) { access_flags = GENERIC_READ | GENERIC_WRITE; } else { access_flags = GENERIC_READ; @@ -397,23 +394,30 @@ static int raw_set_locked(BlockDriverState *bs, int locked) } #endif +static int hdev_has_zero_init(BlockDriverState *bs) +{ + return 0; +} + static BlockDriver bdrv_host_device = { .format_name = "host_device", + .protocol_name = "host_device", .instance_size = sizeof(BDRVRawState), .bdrv_probe_device = hdev_probe_device, - .bdrv_open = hdev_open, + .bdrv_file_open = hdev_open, .bdrv_close = raw_close, .bdrv_flush = raw_flush, + .bdrv_has_zero_init = hdev_has_zero_init, .bdrv_read = raw_read, .bdrv_write = raw_write, .bdrv_getlength = raw_getlength, }; -static void bdrv_raw_init(void) +static void bdrv_file_init(void) { - bdrv_register(&bdrv_raw); + bdrv_register(&bdrv_file); bdrv_register(&bdrv_host_device); } -block_init(bdrv_raw_init); +block_init(bdrv_file_init); diff --git a/block/raw.c b/block/raw.c new file mode 100644 index 0000000..61e6748 --- /dev/null +++ b/block/raw.c @@ -0,0 +1,280 @@ + +#include "qemu-common.h" +#include "block_int.h" +#include "module.h" + +static int raw_open(BlockDriverState *bs, int flags) +{ + bs->sg = bs->file->sg; + return 0; +} + +/* check for the user attempting to write something that looks like a + block format header to the beginning of the image and fail out. +*/ +static int check_for_block_signature(BlockDriverState *bs, const uint8_t *buf) +{ + static const uint8_t signatures[][4] = { + { 'Q', 'F', 'I', 0xfb }, /* qcow/qcow2 */ + { 'C', 'O', 'W', 'D' }, /* VMDK3 */ + { 'V', 'M', 'D', 'K' }, /* VMDK4 */ + { 'O', 'O', 'O', 'M' }, /* UML COW */ + {} + }; + int i; + + for (i = 0; signatures[i][0] != 0; i++) { + if (memcmp(buf, signatures[i], 4) == 0) { + return 1; + } + } + + return 0; +} + +static int check_write_unsafe(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + /* assume that if the user specifies the format explicitly, then assume + that they will continue to do so and provide no safety net */ + if (!bs->probed) { + return 0; + } + + if (sector_num == 0 && nb_sectors > 0) { + return check_for_block_signature(bs, buf); + } + + return 0; +} + +static int raw_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + return bdrv_read(bs->file, sector_num, buf, nb_sectors); +} + +static int raw_write_scrubbed_bootsect(BlockDriverState *bs, + const uint8_t *buf) +{ + uint8_t bootsect[512]; + + /* scrub the dangerous signature */ + memcpy(bootsect, buf, 512); + memset(bootsect, 0, 4); + + return bdrv_write(bs->file, 0, bootsect, 1); +} + +static int raw_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + if (check_write_unsafe(bs, sector_num, buf, nb_sectors)) { + int ret; + + ret = raw_write_scrubbed_bootsect(bs, buf); + if (ret < 0) { + return ret; + } + + ret = bdrv_write(bs->file, 1, buf + 512, nb_sectors - 1); + if (ret < 0) { + return ret; + } + + return ret + 512; + } + + return bdrv_write(bs->file, sector_num, buf, nb_sectors); +} + +static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + return bdrv_aio_readv(bs->file, sector_num, qiov, nb_sectors, cb, opaque); +} + +typedef struct RawScrubberBounce +{ + BlockDriverCompletionFunc *cb; + void *opaque; + QEMUIOVector qiov; +} RawScrubberBounce; + +static void raw_aio_writev_scrubbed(void *opaque, int ret) +{ + RawScrubberBounce *b = opaque; + + if (ret < 0) { + b->cb(b->opaque, ret); + } else { + b->cb(b->opaque, ret + 512); + } + + qemu_iovec_destroy(&b->qiov); + qemu_free(b); +} + +static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) +{ + const uint8_t *first_buf; + int first_buf_index = 0, i; + + /* This is probably being paranoid, but handle cases of zero size + vectors. */ + for (i = 0; i < qiov->niov; i++) { + if (qiov->iov[i].iov_len) { + assert(qiov->iov[i].iov_len >= 512); + first_buf_index = i; + break; + } + } + + first_buf = qiov->iov[first_buf_index].iov_base; + + if (check_write_unsafe(bs, sector_num, first_buf, nb_sectors)) { + RawScrubberBounce *b; + int ret; + + /* write the first sector using sync I/O */ + ret = raw_write_scrubbed_bootsect(bs, first_buf); + if (ret < 0) { + return NULL; + } + + /* adjust request to be everything but first sector */ + + b = qemu_malloc(sizeof(*b)); + b->cb = cb; + b->opaque = opaque; + + qemu_iovec_init(&b->qiov, qiov->nalloc); + qemu_iovec_concat(&b->qiov, qiov, qiov->size); + + b->qiov.size -= 512; + b->qiov.iov[first_buf_index].iov_base += 512; + b->qiov.iov[first_buf_index].iov_len -= 512; + + return bdrv_aio_writev(bs->file, sector_num + 1, &b->qiov, + nb_sectors - 1, raw_aio_writev_scrubbed, b); + } + + return bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, cb, opaque); +} + +static void raw_close(BlockDriverState *bs) +{ +} + +static void raw_flush(BlockDriverState *bs) +{ + bdrv_flush(bs->file); +} + +static BlockDriverAIOCB *raw_aio_flush(BlockDriverState *bs, + BlockDriverCompletionFunc *cb, void *opaque) +{ + return bdrv_aio_flush(bs->file, cb, opaque); +} + +static int64_t raw_getlength(BlockDriverState *bs) +{ + return bdrv_getlength(bs->file); +} + +static int raw_truncate(BlockDriverState *bs, int64_t offset) +{ + return bdrv_truncate(bs->file, offset); +} + +static int raw_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + return 1; /* everything can be opened as raw image */ +} + +static int raw_is_inserted(BlockDriverState *bs) +{ + return bdrv_is_inserted(bs->file); +} + +static int raw_eject(BlockDriverState *bs, int eject_flag) +{ + return bdrv_eject(bs->file, eject_flag); +} + +static int raw_set_locked(BlockDriverState *bs, int locked) +{ + bdrv_set_locked(bs->file, locked); + return 0; +} + +static int raw_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) +{ + return bdrv_ioctl(bs->file, req, buf); +} + +static BlockDriverAIOCB *raw_aio_ioctl(BlockDriverState *bs, + unsigned long int req, void *buf, + BlockDriverCompletionFunc *cb, void *opaque) +{ + return bdrv_aio_ioctl(bs->file, req, buf, cb, opaque); +} + +static int raw_create(const char *filename, QEMUOptionParameter *options) +{ + return bdrv_create_file(filename, options); +} + +static QEMUOptionParameter raw_create_options[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size" + }, + { NULL } +}; + +static int raw_has_zero_init(BlockDriverState *bs) +{ + return bdrv_has_zero_init(bs->file); +} + +static BlockDriver bdrv_raw = { + .format_name = "raw", + + /* It's really 0, but we need to make qemu_malloc() happy */ + .instance_size = 1, + + .bdrv_open = raw_open, + .bdrv_close = raw_close, + .bdrv_read = raw_read, + .bdrv_write = raw_write, + .bdrv_flush = raw_flush, + .bdrv_probe = raw_probe, + .bdrv_getlength = raw_getlength, + .bdrv_truncate = raw_truncate, + + .bdrv_aio_readv = raw_aio_readv, + .bdrv_aio_writev = raw_aio_writev, + .bdrv_aio_flush = raw_aio_flush, + + .bdrv_is_inserted = raw_is_inserted, + .bdrv_eject = raw_eject, + .bdrv_set_locked = raw_set_locked, + .bdrv_ioctl = raw_ioctl, + .bdrv_aio_ioctl = raw_aio_ioctl, + + .bdrv_create = raw_create, + .create_options = raw_create_options, + .bdrv_has_zero_init = raw_has_zero_init, +}; + +static void bdrv_raw_init(void) +{ + bdrv_register(&bdrv_raw); +} + +block_init(bdrv_raw_init); diff --git a/block/vpc.c b/block/vpc.c index ba482e9..e50509e 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -1,5 +1,5 @@ /* - * Block driver for Conectix/Microsoft Virtual PC images + * Block driver for Connectix / Microsoft Virtual PC images * * Copyright (c) 2005 Alex Beregszaszi * Copyright (c) 2009 Kevin Wolf @@ -150,20 +150,16 @@ static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -static int vpc_open(BlockDriverState *bs, const char *filename, int flags) +static int vpc_open(BlockDriverState *bs, int flags) { BDRVVPCState *s = bs->opaque; - int ret, i; + int i; struct vhd_footer* footer; struct vhd_dyndisk_header* dyndisk_header; uint8_t buf[HEADER_SIZE]; uint32_t checksum; - ret = bdrv_file_open(&s->hd, filename, flags); - if (ret < 0) - return ret; - - if (bdrv_pread(s->hd, 0, s->footer_buf, HEADER_SIZE) != HEADER_SIZE) + if (bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE) != HEADER_SIZE) goto fail; footer = (struct vhd_footer*) s->footer_buf; @@ -174,7 +170,7 @@ static int vpc_open(BlockDriverState *bs, const char *filename, int flags) footer->checksum = 0; if (vpc_checksum(s->footer_buf, HEADER_SIZE) != checksum) fprintf(stderr, "block-vpc: The header checksum of '%s' is " - "incorrect.\n", filename); + "incorrect.\n", bs->filename); // The visible size of a image in Virtual PC depends on the geometry // rather than on the size stored in the footer (the size in the footer @@ -182,7 +178,7 @@ static int vpc_open(BlockDriverState *bs, const char *filename, int flags) bs->total_sectors = (int64_t) be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl; - if (bdrv_pread(s->hd, be64_to_cpu(footer->data_offset), buf, HEADER_SIZE) + if (bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf, HEADER_SIZE) != HEADER_SIZE) goto fail; @@ -199,7 +195,7 @@ static int vpc_open(BlockDriverState *bs, const char *filename, int flags) s->pagetable = qemu_malloc(s->max_table_entries * 4); s->bat_offset = be64_to_cpu(dyndisk_header->table_offset); - if (bdrv_pread(s->hd, s->bat_offset, s->pagetable, + if (bdrv_pread(bs->file, s->bat_offset, s->pagetable, s->max_table_entries * 4) != s->max_table_entries * 4) goto fail; @@ -228,7 +224,6 @@ static int vpc_open(BlockDriverState *bs, const char *filename, int flags) return 0; fail: - bdrv_delete(s->hd); return -1; } @@ -266,7 +261,7 @@ static inline int64_t get_sector_offset(BlockDriverState *bs, s->last_bitmap_offset = bitmap_offset; memset(bitmap, 0xff, s->bitmap_size); - bdrv_pwrite(s->hd, bitmap_offset, bitmap, s->bitmap_size); + bdrv_pwrite_sync(bs->file, bitmap_offset, bitmap, s->bitmap_size); } // printf("sector: %" PRIx64 ", index: %x, offset: %x, bioff: %" PRIx64 ", bloff: %" PRIx64 "\n", @@ -316,7 +311,7 @@ static int rewrite_footer(BlockDriverState* bs) BDRVVPCState *s = bs->opaque; int64_t offset = s->free_data_block_offset; - ret = bdrv_pwrite(s->hd, offset, s->footer_buf, HEADER_SIZE); + ret = bdrv_pwrite_sync(bs->file, offset, s->footer_buf, HEADER_SIZE); if (ret < 0) return ret; @@ -351,7 +346,8 @@ static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num) // Initialize the block's bitmap memset(bitmap, 0xff, s->bitmap_size); - bdrv_pwrite(s->hd, s->free_data_block_offset, bitmap, s->bitmap_size); + bdrv_pwrite_sync(bs->file, s->free_data_block_offset, bitmap, + s->bitmap_size); // Write new footer (the old one will be overwritten) s->free_data_block_offset += s->block_size + s->bitmap_size; @@ -362,7 +358,7 @@ static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num) // Write BAT entry to disk bat_offset = s->bat_offset + (4 * index); bat_value = be32_to_cpu(s->pagetable[index]); - ret = bdrv_pwrite(s->hd, bat_offset, &bat_value, 4); + ret = bdrv_pwrite_sync(bs->file, bat_offset, &bat_value, 4); if (ret < 0) goto fail; @@ -379,21 +375,30 @@ static int vpc_read(BlockDriverState *bs, int64_t sector_num, BDRVVPCState *s = bs->opaque; int ret; int64_t offset; + int64_t sectors, sectors_per_block; while (nb_sectors > 0) { offset = get_sector_offset(bs, sector_num, 0); + sectors_per_block = s->block_size >> BDRV_SECTOR_BITS; + sectors = sectors_per_block - (sector_num % sectors_per_block); + if (sectors > nb_sectors) { + sectors = nb_sectors; + } + if (offset == -1) { - memset(buf, 0, 512); + memset(buf, 0, sectors * BDRV_SECTOR_SIZE); } else { - ret = bdrv_pread(s->hd, offset, buf, 512); - if (ret != 512) + ret = bdrv_pread(bs->file, offset, buf, + sectors * BDRV_SECTOR_SIZE); + if (ret != sectors * BDRV_SECTOR_SIZE) { return -1; + } } - nb_sectors--; - sector_num++; - buf += 512; + nb_sectors -= sectors; + sector_num += sectors; + buf += sectors * BDRV_SECTOR_SIZE; } return 0; } @@ -403,24 +408,32 @@ static int vpc_write(BlockDriverState *bs, int64_t sector_num, { BDRVVPCState *s = bs->opaque; int64_t offset; + int64_t sectors, sectors_per_block; int ret; while (nb_sectors > 0) { offset = get_sector_offset(bs, sector_num, 1); + sectors_per_block = s->block_size >> BDRV_SECTOR_BITS; + sectors = sectors_per_block - (sector_num % sectors_per_block); + if (sectors > nb_sectors) { + sectors = nb_sectors; + } + if (offset == -1) { offset = alloc_block(bs, sector_num); if (offset < 0) return -1; } - ret = bdrv_pwrite(s->hd, offset, buf, 512); - if (ret != 512) + ret = bdrv_pwrite(bs->file, offset, buf, sectors * BDRV_SECTOR_SIZE); + if (ret != sectors * BDRV_SECTOR_SIZE) { return -1; + } - nb_sectors--; - sector_num++; - buf += 512; + nb_sectors -= sectors; + sector_num += sectors; + buf += sectors * BDRV_SECTOR_SIZE; } return 0; @@ -470,9 +483,7 @@ static int calculate_geometry(int64_t total_sectors, uint16_t* cyls, } } - // Note: Rounding up deviates from the Virtual PC behaviour - // However, we need this to avoid truncating images in qemu-img convert - *cyls = (cyls_times_heads + *heads - 1) / *heads; + *cyls = cyls_times_heads / *heads; return 0; } @@ -484,9 +495,9 @@ static int vpc_create(const char *filename, QEMUOptionParameter *options) struct vhd_dyndisk_header* dyndisk_header = (struct vhd_dyndisk_header*) buf; int fd, i; - uint16_t cyls; - uint8_t heads; - uint8_t secs_per_cyl; + uint16_t cyls = 0; + uint8_t heads = 0; + uint8_t secs_per_cyl = 0; size_t block_size, num_bat_entries; int64_t total_sectors = 0; @@ -503,18 +514,23 @@ static int vpc_create(const char *filename, QEMUOptionParameter *options) if (fd < 0) return -EIO; - // Calculate matching total_size and geometry - if (calculate_geometry(total_sectors, &cyls, &heads, &secs_per_cyl)) - return -EFBIG; + /* Calculate matching total_size and geometry. Increase the number of + sectors requested until we get enough (or fail). */ + for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) { + if (calculate_geometry(total_sectors + i, + &cyls, &heads, &secs_per_cyl)) { + return -EFBIG; + } + } total_sectors = (int64_t) cyls * heads * secs_per_cyl; // Prepare the Hard Disk Footer memset(buf, 0, 1024); - strncpy(footer->creator, "conectix", 8); + memcpy(footer->creator, "conectix", 8); // TODO Check if "qemu" creator_app is ok for VPC - strncpy(footer->creator_app, "qemu", 4); - strncpy(footer->creator_os, "Wi2k", 4); + memcpy(footer->creator_app, "qemu", 4); + memcpy(footer->creator_os, "Wi2k", 4); footer->features = be32_to_cpu(0x02); footer->version = be32_to_cpu(0x00010000); @@ -563,7 +579,7 @@ static int vpc_create(const char *filename, QEMUOptionParameter *options) // Prepare the Dynamic Disk Header memset(buf, 0, 1024); - strncpy(dyndisk_header->magic, "cxsparse", 8); + memcpy(dyndisk_header->magic, "cxsparse", 8); dyndisk_header->data_offset = be64_to_cpu(0xFFFFFFFF); dyndisk_header->table_offset = be64_to_cpu(3 * 512); @@ -590,7 +606,6 @@ static void vpc_close(BlockDriverState *bs) #ifdef CACHE qemu_free(s->pageentry_u8); #endif - bdrv_delete(s->hd); } static QEMUOptionParameter vpc_create_options[] = { diff --git a/block/vvfat.c b/block/vvfat.c index de50dc7..6d61c2e 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -512,7 +512,7 @@ static inline uint8_t fat_chksum(const direntry_t* entry) for(i=0;i<11;i++) { unsigned char c; - c = (i < 8) ? entry->name[i] : entry->extension[i-8]; + c = (i <= 8) ? entry->name[i] : entry->extension[i-8]; chksum=(((chksum&0xfe)>>1)|((chksum&0x01)?0x80:0)) + c; } @@ -2799,8 +2799,11 @@ static int enable_write_target(BDRVVVFATState *s) if (bdrv_create(bdrv_qcow, s->qcow_filename, options) < 0) return -1; s->qcow = bdrv_new(""); - if (s->qcow == NULL || bdrv_open(s->qcow, s->qcow_filename, 0) < 0) + if (s->qcow == NULL || + bdrv_open(s->qcow, s->qcow_filename, BDRV_O_RDWR, bdrv_qcow) < 0) + { return -1; + } #ifndef _WIN32 unlink(s->qcow_filename); @@ -2828,7 +2831,7 @@ static void vvfat_close(BlockDriverState *bs) static BlockDriver bdrv_vvfat = { .format_name = "vvfat", .instance_size = sizeof(BDRVVVFATState), - .bdrv_open = vvfat_open, + .bdrv_file_open = vvfat_open, .bdrv_read = vvfat_read, .bdrv_write = vvfat_write, .bdrv_close = vvfat_close, @@ -2866,7 +2869,7 @@ static void checkpoint(void) { return; /* avoid compiler warnings: */ hexdump(NULL, 100); - remove_mapping(vvv, NULL); + remove_mapping(vvv, 0); print_mapping(NULL); print_direntry(NULL); } -- cgit v1.1