From 22a7c31a9659deaddafbbcec6562d44141e84474 Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Mon, 4 May 2009 16:35:08 -0400 Subject: blktrace: from-sector redundant in trace_block_remap Remove redundant from-sector parameter: it's /always/ the bio's sector passed in. [ Impact: cleanup ] Signed-off-by: Alan D. Brunelle Reviewed-by: Li Zefan Reviewed-by: KOSAKI Motohiro Cc: Jens Axboe Cc: Arnaldo Carvalho de Melo LKML-Reference: <49FF517C.7000503@hp.com> Signed-off-by: Ingo Molnar --- drivers/md/dm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 8a994be..b01514a 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -657,8 +657,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, /* the bio has been remapped so dispatch it */ trace_block_remap(bdev_get_queue(clone->bi_bdev), clone, - tio->io->bio->bi_bdev->bd_dev, - clone->bi_sector, sector); + tio->io->bio->bi_bdev->bd_dev, sector); generic_make_request(clone); } else if (r < 0 || r == DM_MAPIO_REQUEUE) { -- cgit v1.1 From e1defc4ff0cf57aca6c5e3ff99fa503f5943c1f1 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 22 May 2009 17:17:49 -0400 Subject: block: Do away with the notion of hardsect_size Until now we have had a 1:1 mapping between storage device physical block size and the logical block sized used when addressing the device. With SATA 4KB drives coming out that will no longer be the case. The sector size will be 4KB but the logical block size will remain 512-bytes. Hence we need to distinguish between the physical block size and the logical ditto. This patch renames hardsect_size to logical_block_size. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- drivers/md/bitmap.c | 4 ++-- drivers/md/dm-exception-store.c | 2 +- drivers/md/dm-log.c | 3 ++- drivers/md/dm-snap-persistent.c | 2 +- drivers/md/dm-table.c | 12 +++++++----- drivers/md/md.c | 2 +- 6 files changed, 14 insertions(+), 11 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 47c68bc..06b0ded 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -232,7 +232,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, target = rdev->sb_start + offset + index * (PAGE_SIZE/512); if (sync_page_io(rdev->bdev, target, - roundup(size, bdev_hardsect_size(rdev->bdev)), + roundup(size, bdev_logical_block_size(rdev->bdev)), page, READ)) { page->index = index; attach_page_buffers(page, NULL); /* so that free_buffer will @@ -287,7 +287,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) int size = PAGE_SIZE; if (page->index == bitmap->file_pages-1) size = roundup(bitmap->last_page_size, - bdev_hardsect_size(rdev->bdev)); + bdev_logical_block_size(rdev->bdev)); /* Just make sure we aren't corrupting data or * metadata */ diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index a2e26c2..75d8081 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -178,7 +178,7 @@ static int set_chunk_size(struct dm_exception_store *store, } /* Validate the chunk size against the device block size */ - if (chunk_size_ulong % (bdev_hardsect_size(store->cow->bdev) >> 9)) { + if (chunk_size_ulong % (bdev_logical_block_size(store->cow->bdev) >> 9)) { *error = "Chunk size is not a multiple of device blocksize"; return -EINVAL; } diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index be233bc..6fa8ccf 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -413,7 +413,8 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, * Buffer holds both header and bitset. */ buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + - bitset_size, ti->limits.hardsect_size); + bitset_size, + ti->limits.logical_block_size); if (buf_size > dev->bdev->bd_inode->i_size) { DMWARN("log device %s too small: need %llu bytes", diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index e75c6dd..2662a41 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -282,7 +282,7 @@ static int read_header(struct pstore *ps, int *new_snapshot) */ if (!ps->store->chunk_size) { ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, - bdev_hardsect_size(ps->store->cow->bdev) >> 9); + bdev_logical_block_size(ps->store->cow->bdev) >> 9); ps->store->chunk_mask = ps->store->chunk_size - 1; ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1; chunk_size_supplied = 0; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 429b50b..65e2d97 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -108,7 +108,8 @@ static void combine_restrictions_low(struct io_restrictions *lhs, lhs->max_hw_segments = min_not_zero(lhs->max_hw_segments, rhs->max_hw_segments); - lhs->hardsect_size = max(lhs->hardsect_size, rhs->hardsect_size); + lhs->logical_block_size = max(lhs->logical_block_size, + rhs->logical_block_size); lhs->max_segment_size = min_not_zero(lhs->max_segment_size, rhs->max_segment_size); @@ -529,7 +530,8 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev) rs->max_hw_segments = min_not_zero(rs->max_hw_segments, q->max_hw_segments); - rs->hardsect_size = max(rs->hardsect_size, q->hardsect_size); + rs->logical_block_size = max(rs->logical_block_size, + queue_logical_block_size(q)); rs->max_segment_size = min_not_zero(rs->max_segment_size, q->max_segment_size); @@ -683,8 +685,8 @@ static void check_for_valid_limits(struct io_restrictions *rs) rs->max_phys_segments = MAX_PHYS_SEGMENTS; if (!rs->max_hw_segments) rs->max_hw_segments = MAX_HW_SEGMENTS; - if (!rs->hardsect_size) - rs->hardsect_size = 1 << SECTOR_SHIFT; + if (!rs->logical_block_size) + rs->logical_block_size = 1 << SECTOR_SHIFT; if (!rs->max_segment_size) rs->max_segment_size = MAX_SEGMENT_SIZE; if (!rs->seg_boundary_mask) @@ -914,7 +916,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q) blk_queue_max_sectors(q, t->limits.max_sectors); q->max_phys_segments = t->limits.max_phys_segments; q->max_hw_segments = t->limits.max_hw_segments; - q->hardsect_size = t->limits.hardsect_size; + q->logical_block_size = t->limits.logical_block_size; q->max_segment_size = t->limits.max_segment_size; q->max_hw_sectors = t->limits.max_hw_sectors; q->seg_boundary_mask = t->limits.seg_boundary_mask; diff --git a/drivers/md/md.c b/drivers/md/md.c index fccc834..4cbc19f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1202,7 +1202,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; - bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; + bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; if (rdev->sb_size & bmask) rdev->sb_size = (rdev->sb_size | bmask) + 1; -- cgit v1.1 From ae03bf639a5027d27270123f5f6e3ee6a412781d Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 22 May 2009 17:17:50 -0400 Subject: block: Use accessor functions for queue limits Convert all external users of queue limits to using wrapper functions instead of poking the request queue variables directly. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- drivers/md/dm-table.c | 28 ++++++++++++++-------------- drivers/md/linear.c | 2 +- drivers/md/multipath.c | 4 ++-- drivers/md/raid0.c | 2 +- drivers/md/raid1.c | 4 ++-- drivers/md/raid10.c | 8 ++++---- drivers/md/raid5.c | 4 ++-- 7 files changed, 26 insertions(+), 26 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 65e2d97..e9a73bb 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -510,7 +510,7 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev) * combine_restrictions_low() */ rs->max_sectors = - min_not_zero(rs->max_sectors, q->max_sectors); + min_not_zero(rs->max_sectors, queue_max_sectors(q)); /* * Check if merge fn is supported. @@ -525,25 +525,25 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev) rs->max_phys_segments = min_not_zero(rs->max_phys_segments, - q->max_phys_segments); + queue_max_phys_segments(q)); rs->max_hw_segments = - min_not_zero(rs->max_hw_segments, q->max_hw_segments); + min_not_zero(rs->max_hw_segments, queue_max_hw_segments(q)); rs->logical_block_size = max(rs->logical_block_size, queue_logical_block_size(q)); rs->max_segment_size = - min_not_zero(rs->max_segment_size, q->max_segment_size); + min_not_zero(rs->max_segment_size, queue_max_segment_size(q)); rs->max_hw_sectors = - min_not_zero(rs->max_hw_sectors, q->max_hw_sectors); + min_not_zero(rs->max_hw_sectors, queue_max_hw_sectors(q)); rs->seg_boundary_mask = min_not_zero(rs->seg_boundary_mask, - q->seg_boundary_mask); + queue_segment_boundary(q)); - rs->bounce_pfn = min_not_zero(rs->bounce_pfn, q->bounce_pfn); + rs->bounce_pfn = min_not_zero(rs->bounce_pfn, queue_bounce_pfn(q)); rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); } @@ -914,13 +914,13 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q) * restrictions. */ blk_queue_max_sectors(q, t->limits.max_sectors); - q->max_phys_segments = t->limits.max_phys_segments; - q->max_hw_segments = t->limits.max_hw_segments; - q->logical_block_size = t->limits.logical_block_size; - q->max_segment_size = t->limits.max_segment_size; - q->max_hw_sectors = t->limits.max_hw_sectors; - q->seg_boundary_mask = t->limits.seg_boundary_mask; - q->bounce_pfn = t->limits.bounce_pfn; + blk_queue_max_phys_segments(q, t->limits.max_phys_segments); + blk_queue_max_hw_segments(q, t->limits.max_hw_segments); + blk_queue_logical_block_size(q, t->limits.logical_block_size); + blk_queue_max_segment_size(q, t->limits.max_segment_size); + blk_queue_max_hw_sectors(q, t->limits.max_hw_sectors); + blk_queue_segment_boundary(q, t->limits.seg_boundary_mask); + blk_queue_bounce_limit(q, t->limits.bounce_pfn); if (t->limits.no_cluster) queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q); diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 7a36e38..64f1f3e 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -146,7 +146,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) * a one page request is never in violation. */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) + queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); disk->num_sectors = rdev->sectors; diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 41ced0c..4ee31aa 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -303,7 +303,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) * merge_bvec_fn will be involved in multipath.) */ if (q->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) + queue_max_sectors(q) > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); conf->working_disks++; @@ -467,7 +467,7 @@ static int multipath_run (mddev_t *mddev) * violating it, not that we ever expect a device with * a merge_bvec_fn to be involved in multipath */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) + queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); if (!test_bit(Faulty, &rdev->flags)) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index c08d755..925507e 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -144,7 +144,7 @@ static int create_strip_zones (mddev_t *mddev) */ if (rdev1->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) + queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); if (!smallest || (rdev1->sectors < smallest->sectors)) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 36df910..e23758b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1130,7 +1130,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) * a one page request is never in violation. */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) + queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); p->head_position = 0; @@ -1996,7 +1996,7 @@ static int run(mddev_t *mddev) * a one page request is never in violation. */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) + queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); disk->head_position = 0; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 499620a..750550c 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1158,8 +1158,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) * a one page request is never in violation. */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) - mddev->queue->max_sectors = (PAGE_SIZE>>9); + queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) + blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); p->head_position = 0; rdev->raid_disk = mirror; @@ -2145,8 +2145,8 @@ static int run(mddev_t *mddev) * a one page request is never in violation. */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) - mddev->queue->max_sectors = (PAGE_SIZE>>9); + queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) + blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); disk->head_position = 0; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4616bc3..7970dc8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3463,10 +3463,10 @@ static int bio_fits_rdev(struct bio *bi) { struct request_queue *q = bdev_get_queue(bi->bi_bdev); - if ((bi->bi_size>>9) > q->max_sectors) + if ((bi->bi_size>>9) > queue_max_sectors(q)) return 0; blk_recount_segments(q, bi); - if (bi->bi_phys_segments > q->max_phys_segments) + if (bi->bi_phys_segments > queue_max_phys_segments(q)) return 0; if (q->merge_bvec_fn) -- cgit v1.1 From 62e1e389f87a8839ad83b08c44691d1df8320846 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 26 May 2009 09:40:59 +1000 Subject: md: always update level / chunk_size / layout when writing v1.x metadata. We previously didn't update these fields when writing the metadata because they could never change. They can now, so we better write them. v0.90 metadata always updated these fields. Signed-off-by: NeilBrown --- drivers/md/md.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index fccc834..aa79d55 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1375,6 +1375,9 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) sb->raid_disks = cpu_to_le32(mddev->raid_disks); sb->size = cpu_to_le64(mddev->dev_sectors); + sb->chunksize = cpu_to_le32(mddev->chunk_size >> 9); + sb->level = cpu_to_le32(mddev->level); + sb->layout = cpu_to_le32(mddev->layout); if (mddev->bitmap && mddev->bitmap_file == NULL) { sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); -- cgit v1.1 From 2b69c83924396ad1eda36fdd267c9d2f360f5555 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 26 May 2009 09:41:17 +1000 Subject: md: improve errno return when setting array_size Instead of always returns EINVAL if anything goes wrong when setting the array size, add the option of E2BIG if the size requested is too large. This makes it easier for user-space to be sure what went wrong. Signed-off-by: NeilBrown --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index aa79d55..58e0b02 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3683,7 +3683,7 @@ array_size_store(mddev_t *mddev, const char *buf, size_t len) if (strict_blocks_to_sectors(buf, §ors) < 0) return -EINVAL; if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) - return -EINVAL; + return -E2BIG; mddev->external_size = 1; } -- cgit v1.1 From be512691036cc989c11d0f418187efbbf14468e6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 26 May 2009 09:41:17 +1000 Subject: md: bitmap: improve bitmap maintenance code. The code for checking which bits in the bitmap can be cleared has 2 problems: 1/ it repeatedly takes and drops a spinlock, where it would make more sense to just hold on to it most of the time. 2/ it doesn't make use of some opportunities to skip large sections of the bitmap This patch fixes those. It will only affect CPU consumption, not correctness. Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 47c68bc..56df1ce 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1097,14 +1097,12 @@ void bitmap_daemon_work(struct bitmap *bitmap) } bitmap->allclean = 1; + spin_lock_irqsave(&bitmap->lock, flags); for (j = 0; j < bitmap->chunks; j++) { bitmap_counter_t *bmc; - spin_lock_irqsave(&bitmap->lock, flags); - if (!bitmap->filemap) { + if (!bitmap->filemap) /* error or shutdown */ - spin_unlock_irqrestore(&bitmap->lock, flags); break; - } page = filemap_get_page(bitmap, j); @@ -1121,6 +1119,8 @@ void bitmap_daemon_work(struct bitmap *bitmap) write_page(bitmap, page, 0); bitmap->allclean = 0; } + spin_lock_irqsave(&bitmap->lock, flags); + j |= (PAGE_BITS - 1); continue; } @@ -1181,9 +1181,10 @@ void bitmap_daemon_work(struct bitmap *bitmap) ext2_clear_bit(file_page_offset(j), paddr); kunmap_atomic(paddr, KM_USER0); } - } - spin_unlock_irqrestore(&bitmap->lock, flags); + } else + j |= PAGE_COUNTER_MASK; } + spin_unlock_irqrestore(&bitmap->lock, flags); /* now sync the final page */ if (lastpage != NULL) { -- cgit v1.1 From b6a9ce688f613e2ee5f15e6720e0bb8520efc36e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 26 May 2009 09:41:17 +1000 Subject: md: export 'frozen' resync state through sysfs The md resync engine has a 'frozen' state which ensures that no resync/recovery. This is used to avoid races. Export this state through the 'sync_action' sysfs attribute so that user-space can benefit and also avoid some races. Signed-off-by: NeilBrown --- drivers/md/md.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 58e0b02..384e4f0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3306,7 +3306,9 @@ static ssize_t action_show(mddev_t *mddev, char *page) { char *type = "idle"; - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + type = "frozen"; + else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) type = "reshape"; @@ -3329,7 +3331,12 @@ action_store(mddev_t *mddev, const char *page, size_t len) if (!mddev->pers || !mddev->pers->sync_request) return -EINVAL; - if (cmd_match(page, "idle")) { + if (cmd_match(page, "frozen")) + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + else + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + + if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_unregister_thread(mddev->sync_thread); -- cgit v1.1 From 848b3182365fdf5a05bcd5ed949071cac2c894b3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 26 May 2009 12:41:08 +1000 Subject: md: raid5: avoid sector values going negative when testing reshape progress. As sector_t in unsigned, we cannot afford to let 'safepos' etc go negative. So replace a -= b; by a -= min(b,a); Signed-off-by: NeilBrown --- drivers/md/raid5.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4616bc3..3c3626d 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3811,13 +3811,13 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped safepos = conf->reshape_safe; sector_div(safepos, data_disks); if (mddev->delta_disks < 0) { - writepos -= reshape_sectors; + writepos -= min(reshape_sectors, writepos); readpos += reshape_sectors; safepos += reshape_sectors; } else { writepos += reshape_sectors; - readpos -= reshape_sectors; - safepos -= reshape_sectors; + readpos -= min(reshape_sectors, readpos); + safepos -= min(reshape_sectors, safepos); } /* 'writepos' is the most advanced device address we might write. -- cgit v1.1 From 7a91ee1f628ef6bfe3f13067c0ddf9db520cb86b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 26 May 2009 12:57:21 +1000 Subject: md: don't update curr_resync_completed without also updating reshape_position. In order for the metadata to always be consistent, we mustn't updated curr_resync_completed without also updating reshape_position. The reshape code updates both at the same time. However since commit 97e4f42d62badb0f9fbc27c013e89bc1336a03bc the common md_do_sync will sometimes update curr_resync_completed but is not in a position to update reshape_position. So if MD_RECOVERY_RESHAPE is set (indicating that a reshape is happening, so reshape_position might change), don't update curr_resync_completed in md_do_sync, leave it to the per-personality reshape code. Signed-off-by: NeilBrown --- drivers/md/md.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 384e4f0..9544565 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6362,12 +6362,13 @@ void md_do_sync(mddev_t *mddev) skipped = 0; - if ((mddev->curr_resync > mddev->curr_resync_completed && - (mddev->curr_resync - mddev->curr_resync_completed) - > (max_sectors >> 4)) || - (j - mddev->curr_resync_completed)*2 - >= mddev->resync_max - mddev->curr_resync_completed - ) { + if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + ((mddev->curr_resync > mddev->curr_resync_completed && + (mddev->curr_resync - mddev->curr_resync_completed) + > (max_sectors >> 4)) || + (j - mddev->curr_resync_completed)*2 + >= mddev->resync_max - mddev->curr_resync_completed + )) { /* time to update curr_resync_completed */ blk_unplug(mddev->queue); wait_event(mddev->recovery_wait, -- cgit v1.1 From b492b852cd8c99505708152c29a5e09a787af9de Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 26 May 2009 12:57:36 +1000 Subject: md: don't use locked_ioctl. md has no need for the BKL - it does its own locking. So md_ioctl doesn't need to be a locked_ioctl. Signed-off-by: NeilBrown --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 9544565..641b211 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5567,7 +5567,7 @@ static struct block_device_operations md_fops = .owner = THIS_MODULE, .open = md_open, .release = md_release, - .locked_ioctl = md_ioctl, + .ioctl = md_ioctl, .getgeo = md_getgeo, .media_changed = md_media_changed, .revalidate_disk= md_revalidate, -- cgit v1.1 From ed37d83e6aa218192fb28bb6b82498d2a8c74070 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 27 May 2009 21:39:05 +1000 Subject: md: raid5: change incorrect usage of 'min' macro to 'min_t' A recent patch to raid5.c use min on an int and a sector_t. This isn't allowed. So change it to min_t(sector_t,x,y). Signed-off-by: NeilBrown --- drivers/md/raid5.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3c3626d..5d400ae 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3811,13 +3811,13 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped safepos = conf->reshape_safe; sector_div(safepos, data_disks); if (mddev->delta_disks < 0) { - writepos -= min(reshape_sectors, writepos); + writepos -= min_t(sector_t, reshape_sectors, writepos); readpos += reshape_sectors; safepos += reshape_sectors; } else { writepos += reshape_sectors; - readpos -= min(reshape_sectors, readpos); - safepos -= min(reshape_sectors, safepos); + readpos -= min_t(sector_t, reshape_sectors, readpos); + safepos -= min_t(sector_t, reshape_sectors, safepos); } /* 'writepos' is the most advanced device address we might write. -- cgit v1.1 From a05c0205ba031c01bba33a21bf0a35920eb64833 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 3 Jun 2009 09:33:18 +0200 Subject: block: Fix bounce limit setting in DM blk_queue_bounce_limit() is more than a wrapper about the request queue limits.bounce_pfn variable. Introduce blk_queue_bounce_pfn() which can be called by stacking drivers that wish to set the bounce limit explicitly. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- drivers/md/dm-table.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index e9a73bb..3ca1604 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -920,7 +920,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q) blk_queue_max_segment_size(q, t->limits.max_segment_size); blk_queue_max_hw_sectors(q, t->limits.max_hw_sectors); blk_queue_segment_boundary(q, t->limits.seg_boundary_mask); - blk_queue_bounce_limit(q, t->limits.bounce_pfn); + blk_queue_bounce_pfn(q, t->limits.bounce_pfn); if (t->limits.no_cluster) queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q); -- cgit v1.1 From 9df1bb9b516daeece159ab7fb262d01a0359247c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 9 Jun 2009 06:22:57 +0200 Subject: Revert "block: Fix bounce limit setting in DM" This reverts commit a05c0205ba031c01bba33a21bf0a35920eb64833. DM doesn't need to access the bounce_pfn directly. Signed-off-by: Jens Axboe --- drivers/md/dm-table.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 3ca1604..e9a73bb 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -920,7 +920,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q) blk_queue_max_segment_size(q, t->limits.max_segment_size); blk_queue_max_hw_sectors(q, t->limits.max_hw_sectors); blk_queue_segment_boundary(q, t->limits.seg_boundary_mask); - blk_queue_bounce_pfn(q, t->limits.bounce_pfn); + blk_queue_bounce_limit(q, t->limits.bounce_pfn); if (t->limits.no_cluster) queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q); -- cgit v1.1 From f001a70cdc61c01452d42e8b32fd7c7842ef62d5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 9 Jun 2009 14:30:31 +1000 Subject: md/raid5: use conf->raid_disks in preference to mddev->raid_disk mddev->raid_disks can be changed and any time by a request from user-space. It is a suggestion as to what number of raid_disks is desired. conf->raid_disks can only be changed by the raid5 module with suitable locks in place. It is a statement as to the current number of raid_disks. There are two places where the latter should be used, but the former is used. This can lead to a crash when reshaping an array. This patch changes to mddev-> to conf-> Signed-off-by: NeilBrown --- drivers/md/raid5.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5d400ae..75469e6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3288,7 +3288,7 @@ static void unplug_slaves(mddev_t *mddev) int i; rcu_read_lock(); - for (i=0; iraid_disks; i++) { + for (i = 0; i < conf->raid_disks; i++) { mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { struct request_queue *r_queue = bdev_get_queue(rdev->bdev); @@ -4034,7 +4034,7 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski * We don't need to check the 'failed' flag as when that gets set, * recovery aborts. */ - for (i=0; iraid_disks; i++) + for (i = 0; i < conf->raid_disks; i++) if (conf->disks[i].rdev == NULL) still_degraded = 1; -- cgit v1.1 From a8c906ca3f63d05f0d25490cf82276f73c6fe095 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 9 Jun 2009 14:39:59 +1000 Subject: md/raid5 - avoid deadlocks in get_active_stripe during reshape md has functionality to 'quiesce' and array so that all pending IO completed and no new IO starts. This is used to achieve a stable state before making internal changes. Currently this quiescing applies equally to normal IO, resync IO, and reshape IO. However there is a problem with applying it to reshape IO. Reshape can have multiple 'stripe_heads' that must be active together. If the quiesce come between allocating the first and the last of such a collection, then we deadlock, as the last will not be allocated until the quiesce is lifted, the quiesce will not be lifted until the first (which has been allocated) gets used, and that first cannot be used until the last is allocated. It is not necessary to inhibit reshape IO when a quiesce is requested. Those places in the code that require a full quiesce will ensure the reshape thread is not running at all. So allow reshape requests to get access to new stripe_heads without being blocked by a 'quiesce'. This only affects in-place reshapes (i.e. where the array does not grow or shrink) and these are only newly supported. So this patch is not needed in earlier kernels. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 75469e6..59f2ec0 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -362,7 +362,7 @@ static void raid5_unplug_device(struct request_queue *q); static struct stripe_head * get_active_stripe(raid5_conf_t *conf, sector_t sector, - int previous, int noblock) + int previous, int noblock, int noquiesce) { struct stripe_head *sh; @@ -372,7 +372,7 @@ get_active_stripe(raid5_conf_t *conf, sector_t sector, do { wait_event_lock_irq(conf->wait_for_stripe, - conf->quiesce == 0, + conf->quiesce == 0 || noquiesce, conf->device_lock, /* nothing */); sh = __find_stripe(conf, sector, conf->generation - previous); if (!sh) { @@ -2671,7 +2671,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, sector_t bn = compute_blocknr(sh, i, 1); sector_t s = raid5_compute_sector(conf, bn, 0, &dd_idx, NULL); - sh2 = get_active_stripe(conf, s, 0, 1); + sh2 = get_active_stripe(conf, s, 0, 1, 1); if (sh2 == NULL) /* so far only the early blocks of this stripe * have been requested. When later blocks @@ -2944,7 +2944,7 @@ static bool handle_stripe5(struct stripe_head *sh) /* Finish reconstruct operations initiated by the expansion process */ if (sh->reconstruct_state == reconstruct_state_result) { struct stripe_head *sh2 - = get_active_stripe(conf, sh->sector, 1, 1); + = get_active_stripe(conf, sh->sector, 1, 1, 1); if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { /* sh cannot be written until sh2 has been read. * so arrange for sh to be delayed a little @@ -3189,7 +3189,7 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page) if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { struct stripe_head *sh2 - = get_active_stripe(conf, sh->sector, 1, 1); + = get_active_stripe(conf, sh->sector, 1, 1, 1); if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { /* sh cannot be written until sh2 has been read. * so arrange for sh to be delayed a little @@ -3675,7 +3675,7 @@ static int make_request(struct request_queue *q, struct bio * bi) (unsigned long long)logical_sector); sh = get_active_stripe(conf, new_sector, previous, - (bi->bi_rw&RWA_MASK)); + (bi->bi_rw&RWA_MASK), 0); if (sh) { if (unlikely(previous)) { /* expansion might have moved on while waiting for a @@ -3873,7 +3873,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { int j; int skipped = 0; - sh = get_active_stripe(conf, stripe_addr+i, 0, 0); + sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); set_bit(STRIPE_EXPANDING, &sh->state); atomic_inc(&conf->reshape_stripes); /* If any of this stripe is beyond the end of the old @@ -3922,7 +3922,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped if (last_sector >= mddev->dev_sectors) last_sector = mddev->dev_sectors - 1; while (first_sector <= last_sector) { - sh = get_active_stripe(conf, first_sector, 1, 0); + sh = get_active_stripe(conf, first_sector, 1, 0, 1); set_bit(STRIPE_EXPAND_SOURCE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); @@ -4022,9 +4022,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski bitmap_cond_end_sync(mddev->bitmap, sector_nr); - sh = get_active_stripe(conf, sector_nr, 0, 1); + sh = get_active_stripe(conf, sector_nr, 0, 1, 0); if (sh == NULL) { - sh = get_active_stripe(conf, sector_nr, 0, 0); + sh = get_active_stripe(conf, sector_nr, 0, 0, 0); /* make sure we don't swamp the stripe cache if someone else * is trying to get access */ @@ -4086,7 +4086,7 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) /* already done this stripe */ continue; - sh = get_active_stripe(conf, sector, 0, 1); + sh = get_active_stripe(conf, sector, 0, 1, 0); if (!sh) { /* failed to get a stripe - must wait */ -- cgit v1.1 From 0e6e0271a210817e202c8a4bfffbde3e3c0616d1 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 9 Jun 2009 16:32:22 +1000 Subject: md/raid5: fix bug in reshape code when chunk_size decreases. Now that we support changing the chunksize, we calculate "reshape_sectors" to be the max of number of sectors in old and new chunk size. However there is one please where we still use 'chunksize' rather than 'reshape_sectors'. This causes a reshape that reduces the size of chunks to freeze. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 59f2ec0..bb37fb1 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3916,7 +3916,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped raid5_compute_sector(conf, stripe_addr*(new_data_disks), 1, &dd_idx, NULL); last_sector = - raid5_compute_sector(conf, ((stripe_addr+conf->chunk_size/512) + raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) *(new_data_disks) - 1), 1, &dd_idx, NULL); if (last_sector >= mddev->dev_sectors) -- cgit v1.1 From 55782138e47d9baf2f7d3a7af9e7cf42adf72c56 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 9 Jun 2009 13:43:05 +0800 Subject: tracing/events: convert block trace points to TRACE_EVENT() TRACE_EVENT is a more generic way to define tracepoints. Doing so adds these new capabilities to this tracepoint: - zero-copy and per-cpu splice() tracing - binary tracing without printf overhead - structured logging records exposed under /debug/tracing/events - trace events embedded in function tracer output and other plugins - user-defined, per tracepoint filter expressions ... Cons: - no dev_t info for the output of plug, unplug_timer and unplug_io events. no dev_t info for getrq and sleeprq events if bio == NULL. no dev_t info for rq_abort,...,rq_requeue events if rq->rq_disk == NULL. This is mainly because we can't get the deivce from a request queue. But this may change in the future. - A packet command is converted to a string in TP_assign, not TP_print. While blktrace do the convertion just before output. Since pc requests should be rather rare, this is not a big issue. - In blktrace, an event can have 2 different print formats, but a TRACE_EVENT has a unique format, which means we have some unused data in a trace entry. The overhead is minimized by using __dynamic_array() instead of __array(). I've benchmarked the ioctl blktrace vs the splice based TRACE_EVENT tracing: dd dd + ioctl blktrace dd + TRACE_EVENT (splice) 1 7.36s, 42.7 MB/s 7.50s, 42.0 MB/s 7.41s, 42.5 MB/s 2 7.43s, 42.3 MB/s 7.48s, 42.1 MB/s 7.43s, 42.4 MB/s 3 7.38s, 42.6 MB/s 7.45s, 42.2 MB/s 7.41s, 42.5 MB/s So the overhead of tracing is very small, and no regression when using those trace events vs blktrace. And the binary output of TRACE_EVENT is much smaller than blktrace: # ls -l -h -rw-r--r-- 1 root root 8.8M 06-09 13:24 sda.blktrace.0 -rw-r--r-- 1 root root 195K 06-09 13:24 sda.blktrace.1 -rw-r--r-- 1 root root 2.7M 06-09 13:25 trace_splice.out Following are some comparisons between TRACE_EVENT and blktrace: plug: kjournald-480 [000] 303.084981: block_plug: [kjournald] kjournald-480 [000] 303.084981: 8,0 P N [kjournald] unplug_io: kblockd/0-118 [000] 300.052973: block_unplug_io: [kblockd/0] 1 kblockd/0-118 [000] 300.052974: 8,0 U N [kblockd/0] 1 remap: kjournald-480 [000] 303.085042: block_remap: 8,0 W 102736992 + 8 <- (8,8) 33384 kjournald-480 [000] 303.085043: 8,0 A W 102736992 + 8 <- (8,8) 33384 bio_backmerge: kjournald-480 [000] 303.085086: block_bio_backmerge: 8,0 W 102737032 + 8 [kjournald] kjournald-480 [000] 303.085086: 8,0 M W 102737032 + 8 [kjournald] getrq: kjournald-480 [000] 303.084974: block_getrq: 8,0 W 102736984 + 8 [kjournald] kjournald-480 [000] 303.084975: 8,0 G W 102736984 + 8 [kjournald] bash-2066 [001] 1072.953770: 8,0 G N [bash] bash-2066 [001] 1072.953773: block_getrq: 0,0 N 0 + 0 [bash] rq_complete: konsole-2065 [001] 300.053184: block_rq_complete: 8,0 W () 103669040 + 16 [0] konsole-2065 [001] 300.053191: 8,0 C W 103669040 + 16 [0] ksoftirqd/1-7 [001] 1072.953811: 8,0 C N (5a 00 08 00 00 00 00 00 24 00) [0] ksoftirqd/1-7 [001] 1072.953813: block_rq_complete: 0,0 N (5a 00 08 00 00 00 00 00 24 00) 0 + 0 [0] rq_insert: kjournald-480 [000] 303.084985: block_rq_insert: 8,0 W 0 () 102736984 + 8 [kjournald] kjournald-480 [000] 303.084986: 8,0 I W 102736984 + 8 [kjournald] Changelog from v2 -> v3: - use the newly introduced __dynamic_array(). Changelog from v1 -> v2: - use __string() instead of __array() to minimize the memory required to store hex dump of rq->cmd(). - support large pc requests. - add missing blk_fill_rwbs_rq() in block_rq_requeue TRACE_EVENT. - some cleanups. Signed-off-by: Li Zefan LKML-Reference: <4A2DF669.5070905@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- drivers/md/dm.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm.c b/drivers/md/dm.c index e2ee4a7..3fd8b1e 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -20,7 +20,8 @@ #include #include #include -#include + +#include #define DM_MSG_PREFIX "core" @@ -53,8 +54,6 @@ struct dm_target_io { union map_info info; }; -DEFINE_TRACE(block_bio_complete); - /* * For request-based dm. * One of these is allocated per request. -- cgit v1.1