diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/Makefile | 2 | ||||
-rw-r--r-- | block/as-iosched.c | 10 | ||||
-rw-r--r-- | block/blk-barrier.c | 31 | ||||
-rw-r--r-- | block/blk-core.c | 167 | ||||
-rw-r--r-- | block/blk-iopoll.c | 227 | ||||
-rw-r--r-- | block/blk-merge.c | 51 | ||||
-rw-r--r-- | block/blk-settings.c | 21 | ||||
-rw-r--r-- | block/blk-sysfs.c | 9 | ||||
-rw-r--r-- | block/blk.h | 1 | ||||
-rw-r--r-- | block/cfq-iosched.c | 82 | ||||
-rw-r--r-- | block/elevator.c | 16 | ||||
-rw-r--r-- | block/genhd.c | 24 | ||||
-rw-r--r-- | block/ioctl.c | 49 |
13 files changed, 517 insertions, 173 deletions
diff --git a/block/Makefile b/block/Makefile index 6c54ed0..ba74ca6 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ - ioctl.o genhd.o scsi_ioctl.o + blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o diff --git a/block/as-iosched.c b/block/as-iosched.c index 7a12cf6..ce8ba57 100644 --- a/block/as-iosched.c +++ b/block/as-iosched.c @@ -146,7 +146,7 @@ enum arq_state { #define RQ_STATE(rq) ((enum arq_state)(rq)->elevator_private2) #define RQ_SET_STATE(rq, state) ((rq)->elevator_private2 = (void *) state) -static DEFINE_PER_CPU(unsigned long, ioc_count); +static DEFINE_PER_CPU(unsigned long, as_ioc_count); static struct completion *ioc_gone; static DEFINE_SPINLOCK(ioc_gone_lock); @@ -161,7 +161,7 @@ static void as_antic_stop(struct as_data *ad); static void free_as_io_context(struct as_io_context *aic) { kfree(aic); - elv_ioc_count_dec(ioc_count); + elv_ioc_count_dec(as_ioc_count); if (ioc_gone) { /* * AS scheduler is exiting, grab exit lock and check @@ -169,7 +169,7 @@ static void free_as_io_context(struct as_io_context *aic) * complete ioc_gone and set it back to NULL. */ spin_lock(&ioc_gone_lock); - if (ioc_gone && !elv_ioc_count_read(ioc_count)) { + if (ioc_gone && !elv_ioc_count_read(as_ioc_count)) { complete(ioc_gone); ioc_gone = NULL; } @@ -211,7 +211,7 @@ static struct as_io_context *alloc_as_io_context(void) ret->seek_total = 0; ret->seek_samples = 0; ret->seek_mean = 0; - elv_ioc_count_inc(ioc_count); + elv_ioc_count_inc(as_ioc_count); } return ret; @@ -1507,7 +1507,7 @@ static void __exit as_exit(void) ioc_gone = &all_gone; /* ioc_gone's update must be visible before reading ioc_count */ smp_wmb(); - if (elv_ioc_count_read(ioc_count)) + if (elv_ioc_count_read(as_ioc_count)) wait_for_completion(&all_gone); synchronize_rcu(); } diff --git a/block/blk-barrier.c b/block/blk-barrier.c index 30022b4..6593ab3 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -348,6 +348,9 @@ static void blkdev_discard_end_io(struct bio *bio, int err) clear_bit(BIO_UPTODATE, &bio->bi_flags); } + if (bio->bi_private) + complete(bio->bi_private); + bio_put(bio); } @@ -357,21 +360,20 @@ static void blkdev_discard_end_io(struct bio *bio, int err) * @sector: start sector * @nr_sects: number of sectors to discard * @gfp_mask: memory allocation flags (for bio_alloc) + * @flags: DISCARD_FL_* flags to control behaviour * * Description: - * Issue a discard request for the sectors in question. Does not wait. + * Issue a discard request for the sectors in question. */ -int blkdev_issue_discard(struct block_device *bdev, - sector_t sector, sector_t nr_sects, gfp_t gfp_mask) +int blkdev_issue_discard(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, int flags) { - struct request_queue *q; - struct bio *bio; + DECLARE_COMPLETION_ONSTACK(wait); + struct request_queue *q = bdev_get_queue(bdev); + int type = flags & DISCARD_FL_BARRIER ? + DISCARD_BARRIER : DISCARD_NOBARRIER; int ret = 0; - if (bdev->bd_disk == NULL) - return -ENXIO; - - q = bdev_get_queue(bdev); if (!q) return -ENXIO; @@ -379,12 +381,14 @@ int blkdev_issue_discard(struct block_device *bdev, return -EOPNOTSUPP; while (nr_sects && !ret) { - bio = bio_alloc(gfp_mask, 0); + struct bio *bio = bio_alloc(gfp_mask, 0); if (!bio) return -ENOMEM; bio->bi_end_io = blkdev_discard_end_io; bio->bi_bdev = bdev; + if (flags & DISCARD_FL_WAIT) + bio->bi_private = &wait; bio->bi_sector = sector; @@ -396,10 +400,13 @@ int blkdev_issue_discard(struct block_device *bdev, bio->bi_size = nr_sects << 9; nr_sects = 0; } + bio_get(bio); - submit_bio(DISCARD_BARRIER, bio); + submit_bio(type, bio); + + if (flags & DISCARD_FL_WAIT) + wait_for_completion(&wait); - /* Check if it failed immediately */ if (bio_flagged(bio, BIO_EOPNOTSUPP)) ret = -EOPNOTSUPP; else if (!bio_flagged(bio, BIO_UPTODATE)) diff --git a/block/blk-core.c b/block/blk-core.c index e3299a7..8135228 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -69,7 +69,7 @@ static void drive_stat_acct(struct request *rq, int new_io) part_stat_inc(cpu, part, merges[rw]); else { part_round_stats(cpu, part); - part_inc_in_flight(part); + part_inc_in_flight(part, rw); } part_stat_unlock(); @@ -501,6 +501,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; q->backing_dev_info.state = 0; q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY; + q->backing_dev_info.name = "block"; err = bdi_init(&q->backing_dev_info); if (err) { @@ -1030,7 +1031,7 @@ static void part_round_stats_single(int cpu, struct hd_struct *part, if (part->in_flight) { __part_stat_add(cpu, part, time_in_queue, - part->in_flight * (now - part->stamp)); + part_in_flight(part) * (now - part->stamp)); __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); } part->stamp = now; @@ -1111,31 +1112,27 @@ void init_request_from_bio(struct request *req, struct bio *bio) req->cmd_type = REQ_TYPE_FS; /* - * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST) + * Inherit FAILFAST from bio (for read-ahead, and explicit + * FAILFAST). FAILFAST flags are identical for req and bio. */ - if (bio_rw_ahead(bio)) - req->cmd_flags |= (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | - REQ_FAILFAST_DRIVER); - if (bio_failfast_dev(bio)) - req->cmd_flags |= REQ_FAILFAST_DEV; - if (bio_failfast_transport(bio)) - req->cmd_flags |= REQ_FAILFAST_TRANSPORT; - if (bio_failfast_driver(bio)) - req->cmd_flags |= REQ_FAILFAST_DRIVER; - - if (unlikely(bio_discard(bio))) { + if (bio_rw_flagged(bio, BIO_RW_AHEAD)) + req->cmd_flags |= REQ_FAILFAST_MASK; + else + req->cmd_flags |= bio->bi_rw & REQ_FAILFAST_MASK; + + if (unlikely(bio_rw_flagged(bio, BIO_RW_DISCARD))) { req->cmd_flags |= REQ_DISCARD; - if (bio_barrier(bio)) + if (bio_rw_flagged(bio, BIO_RW_BARRIER)) req->cmd_flags |= REQ_SOFTBARRIER; req->q->prepare_discard_fn(req->q, req); - } else if (unlikely(bio_barrier(bio))) + } else if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) req->cmd_flags |= REQ_HARDBARRIER; - if (bio_sync(bio)) + if (bio_rw_flagged(bio, BIO_RW_SYNCIO)) req->cmd_flags |= REQ_RW_SYNC; - if (bio_rw_meta(bio)) + if (bio_rw_flagged(bio, BIO_RW_META)) req->cmd_flags |= REQ_RW_META; - if (bio_noidle(bio)) + if (bio_rw_flagged(bio, BIO_RW_NOIDLE)) req->cmd_flags |= REQ_NOIDLE; req->errors = 0; @@ -1150,7 +1147,7 @@ void init_request_from_bio(struct request *req, struct bio *bio) */ static inline bool queue_should_plug(struct request_queue *q) { - return !(blk_queue_nonrot(q) && blk_queue_tagged(q)); + return !(blk_queue_nonrot(q) && blk_queue_queuing(q)); } static int __make_request(struct request_queue *q, struct bio *bio) @@ -1159,11 +1156,12 @@ static int __make_request(struct request_queue *q, struct bio *bio) int el_ret; unsigned int bytes = bio->bi_size; const unsigned short prio = bio_prio(bio); - const int sync = bio_sync(bio); - const int unplug = bio_unplug(bio); + const bool sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); + const bool unplug = bio_rw_flagged(bio, BIO_RW_UNPLUG); + const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK; int rw_flags; - if (bio_barrier(bio) && bio_has_data(bio) && + if (bio_rw_flagged(bio, BIO_RW_BARRIER) && bio_has_data(bio) && (q->next_ordered == QUEUE_ORDERED_NONE)) { bio_endio(bio, -EOPNOTSUPP); return 0; @@ -1177,7 +1175,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) spin_lock_irq(q->queue_lock); - if (unlikely(bio_barrier(bio)) || elv_queue_empty(q)) + if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)) || elv_queue_empty(q)) goto get_rq; el_ret = elv_merge(q, &req, bio); @@ -1190,6 +1188,9 @@ static int __make_request(struct request_queue *q, struct bio *bio) trace_block_bio_backmerge(q, bio); + if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) + blk_rq_set_mixed_merge(req); + req->biotail->bi_next = bio; req->biotail = bio; req->__data_len += bytes; @@ -1209,6 +1210,12 @@ static int __make_request(struct request_queue *q, struct bio *bio) trace_block_bio_frontmerge(q, bio); + if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) { + blk_rq_set_mixed_merge(req); + req->cmd_flags &= ~REQ_FAILFAST_MASK; + req->cmd_flags |= ff; + } + bio->bi_next = req->bio; req->bio = bio; @@ -1456,19 +1463,20 @@ static inline void __generic_make_request(struct bio *bio) if (old_sector != -1) trace_block_remap(q, bio, old_dev, old_sector); - trace_block_bio_queue(q, bio); - old_sector = bio->bi_sector; old_dev = bio->bi_bdev->bd_dev; if (bio_check_eod(bio, nr_sectors)) goto end_io; - if (bio_discard(bio) && !q->prepare_discard_fn) { + if (bio_rw_flagged(bio, BIO_RW_DISCARD) && + !q->prepare_discard_fn) { err = -EOPNOTSUPP; goto end_io; } + trace_block_bio_queue(q, bio); + ret = q->make_request_fn(q, bio); } while (ret); @@ -1653,6 +1661,50 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) } EXPORT_SYMBOL_GPL(blk_insert_cloned_request); +/** + * blk_rq_err_bytes - determine number of bytes till the next failure boundary + * @rq: request to examine + * + * Description: + * A request could be merge of IOs which require different failure + * handling. This function determines the number of bytes which + * can be failed from the beginning of the request without + * crossing into area which need to be retried further. + * + * Return: + * The number of bytes to fail. + * + * Context: + * queue_lock must be held. + */ +unsigned int blk_rq_err_bytes(const struct request *rq) +{ + unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; + unsigned int bytes = 0; + struct bio *bio; + + if (!(rq->cmd_flags & REQ_MIXED_MERGE)) + return blk_rq_bytes(rq); + + /* + * Currently the only 'mixing' which can happen is between + * different fastfail types. We can safely fail portions + * which have all the failfast bits that the first one has - + * the ones which are at least as eager to fail as the first + * one. + */ + for (bio = rq->bio; bio; bio = bio->bi_next) { + if ((bio->bi_rw & ff) != ff) + break; + bytes += bio->bi_size; + } + + /* this could lead to infinite loop */ + BUG_ON(blk_rq_bytes(rq) && !bytes); + return bytes; +} +EXPORT_SYMBOL_GPL(blk_rq_err_bytes); + static void blk_account_io_completion(struct request *req, unsigned int bytes) { if (blk_do_io_stat(req)) { @@ -1686,7 +1738,7 @@ static void blk_account_io_done(struct request *req) part_stat_inc(cpu, part, ios[rw]); part_stat_add(cpu, part, ticks[rw], duration); part_round_stats(cpu, part); - part_dec_in_flight(part); + part_dec_in_flight(part, rw); part_stat_unlock(); } @@ -1806,8 +1858,15 @@ void blk_dequeue_request(struct request *rq) * and to it is freed is accounted as io that is in progress at * the driver side. */ - if (blk_account_rq(rq)) + if (blk_account_rq(rq)) { q->in_flight[rq_is_sync(rq)]++; + /* + * Mark this device as supporting hardware queuing, if + * we have more IOs in flight than 4. + */ + if (!blk_queue_queuing(q) && queue_in_flight(q) > 4) + set_bit(QUEUE_FLAG_CQ, &q->queue_flags); + } } /** @@ -1999,6 +2058,12 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) if (blk_fs_request(req) || blk_discard_rq(req)) req->__sector += total_bytes >> 9; + /* mixed attributes always follow the first bio */ + if (req->cmd_flags & REQ_MIXED_MERGE) { + req->cmd_flags &= ~REQ_FAILFAST_MASK; + req->cmd_flags |= req->bio->bi_rw & REQ_FAILFAST_MASK; + } + /* * If total number of sectors is less than the first segment * size, something has gone terribly wrong. @@ -2178,6 +2243,25 @@ bool blk_end_request_cur(struct request *rq, int error) EXPORT_SYMBOL(blk_end_request_cur); /** + * blk_end_request_err - Finish a request till the next failure boundary. + * @rq: the request to finish till the next failure boundary for + * @error: must be negative errno + * + * Description: + * Complete @rq till the next failure boundary. + * + * Return: + * %false - we are done with this request + * %true - still buffers pending for this request + */ +bool blk_end_request_err(struct request *rq, int error) +{ + WARN_ON(error >= 0); + return blk_end_request(rq, error, blk_rq_err_bytes(rq)); +} +EXPORT_SYMBOL_GPL(blk_end_request_err); + +/** * __blk_end_request - Helper function for drivers to complete the request. * @rq: the request being processed * @error: %0 for success, < %0 for error @@ -2236,12 +2320,31 @@ bool __blk_end_request_cur(struct request *rq, int error) } EXPORT_SYMBOL(__blk_end_request_cur); +/** + * __blk_end_request_err - Finish a request till the next failure boundary. + * @rq: the request to finish till the next failure boundary for + * @error: must be negative errno + * + * Description: + * Complete @rq till the next failure boundary. Must be called + * with queue lock held. + * + * Return: + * %false - we are done with this request + * %true - still buffers pending for this request + */ +bool __blk_end_request_err(struct request *rq, int error) +{ + WARN_ON(error >= 0); + return __blk_end_request(rq, error, blk_rq_err_bytes(rq)); +} +EXPORT_SYMBOL_GPL(__blk_end_request_err); + void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio) { - /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw, and - we want BIO_RW_AHEAD (bit 1) to imply REQ_FAILFAST (bit 1). */ - rq->cmd_flags |= (bio->bi_rw & 3); + /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */ + rq->cmd_flags |= bio->bi_rw & REQ_RW; if (bio_has_data(bio)) { rq->nr_phys_segments = bio_phys_segments(q, bio); diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c new file mode 100644 index 0000000..ca56420 --- /dev/null +++ b/block/blk-iopoll.c @@ -0,0 +1,227 @@ +/* + * Functions related to interrupt-poll handling in the block layer. This + * is similar to NAPI for network devices. + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/interrupt.h> +#include <linux/cpu.h> +#include <linux/blk-iopoll.h> +#include <linux/delay.h> + +#include "blk.h" + +int blk_iopoll_enabled = 1; +EXPORT_SYMBOL(blk_iopoll_enabled); + +static unsigned int blk_iopoll_budget __read_mostly = 256; + +static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll); + +/** + * blk_iopoll_sched - Schedule a run of the iopoll handler + * @iop: The parent iopoll structure + * + * Description: + * Add this blk_iopoll structure to the pending poll list and trigger the + * raise of the blk iopoll softirq. The driver must already have gotten a + * succesful return from blk_iopoll_sched_prep() before calling this. + **/ +void blk_iopoll_sched(struct blk_iopoll *iop) +{ + unsigned long flags; + + local_irq_save(flags); + list_add_tail(&iop->list, &__get_cpu_var(blk_cpu_iopoll)); + __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); + local_irq_restore(flags); +} +EXPORT_SYMBOL(blk_iopoll_sched); + +/** + * __blk_iopoll_complete - Mark this @iop as un-polled again + * @iop: The parent iopoll structure + * + * Description: + * See blk_iopoll_complete(). This function must be called with interrupts + * disabled. + **/ +void __blk_iopoll_complete(struct blk_iopoll *iop) +{ + list_del(&iop->list); + smp_mb__before_clear_bit(); + clear_bit_unlock(IOPOLL_F_SCHED, &iop->state); +} +EXPORT_SYMBOL(__blk_iopoll_complete); + +/** + * blk_iopoll_complete - Mark this @iop as un-polled again + * @iop: The parent iopoll structure + * + * Description: + * If a driver consumes less than the assigned budget in its run of the + * iopoll handler, it'll end the polled mode by calling this function. The + * iopoll handler will not be invoked again before blk_iopoll_sched_prep() + * is called. + **/ +void blk_iopoll_complete(struct blk_iopoll *iopoll) +{ + unsigned long flags; + + local_irq_save(flags); + __blk_iopoll_complete(iopoll); + local_irq_restore(flags); +} +EXPORT_SYMBOL(blk_iopoll_complete); + +static void blk_iopoll_softirq(struct softirq_action *h) +{ + struct list_head *list = &__get_cpu_var(blk_cpu_iopoll); + int rearm = 0, budget = blk_iopoll_budget; + unsigned long start_time = jiffies; + + local_irq_disable(); + + while (!list_empty(list)) { + struct blk_iopoll *iop; + int work, weight; + + /* + * If softirq window is exhausted then punt. + */ + if (budget <= 0 || time_after(jiffies, start_time)) { + rearm = 1; + break; + } + + local_irq_enable(); + + /* Even though interrupts have been re-enabled, this + * access is safe because interrupts can only add new + * entries to the tail of this list, and only ->poll() + * calls can remove this head entry from the list. + */ + iop = list_entry(list->next, struct blk_iopoll, list); + + weight = iop->weight; + work = 0; + if (test_bit(IOPOLL_F_SCHED, &iop->state)) + work = iop->poll(iop, weight); + + budget -= work; + + local_irq_disable(); + + /* + * Drivers must not modify the iopoll state, if they + * consume their assigned weight (or more, some drivers can't + * easily just stop processing, they have to complete an + * entire mask of commands).In such cases this code + * still "owns" the iopoll instance and therefore can + * move the instance around on the list at-will. + */ + if (work >= weight) { + if (blk_iopoll_disable_pending(iop)) + __blk_iopoll_complete(iop); + else + list_move_tail(&iop->list, list); + } + } + + if (rearm) + __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); + + local_irq_enable(); +} + +/** + * blk_iopoll_disable - Disable iopoll on this @iop + * @iop: The parent iopoll structure + * + * Description: + * Disable io polling and wait for any pending callbacks to have completed. + **/ +void blk_iopoll_disable(struct blk_iopoll *iop) +{ + set_bit(IOPOLL_F_DISABLE, &iop->state); + while (test_and_set_bit(IOPOLL_F_SCHED, &iop->state)) + msleep(1); + clear_bit(IOPOLL_F_DISABLE, &iop->state); +} +EXPORT_SYMBOL(blk_iopoll_disable); + +/** + * blk_iopoll_enable - Enable iopoll on this @iop + * @iop: The parent iopoll structure + * + * Description: + * Enable iopoll on this @iop. Note that the handler run will not be + * scheduled, it will only mark it as active. + **/ +void blk_iopoll_enable(struct blk_iopoll *iop) +{ + BUG_ON(!test_bit(IOPOLL_F_SCHED, &iop->state)); + smp_mb__before_clear_bit(); + clear_bit_unlock(IOPOLL_F_SCHED, &iop->state); +} +EXPORT_SYMBOL(blk_iopoll_enable); + +/** + * blk_iopoll_init - Initialize this @iop + * @iop: The parent iopoll structure + * @weight: The default weight (or command completion budget) + * @poll_fn: The handler to invoke + * + * Description: + * Initialize this blk_iopoll structure. Before being actively used, the + * driver must call blk_iopoll_enable(). + **/ +void blk_iopoll_init(struct blk_iopoll *iop, int weight, blk_iopoll_fn *poll_fn) +{ + memset(iop, 0, sizeof(*iop)); + INIT_LIST_HEAD(&iop->list); + iop->weight = weight; + iop->poll = poll_fn; + set_bit(IOPOLL_F_SCHED, &iop->state); +} +EXPORT_SYMBOL(blk_iopoll_init); + +static int __cpuinit blk_iopoll_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + /* + * If a CPU goes away, splice its entries to the current CPU + * and trigger a run of the softirq + */ + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { + int cpu = (unsigned long) hcpu; + + local_irq_disable(); + list_splice_init(&per_cpu(blk_cpu_iopoll, cpu), + &__get_cpu_var(blk_cpu_iopoll)); + __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); + local_irq_enable(); + } + + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata blk_iopoll_cpu_notifier = { + .notifier_call = blk_iopoll_cpu_notify, +}; + +static __init int blk_iopoll_setup(void) +{ + int i; + + for_each_possible_cpu(i) + INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i)); + + open_softirq(BLOCK_IOPOLL_SOFTIRQ, blk_iopoll_softirq); + register_hotcpu_notifier(&blk_iopoll_cpu_notifier); + return 0; +} +subsys_initcall(blk_iopoll_setup); diff --git a/block/blk-merge.c b/block/blk-merge.c index e199967..99cb5cf 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -311,6 +311,36 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, return 1; } +/** + * blk_rq_set_mixed_merge - mark a request as mixed merge + * @rq: request to mark as mixed merge + * + * Description: + * @rq is about to be mixed merged. Make sure the attributes + * which can be mixed are set in each bio and mark @rq as mixed + * merged. + */ +void blk_rq_set_mixed_merge(struct request *rq) +{ + unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; + struct bio *bio; + + if (rq->cmd_flags & REQ_MIXED_MERGE) + return; + + /* + * @rq will no longer represent mixable attributes for all the + * contained bios. It will just track those of the first one. + * Distributes the attributs to each bio. + */ + for (bio = rq->bio; bio; bio = bio->bi_next) { + WARN_ON_ONCE((bio->bi_rw & REQ_FAILFAST_MASK) && + (bio->bi_rw & REQ_FAILFAST_MASK) != ff); + bio->bi_rw |= ff; + } + rq->cmd_flags |= REQ_MIXED_MERGE; +} + static void blk_account_io_merge(struct request *req) { if (blk_do_io_stat(req)) { @@ -321,7 +351,7 @@ static void blk_account_io_merge(struct request *req) part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); part_round_stats(cpu, part); - part_dec_in_flight(part); + part_dec_in_flight(part, rq_data_dir(req)); part_stat_unlock(); } @@ -350,12 +380,6 @@ static int attempt_merge(struct request_queue *q, struct request *req, if (blk_integrity_rq(req) != blk_integrity_rq(next)) return 0; - /* don't merge requests of different failfast settings */ - if (blk_failfast_dev(req) != blk_failfast_dev(next) || - blk_failfast_transport(req) != blk_failfast_transport(next) || - blk_failfast_driver(req) != blk_failfast_driver(next)) - return 0; - /* * If we are allowed to merge, then append bio list * from next to rq and release next. merge_requests_fn @@ -366,6 +390,19 @@ static int attempt_merge(struct request_queue *q, struct request *req, return 0; /* + * If failfast settings disagree or any of the two is already + * a mixed merge, mark both as mixed before proceeding. This + * makes sure that all involved bios have mixable attributes + * set properly. + */ + if ((req->cmd_flags | next->cmd_flags) & REQ_MIXED_MERGE || + (req->cmd_flags & REQ_FAILFAST_MASK) != + (next->cmd_flags & REQ_FAILFAST_MASK)) { + blk_rq_set_mixed_merge(req); + blk_rq_set_mixed_merge(next); + } + + /* * At this point we have either done a back merge * or front merge. We need the smaller start_time of * the merged requests to be the current request diff --git a/block/blk-settings.c b/block/blk-settings.c index 476d870..83413ff 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -428,6 +428,25 @@ void blk_queue_io_min(struct request_queue *q, unsigned int min) EXPORT_SYMBOL(blk_queue_io_min); /** + * blk_limits_io_opt - set optimal request size for a device + * @limits: the queue limits + * @opt: smallest I/O size in bytes + * + * Description: + * Storage devices may report an optimal I/O size, which is the + * device's preferred unit for sustained I/O. This is rarely reported + * for disk drives. For RAID arrays it is usually the stripe width or + * the internal track size. A properly aligned multiple of + * optimal_io_size is the preferred request size for workloads where + * sustained throughput is desired. + */ +void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt) +{ + limits->io_opt = opt; +} +EXPORT_SYMBOL(blk_limits_io_opt); + +/** * blk_queue_io_opt - set optimal request size for the queue * @q: the request queue for the device * @opt: optimal request size in bytes @@ -442,7 +461,7 @@ EXPORT_SYMBOL(blk_queue_io_min); */ void blk_queue_io_opt(struct request_queue *q, unsigned int opt) { - q->limits.io_opt = opt; + blk_limits_io_opt(&q->limits, opt); } EXPORT_SYMBOL(blk_queue_io_opt); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 418d636..b78c9c3 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -40,7 +40,12 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) { struct request_list *rl = &q->rq; unsigned long nr; - int ret = queue_var_store(&nr, page, count); + int ret; + + if (!q->request_fn) + return -EINVAL; + + ret = queue_var_store(&nr, page, count); if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; @@ -133,7 +138,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) return -EINVAL; spin_lock_irq(q->queue_lock); - blk_queue_max_sectors(q, max_sectors_kb << 1); + q->limits.max_sectors = max_sectors_kb << 1; spin_unlock_irq(q->queue_lock); return ret; diff --git a/block/blk.h b/block/blk.h index 3fae6ad..5ee3d7e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -104,6 +104,7 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req, int attempt_back_merge(struct request_queue *q, struct request *rq); int attempt_front_merge(struct request_queue *q, struct request *rq); void blk_recalc_rq_segments(struct request *rq); +void blk_rq_set_mixed_merge(struct request *rq); void blk_queue_congestion_threshold(struct request_queue *q); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index fd7080e..1ca813b 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -48,7 +48,7 @@ static int cfq_slice_idle = HZ / 125; static struct kmem_cache *cfq_pool; static struct kmem_cache *cfq_ioc_pool; -static DEFINE_PER_CPU(unsigned long, ioc_count); +static DEFINE_PER_CPU(unsigned long, cfq_ioc_count); static struct completion *ioc_gone; static DEFINE_SPINLOCK(ioc_gone_lock); @@ -134,13 +134,8 @@ struct cfq_data { struct rb_root prio_trees[CFQ_PRIO_LISTS]; unsigned int busy_queues; - /* - * Used to track any pending rt requests so we can pre-empt current - * non-RT cfqq in service when this value is non-zero. - */ - unsigned int busy_rt_queues; - int rq_in_driver; + int rq_in_driver[2]; int sync_flight; /* @@ -191,7 +186,6 @@ enum cfqq_state_flags { CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */ - CFQ_CFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */ CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */ @@ -218,7 +212,6 @@ static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \ CFQ_CFQQ_FNS(on_rr); CFQ_CFQQ_FNS(wait_request); CFQ_CFQQ_FNS(must_dispatch); -CFQ_CFQQ_FNS(must_alloc); CFQ_CFQQ_FNS(must_alloc_slice); CFQ_CFQQ_FNS(fifo_expire); CFQ_CFQQ_FNS(idle_window); @@ -239,6 +232,11 @@ static struct cfq_queue *cfq_get_queue(struct cfq_data *, int, static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *, struct io_context *); +static inline int rq_in_driver(struct cfq_data *cfqd) +{ + return cfqd->rq_in_driver[0] + cfqd->rq_in_driver[1]; +} + static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic, int is_sync) { @@ -257,7 +255,7 @@ static inline void cic_set_cfqq(struct cfq_io_context *cic, */ static inline int cfq_bio_sync(struct bio *bio) { - if (bio_data_dir(bio) == READ || bio_sync(bio)) + if (bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO)) return 1; return 0; @@ -648,8 +646,6 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(cfq_cfqq_on_rr(cfqq)); cfq_mark_cfqq_on_rr(cfqq); cfqd->busy_queues++; - if (cfq_class_rt(cfqq)) - cfqd->busy_rt_queues++; cfq_resort_rr_list(cfqd, cfqq); } @@ -673,8 +669,6 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(!cfqd->busy_queues); cfqd->busy_queues--; - if (cfq_class_rt(cfqq)) - cfqd->busy_rt_queues--; } /* @@ -760,9 +754,9 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq) { struct cfq_data *cfqd = q->elevator->elevator_data; - cfqd->rq_in_driver++; + cfqd->rq_in_driver[rq_is_sync(rq)]++; cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", - cfqd->rq_in_driver); + rq_in_driver(cfqd)); cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); } @@ -770,11 +764,12 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq) static void cfq_deactivate_request(struct request_queue *q, struct request *rq) { struct cfq_data *cfqd = q->elevator->elevator_data; + const int sync = rq_is_sync(rq); - WARN_ON(!cfqd->rq_in_driver); - cfqd->rq_in_driver--; + WARN_ON(!cfqd->rq_in_driver[sync]); + cfqd->rq_in_driver[sync]--; cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d", - cfqd->rq_in_driver); + rq_in_driver(cfqd)); } static void cfq_remove_request(struct request *rq) @@ -1080,7 +1075,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) /* * still requests with the driver, don't idle */ - if (cfqd->rq_in_driver) + if (rq_in_driver(cfqd)) return; /* @@ -1115,6 +1110,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) cfq_log_cfqq(cfqd, cfqq, "dispatch_insert"); + cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq); cfq_remove_request(rq); cfqq->dispatched++; elv_dispatch_sort(q, rq); @@ -1179,20 +1175,6 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) goto expire; /* - * If we have a RT cfqq waiting, then we pre-empt the current non-rt - * cfqq. - */ - if (!cfq_class_rt(cfqq) && cfqd->busy_rt_queues) { - /* - * We simulate this as cfqq timed out so that it gets to bank - * the remaining of its time slice. - */ - cfq_log_cfqq(cfqd, cfqq, "preempt"); - cfq_slice_expired(cfqd, 1); - goto new_queue; - } - - /* * The active queue has requests and isn't expired, allow it to * dispatch. */ @@ -1312,6 +1294,12 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) return 0; /* + * Drain async requests before we start sync IO + */ + if (cfq_cfqq_idle_window(cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC]) + return 0; + + /* * If this is an async queue and we have sync IO in flight, let it wait */ if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq)) @@ -1362,7 +1350,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) cfq_slice_expired(cfqd, 0); } - cfq_log(cfqd, "dispatched a request"); + cfq_log_cfqq(cfqd, cfqq, "dispatched a request"); return 1; } @@ -1427,7 +1415,7 @@ static void cfq_cic_free_rcu(struct rcu_head *head) cic = container_of(head, struct cfq_io_context, rcu_head); kmem_cache_free(cfq_ioc_pool, cic); - elv_ioc_count_dec(ioc_count); + elv_ioc_count_dec(cfq_ioc_count); if (ioc_gone) { /* @@ -1436,7 +1424,7 @@ static void cfq_cic_free_rcu(struct rcu_head *head) * complete ioc_gone and set it back to NULL */ spin_lock(&ioc_gone_lock); - if (ioc_gone && !elv_ioc_count_read(ioc_count)) { + if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) { complete(ioc_gone); ioc_gone = NULL; } @@ -1562,7 +1550,7 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) INIT_HLIST_NODE(&cic->cic_list); cic->dtor = cfq_free_io_context; cic->exit = cfq_exit_io_context; - elv_ioc_count_inc(ioc_count); + elv_ioc_count_inc(cfq_ioc_count); } return cic; @@ -2130,11 +2118,11 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq) */ static void cfq_update_hw_tag(struct cfq_data *cfqd) { - if (cfqd->rq_in_driver > cfqd->rq_in_driver_peak) - cfqd->rq_in_driver_peak = cfqd->rq_in_driver; + if (rq_in_driver(cfqd) > cfqd->rq_in_driver_peak) + cfqd->rq_in_driver_peak = rq_in_driver(cfqd); if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && - cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN) + rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN) return; if (cfqd->hw_tag_samples++ < 50) @@ -2161,9 +2149,9 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfq_update_hw_tag(cfqd); - WARN_ON(!cfqd->rq_in_driver); + WARN_ON(!cfqd->rq_in_driver[sync]); WARN_ON(!cfqq->dispatched); - cfqd->rq_in_driver--; + cfqd->rq_in_driver[sync]--; cfqq->dispatched--; if (cfq_cfqq_sync(cfqq)) @@ -2197,7 +2185,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfq_arm_slice_timer(cfqd); } - if (!cfqd->rq_in_driver) + if (!rq_in_driver(cfqd)) cfq_schedule_dispatch(cfqd); } @@ -2229,8 +2217,7 @@ static void cfq_prio_boost(struct cfq_queue *cfqq) static inline int __cfq_may_queue(struct cfq_queue *cfqq) { - if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) && - !cfq_cfqq_must_alloc_slice(cfqq)) { + if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) { cfq_mark_cfqq_must_alloc_slice(cfqq); return ELV_MQUEUE_MUST; } @@ -2317,7 +2304,6 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) } cfqq->allocated[rw]++; - cfq_clear_cfqq_must_alloc(cfqq); atomic_inc(&cfqq->ref); spin_unlock_irqrestore(q->queue_lock, flags); @@ -2668,7 +2654,7 @@ static void __exit cfq_exit(void) * this also protects us from entering cfq_slab_kill() with * pending RCU callbacks */ - if (elv_ioc_count_read(ioc_count)) + if (elv_ioc_count_read(cfq_ioc_count)) wait_for_completion(&all_gone); cfq_slab_kill(); } diff --git a/block/elevator.c b/block/elevator.c index 2d511f9..1975b61 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -79,7 +79,8 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio) /* * Don't merge file system requests and discard requests */ - if (bio_discard(bio) != bio_discard(rq->bio)) + if (bio_rw_flagged(bio, BIO_RW_DISCARD) != + bio_rw_flagged(rq->bio, BIO_RW_DISCARD)) return 0; /* @@ -100,19 +101,6 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio) if (bio_integrity(bio) != blk_integrity_rq(rq)) return 0; - /* - * Don't merge if failfast settings don't match. - * - * FIXME: The negation in front of each condition is necessary - * because bio and request flags use different bit positions - * and the accessors return those bits directly. This - * ugliness will soon go away. - */ - if (!bio_failfast_dev(bio) != !blk_failfast_dev(rq) || - !bio_failfast_transport(bio) != !blk_failfast_transport(rq) || - !bio_failfast_driver(bio) != !blk_failfast_driver(rq)) - return 0; - if (!elv_iosched_allow_merge(rq, bio)) return 0; diff --git a/block/genhd.c b/block/genhd.c index f4c64c2..2ad91dd 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -869,6 +869,7 @@ static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL); static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); +static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); @@ -888,6 +889,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_alignment_offset.attr, &dev_attr_capability.attr, &dev_attr_stat.attr, + &dev_attr_inflight.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif @@ -901,7 +903,7 @@ static struct attribute_group disk_attr_group = { .attrs = disk_attrs, }; -static struct attribute_group *disk_attr_groups[] = { +static const struct attribute_group *disk_attr_groups[] = { &disk_attr_group, NULL }; @@ -1053,7 +1055,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) part_stat_read(hd, merges[1]), (unsigned long long)part_stat_read(hd, sectors[1]), jiffies_to_msecs(part_stat_read(hd, ticks[1])), - hd->in_flight, + part_in_flight(hd), jiffies_to_msecs(part_stat_read(hd, io_ticks)), jiffies_to_msecs(part_stat_read(hd, time_in_queue)) ); @@ -1215,6 +1217,16 @@ void put_disk(struct gendisk *disk) EXPORT_SYMBOL(put_disk); +static void set_disk_ro_uevent(struct gendisk *gd, int ro) +{ + char event[] = "DISK_RO=1"; + char *envp[] = { event, NULL }; + + if (!ro) + event[8] = '0'; + kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp); +} + void set_device_ro(struct block_device *bdev, int flag) { bdev->bd_part->policy = flag; @@ -1227,8 +1239,12 @@ void set_disk_ro(struct gendisk *disk, int flag) struct disk_part_iter piter; struct hd_struct *part; - disk_part_iter_init(&piter, disk, - DISK_PITER_INCL_EMPTY | DISK_PITER_INCL_PART0); + if (disk->part0.policy != flag) { + set_disk_ro_uevent(disk, flag); + disk->part0.policy = flag; + } + + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) part->policy = flag; disk_part_iter_exit(&piter); diff --git a/block/ioctl.c b/block/ioctl.c index 500e4c7..d3e6b58 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -112,22 +112,9 @@ static int blkdev_reread_part(struct block_device *bdev) return res; } -static void blk_ioc_discard_endio(struct bio *bio, int err) -{ - if (err) { - if (err == -EOPNOTSUPP) - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - clear_bit(BIO_UPTODATE, &bio->bi_flags); - } - complete(bio->bi_private); -} - static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, uint64_t len) { - struct request_queue *q = bdev_get_queue(bdev); - int ret = 0; - if (start & 511) return -EINVAL; if (len & 511) @@ -137,40 +124,8 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, if (start + len > (bdev->bd_inode->i_size >> 9)) return -EINVAL; - - if (!q->prepare_discard_fn) - return -EOPNOTSUPP; - - while (len && !ret) { - DECLARE_COMPLETION_ONSTACK(wait); - struct bio *bio; - - bio = bio_alloc(GFP_KERNEL, 0); - - bio->bi_end_io = blk_ioc_discard_endio; - bio->bi_bdev = bdev; - bio->bi_private = &wait; - bio->bi_sector = start; - - if (len > queue_max_hw_sectors(q)) { - bio->bi_size = queue_max_hw_sectors(q) << 9; - len -= queue_max_hw_sectors(q); - start += queue_max_hw_sectors(q); - } else { - bio->bi_size = len << 9; - len = 0; - } - submit_bio(DISCARD_NOBARRIER, bio); - - wait_for_completion(&wait); - - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; - else if (!bio_flagged(bio, BIO_UPTODATE)) - ret = -EIO; - bio_put(bio); - } - return ret; + return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, + DISCARD_FL_WAIT); } static int put_ushort(unsigned long arg, unsigned short val) |