From dd831006d5be7f74c3fe7aef82380c51c3637960 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:16 +0200
Subject: block: misc cleanups in barrier code

Make the following cleanups in preparation of barrier/flush update.

* blk_do_ordered() declaration is moved from include/linux/blkdev.h to
  block/blk.h.

* blk_do_ordered() now returns pointer to struct request, with %NULL
  meaning "try the next request" and ERR_PTR(-EAGAIN) "try again
  later".  The third case will be dropped with further changes.

* In the initialization of proxy barrier request, data direction is
  already set by init_request_from_bio().  Drop unnecessary explicit
  REQ_WRITE setting and move init_request_from_bio() above REQ_FUA
  flag setting.

* add_request() is collapsed into __make_request().

These changes don't make any functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'block/blk.h')

diff --git a/block/blk.h b/block/blk.h
index 6e7dc87..874eb4e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -51,6 +51,8 @@ static inline void blk_clear_rq_complete(struct request *rq)
  */
 #define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
 
+struct request *blk_do_ordered(struct request_queue *q, struct request *rq);
+
 static inline struct request *__elv_next_request(struct request_queue *q)
 {
 	struct request *rq;
@@ -58,8 +60,9 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 	while (1) {
 		while (!list_empty(&q->queue_head)) {
 			rq = list_entry_rq(q->queue_head.next);
-			if (blk_do_ordered(q, &rq))
-				return rq;
+			rq = blk_do_ordered(q, rq);
+			if (rq)
+				return !IS_ERR(rq) ? rq : NULL;
 		}
 
 		if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
-- 
cgit v1.1


From 28e7d1845216538303bb95d679d8fd4de50e2f1a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:16 +0200
Subject: block: drop barrier ordering by queue draining

Filesystems will take all the responsibilities for ordering requests
around commit writes and will only indicate how the commit writes
themselves should be handled by block layers.  This patch drops
barrier ordering by queue draining from block layer.  Ordering by
draining implementation was somewhat invasive to request handling.
List of notable changes follow.

* Each queue has 1 bit color which is flipped on each barrier issue.
  This is used to track whether a given request is issued before the
  current barrier or not.  REQ_ORDERED_COLOR flag and coloring
  implementation in __elv_add_request() are removed.

* Requests which shouldn't be processed yet for draining were stalled
  by returning -EAGAIN from blk_do_ordered() according to the test
  result between blk_ordered_req_seq() and blk_blk_ordered_cur_seq().
  This logic is removed.

* Draining completion logic in elv_completed_request() removed.

* All barrier sequence requests were queued to request queue and then
  trckled to lower layer according to progress and thus maintaining
  request orders during requeue was necessary.  This is replaced by
  queueing the next request in the barrier sequence only after the
  current one is complete from blk_ordered_complete_seq(), which
  removes the need for multiple proxy requests in struct request_queue
  and the request sorting logic in the ELEVATOR_INSERT_REQUEUE path of
  elv_insert().

* As barriers no longer have ordering constraints, there's no need to
  dump the whole elevator onto the dispatch queue on each barrier.
  Insert barriers at the front instead.

* If other barrier requests come to the front of the dispatch queue
  while one is already in progress, they are stored in
  q->pending_barriers and restored to dispatch queue one-by-one after
  each barrier completion from blk_ordered_complete_seq().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block/blk.h')

diff --git a/block/blk.h b/block/blk.h
index 874eb4e..08081e4 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -62,7 +62,7 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 			rq = list_entry_rq(q->queue_head.next);
 			rq = blk_do_ordered(q, rq);
 			if (rq)
-				return !IS_ERR(rq) ? rq : NULL;
+				return rq;
 		}
 
 		if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
-- 
cgit v1.1


From dd4c133f387c48f526022860ad70354637a80f4c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:16 +0200
Subject: block: rename barrier/ordered to flush

With ordering requirements dropped, barrier and ordered are misnomers.
Now all block layer does is sequencing FLUSH and FUA.  Rename them to
flush.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'block/blk.h')

diff --git a/block/blk.h b/block/blk.h
index 08081e4..24b92bd 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -51,7 +51,7 @@ static inline void blk_clear_rq_complete(struct request *rq)
  */
 #define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
 
-struct request *blk_do_ordered(struct request_queue *q, struct request *rq);
+struct request *blk_do_flush(struct request_queue *q, struct request *rq);
 
 static inline struct request *__elv_next_request(struct request_queue *q)
 {
@@ -60,7 +60,7 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 	while (1) {
 		while (!list_empty(&q->queue_head)) {
 			rq = list_entry_rq(q->queue_head.next);
-			rq = blk_do_ordered(q, rq);
+			rq = blk_do_flush(q, rq);
 			if (rq)
 				return rq;
 		}
-- 
cgit v1.1


From 4fed947cb311e5aa51781d316cefca836352f6ce Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Sep 2010 11:56:17 +0200
Subject: block: implement REQ_FLUSH/FUA based interface for FLUSH/FUA requests

Now that the backend conversion is complete, export sequenced
FLUSH/FUA capability through REQ_FLUSH/FUA flags.  REQ_FLUSH means the
device cache should be flushed before executing the request.  REQ_FUA
means that the data in the request should be on non-volatile media on
completion.

Block layer will choose the correct way of implementing the semantics
and execute it.  The request may be passed to the device directly if
the device can handle it; otherwise, it will be sequenced using one or
more proxy requests.  Devices will never see REQ_FLUSH and/or FUA
which it doesn't support.

Also, unlike the original REQ_HARDBARRIER, REQ_FLUSH/FUA requests are
never failed with -EOPNOTSUPP.  If the underlying device doesn't
support FLUSH/FUA, the block layer simply make those noop.  IOW, it no
longer distinguishes between writeback cache which doesn't support
cache flush and writethrough/no cache.  Devices which have WB cache
w/o flush are very difficult to come by these days and there's nothing
much we can do anyway, so it doesn't make sense to require everyone to
implement -EOPNOTSUPP handling.  This will simplify filesystems and
block drivers as they can drop -EOPNOTSUPP retry logic for barriers.

* QUEUE_ORDERED_* are removed and QUEUE_FSEQ_* are moved into
  blk-flush.c.

* REQ_FLUSH w/o data can also be directly passed to drivers without
  sequencing but some drivers assume that zero length requests don't
  have rq->bio which isn't true for these requests requiring the use
  of proxy requests.

* REQ_COMMON_MASK now includes REQ_FLUSH | REQ_FUA so that they are
  copied from bio to request.

* WRITE_BARRIER is marked deprecated and WRITE_FLUSH, WRITE_FUA and
  WRITE_FLUSH_FUA are added.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'block/blk.h')

diff --git a/block/blk.h b/block/blk.h
index 24b92bd..a09c18b 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -60,6 +60,9 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 	while (1) {
 		while (!list_empty(&q->queue_head)) {
 			rq = list_entry_rq(q->queue_head.next);
+			if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) ||
+			    rq == &q->flush_rq)
+				return rq;
 			rq = blk_do_flush(q, rq);
 			if (rq)
 				return rq;
-- 
cgit v1.1


From 13f05c8d8e98bbdce89158bfdb2e380940695a88 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 10 Sep 2010 20:50:10 +0200
Subject: block/scsi: Provide a limit on the number of integrity segments

Some controllers have a hardware limit on the number of protection
information scatter-gather list segments they can handle.

Introduce a max_integrity_segments limit in the block layer and provide
a new scsi_host_template setting that allows HBA drivers to provide a
value suitable for the hardware.

Add support for honoring the integrity segment limit when merging both
bios and requests.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@carl.home.kernel.dk>
---
 block/blk.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'block/blk.h')

diff --git a/block/blk.h b/block/blk.h
index 6e7dc87..6738831 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -132,14 +132,6 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
 	return q->nr_congestion_off;
 }
 
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
-
-#define rq_for_each_integrity_segment(bvl, _rq, _iter)		\
-	__rq_for_each_bio(_iter.bio, _rq)			\
-		bip_for_each_vec(bvl, _iter.bio->bi_integrity, _iter.i)
-
-#endif /* BLK_DEV_INTEGRITY */
-
 static inline int blk_cpu_to_group(int cpu)
 {
 #ifdef CONFIG_SCHED_MC
-- 
cgit v1.1


From 7681bfeeccff5efa9eb29bf09249a3c400b15327 Mon Sep 17 00:00:00 2001
From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Date: Tue, 19 Oct 2010 09:05:00 +0200
Subject: block: fix accounting bug on cross partition merges

/proc/diskstats would display a strange output as follows.

$ cat /proc/diskstats |grep sda
   8       0 sda 90524 7579 102154 20464 0 0 0 0 0 14096 20089
   8       1 sda1 19085 1352 21841 4209 0 0 0 0 4294967064 15689 4293424691
                                                ~~~~~~~~~~
   8       2 sda2 71252 3624 74891 15950 0 0 0 0 232 23995 1562390
   8       3 sda3 54 487 2188 92 0 0 0 0 0 88 92
   8       4 sda4 4 0 8 0 0 0 0 0 0 0 0
   8       5 sda5 81 2027 2130 138 0 0 0 0 0 87 137

Its reason is the wrong way of accounting hd_struct->in_flight. When a bio is
merged into a request belongs to different partition by ELEVATOR_FRONT_MERGE.

The detailed root cause is as follows.

Assuming that there are two partition, sda1 and sda2.

1. A request for sda2 is in request_queue. Hence sda1's hd_struct->in_flight
   is 0 and sda2's one is 1.

        | hd_struct->in_flight
   ---------------------------
   sda1 |          0
   sda2 |          1
   ---------------------------

2. A bio belongs to sda1 is issued and is merged into the request mentioned on
   step1 by ELEVATOR_BACK_MERGE. The first sector of the request is changed
   from sda2 region to sda1 region. However the two partition's
   hd_struct->in_flight are not changed.

        | hd_struct->in_flight
   ---------------------------
   sda1 |          0
   sda2 |          1
   ---------------------------

3. The request is finished and blk_account_io_done() is called. In this case,
   sda2's hd_struct->in_flight, not a sda1's one, is decremented.

        | hd_struct->in_flight
   ---------------------------
   sda1 |         -1
   sda2 |          1
   ---------------------------

The patch fixes the problem by caching the partition lookup
inside the request structure, hence making sure that the increment
and decrement will always happen on the same partition struct. This
also speeds up IO with accounting enabled, since it cuts down on
the number of lookups we have to do.

When reloading partition tables, quiesce IO to ensure that no
request references to the partition struct exists. When it is safe
to free the partition table, the IO for that device is restarted
again.

Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'block/blk.h')

diff --git a/block/blk.h b/block/blk.h
index 6738831..1340cce 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -110,10 +110,6 @@ void blk_queue_congestion_threshold(struct request_queue *q);
 
 int blk_dev_init(void);
 
-void elv_quiesce_start(struct request_queue *q);
-void elv_quiesce_end(struct request_queue *q);
-
-
 /*
  * Return the threshold (number of used requests) at which the queue is
  * considered to be congested.  It include a little hysteresis to keep the
-- 
cgit v1.1


From f253b86b4ad1b3220544e75880510fd455ebd23f Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Sun, 24 Oct 2010 22:06:02 +0200
Subject: Revert "block: fix accounting bug on cross partition merges"

This reverts commit 7681bfeeccff5efa9eb29bf09249a3c400b15327.

Conflicts:

	include/linux/genhd.h

It has numerous issues with the cleanup path and non-elevator
devices. Revert it for now so we can come up with a clean
version without rushing things.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'block/blk.h')

diff --git a/block/blk.h b/block/blk.h
index 1e675e5..2db8f32 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -116,6 +116,10 @@ void blk_queue_congestion_threshold(struct request_queue *q);
 
 int blk_dev_init(void);
 
+void elv_quiesce_start(struct request_queue *q);
+void elv_quiesce_end(struct request_queue *q);
+
+
 /*
  * Return the threshold (number of used requests) at which the queue is
  * considered to be congested.  It include a little hysteresis to keep the
-- 
cgit v1.1