From f53881e6463124c910b17a5ea722a5cf5ab67fb3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 17 Oct 2011 13:42:43 +0200
Subject: block: make gendisk hold a reference to its queue

commit f992ae801a7dec34a4ed99a6598bbbbfb82af4fb upstream.

The following command sequence triggers an oops.

# mount /dev/sdb1 /mnt
# echo 1 > /sys/class/scsi_device/0\:0\:1\:0/device/delete
# umount /mnt

 general protection fault: 0000 [#1] PREEMPT SMP
 CPU 2
 Modules linked in:

 Pid: 791, comm: umount Not tainted 3.1.0-rc3-work+ #8 Bochs Bochs
 RIP: 0010:[<ffffffff810d0879>]  [<ffffffff810d0879>] __lock_acquire+0x389/0x1d60
...
 Call Trace:
  [<ffffffff810d2845>] lock_acquire+0x95/0x140
  [<ffffffff81aed87b>] _raw_spin_lock+0x3b/0x50
  [<ffffffff811573bc>] bdi_lock_two+0x5c/0x70
  [<ffffffff811c2f6c>] bdev_inode_switch_bdi+0x4c/0xf0
  [<ffffffff811c3fcb>] __blkdev_put+0x11b/0x1d0
  [<ffffffff811c4010>] __blkdev_put+0x160/0x1d0
  [<ffffffff811c40df>] blkdev_put+0x5f/0x190
  [<ffffffff8118f18d>] kill_block_super+0x4d/0x80
  [<ffffffff8118f4a5>] deactivate_locked_super+0x45/0x70
  [<ffffffff8119003a>] deactivate_super+0x4a/0x70
  [<ffffffff811ac4ad>] mntput_no_expire+0xed/0x130
  [<ffffffff811acf2e>] sys_umount+0x7e/0x3a0
  [<ffffffff81aeeeab>] system_call_fastpath+0x16/0x1b

This is because bdev holds on to disk but disk doesn't pin the
associated queue.  If a SCSI device is removed while the device is
still open, the sdev puts the base reference to the queue on release.
When the bdev is finally released, the associated queue is already
gone along with the bdi and bdev_inode_switch_bdi() ends up
dereferencing already freed bdi.

Even if it were not for this bug, disk not holding onto the associated
queue is very unusual and error-prone.

Fix it by making add_disk() take an extra reference to its queue and
put it on disk_release() and ensuring that disk and its fops owner are
put in that order after all accesses to the disk and queue are
complete.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 block/genhd.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'block')
diff --git a/block/genhd.c b/block/genhd.c
index 3608289..8c0829a 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -611,6 +611,12 @@ void add_disk(struct gendisk *disk)
 	register_disk(disk);
 	blk_register_queue(disk);
 
+	/*
+	 * Take an extra ref on queue which will be put on disk_release()
+	 * so that it sticks around as long as @disk is there.
+	 */
+	WARN_ON_ONCE(blk_get_queue(disk->queue));
+
 	retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
 				   "bdi");
 	WARN_ON(retval);
@@ -1103,6 +1109,8 @@ static void disk_release(struct device *dev)
 	disk_replace_part_tbl(disk, NULL);
 	free_part_stats(&disk->part0);
 	free_part_info(&disk->part0);
+	if (disk->queue)
+		blk_put_queue(disk->queue);
 	kfree(disk);
 }
 struct class block_class = {
-- 
cgit v1.1


From acc1887e7e0da5d19fb77b2a5e5c5db2d57667f9 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Sun, 13 Nov 2011 19:58:09 +0100
Subject: block: Always check length of all iov entries in
 blk_rq_map_user_iov()

commit 6b76106d8ef31111d6fc469564b83b5f5542794f upstream.

Even after commit 5478755616ae2ef1ce144dded589b62b2a50d575
("block: check for proper length of iov entries earlier ...")
we still won't check for zero-length entries after an unaligned
entry.  Remove the break-statement, so all entries are checked.

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 block/blk-map.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-map.c b/block/blk-map.c
index e663ac2..164cd00 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -204,10 +204,11 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 		if (!iov[i].iov_len)
 			return -EINVAL;
 
-		if (uaddr & queue_dma_alignment(q)) {
+		/*
+		 * Keep going so we check length of all segments
+		 */
+		if (uaddr & queue_dma_alignment(q))
 			unaligned = 1;
-			break;
-		}
 	}
 
 	if (unaligned || (q->dma_pad_mask & len) || map_data)
-- 
cgit v1.1


From 8f8a594251e5260f53d746887890d0d2185ceebb Mon Sep 17 00:00:00 2001
From: majianpeng <majianpeng@gmail.com>
Date: Wed, 30 Nov 2011 15:47:48 +0100
Subject: cfq-iosched: free cic_index if blkio_alloc_blkg_stats fails

commit 2984ff38ccf6cbc02a7a996a36c7d6f69f3c6146 upstream.

If we fail allocating the blkpg stats, we free cfqd and cfgq.
But we need to free the IDA cfqd->cic_index as well.

Signed-off-by: majianpeng <majianpeng@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 block/cfq-iosched.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ae21919..aae9447 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -4015,6 +4015,11 @@ static void *cfq_init_queue(struct request_queue *q)
 
 	if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
 		kfree(cfqg);
+
+		spin_lock(&cic_index_lock);
+		ida_remove(&cic_index_ida, cfqd->cic_index);
+		spin_unlock(&cic_index_lock);
+
 		kfree(cfqd);
 		return NULL;
 	}
-- 
cgit v1.1


From cec3c159f674367def095745a72ac6150f889243 Mon Sep 17 00:00:00 2001
From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Date: Fri, 2 Dec 2011 10:07:07 +0100
Subject: cfq-iosched: fix cfq_cic_link() race confition

commit 5eb46851de3904cd1be9192fdacb8d34deadc1fc upstream.

cfq_cic_link() has race condition. When some processes which shared ioc
issue I/O to same block device simultaneously, cfq_cic_link() returns -EEXIST
sometimes. The race condition might stop I/O by following steps:

step  1: Process A: Issue an I/O to /dev/sda
step  2: Process A: Get an ioc (iocA here) in get_io_context() which does not
		    linked with a cic for the device
step  3: Process A: Get a new cic for the device (cicA here) in
		    cfq_alloc_io_context()

step  4: Process B: Issue an I/O to /dev/sda
step  5: Process B: Get iocA in get_io_context() since process A and B share the
		    same ioc
step  6: Process B: Get a new cic for the device (cicB here) in
		    cfq_alloc_io_context() since iocA has not been linked with a
		    cic for the device yet

step  7: Process A: Link cicA to iocA in cfq_cic_link()
step  8: Process A: Dispatch I/O to driver and finish it

step  9: Process B: Try to link cicB to iocA in cfq_cic_link()
		    But it fails with showing "cfq: cic link failed!" kernel
		    message, since iocA has already linked with cicA at step 7.
step 10: Process B: Wait for finishig I/O in get_request_wait()
		    The function does not wake up, when there is no I/O to the
		    device.

When cfq_cic_link() returns -EEXIST, it means ioc has already linked with cic.
So when cfq_cic_link() return -EEXIST, retry cfq_cic_lookup().

Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 block/cfq-iosched.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index aae9447..23500ac 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3169,7 +3169,7 @@ static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
 		}
 	}
 
-	if (ret)
+	if (ret && ret != -EEXIST)
 		printk(KERN_ERR "cfq: cic link failed!\n");
 
 	return ret;
@@ -3185,6 +3185,7 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
 	struct io_context *ioc = NULL;
 	struct cfq_io_context *cic;
+	int ret;
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
@@ -3192,6 +3193,7 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 	if (!ioc)
 		return NULL;
 
+retry:
 	cic = cfq_cic_lookup(cfqd, ioc);
 	if (cic)
 		goto out;
@@ -3200,7 +3202,12 @@ cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 	if (cic == NULL)
 		goto err;
 
-	if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))
+	ret = cfq_cic_link(cfqd, ioc, cic, gfp_mask);
+	if (ret == -EEXIST) {
+		/* someone has linked cic to ioc already */
+		cfq_cic_free(cic);
+		goto retry;
+	} else if (ret)
 		goto err_free;
 
 out:
-- 
cgit v1.1


From d27bf91d1a9ea58a32bf9dd949e93bb1dc1fc9bf Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 23 Nov 2011 10:59:13 +0100
Subject: block: initialize request_queue's numa node during

commit 5151412dd4338b273afdb107c3772528e9e67d92 upstream.

struct request_queue is allocated with __GFP_ZERO so its "node" field is
zero before initialization.  This causes an oops if node 0 is offline in
the page allocator because its zonelists are not initialized.  From Dave
Young's dmesg:

	SRAT: Node 1 PXM 2 0-d0000000
	SRAT: Node 1 PXM 2 100000000-330000000
	SRAT: Node 0 PXM 1 330000000-630000000
	Initmem setup node 1 0000000000000000-000000000affb000
	...
	Built 1 zonelists in Node order, mobility grouping on.
	...
	BUG: unable to handle kernel paging request at 0000000000001c08
	IP: [<ffffffff8111c355>] __alloc_pages_nodemask+0xb5/0x870

and __alloc_pages_nodemask+0xb5 translates to a NULL pointer on
zonelist->_zonerefs.

The fix is to initialize q->node at the time of allocation so the correct
node is passed to the slab allocator later.

Since blk_init_allocated_queue_node() is no longer needed, merge it with
blk_init_allocated_queue().

[rientjes@google.com: changelog, initializing q->node]
Reported-by: Dave Young <dyoung@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Tested-by: Dave Young <dyoung@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 block/blk-core.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

(limited to 'block')

diff --git a/block/blk-core.c b/block/blk-core.c
index 847d04e..35ae52d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -418,6 +418,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	q->backing_dev_info.state = 0;
 	q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
 	q->backing_dev_info.name = "block";
+	q->node = node_id;
 
 	err = bdi_init(&q->backing_dev_info);
 	if (err) {
@@ -502,7 +503,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 	if (!uninit_q)
 		return NULL;
 
-	q = blk_init_allocated_queue_node(uninit_q, rfn, lock, node_id);
+	q = blk_init_allocated_queue(uninit_q, rfn, lock);
 	if (!q)
 		blk_cleanup_queue(uninit_q);
 
@@ -514,18 +515,9 @@ struct request_queue *
 blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
 			 spinlock_t *lock)
 {
-	return blk_init_allocated_queue_node(q, rfn, lock, -1);
-}
-EXPORT_SYMBOL(blk_init_allocated_queue);
-
-struct request_queue *
-blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
-			      spinlock_t *lock, int node_id)
-{
 	if (!q)
 		return NULL;
 
-	q->node = node_id;
 	if (blk_init_free_list(q))
 		return NULL;
 
@@ -555,7 +547,7 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
 
 	return NULL;
 }
-EXPORT_SYMBOL(blk_init_allocated_queue_node);
+EXPORT_SYMBOL(blk_init_allocated_queue);
 
 int blk_get_queue(struct request_queue *q)
 {
-- 
cgit v1.1


From 3b8373b85c761b2a12bdaf9fcee4c7a3eefa8459 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 12 Jan 2012 16:01:27 +0100
Subject: block: add and use scsi_blk_cmd_ioctl

commit 577ebb374c78314ac4617242f509e2f5e7156649 upstream.

Introduce a wrapper around scsi_cmd_ioctl that takes a block device.

The function will then be enhanced to detect partition block devices
and, in that case, subject the ioctls to whitelisting.

Cc: linux-scsi@vger.kernel.org
Cc: Jens Axboe <axboe@kernel.dk>
Cc: James Bottomley <JBottomley@parallels.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 block/scsi_ioctl.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'block')

diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 4f4230b..57ac937 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -691,6 +691,13 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod
 }
 EXPORT_SYMBOL(scsi_cmd_ioctl);
 
+int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode,
+		       unsigned int cmd, void __user *arg)
+{
+	return scsi_cmd_ioctl(bd->bd_disk->queue, bd->bd_disk, mode, cmd, arg);
+}
+EXPORT_SYMBOL(scsi_cmd_blk_ioctl);
+
 static int __init blk_scsi_ioctl_init(void)
 {
 	blk_set_cmd_filter_defaults(&blk_default_cmd_filter);
-- 
cgit v1.1


From 8bd8442fec18284924e17a0fa8ef89d98b0a6d71 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 12 Jan 2012 16:01:28 +0100
Subject: block: fail SCSI passthrough ioctls on partition devices

commit 0bfc96cb77224736dfa35c3c555d37b3646ef35e upstream.

[ Changes with respect to 3.3: return -ENOTTY from scsi_verify_blk_ioctl
  and -ENOIOCTLCMD from sd_compat_ioctl. ]

Linux allows executing the SG_IO ioctl on a partition or LVM volume, and
will pass the command to the underlying block device.  This is
well-known, but it is also a large security problem when (via Unix
permissions, ACLs, SELinux or a combination thereof) a program or user
needs to be granted access only to part of the disk.

This patch lets partitions forward a small set of harmless ioctls;
others are logged with printk so that we can see which ioctls are
actually sent.  In my tests only CDROM_GET_CAPABILITY actually occurred.
Of course it was being sent to a (partition on a) hard disk, so it would
have failed with ENOTTY and the patch isn't changing anything in
practice.  Still, I'm treating it specially to avoid spamming the logs.

In principle, this restriction should include programs running with
CAP_SYS_RAWIO.  If for example I let a program access /dev/sda2 and
/dev/sdb, it still should not be able to read/write outside the
boundaries of /dev/sda2 independent of the capabilities.  However, for
now programs with CAP_SYS_RAWIO will still be allowed to send the
ioctls.  Their actions will still be logged.

This patch does not affect the non-libata IDE driver.  That driver
however already tests for bd != bd->bd_contains before issuing some
ioctl; it could be restricted further to forbid these ioctls even for
programs running with CAP_SYS_ADMIN/CAP_SYS_RAWIO.

Cc: linux-scsi@vger.kernel.org
Cc: Jens Axboe <axboe@kernel.dk>
Cc: James Bottomley <JBottomley@parallels.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
[ Make it also print the command name when warning - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 block/scsi_ioctl.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

(limited to 'block')

diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 57ac937..5ef1f4c 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -24,6 +24,7 @@
 #include <linux/capability.h>
 #include <linux/completion.h>
 #include <linux/cdrom.h>
+#include <linux/ratelimit.h>
 #include <linux/slab.h>
 #include <linux/times.h>
 #include <asm/uaccess.h>
@@ -691,9 +692,53 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod
 }
 EXPORT_SYMBOL(scsi_cmd_ioctl);
 
+int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd)
+{
+	if (bd && bd == bd->bd_contains)
+		return 0;
+
+	/* Actually none of these is particularly useful on a partition,
+	 * but they are safe.
+	 */
+	switch (cmd) {
+	case SCSI_IOCTL_GET_IDLUN:
+	case SCSI_IOCTL_GET_BUS_NUMBER:
+	case SCSI_IOCTL_GET_PCI:
+	case SCSI_IOCTL_PROBE_HOST:
+	case SG_GET_VERSION_NUM:
+	case SG_SET_TIMEOUT:
+	case SG_GET_TIMEOUT:
+	case SG_GET_RESERVED_SIZE:
+	case SG_SET_RESERVED_SIZE:
+	case SG_EMULATED_HOST:
+		return 0;
+	case CDROM_GET_CAPABILITY:
+		/* Keep this until we remove the printk below.  udev sends it
+		 * and we do not want to spam dmesg about it.   CD-ROMs do
+		 * not have partitions, so we get here only for disks.
+		 */
+		return -ENOTTY;
+	default:
+		break;
+	}
+
+	/* In particular, rule out all resets and host-specific ioctls.  */
+	printk_ratelimited(KERN_WARNING
+			   "%s: sending ioctl %x to a partition!\n", current->comm, cmd);
+
+	return capable(CAP_SYS_RAWIO) ? 0 : -ENOTTY;
+}
+EXPORT_SYMBOL(scsi_verify_blk_ioctl);
+
 int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode,
 		       unsigned int cmd, void __user *arg)
 {
+	int ret;
+
+	ret = scsi_verify_blk_ioctl(bd, cmd);
+	if (ret < 0)
+		return ret;
+
 	return scsi_cmd_ioctl(bd->bd_disk->queue, bd->bd_disk, mode, cmd, arg);
 }
 EXPORT_SYMBOL(scsi_cmd_blk_ioctl);
-- 
cgit v1.1


From b38d6b8e1914ed83cc58673341781f4cba4d9d06 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Wed, 8 Feb 2012 20:02:03 +0100
Subject: bsg: fix sysfs link remove warning

commit 37b40adf2d1b4a5e51323be73ccf8ddcf3f15dd3 upstream.

We create "bsg" link if q->kobj.sd is not NULL, so remove it only
when the same condition is true.

Fixes:

WARNING: at fs/sysfs/inode.c:323 sysfs_hash_and_remove+0x2b/0x77()
sysfs: can not remove 'bsg', no directory
Call Trace:
  [<c0429683>] warn_slowpath_common+0x6a/0x7f
  [<c0537a68>] ? sysfs_hash_and_remove+0x2b/0x77
  [<c042970b>] warn_slowpath_fmt+0x2b/0x2f
  [<c0537a68>] sysfs_hash_and_remove+0x2b/0x77
  [<c053969a>] sysfs_remove_link+0x20/0x23
  [<c05d88f1>] bsg_unregister_queue+0x40/0x6d
  [<c0692263>] __scsi_remove_device+0x31/0x9d
  [<c069149f>] scsi_forget_host+0x41/0x52
  [<c0689fa9>] scsi_remove_host+0x71/0xe0
  [<f7de5945>] quiesce_and_remove_host+0x51/0x83 [usb_storage]
  [<f7de5a1e>] usb_stor_disconnect+0x18/0x22 [usb_storage]
  [<c06c29de>] usb_unbind_interface+0x4e/0x109
  [<c067a80f>] __device_release_driver+0x6b/0xa6
  [<c067a861>] device_release_driver+0x17/0x22
  [<c067a46a>] bus_remove_device+0xd6/0xe6
  [<c06785e2>] device_del+0xf2/0x137
  [<c06c101f>] usb_disable_device+0x94/0x1a0

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Tim Gardner <tim.gardner@canonical.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 block/bsg.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/bsg.c b/block/bsg.c
index 0c8b64a..792ead6 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -985,7 +985,8 @@ void bsg_unregister_queue(struct request_queue *q)
 
 	mutex_lock(&bsg_mutex);
 	idr_remove(&bsg_minor_idr, bcd->minor);
-	sysfs_remove_link(&q->kobj, "bsg");
+	if (q->kobj.sd)
+		sysfs_remove_link(&q->kobj, "bsg");
 	device_unregister(bcd->class_dev);
 	bcd->class_dev = NULL;
 	kref_put(&bcd->ref, bsg_kref_release_function);
-- 
cgit v1.1


From aaa136d348dc723f45667aa8b39eb58c7e7c4383 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Fri, 2 Mar 2012 10:43:28 +0100
Subject: block: fix __blkdev_get and add_disk race condition

commit 9f53d2fe815b4011ff930a7b6db98385d45faa68 upstream.

The following situation might occur:

__blkdev_get:			add_disk:

				register_disk()
get_gendisk()

disk_block_events()
	disk->ev == NULL

				disk_add_events()

__disk_unblock_events()
	disk->ev != NULL
	--ev->block

Then we unblock events, when they are suppose to be blocked. This can
trigger events related block/genhd.c warnings, but also can crash in
sd_check_events() or other places.

I'm able to reproduce crashes with the following scripts (with
connected usb dongle as sdb disk).

<snip>
DEV=/dev/sdb
ENABLE=/sys/bus/usb/devices/1-2/bConfigurationValue

function stop_me()
{
	for i in `jobs -p` ; do kill $i 2> /dev/null ; done
	exit
}

trap stop_me SIGHUP SIGINT SIGTERM

for ((i = 0; i < 10; i++)) ; do
	while true; do fdisk -l $DEV  2>&1 > /dev/null ; done &
done

while true ; do
echo 1 > $ENABLE
sleep 1
echo 0 > $ENABLE
done
</snip>

I use the script to verify patch fixing oops in sd_revalidate_disk
http://marc.info/?l=linux-scsi&m=132935572512352&w=2
Without Jun'ichi Nomura patch titled "Fix NULL pointer dereference in
sd_revalidate_disk" or this one, script easily crash kernel within
a few seconds. With both patches applied I do not observe crash.
Unfortunately after some time (dozen of minutes), script will hung in:

[ 1563.906432]  [<c08354f5>] schedule_timeout_uninterruptible+0x15/0x20
[ 1563.906437]  [<c04532d5>] msleep+0x15/0x20
[ 1563.906443]  [<c05d60b2>] blk_drain_queue+0x32/0xd0
[ 1563.906447]  [<c05d6e00>] blk_cleanup_queue+0xd0/0x170
[ 1563.906454]  [<c06d278f>] scsi_free_queue+0x3f/0x60
[ 1563.906459]  [<c06d7e6e>] __scsi_remove_device+0x6e/0xb0
[ 1563.906463]  [<c06d4aff>] scsi_forget_host+0x4f/0x60
[ 1563.906468]  [<c06cd84a>] scsi_remove_host+0x5a/0xf0
[ 1563.906482]  [<f7f030fb>] quiesce_and_remove_host+0x5b/0xa0 [usb_storage]
[ 1563.906490]  [<f7f03203>] usb_stor_disconnect+0x13/0x20 [usb_storage]

Anyway I think this patch is some step forward.

As drawback, I do not teardown on sysfs file create error, because I do
not know how to nullify disk->ev (since it can be used). However add_disk
error handling practically does not exist too, and things will work
without this sysfs file, except events will not be exported to user
space.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 block/genhd.c | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 8c0829a..091429d 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -36,6 +36,7 @@ static DEFINE_IDR(ext_devt_idr);
 
 static struct device_type disk_type;
 
+static void disk_alloc_events(struct gendisk *disk);
 static void disk_add_events(struct gendisk *disk);
 static void disk_del_events(struct gendisk *disk);
 static void disk_release_events(struct gendisk *disk);
@@ -602,6 +603,8 @@ void add_disk(struct gendisk *disk)
 	disk->major = MAJOR(devt);
 	disk->first_minor = MINOR(devt);
 
+	disk_alloc_events(disk);
+
 	/* Register BDI before referencing it from bdev */ 
 	bdi = &disk->queue->backing_dev_info;
 	bdi_register_dev(bdi, disk_devt(disk));
@@ -1740,9 +1743,9 @@ module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
 		&disk_events_dfl_poll_msecs, 0644);
 
 /*
- * disk_{add|del|release}_events - initialize and destroy disk_events.
+ * disk_{alloc|add|del|release}_events - initialize and destroy disk_events.
  */
-static void disk_add_events(struct gendisk *disk)
+static void disk_alloc_events(struct gendisk *disk)
 {
 	struct disk_events *ev;
 
@@ -1755,16 +1758,6 @@ static void disk_add_events(struct gendisk *disk)
 		return;
 	}
 
-	if (sysfs_create_files(&disk_to_dev(disk)->kobj,
-			       disk_events_attrs) < 0) {
-		pr_warn("%s: failed to create sysfs files for events\n",
-			disk->disk_name);
-		kfree(ev);
-		return;
-	}
-
-	disk->ev = ev;
-
 	INIT_LIST_HEAD(&ev->node);
 	ev->disk = disk;
 	spin_lock_init(&ev->lock);
@@ -1773,8 +1766,21 @@ static void disk_add_events(struct gendisk *disk)
 	ev->poll_msecs = -1;
 	INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
 
+	disk->ev = ev;
+}
+
+static void disk_add_events(struct gendisk *disk)
+{
+	if (!disk->ev)
+		return;
+
+	/* FIXME: error handling */
+	if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0)
+		pr_warn("%s: failed to create sysfs files for events\n",
+			disk->disk_name);
+
 	mutex_lock(&disk_events_mutex);
-	list_add_tail(&ev->node, &disk_events);
+	list_add_tail(&disk->ev->node, &disk_events);
 	mutex_unlock(&disk_events_mutex);
 
 	/*
-- 
cgit v1.1


From 25705e3a3e9f91280b51124a28e74418aff00c7d Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Fri, 2 Mar 2012 10:51:00 +0100
Subject: Block: use a freezable workqueue for disk-event polling

commit 62d3c5439c534b0e6c653fc63e6d8c67be3a57b1 upstream.

This patch (as1519) fixes a bug in the block layer's disk-events
polling.  The polling is done by a work routine queued on the
system_nrt_wq workqueue.  Since that workqueue isn't freezable, the
polling continues even in the middle of a system sleep transition.

Obviously, polling a suspended drive for media changes and such isn't
a good thing to do; in the case of USB mass-storage devices it can
lead to real problems requiring device resets and even re-enumeration.

The patch fixes things by creating a new system-wide, non-reentrant,
freezable workqueue and using it for disk-events polling.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 block/genhd.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'block')

diff --git a/block/genhd.c b/block/genhd.c
index 091429d..f6ecddb 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1487,9 +1487,9 @@ static void __disk_unblock_events(struct gendisk *disk, bool check_now)
 	intv = disk_events_poll_jiffies(disk);
 	set_timer_slack(&ev->dwork.timer, intv / 4);
 	if (check_now)
-		queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
+		queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, 0);
 	else if (intv)
-		queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
+		queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, intv);
 out_unlock:
 	spin_unlock_irqrestore(&ev->lock, flags);
 }
@@ -1530,7 +1530,7 @@ void disk_check_events(struct gendisk *disk)
 	spin_lock_irqsave(&ev->lock, flags);
 	if (!ev->block) {
 		cancel_delayed_work(&ev->dwork);
-		queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
+		queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, 0);
 	}
 	spin_unlock_irqrestore(&ev->lock, flags);
 }
@@ -1568,7 +1568,7 @@ unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
 
 	/* uncondtionally schedule event check and wait for it to finish */
 	disk_block_events(disk);
-	queue_delayed_work(system_nrt_wq, &ev->dwork, 0);
+	queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, 0);
 	flush_delayed_work(&ev->dwork);
 	__disk_unblock_events(disk, false);
 
@@ -1605,7 +1605,7 @@ static void disk_events_workfn(struct work_struct *work)
 
 	intv = disk_events_poll_jiffies(disk);
 	if (!ev->block && intv)
-		queue_delayed_work(system_nrt_wq, &ev->dwork, intv);
+		queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, intv);
 
 	spin_unlock_irq(&ev->lock);
 
-- 
cgit v1.1