From 6fd70fafa11fd88d5beee360395f4c0b28d7af43 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastryyy@gmail.com>
Date: Wed, 3 Aug 2011 23:12:18 +0400
Subject: CIFS: Fix missing a decrement of inFlight value

commit 0193e072268fe62c4b19ad4b05cd0d4b23c43bb9 upstream.

if we failed on getting mid entry in cifs_call_async.

Signed-off-by: Pavel Shilovsky <piastryyy@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/cifs/transport.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 147aa22..c1b9c4b 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -362,6 +362,8 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
 	mid = AllocMidQEntry(hdr, server);
 	if (mid == NULL) {
 		mutex_unlock(&server->srv_mutex);
+		atomic_dec(&server->inFlight);
+		wake_up(&server->request_q);
 		return -ENOMEM;
 	}
 
-- 
cgit v1.1


From cae28d950cc7319a7e88deccbb0c4492f0830e6e Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 5 Aug 2011 09:02:40 -0400
Subject: cifs: cope with negative dentries in cifs_get_root

commit 80975d21aae2136ccae1ce914a1602dc1d8b0795 upstream.

The loop around lookup_one_len doesn't handle the case where it might
return a negative dentry, which can cause an oops on the next pass
through the loop. Check for that and break out of the loop with an
error of -ENOENT if there is one.

Fixes the panic reported here:

    https://bugzilla.redhat.com/show_bug.cgi?id=727927

Reported-by: TR Bentley <home@trarbentley.net>
Reported-by: Iain Arnell <iarnell@gmail.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/cifs/cifsfs.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index bc4b12c..fc7e57b 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -581,6 +581,10 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
 		mutex_unlock(&dir->i_mutex);
 		dput(dentry);
 		dentry = child;
+		if (!dentry->d_inode) {
+			dput(dentry);
+			dentry = ERR_PTR(-ENOENT);
+		}
 	} while (!IS_ERR(dentry));
 	_FreeXid(xid);
 	kfree(full_path);
-- 
cgit v1.1


From 407529f61453bf0e683aa4584bfe2a5b864f1525 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 5 Aug 2011 10:28:01 -0400
Subject: cifs: convert prefixpath delimiters in cifs_build_path_to_root

commit f9e8c45002cacad536b338dfa9e910e341a49c31 upstream.

Regression from 2.6.39...

The delimiters in the prefixpath are not being converted based on
whether posix paths are in effect. Fixes:

    https://bugzilla.redhat.com/show_bug.cgi?id=727834

Reported-and-Tested-by: Iain Arnell <iarnell@gmail.com>
Reported-by: Patrick Oltmann <patrick.oltmann@gmx.net>
Cc: Pavel Shilovsky <piastryyy@gmail.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/cifs/inode.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 9b018c8..a7b2dcd 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -764,20 +764,10 @@ char *cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
 	if (full_path == NULL)
 		return full_path;
 
-	if (dfsplen) {
+	if (dfsplen)
 		strncpy(full_path, tcon->treeName, dfsplen);
-		/* switch slash direction in prepath depending on whether
-		 * windows or posix style path names
-		 */
-		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
-			int i;
-			for (i = 0; i < dfsplen; i++) {
-				if (full_path[i] == '\\')
-					full_path[i] = '/';
-			}
-		}
-	}
 	strncpy(full_path + dfsplen, vol->prepath, pplen);
+	convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
 	full_path[dfsplen + pplen] = 0; /* add trailing null */
 	return full_path;
 }
-- 
cgit v1.1


From ed60157d262ebc0a032362013c58665d490edeee Mon Sep 17 00:00:00 2001
From: John Johansen <john.johansen@canonical.com>
Date: Fri, 22 Jul 2011 08:14:15 -0700
Subject: Ecryptfs: Add mount option to check uid of device being mounted =
 expect uid

commit 764355487ea220fdc2faf128d577d7f679b91f97 upstream.

Close a TOCTOU race for mounts done via ecryptfs-mount-private.  The mount
source (device) can be raced when the ownership test is done in userspace.
Provide Ecryptfs a means to force the uid check at mount time.

Signed-off-by: John Johansen <john.johansen@canonical.com>
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/ecryptfs/main.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 9f1bb74..b4a6bef 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -175,6 +175,7 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
        ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig,
        ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes,
        ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only,
+       ecryptfs_opt_check_dev_ruid,
        ecryptfs_opt_err };
 
 static const match_table_t tokens = {
@@ -191,6 +192,7 @@ static const match_table_t tokens = {
 	{ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"},
 	{ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"},
 	{ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"},
+	{ecryptfs_opt_check_dev_ruid, "ecryptfs_check_dev_ruid"},
 	{ecryptfs_opt_err, NULL}
 };
 
@@ -236,6 +238,7 @@ static void ecryptfs_init_mount_crypt_stat(
  * ecryptfs_parse_options
  * @sb: The ecryptfs super block
  * @options: The options passed to the kernel
+ * @check_ruid: set to 1 if device uid should be checked against the ruid
  *
  * Parse mount options:
  * debug=N 	   - ecryptfs_verbosity level for debug output
@@ -251,7 +254,8 @@ static void ecryptfs_init_mount_crypt_stat(
  *
  * Returns zero on success; non-zero on error
  */
-static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
+static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options,
+				  uid_t *check_ruid)
 {
 	char *p;
 	int rc = 0;
@@ -276,6 +280,8 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
 	char *cipher_key_bytes_src;
 	char *fn_cipher_key_bytes_src;
 
+	*check_ruid = 0;
+
 	if (!options) {
 		rc = -EINVAL;
 		goto out;
@@ -380,6 +386,9 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
 			mount_crypt_stat->flags |=
 				ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY;
 			break;
+		case ecryptfs_opt_check_dev_ruid:
+			*check_ruid = 1;
+			break;
 		case ecryptfs_opt_err:
 		default:
 			printk(KERN_WARNING
@@ -475,6 +484,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
 	const char *err = "Getting sb failed";
 	struct inode *inode;
 	struct path path;
+	uid_t check_ruid;
 	int rc;
 
 	sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL);
@@ -483,7 +493,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
 		goto out;
 	}
 
-	rc = ecryptfs_parse_options(sbi, raw_data);
+	rc = ecryptfs_parse_options(sbi, raw_data, &check_ruid);
 	if (rc) {
 		err = "Error parsing options";
 		goto out;
@@ -521,6 +531,15 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
 			"known incompatibilities\n");
 		goto out_free;
 	}
+
+	if (check_ruid && path.dentry->d_inode->i_uid != current_uid()) {
+		rc = -EPERM;
+		printk(KERN_ERR "Mount of device (uid: %d) not owned by "
+		       "requested user (uid: %d)\n",
+		       path.dentry->d_inode->i_uid, current_uid());
+		goto out_free;
+	}
+
 	ecryptfs_set_superblock_lower(s, path.dentry->d_sb);
 	s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
 	s->s_blocksize = path.dentry->d_sb->s_blocksize;
-- 
cgit v1.1


From 8c9729a2ca6291a80c4d0473ae64f2b7a547cc9e Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Thu, 4 Aug 2011 22:58:51 -0500
Subject: eCryptfs: Return error when lower file pointer is NULL

commit f61500e000eedc0c7a0201200a7f00ba5529c002 upstream.

When an eCryptfs inode's lower file has been closed, and the pointer has
been set to NULL, return an error when trying to do a lower read or
write rather than calling BUG().

https://bugzilla.kernel.org/show_bug.cgi?id=37292

Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/ecryptfs/read_write.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 85d4309..3745f7c 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -39,15 +39,16 @@
 int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
 			 loff_t offset, size_t size)
 {
-	struct ecryptfs_inode_info *inode_info;
+	struct file *lower_file;
 	mm_segment_t fs_save;
 	ssize_t rc;
 
-	inode_info = ecryptfs_inode_to_private(ecryptfs_inode);
-	BUG_ON(!inode_info->lower_file);
+	lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file;
+	if (!lower_file)
+		return -EIO;
 	fs_save = get_fs();
 	set_fs(get_ds());
-	rc = vfs_write(inode_info->lower_file, data, size, &offset);
+	rc = vfs_write(lower_file, data, size, &offset);
 	set_fs(fs_save);
 	mark_inode_dirty_sync(ecryptfs_inode);
 	return rc;
@@ -225,15 +226,16 @@ out:
 int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
 			struct inode *ecryptfs_inode)
 {
-	struct ecryptfs_inode_info *inode_info =
-		ecryptfs_inode_to_private(ecryptfs_inode);
+	struct file *lower_file;
 	mm_segment_t fs_save;
 	ssize_t rc;
 
-	BUG_ON(!inode_info->lower_file);
+	lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file;
+	if (!lower_file)
+		return -EIO;
 	fs_save = get_fs();
 	set_fs(get_ds());
-	rc = vfs_read(inode_info->lower_file, data, size, &offset);
+	rc = vfs_read(lower_file, data, size, &offset);
 	set_fs(fs_save);
 	return rc;
 }
-- 
cgit v1.1


From 1ae2a2c0515870d784f1ea101b079bd0962b6cd5 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Thu, 11 Aug 2011 09:51:46 -0500
Subject: ext3: Properly count journal credits for long symlinks

commit d2db60df1e7eb39cf0f378dfc4dd8813666d46ef upstream.

Commit ae54870a1dc9 ("ext3: Fix lock inversion in ext3_symlink()")
recalculated the number of credits needed for a long symlink, in the
process of splitting it into two transactions.  However, the first
credit calculation under-counted because if selinux is enabled, credits
are needed to create the selinux xattr as well.

Overrunning the reservation will result in an OOPS in
journal_dirty_metadata() due to this assert:

  J_ASSERT_JH(jh, handle->h_buffer_credits > 0);

Fix this by increasing the reservation size.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/ext3/namei.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 34b6d9b..e5a7111 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2210,9 +2210,11 @@ static int ext3_symlink (struct inode * dir,
 		/*
 		 * For non-fast symlinks, we just allocate inode and put it on
 		 * orphan list in the first transaction => we need bitmap,
-		 * group descriptor, sb, inode block, quota blocks.
+		 * group descriptor, sb, inode block, quota blocks, and
+		 * possibly selinux xattr blocks.
 		 */
-		credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+		credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+			  EXT3_XATTR_TRANS_BLOCKS;
 	} else {
 		/*
 		 * Fast symlink. We have to add entry to directory
-- 
cgit v1.1


From 8b180803c66ccc825d2969c1ea9929fcb7fba98e Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Thu, 11 Aug 2011 09:54:31 -0500
Subject: ext4: Properly count journal credits for long symlinks

commit 8c20871998c082f6fbc963f1449a5ba5140ee39a upstream.

Commit df5e6223407e ("ext4: fix deadlock in ext4_symlink() in ENOSPC
conditions") recalculated the number of credits needed for a long
symlink, in the process of splitting it into two transactions.  However,
the first credit calculation under-counted because if selinux is
enabled, credits are needed to create the selinux xattr as well.

Overrunning the reservation will result in an OOPS in
jbd2_journal_dirty_metadata() due to this assert:

  J_ASSERT_JH(jh, handle->h_buffer_credits > 0);

Fix this by increasing the reservation size.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/ext4/namei.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index b754b77..458a394 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2264,9 +2264,11 @@ static int ext4_symlink(struct inode *dir,
 		/*
 		 * For non-fast symlinks, we just allocate inode and put it on
 		 * orphan list in the first transaction => we need bitmap,
-		 * group descriptor, sb, inode block, quota blocks.
+		 * group descriptor, sb, inode block, quota blocks, and
+		 * possibly selinux xattr blocks.
 		 */
-		credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
+		credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
+			  EXT4_XATTR_TRANS_BLOCKS;
 	} else {
 		/*
 		 * Fast symlink. We have to add entry to directory
-- 
cgit v1.1


From b732c7ad1f6e6056dd898de7987665d4c490c78d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 8 Aug 2011 11:50:24 -0400
Subject: cifs: demote cERROR in build_path_from_dentry to cFYI

commit fa71f447065f676157ba6a2c121ba419818fc559 upstream.

Running the cthon tests on a recent kernel caused this message to pop
occasionally:

    CIFS VFS: did not end path lookup where expected namelen is 0

Some added debugging showed that namelen and dfsplen were both 0 when
this occurred. That means that the read_seqretry returned true.

Assuming that the comment inside the if statement is true, this should
be harmless and just means that we raced with a rename. If that is the
case, then there's no need for alarm and we can demote this to cFYI.

While we're at it, print the dfsplen too so that we can see what
happened here if the message pops during debugging.

Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/cifs/dir.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index d8d26f3..16cdd6d 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -110,8 +110,8 @@ cifs_bp_rename_retry:
 	}
 	rcu_read_unlock();
 	if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) {
-		cERROR(1, "did not end path lookup where expected namelen is %d",
-			namelen);
+		cFYI(1, "did not end path lookup where expected. namelen=%d "
+			"dfsplen=%d", namelen, dfsplen);
 		/* presumably this is only possible if racing with a rename
 		of one of the parent directories  (we can not lock the dentries
 		above us to prevent this, but retrying should be harmless) */
-- 
cgit v1.1


From f10df4139384fdede3be489dbfea077fc3c26ad9 Mon Sep 17 00:00:00 2001
From: Timo Warns <Warns@pre-sense.de>
Date: Wed, 17 Aug 2011 17:59:56 +0200
Subject: befs: Validate length of long symbolic links.

commit 338d0f0a6fbc82407864606f5b64b75aeb3c70f2 upstream.

Signed-off-by: Timo Warns <warns@pre-sense.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/befs/linuxvfs.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 54b8c28..720d885 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -474,17 +474,22 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd)
 		befs_data_stream *data = &befs_ino->i_data.ds;
 		befs_off_t len = data->size;
 
-		befs_debug(sb, "Follow long symlink");
-
-		link = kmalloc(len, GFP_NOFS);
-		if (!link) {
-			link = ERR_PTR(-ENOMEM);
-		} else if (befs_read_lsymlink(sb, data, link, len) != len) {
-			kfree(link);
-			befs_error(sb, "Failed to read entire long symlink");
+		if (len == 0) {
+			befs_error(sb, "Long symlink with illegal length");
 			link = ERR_PTR(-EIO);
 		} else {
-			link[len - 1] = '\0';
+			befs_debug(sb, "Follow long symlink");
+
+			link = kmalloc(len, GFP_NOFS);
+			if (!link) {
+				link = ERR_PTR(-ENOMEM);
+			} else if (befs_read_lsymlink(sb, data, link, len) != len) {
+				kfree(link);
+				befs_error(sb, "Failed to read entire long symlink");
+				link = ERR_PTR(-EIO);
+			} else {
+				link[len - 1] = '\0';
+			}
 		}
 	} else {
 		link = befs_ino->i_data.symlink;
-- 
cgit v1.1


From b3ff2fd377a0b593678af0082b6a2e4ecc3eec84 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 18 Aug 2011 04:41:55 +0000
Subject: possible memory corruption on mount

commit 13589c437daf4c8e429b3236c0b923de1c9420d8 upstream.

CIFS cleanup_volume_info_contents() looks like having a memory
corruption problem.
When UNCip is set to "&vol->UNC[2]" in cifs_parse_mount_options(), it
should not be kfree()-ed in cleanup_volume_info_contents().

Introduced in commit b946845a9dc523c759cae2b6a0f6827486c3221a

Signed-off-by: J.R. Okajima <hooanon05@yahoo.co.jp>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/cifs/connect.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index ccc1afa..e0ea721 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2838,7 +2838,8 @@ cleanup_volume_info_contents(struct smb_vol *volume_info)
 	kfree(volume_info->username);
 	kzfree(volume_info->password);
 	kfree(volume_info->UNC);
-	kfree(volume_info->UNCip);
+	if (volume_info->UNCip != volume_info->UNC + 2)
+		kfree(volume_info->UNCip);
 	kfree(volume_info->domainname);
 	kfree(volume_info->iocharset);
 	kfree(volume_info->prepath);
-- 
cgit v1.1


From 3be1216c9e9976074d49414a53abf572ec4b4a24 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Wed, 3 Aug 2011 21:52:51 -0700
Subject: pnfs-obj: Fix the comp_index != 0 case

commit 9af7db3228acc286c50e3a0f054ec982efdbc6c6 upstream.

There were bugs in the case of partial layout where olo_comp_index
is not zero. This used to work and was tested but one of the later
cleanup SQUASHMEs broke it and was not tested since.

Also add a dprint that specify those received layout parameters.
Everything else was already printed.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/nfs/objlayout/objio_osd.c        | 16 +++++++---------
 fs/nfs/objlayout/pnfs_osd_xdr_cli.c |  3 +++
 2 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 8ff2ea3..799e8cf 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -479,7 +479,6 @@ static int _io_check(struct objio_state *ios, bool is_write)
 	for (i = 0; i <  ios->numdevs; i++) {
 		struct osd_sense_info osi;
 		struct osd_request *or = ios->per_dev[i].or;
-		unsigned dev;
 		int ret;
 
 		if (!or)
@@ -500,9 +499,8 @@ static int _io_check(struct objio_state *ios, bool is_write)
 
 			continue; /* we recovered */
 		}
-		dev = ios->per_dev[i].dev;
-		objlayout_io_set_result(&ios->ol_state, dev,
-					&ios->layout->comps[dev].oc_object_id,
+		objlayout_io_set_result(&ios->ol_state, i,
+					&ios->layout->comps[i].oc_object_id,
 					osd_pri_2_pnfs_err(osi.osd_err_pri),
 					ios->per_dev[i].offset,
 					ios->per_dev[i].length,
@@ -650,7 +648,7 @@ static int _prepare_one_group(struct objio_state *ios, u64 length,
 	int ret = 0;
 
 	while (length) {
-		struct _objio_per_comp *per_dev = &ios->per_dev[dev];
+		struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev];
 		unsigned cur_len, page_off = 0;
 
 		if (!per_dev->length) {
@@ -670,8 +668,8 @@ static int _prepare_one_group(struct objio_state *ios, u64 length,
 				cur_len = stripe_unit;
 			}
 
-			if (max_comp < dev)
-				max_comp = dev;
+			if (max_comp < dev - first_dev)
+				max_comp = dev - first_dev;
 		} else {
 			cur_len = stripe_unit;
 		}
@@ -806,7 +804,7 @@ static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
 	struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
 	unsigned dev = per_dev->dev;
 	struct pnfs_osd_object_cred *cred =
-			&ios->layout->comps[dev];
+			&ios->layout->comps[cur_comp];
 	struct osd_obj_id obj = {
 		.partition = cred->oc_object_id.oid_partition_id,
 		.id = cred->oc_object_id.oid_object_id,
@@ -904,7 +902,7 @@ static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
 	for (; cur_comp < last_comp; ++cur_comp, ++dev) {
 		struct osd_request *or = NULL;
 		struct pnfs_osd_object_cred *cred =
-					&ios->layout->comps[dev];
+					&ios->layout->comps[cur_comp];
 		struct osd_obj_id obj = {
 			.partition = cred->oc_object_id.oid_partition_id,
 			.id = cred->oc_object_id.oid_object_id,
diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
index 16fc758..b3918f7 100644
--- a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
+++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
@@ -170,6 +170,9 @@ int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout,
 	p = _osd_xdr_decode_data_map(p, &layout->olo_map);
 	layout->olo_comps_index = be32_to_cpup(p++);
 	layout->olo_num_comps = be32_to_cpup(p++);
+	dprintk("%s: olo_comps_index=%d olo_num_comps=%d\n", __func__,
+		layout->olo_comps_index, layout->olo_num_comps);
+
 	iter->total_comps = layout->olo_num_comps;
 	return 0;
 }
-- 
cgit v1.1


From b861a2580da034f6a57517c687ded68e20f99763 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Wed, 3 Aug 2011 21:54:33 -0700
Subject: pnfs-obj: Bug when we are running out of bio

commit 20618b21da0796115e81906d24ff1601552701b7 upstream.

When we have a situation that the number of pages we want
to encode is bigger then the size of the bio. (Which can
currently happen only when all IO is going to a single device
.e.g group_width==1) then the IO is submitted short and we
report back only the amount of bytes we actually wrote/read
and all is fine. BUT ...

There was a bug that the current length counter was advanced
before the fail to add the extra page, and we come to a situation
that the CDB length was one-page longer then the actual bio size,
which is of course rejected by the osd-target.

While here also fix the bio size calculation, in the case
that we received more then one group of devices.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/nfs/objlayout/objio_osd.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 799e8cf..1d1dc1e 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -587,22 +587,19 @@ static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
 }
 
 static int _add_stripe_unit(struct objio_state *ios,  unsigned *cur_pg,
-		unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len,
+		unsigned pgbase, struct _objio_per_comp *per_dev, int len,
 		gfp_t gfp_flags)
 {
 	unsigned pg = *cur_pg;
+	int cur_len = len;
 	struct request_queue *q =
 			osd_request_queue(_io_od(ios, per_dev->dev));
 
-	per_dev->length += cur_len;
-
 	if (per_dev->bio == NULL) {
-		unsigned stripes = ios->layout->num_comps /
-						     ios->layout->mirrors_p1;
-		unsigned pages_in_stripe = stripes *
+		unsigned pages_in_stripe = ios->layout->group_width *
 				      (ios->layout->stripe_unit / PAGE_SIZE);
 		unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
-				    stripes;
+				    ios->layout->group_width;
 
 		if (BIO_MAX_PAGES_KMALLOC < bio_size)
 			bio_size = BIO_MAX_PAGES_KMALLOC;
@@ -630,6 +627,7 @@ static int _add_stripe_unit(struct objio_state *ios,  unsigned *cur_pg,
 	}
 	BUG_ON(cur_len);
 
+	per_dev->length += len;
 	*cur_pg = pg;
 	return 0;
 }
-- 
cgit v1.1


From f4bc412bc2f46d644375403b601f42d8487949da Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 2 Aug 2011 14:46:29 -0400
Subject: NFSv4.1: Fix the callback 'highest_used_slotid' behaviour

commit 55a673990ec04cf63005318bcf08c2b0046e5778 upstream.

Currently, there is no guarantee that we will call nfs4_cb_take_slot() even
though nfs4_callback_compound() will consistently call
nfs4_cb_free_slot() provided the cb_process_state has set the 'clp' field.
The result is that we can trigger the BUG_ON() upon the next call to
nfs4_cb_take_slot().

This patch fixes the above problem by using the slot id that was taken in
the CB_SEQUENCE operation as a flag for whether or not we need to call
nfs4_cb_free_slot().
It also fixes an atomicity problem: we need to set tbl->highest_used_slotid
atomically with the check for NFS4_SESSION_DRAINING, otherwise we end up
racing with the various tests in nfs4_begin_drain_session().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/nfs/callback.h      |  2 +-
 fs/nfs/callback_proc.c | 20 ++++++++++++++------
 fs/nfs/callback_xdr.c  | 24 +++++++-----------------
 3 files changed, 22 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index b257383..07df5f1 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -38,6 +38,7 @@ enum nfs4_callback_opnum {
 struct cb_process_state {
 	__be32			drc_status;
 	struct nfs_client	*clp;
+	int			slotid;
 };
 
 struct cb_compound_hdr_arg {
@@ -166,7 +167,6 @@ extern unsigned nfs4_callback_layoutrecall(
 	void *dummy, struct cb_process_state *cps);
 
 extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
-extern void nfs4_cb_take_slot(struct nfs_client *clp);
 
 struct cb_devicenotifyitem {
 	uint32_t		cbd_notify_type;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index d4d1954..31407ec 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -333,7 +333,7 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
 	/* Normal */
 	if (likely(args->csa_sequenceid == slot->seq_nr + 1)) {
 		slot->seq_nr++;
-		return htonl(NFS4_OK);
+		goto out_ok;
 	}
 
 	/* Replay */
@@ -352,11 +352,14 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
 	/* Wraparound */
 	if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) {
 		slot->seq_nr = 1;
-		return htonl(NFS4_OK);
+		goto out_ok;
 	}
 
 	/* Misordered request */
 	return htonl(NFS4ERR_SEQ_MISORDERED);
+out_ok:
+	tbl->highest_used_slotid = args->csa_slotid;
+	return htonl(NFS4_OK);
 }
 
 /*
@@ -418,26 +421,32 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
 			      struct cb_sequenceres *res,
 			      struct cb_process_state *cps)
 {
+	struct nfs4_slot_table *tbl;
 	struct nfs_client *clp;
 	int i;
 	__be32 status = htonl(NFS4ERR_BADSESSION);
 
-	cps->clp = NULL;
-
 	clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid);
 	if (clp == NULL)
 		goto out;
 
+	tbl = &clp->cl_session->bc_slot_table;
+
+	spin_lock(&tbl->slot_tbl_lock);
 	/* state manager is resetting the session */
 	if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) {
-		status = NFS4ERR_DELAY;
+		spin_unlock(&tbl->slot_tbl_lock);
+		status = htonl(NFS4ERR_DELAY);
 		goto out;
 	}
 
 	status = validate_seqid(&clp->cl_session->bc_slot_table, args);
+	spin_unlock(&tbl->slot_tbl_lock);
 	if (status)
 		goto out;
 
+	cps->slotid = args->csa_slotid;
+
 	/*
 	 * Check for pending referring calls.  If a match is found, a
 	 * related callback was received before the response to the original
@@ -454,7 +463,6 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
 	res->csr_slotid = args->csa_slotid;
 	res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
 	res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
-	nfs4_cb_take_slot(clp);
 
 out:
 	cps->clp = clp; /* put in nfs4_callback_compound */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index c6c86a7..918ad64 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -754,26 +754,15 @@ static void nfs4_callback_free_slot(struct nfs4_session *session)
 	 * Let the state manager know callback processing done.
 	 * A single slot, so highest used slotid is either 0 or -1
 	 */
-	tbl->highest_used_slotid--;
+	tbl->highest_used_slotid = -1;
 	nfs4_check_drain_bc_complete(session);
 	spin_unlock(&tbl->slot_tbl_lock);
 }
 
-static void nfs4_cb_free_slot(struct nfs_client *clp)
+static void nfs4_cb_free_slot(struct cb_process_state *cps)
 {
-	if (clp && clp->cl_session)
-		nfs4_callback_free_slot(clp->cl_session);
-}
-
-/* A single slot, so highest used slotid is either 0 or -1 */
-void nfs4_cb_take_slot(struct nfs_client *clp)
-{
-	struct nfs4_slot_table *tbl = &clp->cl_session->bc_slot_table;
-
-	spin_lock(&tbl->slot_tbl_lock);
-	tbl->highest_used_slotid++;
-	BUG_ON(tbl->highest_used_slotid != 0);
-	spin_unlock(&tbl->slot_tbl_lock);
+	if (cps->slotid != -1)
+		nfs4_callback_free_slot(cps->clp->cl_session);
 }
 
 #else /* CONFIG_NFS_V4_1 */
@@ -784,7 +773,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
 	return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
 }
 
-static void nfs4_cb_free_slot(struct nfs_client *clp)
+static void nfs4_cb_free_slot(struct cb_process_state *cps)
 {
 }
 #endif /* CONFIG_NFS_V4_1 */
@@ -866,6 +855,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
 	struct cb_process_state cps = {
 		.drc_status = 0,
 		.clp = NULL,
+		.slotid = -1,
 	};
 	unsigned int nops = 0;
 
@@ -906,7 +896,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
 
 	*hdr_res.status = status;
 	*hdr_res.nops = htonl(nops);
-	nfs4_cb_free_slot(cps.clp);
+	nfs4_cb_free_slot(&cps);
 	nfs_put_client(cps.clp);
 	dprintk("%s: done, status = %u\n", __func__, ntohl(status));
 	return rpc_success;
-- 
cgit v1.1


From e25d2c749d25fc559f374766af66d267c97e0877 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 2 Aug 2011 14:46:52 -0400
Subject: NFSv4.1: Return NFS4ERR_BADSESSION to callbacks during session resets

commit 910ac68a2b80c7de95bc8488734067b1bb15d583 upstream.

If the client is in the process of resetting the session when it receives
a callback, then returning NFS4ERR_DELAY may cause a deadlock with the
DESTROY_SESSION call.

Basically, if the client returns NFS4ERR_DELAY in response to the
CB_SEQUENCE call, then the server is entitled to believe that the
client is busy because it is already processing that call. In that
case, the server is perfectly entitled to respond with a
NFS4ERR_BACK_CHAN_BUSY to any DESTROY_SESSION call.

Fix this by having the client reply with a NFS4ERR_BADSESSION in
response to the callback if it is resetting the session.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/nfs/callback_proc.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 31407ec..aaa09e9 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -437,6 +437,11 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
 	if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) {
 		spin_unlock(&tbl->slot_tbl_lock);
 		status = htonl(NFS4ERR_DELAY);
+		/* Return NFS4ERR_BADSESSION if we're draining the session
+		 * in order to reset it.
+		 */
+		if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
+			status = htonl(NFS4ERR_BADSESSION);
 		goto out;
 	}
 
-- 
cgit v1.1


From a2ea18615b6929ccc884e651cd1c0e04941548bf Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 4 Aug 2011 14:52:27 +0000
Subject: Btrfs: detect wether a device supports discard

commit d5e2003c2bcda93a8f2e668eb4642d70c9c38301 upstream.

We have a problem where if a user specifies discard but doesn't actually support
it we will return EOPNOTSUPP from btrfs_discard_extent.  This is a problem
because this gets called (in a fashion) from the tree log recovery code, which
has a nice little BUG_ON(ret) after it, which causes us to fail the tree log
replay.  So instead detect wether our devices support discard when we're adding
them and then don't issue discards if we know that the device doesn't support
it.  And just for good measure set ret = 0 in btrfs_issue_discard just in case
we still get EOPNOTSUPP so we don't screw anybody up like this again.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/btrfs/extent-tree.c | 12 ++++++++++--
 fs/btrfs/volumes.c     | 17 +++++++++++++++++
 fs/btrfs/volumes.h     |  2 ++
 3 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 71cd456..7e20a65 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1784,6 +1784,9 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 
 
 		for (i = 0; i < multi->num_stripes; i++, stripe++) {
+			if (!stripe->dev->can_discard)
+				continue;
+
 			ret = btrfs_issue_discard(stripe->dev->bdev,
 						  stripe->physical,
 						  stripe->length);
@@ -1791,11 +1794,16 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 				discarded_bytes += stripe->length;
 			else if (ret != -EOPNOTSUPP)
 				break;
+
+			/*
+			 * Just in case we get back EOPNOTSUPP for some reason,
+			 * just ignore the return value so we don't screw up
+			 * people calling discard_extent.
+			 */
+			ret = 0;
 		}
 		kfree(multi);
 	}
-	if (discarded_bytes && ret == -EOPNOTSUPP)
-		ret = 0;
 
 	if (actual_bytes)
 		*actual_bytes = discarded_bytes;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 19450bc..43baaf0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -500,6 +500,9 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 			fs_devices->rw_devices--;
 		}
 
+		if (device->can_discard)
+			fs_devices->num_can_discard--;
+
 		new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
 		BUG_ON(!new_device);
 		memcpy(new_device, device, sizeof(*new_device));
@@ -508,6 +511,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 		new_device->bdev = NULL;
 		new_device->writeable = 0;
 		new_device->in_fs_metadata = 0;
+		new_device->can_discard = 0;
 		list_replace_rcu(&device->dev_list, &new_device->dev_list);
 
 		call_rcu(&device->rcu, free_device);
@@ -547,6 +551,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 				fmode_t flags, void *holder)
 {
+	struct request_queue *q;
 	struct block_device *bdev;
 	struct list_head *head = &fs_devices->devices;
 	struct btrfs_device *device;
@@ -603,6 +608,12 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 			seeding = 0;
 		}
 
+		q = bdev_get_queue(bdev);
+		if (blk_queue_discard(q)) {
+			device->can_discard = 1;
+			fs_devices->num_can_discard++;
+		}
+
 		device->bdev = bdev;
 		device->in_fs_metadata = 0;
 		device->mode = flags;
@@ -1542,6 +1553,7 @@ error:
 
 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 {
+	struct request_queue *q;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_device *device;
 	struct block_device *bdev;
@@ -1611,6 +1623,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 
 	lock_chunks(root);
 
+	q = bdev_get_queue(bdev);
+	if (blk_queue_discard(q))
+		device->can_discard = 1;
 	device->writeable = 1;
 	device->work.func = pending_bios_fn;
 	generate_random_uuid(device->uuid);
@@ -1646,6 +1661,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	root->fs_info->fs_devices->num_devices++;
 	root->fs_info->fs_devices->open_devices++;
 	root->fs_info->fs_devices->rw_devices++;
+	if (device->can_discard)
+		root->fs_info->fs_devices->num_can_discard++;
 	root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
 
 	if (!blk_queue_nonrot(bdev_get_queue(bdev)))
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 7c12d61..6d866db 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -48,6 +48,7 @@ struct btrfs_device {
 	int writeable;
 	int in_fs_metadata;
 	int missing;
+	int can_discard;
 
 	spinlock_t io_lock;
 
@@ -104,6 +105,7 @@ struct btrfs_fs_devices {
 	u64 rw_devices;
 	u64 missing_devices;
 	u64 total_rw_bytes;
+	u64 num_can_discard;
 	struct block_device *latest_bdev;
 
 	/* all of the devices in the FS, protected by a mutex
-- 
cgit v1.1


From 966ef7daecee021ffded11d584e57160ee0395c2 Mon Sep 17 00:00:00 2001
From: liubo <liubo2009@cn.fujitsu.com>
Date: Sat, 6 Aug 2011 08:35:23 +0000
Subject: Btrfs: fix an oops of log replay

commit 34f3e4f23ca3d259fe078f62a128d97ca83508ef upstream.

When btrfs recovers from a crash, it may hit the oops below:

------------[ cut here ]------------
kernel BUG at fs/btrfs/inode.c:4580!
[...]
RIP: 0010:[<ffffffffa03df251>]  [<ffffffffa03df251>] btrfs_add_link+0x161/0x1c0 [btrfs]
[...]
Call Trace:
 [<ffffffffa03e7b31>] ? btrfs_inode_ref_index+0x31/0x80 [btrfs]
 [<ffffffffa04054e9>] add_inode_ref+0x319/0x3f0 [btrfs]
 [<ffffffffa0407087>] replay_one_buffer+0x2c7/0x390 [btrfs]
 [<ffffffffa040444a>] walk_down_log_tree+0x32a/0x480 [btrfs]
 [<ffffffffa0404695>] walk_log_tree+0xf5/0x240 [btrfs]
 [<ffffffffa0406cc0>] btrfs_recover_log_trees+0x250/0x350 [btrfs]
 [<ffffffffa0406dc0>] ? btrfs_recover_log_trees+0x350/0x350 [btrfs]
 [<ffffffffa03d18b2>] open_ctree+0x1442/0x17d0 [btrfs]
[...]

This comes from that while replaying an inode ref item, we forget to
check those old conflicting DIR_ITEM and DIR_INDEX items in fs/file tree,
then we will come to conflict corners which lead to BUG_ON().

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Tested-by: Andy Lutomirski <luto@mit.edu>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/btrfs/tree-log.c | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4ce8a9f..7fa128d 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -799,14 +799,15 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 				  struct extent_buffer *eb, int slot,
 				  struct btrfs_key *key)
 {
-	struct inode *dir;
-	int ret;
 	struct btrfs_inode_ref *ref;
+	struct btrfs_dir_item *di;
+	struct inode *dir;
 	struct inode *inode;
-	char *name;
-	int namelen;
 	unsigned long ref_ptr;
 	unsigned long ref_end;
+	char *name;
+	int namelen;
+	int ret;
 	int search_done = 0;
 
 	/*
@@ -909,6 +910,25 @@ again:
 	}
 	btrfs_release_path(path);
 
+	/* look for a conflicting sequence number */
+	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
+					 btrfs_inode_ref_index(eb, ref),
+					 name, namelen, 0);
+	if (di && !IS_ERR(di)) {
+		ret = drop_one_dir_item(trans, root, path, dir, di);
+		BUG_ON(ret);
+	}
+	btrfs_release_path(path);
+
+	/* look for a conflicing name */
+	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
+				   name, namelen, 0);
+	if (di && !IS_ERR(di)) {
+		ret = drop_one_dir_item(trans, root, path, dir, di);
+		BUG_ON(ret);
+	}
+	btrfs_release_path(path);
+
 insert:
 	/* insert our name */
 	ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
-- 
cgit v1.1


From 2fb522e963f57a6c0206f67a355b8131b16ef607 Mon Sep 17 00:00:00 2001
From: Curt Wohlgemuth <curtw@google.com>
Date: Sat, 13 Aug 2011 11:25:18 -0400
Subject: ext4: Fix ext4_should_writeback_data() for no-journal mode

commit 441c850857148935babe000fc2ba1455fe54a6a9 upstream.

ext4_should_writeback_data() had an incorrect sequence of
tests to determine if it should return 0 or 1: in
particular, even in no-journal mode, 0 was being returned
for a non-regular-file inode.

This meant that, in non-journal mode, we would use
ext4_journalled_aops for directories, symlinks, and other
non-regular files.  However, calling journalled aop
callbacks when there is no valid handle, can cause problems.

This would cause a kernel crash with Jan Kara's commit
2d859db3e4 ("ext4: fix data corruption in inodes with
journalled data"), because we now dereference 'handle' in
ext4_journalled_write_end().

I also added BUG_ONs to check for a valid handle in the
obviously journal-only aops callbacks.

I tested this running xfstests with a scratch device in
these modes:

   - no-journal
   - data=ordered
   - data=writeback
   - data=journal

All work fine; the data=journal run has many failures and a
crash in xfstests 074, but this is no different from a
vanilla kernel.

Signed-off-by: Curt Wohlgemuth <curtw@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/ext4/ext4_jbd2.h | 4 ++--
 fs/ext4/inode.c     | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index bb85757..5802fa1 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -289,10 +289,10 @@ static inline int ext4_should_order_data(struct inode *inode)
 
 static inline int ext4_should_writeback_data(struct inode *inode)
 {
-	if (!S_ISREG(inode->i_mode))
-		return 0;
 	if (EXT4_JOURNAL(inode) == NULL)
 		return 1;
+	if (!S_ISREG(inode->i_mode))
+		return 0;
 	if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
 		return 0;
 	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e3126c0..0b60a46 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1849,6 +1849,8 @@ static int ext4_journalled_write_end(struct file *file,
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 
+	BUG_ON(!ext4_handle_valid(handle));
+
 	if (copied < len) {
 		if (!PageUptodate(page))
 			copied = 0;
@@ -2564,6 +2566,8 @@ static int __ext4_journalled_writepage(struct page *page,
 		goto out;
 	}
 
+	BUG_ON(!ext4_handle_valid(handle));
+
 	ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
 				do_journal_get_write_access);
 
-- 
cgit v1.1


From 2526f368949bccda6e8ed1bf74a4e955e3af42af Mon Sep 17 00:00:00 2001
From: Jiaying Zhang <jiayingz@google.com>
Date: Sat, 13 Aug 2011 12:17:13 -0400
Subject: ext4: call ext4_ioend_wait and ext4_flush_completed_IO in
 ext4_evict_inode

commit 2581fdc810889fdea97689cb62481201d579c796 upstream.

Flush inode's i_completed_io_list before calling ext4_io_wait to
prevent the following deadlock scenario: A page fault happens while
some process is writing inode A. During page fault,
shrink_icache_memory is called that in turn evicts another inode
B. Inode B has some pending io_end work so it calls ext4_ioend_wait()
that waits for inode B's i_ioend_count to become zero. However, inode
B's ioend work was queued behind some of inode A's ioend work on the
same cpu's ext4-dio-unwritten workqueue. As the ext4-dio-unwritten
thread on that cpu is processing inode A's ioend work, it tries to
grab inode A's i_mutex lock. Since the i_mutex lock of inode A is
still hold before the page fault happened, we enter a deadlock.

Also moves ext4_flush_completed_IO and ext4_ioend_wait from
ext4_destroy_inode() to ext4_evict_inode(). During inode deleteion,
ext4_evict_inode() is called before ext4_destroy_inode() and in
ext4_evict_inode(), we may call ext4_truncate() without holding
i_mutex lock. As a result, there is a race between flush_completed_IO
that is called from ext4_ext_truncate() and ext4_end_io_work, which
may cause corruption on an io_end structure. This change moves
ext4_flush_completed_IO and ext4_ioend_wait from ext4_destroy_inode()
to ext4_evict_inode() to resolve the race between ext4_truncate() and
ext4_end_io_work during inode deletion.

Signed-off-by: Jiaying Zhang <jiayingz@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/ext4/inode.c | 6 ++++++
 fs/ext4/super.c | 1 -
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0b60a46..773de46 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -189,6 +189,12 @@ void ext4_evict_inode(struct inode *inode)
 	int err;
 
 	trace_ext4_evict_inode(inode);
+
+	mutex_lock(&inode->i_mutex);
+	ext4_flush_completed_IO(inode);
+	mutex_unlock(&inode->i_mutex);
+	ext4_ioend_wait(inode);
+
 	if (inode->i_nlink) {
 		truncate_inode_pages(&inode->i_data, 0);
 		goto no_delete;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9ea71aa..111ed9d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -892,7 +892,6 @@ static void ext4_i_callback(struct rcu_head *head)
 
 static void ext4_destroy_inode(struct inode *inode)
 {
-	ext4_ioend_wait(inode);
 	if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
 		ext4_msg(inode->i_sb, KERN_ERR,
 			 "Inode %lu (%p): orphan list check failed!",
-- 
cgit v1.1


From 4eddd2a50f2548c6c83081fde8fdbc3de07626f6 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Sat, 13 Aug 2011 12:30:59 -0400
Subject: ext4: Resolve the hang of direct i/o read in handling
 EXT4_IO_END_UNWRITTEN.

commit 32c80b32c053dc52712dedac5e4d0aa7c93fc353 upstream.

EXT4_IO_END_UNWRITTEN flag set and the increase of i_aiodio_unwritten
should be done simultaneously since ext4_end_io_nolock always clear
the flag and decrease the counter in the same time.

We don't increase i_aiodio_unwritten when setting
EXT4_IO_END_UNWRITTEN so it will go nagative and causes some process
to wait forever.

Part of the patch came from Eric in his e-mail, but it doesn't fix the
problem met by Michael actually.

http://marc.info/?l=linux-ext4&m=131316851417460&w=2

Reported-and-Tested-by: Michael Tokarev<mjt@tls.msk.ru>
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/ext4/inode.c   | 9 ++++++++-
 fs/ext4/page-io.c | 6 ++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 773de46..5390b4d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3645,8 +3645,15 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
 		goto out;
 	}
 
-	io_end->flag = EXT4_IO_END_UNWRITTEN;
+	/*
+	 * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now,
+	 * but being more careful is always safe for the future change.
+	 */
 	inode = io_end->inode;
+	if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+		io_end->flag |= EXT4_IO_END_UNWRITTEN;
+		atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+	}
 
 	/* Add the io_end to per-inode completed io list*/
 	spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 7bb8f76..97e5e98 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -338,8 +338,10 @@ submit_and_retry:
 	if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
 	    (io_end->pages[io_end->num_io_pages-1] != io_page))
 		goto submit_and_retry;
-	if (buffer_uninit(bh))
-		io->io_end->flag |= EXT4_IO_END_UNWRITTEN;
+	if (buffer_uninit(bh) && !(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+		io_end->flag |= EXT4_IO_END_UNWRITTEN;
+		atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+	}
 	io->io_end->size += bh->b_size;
 	io->io_next_block++;
 	ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
-- 
cgit v1.1


From 45df4b8977852ea12d6ed19f6c87e6765f6c31e5 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 13 Aug 2011 12:58:21 -0400
Subject: ext4: fix nomblk_io_submit option so it correctly converts uninit
 blocks

commit 9dd75f1f1a02d656a11a7b9b9e6c2759b9c1e946 upstream.

Bug discovered by Jan Kara:

Finally, commit 1449032be17abb69116dbc393f67ceb8bd034f92 returned back
the old IO submission code but apparently it forgot to return the old
handling of uninitialized buffers so we unconditionnaly call
block_write_full_page() without specifying end_io function. So AFAICS
we never convert unwritten extents to written in some cases. For
example when I mount the fs as: mount -t ext4 -o
nomblk_io_submit,dioread_nolock /dev/ubdb /mnt and do
        int fd = open(argv[1], O_RDWR | O_CREAT | O_TRUNC, 0600);
        char buf[1024];
        memset(buf, 'a', sizeof(buf));
        fallocate(fd, 0, 0, 16384);
        write(fd, buf, sizeof(buf));

I get a file full of zeros (after remounting the filesystem so that
pagecache is dropped) instead of seeing the first KB contain 'a's.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/ext4/inode.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5390b4d..b864839 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2156,7 +2156,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
 			else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
 				err = ext4_bio_write_page(&io_submit, page,
 							  len, mpd->wbc);
-			else
+			else if (buffer_uninit(page_bufs)) {
+				ext4_set_bh_endio(page_bufs, inode);
+				err = block_write_full_page_endio(page,
+					noalloc_get_block_write,
+					mpd->wbc, ext4_end_io_buffer_write);
+			} else
 				err = block_write_full_page(page,
 					noalloc_get_block_write, mpd->wbc);
 
-- 
cgit v1.1


From 4ca4e8168092fcf2c352b25556e786762668a2a4 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 24 Aug 2011 10:20:17 +0200
Subject: fuse: check size of FUSE_NOTIFY_INVAL_ENTRY message

commit c2183d1e9b3f313dd8ba2b1b0197c8d9fb86a7ae upstream.

FUSE_NOTIFY_INVAL_ENTRY didn't check the length of the write so the
message processing could overrun and result in a "kernel BUG at
fs/fuse/dev.c:629!"

Reported-by: Han-Wen Nienhuys <hanwenn@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/fuse/dev.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 640fc22..168a80f 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1358,6 +1358,10 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
 	if (outarg.namelen > FUSE_NAME_MAX)
 		goto err;
 
+	err = -EINVAL;
+	if (size != sizeof(outarg) + outarg.namelen + 1)
+		goto err;
+
 	name.name = buf;
 	name.len = outarg.namelen;
 	err = fuse_copy_one(cs, buf, outarg.namelen + 1);
-- 
cgit v1.1


From 97e5a85664252fd3f3f84751c7779c326df9c851 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Sat, 10 Sep 2011 17:20:21 +1000
Subject: Avoid dereferencing a 'request_queue' after last close.

commit 94007751bb02797ba87bac7aacee2731ac2039a3 upstream.

On the last close of an 'md' device which as been stopped, the device
is destroyed and in particular the request_queue is freed.  The free
is done in a separate thread so it might happen a short time later.

__blkdev_put calls bdev_inode_switch_bdi *after* ->release has been
called.

Since commit f758eeabeb96f878c860e8f110f94ec8820822a9
bdev_inode_switch_bdi will dereference the 'old' bdi, which lives
inside a request_queue, to get a spin lock.  This causes the last
close on an md device to sometime take a spin_lock which lives in
freed memory - which results in an oops.

So move the called to bdev_inode_switch_bdi before the call to
->release.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Acked-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/block_dev.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 610e8e0..194cf66 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1419,6 +1419,11 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 		WARN_ON_ONCE(bdev->bd_holders);
 		sync_blockdev(bdev);
 		kill_bdev(bdev);
+		/* ->release can cause the old bdi to disappear,
+		 * so must switch it out first
+		 */
+		bdev_inode_switch_bdi(bdev->bd_inode,
+					&default_backing_dev_info);
 	}
 	if (bdev->bd_contains == bdev) {
 		if (disk->fops->release)
@@ -1432,8 +1437,6 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 		disk_put_part(bdev->bd_part);
 		bdev->bd_part = NULL;
 		bdev->bd_disk = NULL;
-		bdev_inode_switch_bdi(bdev->bd_inode,
-					&default_backing_dev_info);
 		if (bdev != bdev->bd_contains)
 			victim = bdev->bd_contains;
 		bdev->bd_contains = NULL;
-- 
cgit v1.1


From e38b21e76bb89bba3e59da494de586aac23ad430 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 11 Jul 2011 16:40:59 +0000
Subject: fs/9p: When doing inode lookup compare qid details and inode mode
 bits.

commit fd2421f54423f307ecd31bdebdca6bc317e0c492 upstream.

This make sure we don't use wrong inode from the inode hash. The inode number
of the file deleted is reused by the next file system object created
and if we only use inode number for inode hash lookup we could end up
with wrong struct inode.

Also compare inode generation number. Not all Linux file system provide
st_gen in userspace. So it could be 0;

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/9p/cache.c          | 20 +++++++++-----------
 fs/9p/cache.h          |  9 ---------
 fs/9p/v9fs.c           |  2 +-
 fs/9p/v9fs.h           |  2 +-
 fs/9p/vfs_inode.c      | 36 +++++++++++++++++++++++++++++++++---
 fs/9p/vfs_inode_dotl.c | 41 +++++++++++++++++++++++++++++++++++++----
 6 files changed, 81 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 5b335c5..945aa5f 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -108,11 +108,10 @@ static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data,
 					 void *buffer, uint16_t bufmax)
 {
 	const struct v9fs_inode *v9inode = cookie_netfs_data;
-	memcpy(buffer, &v9inode->fscache_key->path,
-	       sizeof(v9inode->fscache_key->path));
+	memcpy(buffer, &v9inode->qid.path, sizeof(v9inode->qid.path));
 	P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &v9inode->vfs_inode,
-		   v9inode->fscache_key->path);
-	return sizeof(v9inode->fscache_key->path);
+		   v9inode->qid.path);
+	return sizeof(v9inode->qid.path);
 }
 
 static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data,
@@ -129,11 +128,10 @@ static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
 					 void *buffer, uint16_t buflen)
 {
 	const struct v9fs_inode *v9inode = cookie_netfs_data;
-	memcpy(buffer, &v9inode->fscache_key->version,
-	       sizeof(v9inode->fscache_key->version));
+	memcpy(buffer, &v9inode->qid.version, sizeof(v9inode->qid.version));
 	P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &v9inode->vfs_inode,
-		   v9inode->fscache_key->version);
-	return sizeof(v9inode->fscache_key->version);
+		   v9inode->qid.version);
+	return sizeof(v9inode->qid.version);
 }
 
 static enum
@@ -143,11 +141,11 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
 {
 	const struct v9fs_inode *v9inode = cookie_netfs_data;
 
-	if (buflen != sizeof(v9inode->fscache_key->version))
+	if (buflen != sizeof(v9inode->qid.version))
 		return FSCACHE_CHECKAUX_OBSOLETE;
 
-	if (memcmp(buffer, &v9inode->fscache_key->version,
-		   sizeof(v9inode->fscache_key->version)))
+	if (memcmp(buffer, &v9inode->qid.version,
+		   sizeof(v9inode->qid.version)))
 		return FSCACHE_CHECKAUX_OBSOLETE;
 
 	return FSCACHE_CHECKAUX_OKAY;
diff --git a/fs/9p/cache.h b/fs/9p/cache.h
index 049507a..40cc54c 100644
--- a/fs/9p/cache.h
+++ b/fs/9p/cache.h
@@ -93,15 +93,6 @@ static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
 	BUG_ON(PageFsCache(page));
 }
 
-static inline void v9fs_fscache_set_key(struct inode *inode,
-					struct p9_qid *qid)
-{
-	struct v9fs_inode *v9inode = V9FS_I(inode);
-	spin_lock(&v9inode->fscache_lock);
-	v9inode->fscache_key = qid;
-	spin_unlock(&v9inode->fscache_lock);
-}
-
 static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
 						   struct page *page)
 {
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index c82b017..8b7c6be 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -487,8 +487,8 @@ static void v9fs_inode_init_once(void *foo)
 	struct v9fs_inode *v9inode = (struct v9fs_inode *)foo;
 #ifdef CONFIG_9P_FSCACHE
 	v9inode->fscache = NULL;
-	v9inode->fscache_key = NULL;
 #endif
+	memset(&v9inode->qid, 0, sizeof(v9inode->qid));
 	inode_init_once(&v9inode->vfs_inode);
 }
 
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index e5ebedf..5d7392e 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -125,8 +125,8 @@ struct v9fs_inode {
 #ifdef CONFIG_9P_FSCACHE
 	spinlock_t fscache_lock;
 	struct fscache_cookie *fscache;
-	struct p9_qid *fscache_key;
 #endif
+	struct p9_qid qid;
 	unsigned int cache_validity;
 	struct p9_fid *writeback_fid;
 	struct mutex v_mutex;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 7f6c677..88dbf07 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -216,7 +216,6 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
 		return NULL;
 #ifdef CONFIG_9P_FSCACHE
 	v9inode->fscache = NULL;
-	v9inode->fscache_key = NULL;
 	spin_lock_init(&v9inode->fscache_lock);
 #endif
 	v9inode->writeback_fid = NULL;
@@ -433,6 +432,37 @@ void v9fs_evict_inode(struct inode *inode)
 	}
 }
 
+static int v9fs_test_inode(struct inode *inode, void *data)
+{
+	int umode;
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	struct p9_wstat *st = (struct p9_wstat *)data;
+	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
+
+	umode = p9mode2unixmode(v9ses, st->mode);
+	/* don't match inode of different type */
+	if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
+		return 0;
+
+	/* compare qid details */
+	if (memcmp(&v9inode->qid.version,
+		   &st->qid.version, sizeof(v9inode->qid.version)))
+		return 0;
+
+	if (v9inode->qid.type != st->qid.type)
+		return 0;
+	return 1;
+}
+
+static int v9fs_set_inode(struct inode *inode,  void *data)
+{
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	struct p9_wstat *st = (struct p9_wstat *)data;
+
+	memcpy(&v9inode->qid, &st->qid, sizeof(st->qid));
+	return 0;
+}
+
 static struct inode *v9fs_qid_iget(struct super_block *sb,
 				   struct p9_qid *qid,
 				   struct p9_wstat *st)
@@ -443,7 +473,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
 	struct v9fs_session_info *v9ses = sb->s_fs_info;
 
 	i_ino = v9fs_qid2ino(qid);
-	inode = iget_locked(sb, i_ino);
+	inode = iget5_locked(sb, i_ino, v9fs_test_inode, v9fs_set_inode, st);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 	if (!(inode->i_state & I_NEW))
@@ -453,6 +483,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
 	 * FIXME!! we may need support for stale inodes
 	 * later.
 	 */
+	inode->i_ino = i_ino;
 	umode = p9mode2unixmode(v9ses, st->mode);
 	retval = v9fs_init_inode(v9ses, inode, umode);
 	if (retval)
@@ -460,7 +491,6 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
 
 	v9fs_stat2inode(st, inode, sb);
 #ifdef CONFIG_9P_FSCACHE
-	v9fs_fscache_set_key(inode, &st->qid);
 	v9fs_cache_inode_get_cookie(inode);
 #endif
 	unlock_new_inode(inode);
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 691c78f..caa63ef 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -86,6 +86,38 @@ static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
 	return dentry;
 }
 
+static int v9fs_test_inode_dotl(struct inode *inode, void *data)
+{
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	struct p9_stat_dotl *st = (struct p9_stat_dotl *)data;
+
+	/* don't match inode of different type */
+	if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT))
+		return 0;
+
+	if (inode->i_generation != st->st_gen)
+		return 0;
+
+	/* compare qid details */
+	if (memcmp(&v9inode->qid.version,
+		   &st->qid.version, sizeof(v9inode->qid.version)))
+		return 0;
+
+	if (v9inode->qid.type != st->qid.type)
+		return 0;
+	return 1;
+}
+
+static int v9fs_set_inode_dotl(struct inode *inode,  void *data)
+{
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+	struct p9_stat_dotl *st = (struct p9_stat_dotl *)data;
+
+	memcpy(&v9inode->qid, &st->qid, sizeof(st->qid));
+	inode->i_generation = st->st_gen;
+	return 0;
+}
+
 static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
 					struct p9_qid *qid,
 					struct p9_fid *fid,
@@ -97,7 +129,8 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
 	struct v9fs_session_info *v9ses = sb->s_fs_info;
 
 	i_ino = v9fs_qid2ino(qid);
-	inode = iget_locked(sb, i_ino);
+	inode = iget5_locked(sb, i_ino, v9fs_test_inode_dotl,
+			     v9fs_set_inode_dotl, st);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 	if (!(inode->i_state & I_NEW))
@@ -107,13 +140,13 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
 	 * FIXME!! we may need support for stale inodes
 	 * later.
 	 */
+	inode->i_ino = i_ino;
 	retval = v9fs_init_inode(v9ses, inode, st->st_mode);
 	if (retval)
 		goto error;
 
 	v9fs_stat2inode_dotl(st, inode);
 #ifdef CONFIG_9P_FSCACHE
-	v9fs_fscache_set_key(inode, &st->qid);
 	v9fs_cache_inode_get_cookie(inode);
 #endif
 	retval = v9fs_get_acl(inode, fid);
@@ -136,7 +169,7 @@ v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
 	struct p9_stat_dotl *st;
 	struct inode *inode = NULL;
 
-	st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
+	st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN);
 	if (IS_ERR(st))
 		return ERR_CAST(st);
 
@@ -547,7 +580,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
 			inode->i_blocks = stat->st_blocks;
 	}
 	if (stat->st_result_mask & P9_STATS_GEN)
-			inode->i_generation = stat->st_gen;
+		inode->i_generation = stat->st_gen;
 
 	/* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
 	 * because the inode structure does not have fields for them.
-- 
cgit v1.1


From 6170eea647404a2e5726ae81cc8ccc45db89812d Mon Sep 17 00:00:00 2001
From: Prem Karat <prem.karat@linux.vnet.ibm.com>
Date: Fri, 6 May 2011 18:24:18 +0530
Subject: fs/9p: Fix invalid mount options/args

commit a2dd43bb0d7b9ce28f8a39254c25840c0730498e upstream.

Without this fix, if any invalid mount options/args are passed while mouting
the 9p fs, no error (-EINVAL) is returned and default arg value is assigned.

This fix returns -EINVAL when an invalid arguement is found while parsing
mount options.

Signed-off-by: Prem Karat <prem.karat@linux.vnet.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/9p/v9fs.c | 43 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 34 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 8b7c6be..ef96618 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -78,6 +78,25 @@ static const match_table_t tokens = {
 	{Opt_err, NULL}
 };
 
+/* Interpret mount options for cache mode */
+static int get_cache_mode(char *s)
+{
+	int version = -EINVAL;
+
+	if (!strcmp(s, "loose")) {
+		version = CACHE_LOOSE;
+		P9_DPRINTK(P9_DEBUG_9P, "Cache mode: loose\n");
+	} else if (!strcmp(s, "fscache")) {
+		version = CACHE_FSCACHE;
+		P9_DPRINTK(P9_DEBUG_9P, "Cache mode: fscache\n");
+	} else if (!strcmp(s, "none")) {
+		version = CACHE_NONE;
+		P9_DPRINTK(P9_DEBUG_9P, "Cache mode: none\n");
+	} else
+		printk(KERN_INFO "9p: Unknown Cache mode %s.\n", s);
+	return version;
+}
+
 /**
  * v9fs_parse_options - parse mount options into session structure
  * @v9ses: existing v9fs session information
@@ -97,7 +116,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 	/* setup defaults */
 	v9ses->afid = ~0;
 	v9ses->debug = 0;
-	v9ses->cache = 0;
+	v9ses->cache = CACHE_NONE;
 #ifdef CONFIG_9P_FSCACHE
 	v9ses->cachetag = NULL;
 #endif
@@ -171,13 +190,13 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 				  "problem allocating copy of cache arg\n");
 				goto free_and_return;
 			}
+			ret = get_cache_mode(s);
+			if (ret == -EINVAL) {
+				kfree(s);
+				goto free_and_return;
+			}
 
-			if (strcmp(s, "loose") == 0)
-				v9ses->cache = CACHE_LOOSE;
-			else if (strcmp(s, "fscache") == 0)
-				v9ses->cache = CACHE_FSCACHE;
-			else
-				v9ses->cache = CACHE_NONE;
+			v9ses->cache = ret;
 			kfree(s);
 			break;
 
@@ -200,9 +219,15 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 			} else {
 				v9ses->flags |= V9FS_ACCESS_SINGLE;
 				v9ses->uid = simple_strtoul(s, &e, 10);
-				if (*e != '\0')
-					v9ses->uid = ~0;
+				if (*e != '\0') {
+					ret = -EINVAL;
+					printk(KERN_INFO "9p: Unknown access "
+							"argument %s.\n", s);
+					kfree(s);
+					goto free_and_return;
+				}
 			}
+
 			kfree(s);
 			break;
 
-- 
cgit v1.1


From 926fa0b4b9cd6537f4750642165647aa20cfeae4 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 6 Jul 2011 16:32:31 +0530
Subject: fs/9p: Always ask new inode in create

commit ed80fcfac2565fa866d93ba14f0e75de17a8223e upstream.

This make sure we don't end up reusing the unlinked inode object.
The ideal way is to use inode i_generation. But i_generation is
not available in userspace always.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/9p/v9fs.h           | 27 +++++++++++++++++++++++----
 fs/9p/vfs_inode.c      | 22 +++++++++++++++++-----
 fs/9p/vfs_inode_dotl.c | 30 +++++++++++++++++++++---------
 3 files changed, 61 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 5d7392e..e78956c 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -153,13 +153,13 @@ extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd,
 			void *p);
 extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses,
 					 struct p9_fid *fid,
-					 struct super_block *sb);
+					 struct super_block *sb, int new);
 extern const struct inode_operations v9fs_dir_inode_operations_dotl;
 extern const struct inode_operations v9fs_file_inode_operations_dotl;
 extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
 extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
 					      struct p9_fid *fid,
-					      struct super_block *sb);
+					      struct super_block *sb, int new);
 
 /* other default globals */
 #define V9FS_PORT	564
@@ -201,8 +201,27 @@ v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
 			struct super_block *sb)
 {
 	if (v9fs_proto_dotl(v9ses))
-		return v9fs_inode_from_fid_dotl(v9ses, fid, sb);
+		return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 0);
 	else
-		return v9fs_inode_from_fid(v9ses, fid, sb);
+		return v9fs_inode_from_fid(v9ses, fid, sb, 0);
 }
+
+/**
+ * v9fs_get_new_inode_from_fid - Helper routine to populate an inode by
+ * issuing a attribute request
+ * @v9ses: session information
+ * @fid: fid to issue attribute request for
+ * @sb: superblock on which to create inode
+ *
+ */
+static inline struct inode *
+v9fs_get_new_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+			    struct super_block *sb)
+{
+	if (v9fs_proto_dotl(v9ses))
+		return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 1);
+	else
+		return v9fs_inode_from_fid(v9ses, fid, sb, 1);
+}
+
 #endif
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 88dbf07..35d4121 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -454,6 +454,11 @@ static int v9fs_test_inode(struct inode *inode, void *data)
 	return 1;
 }
 
+static int v9fs_test_new_inode(struct inode *inode, void *data)
+{
+	return 0;
+}
+
 static int v9fs_set_inode(struct inode *inode,  void *data)
 {
 	struct v9fs_inode *v9inode = V9FS_I(inode);
@@ -465,15 +470,22 @@ static int v9fs_set_inode(struct inode *inode,  void *data)
 
 static struct inode *v9fs_qid_iget(struct super_block *sb,
 				   struct p9_qid *qid,
-				   struct p9_wstat *st)
+				   struct p9_wstat *st,
+				   int new)
 {
 	int retval, umode;
 	unsigned long i_ino;
 	struct inode *inode;
 	struct v9fs_session_info *v9ses = sb->s_fs_info;
+	int (*test)(struct inode *, void *);
+
+	if (new)
+		test = v9fs_test_new_inode;
+	else
+		test = v9fs_test_inode;
 
 	i_ino = v9fs_qid2ino(qid);
-	inode = iget5_locked(sb, i_ino, v9fs_test_inode, v9fs_set_inode, st);
+	inode = iget5_locked(sb, i_ino, test, v9fs_set_inode, st);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 	if (!(inode->i_state & I_NEW))
@@ -504,7 +516,7 @@ error:
 
 struct inode *
 v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-		    struct super_block *sb)
+		    struct super_block *sb, int new)
 {
 	struct p9_wstat *st;
 	struct inode *inode = NULL;
@@ -513,7 +525,7 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
 	if (IS_ERR(st))
 		return ERR_CAST(st);
 
-	inode = v9fs_qid_iget(sb, &st->qid, st);
+	inode = v9fs_qid_iget(sb, &st->qid, st, new);
 	p9stat_free(st);
 	kfree(st);
 	return inode;
@@ -615,7 +627,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 	}
 
 	/* instantiate inode and assign the unopened fid to the dentry */
-	inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
+	inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index caa63ef..bec75f0 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -108,6 +108,12 @@ static int v9fs_test_inode_dotl(struct inode *inode, void *data)
 	return 1;
 }
 
+/* Always get a new inode */
+static int v9fs_test_new_inode_dotl(struct inode *inode, void *data)
+{
+	return 0;
+}
+
 static int v9fs_set_inode_dotl(struct inode *inode,  void *data)
 {
 	struct v9fs_inode *v9inode = V9FS_I(inode);
@@ -121,16 +127,22 @@ static int v9fs_set_inode_dotl(struct inode *inode,  void *data)
 static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
 					struct p9_qid *qid,
 					struct p9_fid *fid,
-					struct p9_stat_dotl *st)
+					struct p9_stat_dotl *st,
+					int new)
 {
 	int retval;
 	unsigned long i_ino;
 	struct inode *inode;
 	struct v9fs_session_info *v9ses = sb->s_fs_info;
+	int (*test)(struct inode *, void *);
+
+	if (new)
+		test = v9fs_test_new_inode_dotl;
+	else
+		test = v9fs_test_inode_dotl;
 
 	i_ino = v9fs_qid2ino(qid);
-	inode = iget5_locked(sb, i_ino, v9fs_test_inode_dotl,
-			     v9fs_set_inode_dotl, st);
+	inode = iget5_locked(sb, i_ino, test, v9fs_set_inode_dotl, st);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 	if (!(inode->i_state & I_NEW))
@@ -164,7 +176,7 @@ error:
 
 struct inode *
 v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-			 struct super_block *sb)
+			 struct super_block *sb, int new)
 {
 	struct p9_stat_dotl *st;
 	struct inode *inode = NULL;
@@ -173,7 +185,7 @@ v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
 	if (IS_ERR(st))
 		return ERR_CAST(st);
 
-	inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st);
+	inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st, new);
 	kfree(st);
 	return inode;
 }
@@ -263,7 +275,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 		fid = NULL;
 		goto error;
 	}
-	inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
+	inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
@@ -383,7 +395,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
 			goto error;
 		}
 
-		inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
+		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
 			P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
@@ -636,7 +648,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
 		}
 
 		/* instantiate inode and assign the unopened fid to dentry */
-		inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
+		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
 			P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
@@ -789,7 +801,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
 			goto error;
 		}
 
-		inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
+		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
 			P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
-- 
cgit v1.1


From 7c0e1afbe36ec6d067638daafe6e7ee30085202b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 23 Jul 2011 02:28:13 -0400
Subject: 9p: close ACL leaks

commit 1ec95bf34d976b38897d1977b155a544d77b05e7 upstream.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/9p/acl.c            | 22 +++++++++++++---------
 fs/9p/acl.h            |  6 +++---
 fs/9p/vfs_inode_dotl.c |  9 ++++++---
 3 files changed, 22 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 535ab6e..4a866cd 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -185,12 +185,15 @@ int v9fs_acl_chmod(struct dentry *dentry)
 }
 
 int v9fs_set_create_acl(struct dentry *dentry,
-			struct posix_acl *dpacl, struct posix_acl *pacl)
+			struct posix_acl **dpacl, struct posix_acl **pacl)
 {
-	v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl);
-	v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl);
-	posix_acl_release(dpacl);
-	posix_acl_release(pacl);
+	if (dentry) {
+		v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, *dpacl);
+		v9fs_set_acl(dentry, ACL_TYPE_ACCESS, *pacl);
+	}
+	posix_acl_release(*dpacl);
+	posix_acl_release(*pacl);
+	*dpacl = *pacl = NULL;
 	return 0;
 }
 
@@ -212,11 +215,11 @@ int v9fs_acl_mode(struct inode *dir, mode_t *modep,
 		struct posix_acl *clone;
 
 		if (S_ISDIR(mode))
-			*dpacl = acl;
+			*dpacl = posix_acl_dup(acl);
 		clone = posix_acl_clone(acl, GFP_NOFS);
-		retval = -ENOMEM;
+		posix_acl_release(acl);
 		if (!clone)
-			goto cleanup;
+			return -ENOMEM;
 
 		retval = posix_acl_create_masq(clone, &mode);
 		if (retval < 0) {
@@ -225,11 +228,12 @@ int v9fs_acl_mode(struct inode *dir, mode_t *modep,
 		}
 		if (retval > 0)
 			*pacl = clone;
+		else
+			posix_acl_release(clone);
 	}
 	*modep  = mode;
 	return 0;
 cleanup:
-	posix_acl_release(acl);
 	return retval;
 
 }
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
index 7ef3ac9..c47ea9c 100644
--- a/fs/9p/acl.h
+++ b/fs/9p/acl.h
@@ -19,7 +19,7 @@ extern int v9fs_get_acl(struct inode *, struct p9_fid *);
 extern int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags);
 extern int v9fs_acl_chmod(struct dentry *);
 extern int v9fs_set_create_acl(struct dentry *,
-			       struct posix_acl *, struct posix_acl *);
+			       struct posix_acl **, struct posix_acl **);
 extern int v9fs_acl_mode(struct inode *dir, mode_t *modep,
 			 struct posix_acl **dpacl, struct posix_acl **pacl);
 #else
@@ -33,8 +33,8 @@ static inline int v9fs_acl_chmod(struct dentry *dentry)
 	return 0;
 }
 static inline int v9fs_set_create_acl(struct dentry *dentry,
-				      struct posix_acl *dpacl,
-				      struct posix_acl *pacl)
+				      struct posix_acl **dpacl,
+				      struct posix_acl **pacl)
 {
 	return 0;
 }
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index bec75f0..185ce37 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -287,7 +287,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 		goto error;
 
 	/* Now set the ACL based on the default value */
-	v9fs_set_create_acl(dentry, dacl, pacl);
+	v9fs_set_create_acl(dentry, &dacl, &pacl);
 
 	v9inode = V9FS_I(inode);
 	mutex_lock(&v9inode->v_mutex);
@@ -328,6 +328,7 @@ error:
 err_clunk_old_fid:
 	if (ofid)
 		p9_client_clunk(ofid);
+	v9fs_set_create_acl(NULL, &dacl, &pacl);
 	return err;
 }
 
@@ -421,12 +422,13 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
 		d_instantiate(dentry, inode);
 	}
 	/* Now set the ACL based on the default value */
-	v9fs_set_create_acl(dentry, dacl, pacl);
+	v9fs_set_create_acl(dentry, &dacl, &pacl);
 	inc_nlink(dir);
 	v9fs_invalidate_inode_attr(dir);
 error:
 	if (fid)
 		p9_client_clunk(fid);
+	v9fs_set_create_acl(NULL, &dacl, &pacl);
 	return err;
 }
 
@@ -826,10 +828,11 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
 		d_instantiate(dentry, inode);
 	}
 	/* Now set the ACL based on the default value */
-	v9fs_set_create_acl(dentry, dacl, pacl);
+	v9fs_set_create_acl(dentry, &dacl, &pacl);
 error:
 	if (fid)
 		p9_client_clunk(fid);
+	v9fs_set_create_acl(NULL, &dacl, &pacl);
 	return err;
 }
 
-- 
cgit v1.1


From e279cdca3ca67dcd8e24051629163f9d5d838894 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 25 Jul 2011 18:06:32 +0000
Subject: fs/9p: Add fid before dentry instantiation

commit 5441ae5eb3614d3c28f77073370738a2820c88e4 upstream.

d_instantiate marks the dentry positive. So a parallel lookup and mkdir of
the directory can find dentry that doesn't have fid attached. This can result
in both the code path doing v9fs_fid_add which results in v9fs_dentry leak.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/9p/vfs_inode.c      | 4 +---
 fs/9p/vfs_inode_dotl.c | 8 ++++----
 2 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 35d4121..ad7aae4 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -633,13 +633,11 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 		P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
 		goto error;
 	}
-	d_instantiate(dentry, inode);
 	err = v9fs_fid_add(dentry, fid);
 	if (err < 0)
 		goto error;
-
+	d_instantiate(dentry, inode);
 	return ofid;
-
 error:
 	if (ofid)
 		p9_client_clunk(ofid);
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 185ce37..0a17235 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -281,10 +281,10 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 		P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
 		goto error;
 	}
-	d_instantiate(dentry, inode);
 	err = v9fs_fid_add(dentry, fid);
 	if (err < 0)
 		goto error;
+	d_instantiate(dentry, inode);
 
 	/* Now set the ACL based on the default value */
 	v9fs_set_create_acl(dentry, &dacl, &pacl);
@@ -403,10 +403,10 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
 				err);
 			goto error;
 		}
-		d_instantiate(dentry, inode);
 		err = v9fs_fid_add(dentry, fid);
 		if (err < 0)
 			goto error;
+		d_instantiate(dentry, inode);
 		fid = NULL;
 	} else {
 		/*
@@ -657,10 +657,10 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
 					err);
 			goto error;
 		}
-		d_instantiate(dentry, inode);
 		err = v9fs_fid_add(dentry, fid);
 		if (err < 0)
 			goto error;
+		d_instantiate(dentry, inode);
 		fid = NULL;
 	} else {
 		/* Not in cached mode. No need to populate inode with stat */
@@ -810,10 +810,10 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
 				err);
 			goto error;
 		}
-		d_instantiate(dentry, inode);
 		err = v9fs_fid_add(dentry, fid);
 		if (err < 0)
 			goto error;
+		d_instantiate(dentry, inode);
 		fid = NULL;
 	} else {
 		/*
-- 
cgit v1.1


From 29a3e8657d2a2640384166e3fe29a086d235fc33 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 25 Jul 2011 18:06:33 +0000
Subject: fs/9p: Don't update file type when updating file attributes

commit 45089142b1497dab2327d60f6c71c40766fc3ea4 upstream.

We should only update attributes that we can change on stat2inode.
Also do file type initialization in v9fs_init_inode.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/9p/v9fs_vfs.h       |  4 +--
 fs/9p/vfs_inode.c      | 91 +++++++++++++++++++++++++++-----------------------
 fs/9p/vfs_inode_dotl.c | 23 ++++++++-----
 fs/9p/vfs_super.c      |  2 +-
 4 files changed, 68 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 4014160..bd86d22 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -54,9 +54,9 @@ extern struct kmem_cache *v9fs_inode_cache;
 
 struct inode *v9fs_alloc_inode(struct super_block *sb);
 void v9fs_destroy_inode(struct inode *inode);
-struct inode *v9fs_get_inode(struct super_block *sb, int mode);
+struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t);
 int v9fs_init_inode(struct v9fs_session_info *v9ses,
-		    struct inode *inode, int mode);
+		    struct inode *inode, int mode, dev_t);
 void v9fs_evict_inode(struct inode *inode);
 ino_t v9fs_qid2ino(struct p9_qid *qid);
 void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index ad7aae4..1ab561d 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -95,15 +95,18 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode)
 /**
  * p9mode2unixmode- convert plan9 mode bits to unix mode bits
  * @v9ses: v9fs session information
- * @mode: mode to convert
+ * @stat: p9_wstat from which mode need to be derived
+ * @rdev: major number, minor number in case of device files.
  *
  */
-
-static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
+static int p9mode2unixmode(struct v9fs_session_info *v9ses,
+			   struct p9_wstat *stat, dev_t *rdev)
 {
 	int res;
+	int mode = stat->mode;
 
-	res = mode & 0777;
+	res = mode & S_IALLUGO;
+	*rdev = 0;
 
 	if ((mode & P9_DMDIR) == P9_DMDIR)
 		res |= S_IFDIR;
@@ -116,9 +119,26 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
 		 && (v9ses->nodev == 0))
 		res |= S_IFIFO;
 	else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses))
-		 && (v9ses->nodev == 0))
-		res |= S_IFBLK;
-	else
+		 && (v9ses->nodev == 0)) {
+		char type = 0, ext[32];
+		int major = -1, minor = -1;
+
+		strncpy(ext, stat->extension, sizeof(ext));
+		sscanf(ext, "%c %u %u", &type, &major, &minor);
+		switch (type) {
+		case 'c':
+			res |= S_IFCHR;
+			break;
+		case 'b':
+			res |= S_IFBLK;
+			break;
+		default:
+			P9_DPRINTK(P9_DEBUG_ERROR,
+				"Unknown special type %c %s\n", type,
+				stat->extension);
+		};
+		*rdev = MKDEV(major, minor);
+	} else
 		res |= S_IFREG;
 
 	if (v9fs_proto_dotu(v9ses)) {
@@ -131,7 +151,6 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
 		if ((mode & P9_DMSETVTX) == P9_DMSETVTX)
 			res |= S_ISVTX;
 	}
-
 	return res;
 }
 
@@ -242,13 +261,13 @@ void v9fs_destroy_inode(struct inode *inode)
 }
 
 int v9fs_init_inode(struct v9fs_session_info *v9ses,
-		    struct inode *inode, int mode)
+		    struct inode *inode, int mode, dev_t rdev)
 {
 	int err = 0;
 
 	inode_init_owner(inode, NULL, mode);
 	inode->i_blocks = 0;
-	inode->i_rdev = 0;
+	inode->i_rdev = rdev;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	inode->i_mapping->a_ops = &v9fs_addr_operations;
 
@@ -335,7 +354,7 @@ error:
  *
  */
 
-struct inode *v9fs_get_inode(struct super_block *sb, int mode)
+struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t rdev)
 {
 	int err;
 	struct inode *inode;
@@ -348,7 +367,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 		P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
 		return ERR_PTR(-ENOMEM);
 	}
-	err = v9fs_init_inode(v9ses, inode, mode);
+	err = v9fs_init_inode(v9ses, inode, mode, rdev);
 	if (err) {
 		iput(inode);
 		return ERR_PTR(err);
@@ -435,11 +454,12 @@ void v9fs_evict_inode(struct inode *inode)
 static int v9fs_test_inode(struct inode *inode, void *data)
 {
 	int umode;
+	dev_t rdev;
 	struct v9fs_inode *v9inode = V9FS_I(inode);
 	struct p9_wstat *st = (struct p9_wstat *)data;
 	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
 
-	umode = p9mode2unixmode(v9ses, st->mode);
+	umode = p9mode2unixmode(v9ses, st, &rdev);
 	/* don't match inode of different type */
 	if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
 		return 0;
@@ -473,6 +493,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
 				   struct p9_wstat *st,
 				   int new)
 {
+	dev_t rdev;
 	int retval, umode;
 	unsigned long i_ino;
 	struct inode *inode;
@@ -496,8 +517,8 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
 	 * later.
 	 */
 	inode->i_ino = i_ino;
-	umode = p9mode2unixmode(v9ses, st->mode);
-	retval = v9fs_init_inode(v9ses, inode, umode);
+	umode = p9mode2unixmode(v9ses, st, &rdev);
+	retval = v9fs_init_inode(v9ses, inode, umode, rdev);
 	if (retval)
 		goto error;
 
@@ -990,7 +1011,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		return PTR_ERR(st);
 
 	v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb);
-		generic_fillattr(dentry->d_inode, stat);
+	generic_fillattr(dentry->d_inode, stat);
 
 	p9stat_free(st);
 	kfree(st);
@@ -1074,6 +1095,7 @@ void
 v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 	struct super_block *sb)
 {
+	mode_t mode;
 	char ext[32];
 	char tag_name[14];
 	unsigned int i_nlink;
@@ -1109,31 +1131,9 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 				inode->i_nlink = i_nlink;
 		}
 	}
-	inode->i_mode = p9mode2unixmode(v9ses, stat->mode);
-	if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) {
-		char type = 0;
-		int major = -1;
-		int minor = -1;
-
-		strncpy(ext, stat->extension, sizeof(ext));
-		sscanf(ext, "%c %u %u", &type, &major, &minor);
-		switch (type) {
-		case 'c':
-			inode->i_mode &= ~S_IFBLK;
-			inode->i_mode |= S_IFCHR;
-			break;
-		case 'b':
-			break;
-		default:
-			P9_DPRINTK(P9_DEBUG_ERROR,
-				"Unknown special type %c %s\n", type,
-				stat->extension);
-		};
-		inode->i_rdev = MKDEV(major, minor);
-		init_special_inode(inode, inode->i_mode, inode->i_rdev);
-	} else
-		inode->i_rdev = 0;
-
+	mode = stat->mode & S_IALLUGO;
+	mode |= inode->i_mode & ~S_IALLUGO;
+	inode->i_mode = mode;
 	i_size_write(inode, stat->length);
 
 	/* not real number of blocks, but 512 byte ones ... */
@@ -1399,6 +1399,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 
 int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
 {
+	int umode;
+	dev_t rdev;
 	loff_t i_size;
 	struct p9_wstat *st;
 	struct v9fs_session_info *v9ses;
@@ -1407,6 +1409,12 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
 	st = p9_client_stat(fid);
 	if (IS_ERR(st))
 		return PTR_ERR(st);
+	/*
+	 * Don't update inode if the file type is different
+	 */
+	umode = p9mode2unixmode(v9ses, st, &rdev);
+	if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
+		goto out;
 
 	spin_lock(&inode->i_lock);
 	/*
@@ -1418,6 +1426,7 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
 	if (v9ses->cache)
 		inode->i_size = i_size;
 	spin_unlock(&inode->i_lock);
+out:
 	p9stat_free(st);
 	kfree(st);
 	return 0;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 0a17235..818bf6f 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -153,7 +153,8 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
 	 * later.
 	 */
 	inode->i_ino = i_ino;
-	retval = v9fs_init_inode(v9ses, inode, st->st_mode);
+	retval = v9fs_init_inode(v9ses, inode,
+				 st->st_mode, new_decode_dev(st->st_rdev));
 	if (retval)
 		goto error;
 
@@ -414,7 +415,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
 		 * inode with stat. We need to get an inode
 		 * so that we can set the acl with dentry
 		 */
-		inode = v9fs_get_inode(dir->i_sb, mode);
+		inode = v9fs_get_inode(dir->i_sb, mode, 0);
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
 			goto error;
@@ -540,6 +541,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
 void
 v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
 {
+	mode_t mode;
 	struct v9fs_inode *v9inode = V9FS_I(inode);
 
 	if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
@@ -552,11 +554,10 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
 		inode->i_uid = stat->st_uid;
 		inode->i_gid = stat->st_gid;
 		inode->i_nlink = stat->st_nlink;
-		inode->i_mode = stat->st_mode;
-		inode->i_rdev = new_decode_dev(stat->st_rdev);
 
-		if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode)))
-			init_special_inode(inode, inode->i_mode, inode->i_rdev);
+		mode = stat->st_mode & S_IALLUGO;
+		mode |= inode->i_mode & ~S_IALLUGO;
+		inode->i_mode = mode;
 
 		i_size_write(inode, stat->st_size);
 		inode->i_blocks = stat->st_blocks;
@@ -664,7 +665,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
 		fid = NULL;
 	} else {
 		/* Not in cached mode. No need to populate inode with stat */
-		inode = v9fs_get_inode(dir->i_sb, S_IFLNK);
+		inode = v9fs_get_inode(dir->i_sb, S_IFLNK, 0);
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
 			goto error;
@@ -820,7 +821,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
 		 * Not in cached mode. No need to populate inode with stat.
 		 * socket syscall returns a fd, so we need instantiate
 		 */
-		inode = v9fs_get_inode(dir->i_sb, mode);
+		inode = v9fs_get_inode(dir->i_sb, mode, rdev);
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
 			goto error;
@@ -886,6 +887,11 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
 	st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
 	if (IS_ERR(st))
 		return PTR_ERR(st);
+	/*
+	 * Don't update inode if the file type is different
+	 */
+	if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT))
+		goto out;
 
 	spin_lock(&inode->i_lock);
 	/*
@@ -897,6 +903,7 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
 	if (v9ses->cache)
 		inode->i_size = i_size;
 	spin_unlock(&inode->i_lock);
+out:
 	kfree(st);
 	return 0;
 }
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index feef6cd..c70251d 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -149,7 +149,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 	else
 		sb->s_d_op = &v9fs_dentry_operations;
 
-	inode = v9fs_get_inode(sb, S_IFDIR | mode);
+	inode = v9fs_get_inode(sb, S_IFDIR | mode, 0);
 	if (IS_ERR(inode)) {
 		retval = PTR_ERR(inode);
 		goto release_sb;
-- 
cgit v1.1


From a111278ea956e42e2d1ac4b4f04a2c71d322235b Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 3 Aug 2011 19:55:32 +0530
Subject: fs/9p: Add OS dependent open flags in 9p protocol

commit f88657ce3f9713a0c62101dffb0e972a979e77b9 upstream.

Some of the flags are OS/arch dependent we add a 9p
protocol value which maps to asm-generic/fcntl.h values in Linux
Based on the original patch from Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>

[extra comments from author as to why this needs to go to stable:

Earlier for different operation such as open we used the values of open
flag as defined by the OS.  But some of these flags such as O_DIRECT are
arch dependent. So if we have the 9p client and server running on
different architectures, we end up with client sending client
architecture value of these open flag and server will try to map these
values to what its architecture states. For ex: O_DIRECT on a x86 client
maps to

#define O_DIRECT        00040000

Where as on sparc server it will maps to

#define O_DIRECT        0x100000

Hence we need to map these open flags to OS/arch independent flag
values.  Getting these changes to an early version of kernel ensures us
that we work with different combination of client and server. We should
ideally backport this patch to all possible kernel version.]

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Harsh Prateek Bora <harsh@linux.vnet.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/9p/v9fs_vfs.h       |  2 ++
 fs/9p/vfs_file.c       |  2 +-
 fs/9p/vfs_inode_dotl.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 57 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index bd86d22..f9a28ea 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -82,4 +82,6 @@ static inline void v9fs_invalidate_inode_attr(struct inode *inode)
 	v9inode->cache_validity |= V9FS_INO_INVALID_ATTR;
 	return;
 }
+
+int v9fs_open_to_dotl_flags(int flags);
 #endif
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index ffed558..56907e4 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -65,7 +65,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 	v9inode = V9FS_I(inode);
 	v9ses = v9fs_inode2v9ses(inode);
 	if (v9fs_proto_dotl(v9ses))
-		omode = file->f_flags;
+		omode = v9fs_open_to_dotl_flags(file->f_flags);
 	else
 		omode = v9fs_uflags2omode(file->f_flags,
 					v9fs_proto_dotu(v9ses));
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 818bf6f..c873172 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -191,6 +191,58 @@ v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
 	return inode;
 }
 
+struct dotl_openflag_map {
+	int open_flag;
+	int dotl_flag;
+};
+
+static int v9fs_mapped_dotl_flags(int flags)
+{
+	int i;
+	int rflags = 0;
+	struct dotl_openflag_map dotl_oflag_map[] = {
+		{ O_CREAT,	P9_DOTL_CREATE },
+		{ O_EXCL,	P9_DOTL_EXCL },
+		{ O_NOCTTY,	P9_DOTL_NOCTTY },
+		{ O_TRUNC,	P9_DOTL_TRUNC },
+		{ O_APPEND,	P9_DOTL_APPEND },
+		{ O_NONBLOCK,	P9_DOTL_NONBLOCK },
+		{ O_DSYNC,	P9_DOTL_DSYNC },
+		{ FASYNC,	P9_DOTL_FASYNC },
+		{ O_DIRECT,	P9_DOTL_DIRECT },
+		{ O_LARGEFILE,	P9_DOTL_LARGEFILE },
+		{ O_DIRECTORY,	P9_DOTL_DIRECTORY },
+		{ O_NOFOLLOW,	P9_DOTL_NOFOLLOW },
+		{ O_NOATIME,	P9_DOTL_NOATIME },
+		{ O_CLOEXEC,	P9_DOTL_CLOEXEC },
+		{ O_SYNC,	P9_DOTL_SYNC},
+	};
+	for (i = 0; i < ARRAY_SIZE(dotl_oflag_map); i++) {
+		if (flags & dotl_oflag_map[i].open_flag)
+			rflags |= dotl_oflag_map[i].dotl_flag;
+	}
+	return rflags;
+}
+
+/**
+ * v9fs_open_to_dotl_flags- convert Linux specific open flags to
+ * plan 9 open flag.
+ * @flags: flags to convert
+ */
+int v9fs_open_to_dotl_flags(int flags)
+{
+	int rflags = 0;
+
+	/*
+	 * We have same bits for P9_DOTL_READONLY, P9_DOTL_WRONLY
+	 * and P9_DOTL_NOACCESS
+	 */
+	rflags |= flags & O_ACCMODE;
+	rflags |= v9fs_mapped_dotl_flags(flags);
+
+	return rflags;
+}
+
 /**
  * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
  * @dir: directory inode that is being created
@@ -259,7 +311,8 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
 			   "Failed to get acl values in creat %d\n", err);
 		goto error;
 	}
-	err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid);
+	err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags),
+				    mode, gid, &qid);
 	if (err < 0) {
 		P9_DPRINTK(P9_DEBUG_VFS,
 				"p9_client_open_dotl failed in creat %d\n",
-- 
cgit v1.1


From 8bdb14f9c33bb0ad8b37a9465c968ccce4ecda3b Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 16 Aug 2011 22:19:28 +0530
Subject: fs/9p: Always ask new inode in lookup for cache mode disabled

commit 73f507171cfa407b19f254aef95cbb058c8180cf upstream.

This make sure we don't end up reusing the unlinked inode object.
The ideal way is to use inode i_generation. But i_generation is
not available in userspace always.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/9p/vfs_inode.c | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 1ab561d..c72e20c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -799,6 +799,7 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 				      struct nameidata *nameidata)
 {
+	struct dentry *res;
 	struct super_block *sb;
 	struct v9fs_session_info *v9ses;
 	struct p9_fid *dfid, *fid;
@@ -830,22 +831,35 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 
 		return ERR_PTR(result);
 	}
-
-	inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
+	/*
+	 * Make sure we don't use a wrong inode due to parallel
+	 * unlink. For cached mode create calls request for new
+	 * inode. But with cache disabled, lookup should do this.
+	 */
+	if (v9ses->cache)
+		inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
+	else
+		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
 	if (IS_ERR(inode)) {
 		result = PTR_ERR(inode);
 		inode = NULL;
 		goto error;
 	}
-
 	result = v9fs_fid_add(dentry, fid);
 	if (result < 0)
 		goto error_iput;
-
 inst_out:
-	d_add(dentry, inode);
-	return NULL;
-
+	/*
+	 * If we had a rename on the server and a parallel lookup
+	 * for the new name, then make sure we instantiate with
+	 * the new name. ie look up for a/b, while on server somebody
+	 * moved b under k and client parallely did a lookup for
+	 * k/b.
+	 */
+	res = d_materialise_unique(dentry, inode);
+	if (!IS_ERR(res))
+		return res;
+	result = PTR_ERR(res);
 error_iput:
 	iput(inode);
 error:
-- 
cgit v1.1


From 101e357617f790d36b999dada08c437fd2431c3c Mon Sep 17 00:00:00 2001
From: Jim Garlick <garlick.jim@gmail.com>
Date: Sun, 21 Aug 2011 00:21:18 +0530
Subject: fs/9p: Use protocol-defined value for lock/getlock 'type' field.

commit 51b8b4fb32271d39fbdd760397406177b2b0fd36 upstream.

Signed-off-by: Jim Garlick <garlick@llnl.gov>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Harsh Prateek Bora <harsh@linux.vnet.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/9p/vfs_file.c | 34 +++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 56907e4..9d6e168 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -169,7 +169,18 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
 
 	/* convert posix lock to p9 tlock args */
 	memset(&flock, 0, sizeof(flock));
-	flock.type = fl->fl_type;
+	/* map the lock type */
+	switch (fl->fl_type) {
+	case F_RDLCK:
+		flock.type = P9_LOCK_TYPE_RDLCK;
+		break;
+	case F_WRLCK:
+		flock.type = P9_LOCK_TYPE_WRLCK;
+		break;
+	case F_UNLCK:
+		flock.type = P9_LOCK_TYPE_UNLCK;
+		break;
+	}
 	flock.start = fl->fl_start;
 	if (fl->fl_end == OFFSET_MAX)
 		flock.length = 0;
@@ -245,7 +256,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
 
 	/* convert posix lock to p9 tgetlock args */
 	memset(&glock, 0, sizeof(glock));
-	glock.type = fl->fl_type;
+	glock.type  = P9_LOCK_TYPE_UNLCK;
 	glock.start = fl->fl_start;
 	if (fl->fl_end == OFFSET_MAX)
 		glock.length = 0;
@@ -257,17 +268,26 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
 	res = p9_client_getlock_dotl(fid, &glock);
 	if (res < 0)
 		return res;
-	if (glock.type != F_UNLCK) {
-		fl->fl_type = glock.type;
+	/* map 9p lock type to os lock type */
+	switch (glock.type) {
+	case P9_LOCK_TYPE_RDLCK:
+		fl->fl_type = F_RDLCK;
+		break;
+	case P9_LOCK_TYPE_WRLCK:
+		fl->fl_type = F_WRLCK;
+		break;
+	case P9_LOCK_TYPE_UNLCK:
+		fl->fl_type = F_UNLCK;
+		break;
+	}
+	if (glock.type != P9_LOCK_TYPE_UNLCK) {
 		fl->fl_start = glock.start;
 		if (glock.length == 0)
 			fl->fl_end = OFFSET_MAX;
 		else
 			fl->fl_end = glock.start + glock.length - 1;
 		fl->fl_pid = glock.proc_id;
-	} else
-		fl->fl_type = F_UNLCK;
-
+	}
 	return res;
 }
 
-- 
cgit v1.1


From 97abc52eb26f2ae8d033100c4c024e6138bdfd60 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Wed, 14 Sep 2011 18:55:41 +0100
Subject: restore pinning the victim dentry in vfs_rmdir()/vfs_rename_dir()

commit 1d2ef5901483004d74947bbf78d5146c24038fe7 upstream.

We used to get the victim pinned by dentry_unhash() prior to commit
64252c75a219 ("vfs: remove dget() from dentry_unhash()") and ->rmdir()
and ->rename() instances relied on that; most of them don't care, but
ones that used d_delete() themselves do.  As the result, we are getting
rmdir() oopses on NFS now.

Just grab the reference before locking the victim and drop it explicitly
after unlocking, same as vfs_rename_other() does.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Simon Kirby <sim@hostway.ca>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/namei.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 14ab8d3..b456c7a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2582,6 +2582,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 	if (!dir->i_op->rmdir)
 		return -EPERM;
 
+	dget(dentry);
 	mutex_lock(&dentry->d_inode->i_mutex);
 
 	error = -EBUSY;
@@ -2602,6 +2603,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 
 out:
 	mutex_unlock(&dentry->d_inode->i_mutex);
+	dput(dentry);
 	if (!error)
 		d_delete(dentry);
 	return error;
@@ -3005,6 +3007,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 	if (error)
 		return error;
 
+	dget(new_dentry);
 	if (target)
 		mutex_lock(&target->i_mutex);
 
@@ -3025,6 +3028,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 out:
 	if (target)
 		mutex_unlock(&target->i_mutex);
+	dput(new_dentry);
 	if (!error)
 		if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
 			d_move(old_dentry,new_dentry);
-- 
cgit v1.1


From 862bee39ef540bd3a7772b899b4fa7f3036abf0d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 23 Aug 2011 07:21:28 -0400
Subject: cifs: fix possible memory corruption in CIFSFindNext

commit 9438fabb73eb48055b58b89fc51e0bc4db22fabd upstream.

The name_len variable in CIFSFindNext is a signed int that gets set to
the resume_name_len in the cifs_search_info. The resume_name_len however
is unsigned and for some infolevels is populated directly from a 32 bit
value sent by the server.

If the server sends a very large value for this, then that value could
look negative when converted to a signed int. That would make that
value pass the PATH_MAX check later in CIFSFindNext. The name_len would
then be used as a length value for a memcpy. It would then be treated
as unsigned again, and the memcpy scribbles over a ton of memory.

Fix this by making the name_len an unsigned value in CIFSFindNext.

Reported-by: Darren Lavender <dcl@hppine99.gbr.hp.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/cifs/cifssmb.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 1a9fe7f..07132c4 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -4079,7 +4079,8 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon,
 	T2_FNEXT_RSP_PARMS *parms;
 	char *response_data;
 	int rc = 0;
-	int bytes_returned, name_len;
+	int bytes_returned;
+	unsigned int name_len;
 	__u16 params, byte_count;
 
 	cFYI(1, "In FindNext");
-- 
cgit v1.1


From 562960c731b9ffba8cd200be8d38f5f793b23d74 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 29 Aug 2011 18:54:12 +0000
Subject: Fix the conflict between rwpidforward and rw mount options

commit c9c7fa0064f4afe1d040e72f24c2256dd8ac402d upstream.

Both these options are started with "rw" - that's why the first one
isn't switched on even if it is specified. Fix this by adding a length
check for "rw" option check.

Signed-off-by: Pavel Shilovsky <piastry@etersoft.ru>
Signed-off-by: Steve French <sfrench@us.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/cifs/connect.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e0ea721..2451627 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1258,7 +1258,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			/* ignore */
 		} else if (strnicmp(data, "guest", 5) == 0) {
 			/* ignore */
-		} else if (strnicmp(data, "rw", 2) == 0) {
+		} else if (strnicmp(data, "rw", 2) == 0 && strlen(data) == 2) {
 			/* ignore */
 		} else if (strnicmp(data, "ro", 2) == 0) {
 			/* ignore */
@@ -1361,7 +1361,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			vol->server_ino = 1;
 		} else if (strnicmp(data, "noserverino", 9) == 0) {
 			vol->server_ino = 0;
-		} else if (strnicmp(data, "rwpidforward", 4) == 0) {
+		} else if (strnicmp(data, "rwpidforward", 12) == 0) {
 			vol->rwpidforward = 1;
 		} else if (strnicmp(data, "cifsacl", 7) == 0) {
 			vol->cifs_acl = 1;
-- 
cgit v1.1


From a19dcc747653657eeea06b417d8e4593f8f44536 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Tue, 20 Sep 2011 15:19:38 -0700
Subject: make /proc/$pid/numa_maps gather_stats() take variable page size

commit eb4866d0066ffd5446751c102d64feb3318d8bd1 upstream.

We need to teach the numa_maps code about transparent huge pages.  The
first step is to teach gather_stats() that the pte it is dealing with
might represent more than one page.

Note that will we use this in a moment for transparent huge pages since
they have use a single pmd_t which _acts_ as a "surrogate" for a bunch
of smaller pte_t's.

I'm a _bit_ unhappy that this interface counts in hugetlbfs page sizes
for hugetlbfs pages and PAGE_SIZE for normal pages.  That means that to
figure out how many _bytes_ "dirty=1" means, you must first know the
hugetlbfs page size.  That's easier said than done especially if you
don't have visibility in to the mount.

But, that's probably a discussion for another day especially since it
would change behavior to fix it.  But, just in case anyone wonders why
this patch only passes a '1' in the hugetlb case...

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Acked-by: Hugh Dickins <hughd@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/proc/task_mmu.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 25b6a88..61342a4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -877,30 +877,31 @@ struct numa_maps_private {
 	struct numa_maps md;
 };
 
-static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty)
+static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
+			unsigned long nr_pages)
 {
 	int count = page_mapcount(page);
 
-	md->pages++;
+	md->pages += nr_pages;
 	if (pte_dirty || PageDirty(page))
-		md->dirty++;
+		md->dirty += nr_pages;
 
 	if (PageSwapCache(page))
-		md->swapcache++;
+		md->swapcache += nr_pages;
 
 	if (PageActive(page) || PageUnevictable(page))
-		md->active++;
+		md->active += nr_pages;
 
 	if (PageWriteback(page))
-		md->writeback++;
+		md->writeback += nr_pages;
 
 	if (PageAnon(page))
-		md->anon++;
+		md->anon += nr_pages;
 
 	if (count > md->mapcount_max)
 		md->mapcount_max = count;
 
-	md->node[page_to_nid(page)]++;
+	md->node[page_to_nid(page)] += nr_pages;
 }
 
 static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
@@ -931,7 +932,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 		if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
 			continue;
 
-		gather_stats(page, md, pte_dirty(*pte));
+		gather_stats(page, md, pte_dirty(*pte), 1);
 
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap_unlock(orig_pte, ptl);
@@ -952,7 +953,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
 		return 0;
 
 	md = walk->private;
-	gather_stats(page, md, pte_dirty(*pte));
+	gather_stats(page, md, pte_dirty(*pte), 1);
 	return 0;
 }
 
-- 
cgit v1.1


From b879781180172696903fac604ba44c3204e0c1e4 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Tue, 20 Sep 2011 15:19:39 -0700
Subject: break out numa_maps gather_pte_stats() checks

commit 3200a8aaab0c9ccdc0f59b0dac2d4a47029137fa upstream.

gather_pte_stats() does a number of checks on a target page
to see whether it should even be considered for statistics.
This breaks that code out in to a separate function so that
we can use it in the transparent hugepage case in the next
patch.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Acked-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Christoph Lameter <cl@gentwo.org>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/proc/task_mmu.c | 39 ++++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 61342a4..9dca07e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -904,6 +904,29 @@ static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
 	md->node[page_to_nid(page)] += nr_pages;
 }
 
+static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
+		unsigned long addr)
+{
+	struct page *page;
+	int nid;
+
+	if (!pte_present(pte))
+		return NULL;
+
+	page = vm_normal_page(vma, addr, pte);
+	if (!page)
+		return NULL;
+
+	if (PageReserved(page))
+		return NULL;
+
+	nid = page_to_nid(page);
+	if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
+		return NULL;
+
+	return page;
+}
+
 static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 		unsigned long end, struct mm_walk *walk)
 {
@@ -915,23 +938,9 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 	md = walk->private;
 	orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
 	do {
-		struct page *page;
-		int nid;
-
-		if (!pte_present(*pte))
-			continue;
-
-		page = vm_normal_page(md->vma, addr, *pte);
+		struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
 		if (!page)
 			continue;
-
-		if (PageReserved(page))
-			continue;
-
-		nid = page_to_nid(page);
-		if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
-			continue;
-
 		gather_stats(page, md, pte_dirty(*pte), 1);
 
 	} while (pte++, addr += PAGE_SIZE, addr != end);
-- 
cgit v1.1


From e2d598ab82b589fc40c4656a10004e328a90b6dd Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Tue, 20 Sep 2011 15:19:41 -0700
Subject: teach /proc/$pid/numa_maps about transparent hugepages

commit 32ef43848f283e0ef945d3c67e851c143fea3970 upstream.

This is modeled after the smaps code.

It detects transparent hugepages and then does a single gather_stats()
for the page as a whole.  This has two benifits:
 1. It is more efficient since it does many pages in a single shot.
 2. It does not have to break down the huge page.

Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Acked-by: Hugh Dickins <hughd@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/proc/task_mmu.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 9dca07e..5afaa58 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -936,6 +936,26 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 	pte_t *pte;
 
 	md = walk->private;
+	spin_lock(&walk->mm->page_table_lock);
+	if (pmd_trans_huge(*pmd)) {
+		if (pmd_trans_splitting(*pmd)) {
+			spin_unlock(&walk->mm->page_table_lock);
+			wait_split_huge_page(md->vma->anon_vma, pmd);
+		} else {
+			pte_t huge_pte = *(pte_t *)pmd;
+			struct page *page;
+
+			page = can_gather_numa_stats(huge_pte, md->vma, addr);
+			if (page)
+				gather_stats(page, md, pte_dirty(huge_pte),
+						HPAGE_PMD_SIZE/PAGE_SIZE);
+			spin_unlock(&walk->mm->page_table_lock);
+			return 0;
+		}
+	} else {
+		spin_unlock(&walk->mm->page_table_lock);
+	}
+
 	orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
 	do {
 		struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
-- 
cgit v1.1


From ac693061b11c33d5a5c5ec1925de7abd3fcb0971 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Sun, 6 Jun 2010 10:38:15 -0600
Subject: writeback: introduce .tagged_writepages for the WB_SYNC_NONE sync
 stage

commit 6e6938b6d3130305a5960c86b1a9b21e58cf6144 upstream.

sync(2) is performed in two stages: the WB_SYNC_NONE sync and the
WB_SYNC_ALL sync. Identify the first stage with .tagged_writepages and
do livelock prevention for it, too.

Jan's commit f446daaea9 ("mm: implement writeback livelock avoidance
using page tagging") is a partial fix in that it only fixed the
WB_SYNC_ALL phase livelock.

Although ext4 is tested to no longer livelock with commit f446daaea9,
it may due to some "redirty_tail() after pages_skipped" effect which
is by no means a guarantee for _all_ the file systems.

Note that writeback_inodes_sb() is called by not only sync(), they are
treated the same because the other callers also need livelock prevention.

Impact:  It changes the order in which pages/inodes are synced to disk.
Now in the WB_SYNC_NONE stage, it won't proceed to write the next inode
until finished with the current inode.

Acked-by: Jan Kara <jack@suse.cz>
CC: Dave Chinner <david@fromorbit.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/ext4/inode.c   |  4 ++--
 fs/fs-writeback.c | 17 +++++++++--------
 2 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b864839..c94774c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2756,7 +2756,7 @@ static int write_cache_pages_da(struct address_space *mapping,
 	index = wbc->range_start >> PAGE_CACHE_SHIFT;
 	end = wbc->range_end >> PAGE_CACHE_SHIFT;
 
-	if (wbc->sync_mode == WB_SYNC_ALL)
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
 		tag = PAGECACHE_TAG_TOWRITE;
 	else
 		tag = PAGECACHE_TAG_DIRTY;
@@ -2988,7 +2988,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 	}
 
 retry:
-	if (wbc->sync_mode == WB_SYNC_ALL)
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
 		tag_pages_for_writeback(mapping, index, end);
 
 	while (!ret && wbc->nr_to_write > 0) {
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 0f015a0..5ed2ce9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -36,6 +36,7 @@ struct wb_writeback_work {
 	long nr_pages;
 	struct super_block *sb;
 	enum writeback_sync_modes sync_mode;
+	unsigned int tagged_writepages:1;
 	unsigned int for_kupdate:1;
 	unsigned int range_cyclic:1;
 	unsigned int for_background:1;
@@ -650,6 +651,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 {
 	struct writeback_control wbc = {
 		.sync_mode		= work->sync_mode,
+		.tagged_writepages	= work->tagged_writepages,
 		.older_than_this	= NULL,
 		.for_kupdate		= work->for_kupdate,
 		.for_background		= work->for_background,
@@ -657,7 +659,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 	};
 	unsigned long oldest_jif;
 	long wrote = 0;
-	long write_chunk;
+	long write_chunk = MAX_WRITEBACK_PAGES;
 	struct inode *inode;
 
 	if (wbc.for_kupdate) {
@@ -683,9 +685,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 	 *                   (quickly) tag currently dirty pages
 	 *                   (maybe slowly) sync all tagged pages
 	 */
-	if (wbc.sync_mode == WB_SYNC_NONE)
-		write_chunk = MAX_WRITEBACK_PAGES;
-	else
+	if (wbc.sync_mode == WB_SYNC_ALL || wbc.tagged_writepages)
 		write_chunk = LONG_MAX;
 
 	wbc.wb_start = jiffies; /* livelock avoidance */
@@ -1188,10 +1188,11 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	struct wb_writeback_work work = {
-		.sb		= sb,
-		.sync_mode	= WB_SYNC_NONE,
-		.done		= &done,
-		.nr_pages	= nr,
+		.sb			= sb,
+		.sync_mode		= WB_SYNC_NONE,
+		.tagged_writepages	= 1,
+		.done			= &done,
+		.nr_pages		= nr,
 	};
 
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
-- 
cgit v1.1


From cfdf7986b6398049c35f8eb6e236d2387ee7ae14 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 27 Apr 2011 19:05:21 -0600
Subject: writeback: update dirtied_when for synced inode to prevent livelock

commit 94c3dcbb0b0cdfd82cedd21705424d8044edc42c upstream.

Explicitly update .dirtied_when on synced inodes, so that they are no
longer considered for writeback in the next round.

It can prevent both of the following livelock schemes:

- while true; do echo data >> f; done
- while true; do touch f;        done (in theory)

The exact livelock condition is, during sync(1):

(1) no new inodes are dirtied
(2) an inode being actively dirtied

On (2), the inode will be tagged and synced with .nr_to_write=LONG_MAX.
When finished, it will be redirty_tail()ed because it's still dirty
and (.nr_to_write > 0). redirty_tail() won't update its ->dirtied_when
on condition (1). The sync work will then revisit it on the next
queue_io() and find it eligible again because its old ->dirtied_when
predates the sync work start time.

We'll do more aggressive "keep writeback as long as we wrote something"
logic in wb_writeback(). The "use LONG_MAX .nr_to_write" trick in commit
b9543dac5bbc ("writeback: avoid livelocking WB_SYNC_ALL writeback") will
no longer be enough to stop sync livelock.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/fs-writeback.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5ed2ce9..fe190a8 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -419,6 +419,15 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	spin_lock(&inode->i_lock);
 	inode->i_state &= ~I_SYNC;
 	if (!(inode->i_state & I_FREEING)) {
+		/*
+		 * Sync livelock prevention. Each inode is tagged and synced in
+		 * one shot. If still dirty, it will be redirty_tail()'ed below.
+		 * Update the dirty time to prevent enqueue and sync it again.
+		 */
+		if ((inode->i_state & I_DIRTY) &&
+		    (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
+			inode->dirtied_when = jiffies;
+
 		if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
 			/*
 			 * We didn't write back all the pages.  nfs_writepages()
-- 
cgit v1.1


From 890ecd3d30e0ba8b1b676eaf6c925f65483f5f0d Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Sun, 18 Sep 2011 10:20:46 -0400
Subject: btrfs: fix d_off in the first dirent

commit 3765fefaee2da83f10829fa64a74e6b7360350cb upstream.

Since the d_off in the first dirent for "." (that originates from
the 4th argument "offset" of filldir() for the 2nd dirent for "..")
is wrongly assigned in btrfs_real_readdir(), telldir returns same
offset for different locations.

 | # mkfs.btrfs /dev/sdb1
 | # mount /dev/sdb1 fs0
 | # cd fs0
 | # touch file0 file1
 | # ../test
 | telldir: 0
 | readdir: d_off = 2, d_name = "."
 | telldir: 2
 | readdir: d_off = 2, d_name = ".."
 | telldir: 2
 | readdir: d_off = 3, d_name = "file0"
 | telldir: 3
 | readdir: d_off = 2147483647, d_name = "file1"
 | telldir: 2147483647

To fix this problem, pass filp->f_pos (which is loff_t) instead.

 | # ../test
 | telldir: 0
 | readdir: d_off = 1, d_name = "."
 | telldir: 1
 | readdir: d_off = 2, d_name = ".."
 | telldir: 2
 | readdir: d_off = 3, d_name = "file0"
 :

At the moment the "offset" for "." is unused because there is no
preceding dirent, however it is better to pass filp->f_pos to follow
grammatical usage.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
Cc: Grazvydas Ignotas <notasas@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/btrfs/inode.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3601f0a..d42e6bf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4124,7 +4124,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
 
 	/* special case for "." */
 	if (filp->f_pos == 0) {
-		over = filldir(dirent, ".", 1, 1, btrfs_ino(inode), DT_DIR);
+		over = filldir(dirent, ".", 1,
+			       filp->f_pos, btrfs_ino(inode), DT_DIR);
 		if (over)
 			return 0;
 		filp->f_pos = 1;
@@ -4133,7 +4134,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
 	if (filp->f_pos == 1) {
 		u64 pino = parent_ino(filp->f_path.dentry);
 		over = filldir(dirent, "..", 2,
-			       2, pino, DT_DIR);
+			       filp->f_pos, pino, DT_DIR);
 		if (over)
 			return 0;
 		filp->f_pos = 2;
-- 
cgit v1.1


From 2b7eea63de50d738ae12a1bf84b76ef91c007a0e Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Tue, 26 Jul 2011 16:08:41 -0700
Subject: exec: do not call request_module() twice from search_binary_handler()

commit 912193521b719fbfc2f16776febf5232fe8ba261 upstream.

Currently, search_binary_handler() tries to load binary loader module
using request_module() if a loader for the requested program is not yet
loaded.  But second attempt of request_module() does not affect the result
of search_binary_handler().

If request_module() triggered recursion, calling request_module() twice
causes 2 to the power of MAX_KMOD_CONCURRENT (= 50) repetitions.  It is
not an infinite loop but is sufficient for users to consider as a hang up.

Therefore, this patch changes not to call request_module() twice, making 1
to the power of MAX_KMOD_CONCURRENT repetitions in case of recursion.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reported-by: Richard Weinberger <richard@nod.at>
Tested-by: Richard Weinberger <richard@nod.at>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Maxim Uvarov <muvarov@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 6075a1e..044c13f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1411,6 +1411,8 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 			    printable(bprm->buf[2]) &&
 			    printable(bprm->buf[3]))
 				break; /* -ENOEXEC */
+			if (try)
+				break; /* -ENOEXEC */
 			request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
 #endif
 		}
-- 
cgit v1.1


From 37b2b419970e9b380a07c15d60a26987ce09ad15 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 12 Sep 2011 09:38:03 +0200
Subject: fuse: fix memory leak

commit 5dfcc87fd79dfb96ed155b524337dbd0da4f5993 upstream.

kmemleak is reporting that 32 bytes are being leaked by FUSE:

  unreferenced object 0xe373b270 (size 32):
  comm "fusermount", pid 1207, jiffies 4294707026 (age 2675.187s)
  hex dump (first 32 bytes):
    01 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00  ................
    00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<b05517d7>] kmemleak_alloc+0x27/0x50
    [<b0196435>] kmem_cache_alloc+0xc5/0x180
    [<b02455be>] fuse_alloc_forget+0x1e/0x20
    [<b0245670>] fuse_alloc_inode+0xb0/0xd0
    [<b01b1a8c>] alloc_inode+0x1c/0x80
    [<b01b290f>] iget5_locked+0x8f/0x1a0
    [<b0246022>] fuse_iget+0x72/0x1a0
    [<b02461da>] fuse_get_root_inode+0x8a/0x90
    [<b02465cf>] fuse_fill_super+0x3ef/0x590
    [<b019e56f>] mount_nodev+0x3f/0x90
    [<b0244e95>] fuse_mount+0x15/0x20
    [<b019d1bc>] mount_fs+0x1c/0xc0
    [<b01b5811>] vfs_kern_mount+0x41/0x90
    [<b01b5af9>] do_kern_mount+0x39/0xd0
    [<b01b7585>] do_mount+0x2e5/0x660
    [<b01b7966>] sys_mount+0x66/0xa0

This leak report is consistent and happens once per boot on
3.1.0-rc5-dirty.

This happens if a FORGET request is queued after the fuse device was
released.

Reported-by: Sitsofe Wheeler <sitsofe@yahoo.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Tested-by: Sitsofe Wheeler <sitsofe@yahoo.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Josh Boyer <jwboyer@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/fuse/dev.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 168a80f..5cb8614 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -258,10 +258,14 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
 	forget->forget_one.nlookup = nlookup;
 
 	spin_lock(&fc->lock);
-	fc->forget_list_tail->next = forget;
-	fc->forget_list_tail = forget;
-	wake_up(&fc->waitq);
-	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+	if (fc->connected) {
+		fc->forget_list_tail->next = forget;
+		fc->forget_list_tail = forget;
+		wake_up(&fc->waitq);
+		kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+	} else {
+		kfree(forget);
+	}
 	spin_unlock(&fc->lock);
 }
 
-- 
cgit v1.1


From c53c89aba3ebdfc3e9acdb18bb5ee9d2f8a328d0 Mon Sep 17 00:00:00 2001
From: Seth Forshee <seth.forshee@canonical.com>
Date: Mon, 18 Jul 2011 08:06:23 -0700
Subject: hfsplus: ensure bio requests are not smaller than the hardware
 sectors

commit 6596528e391ad978a6a120142cba97a1d7324cb6 upstream.

Currently all bio requests are 512 bytes, which may fail for media
whose physical sector size is larger than this. Ensure these
requests are not smaller than the block device logical block size.

BugLink: http://bugs.launchpad.net/bugs/734883
Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Josh Boyer <jwboyer@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/hfsplus/hfsplus_fs.h | 16 ++++++++--
 fs/hfsplus/part_tbl.c   | 32 ++++++++++---------
 fs/hfsplus/super.c      | 12 +++----
 fs/hfsplus/wrapper.c    | 83 +++++++++++++++++++++++++++++++++++++------------
 4 files changed, 101 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index d685752..4e7f64b 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -13,6 +13,7 @@
 #include <linux/fs.h>
 #include <linux/mutex.h>
 #include <linux/buffer_head.h>
+#include <linux/blkdev.h>
 #include "hfsplus_raw.h"
 
 #define DBG_BNODE_REFS	0x00000001
@@ -110,7 +111,9 @@ struct hfsplus_vh;
 struct hfs_btree;
 
 struct hfsplus_sb_info {
+	void *s_vhdr_buf;
 	struct hfsplus_vh *s_vhdr;
+	void *s_backup_vhdr_buf;
 	struct hfsplus_vh *s_backup_vhdr;
 	struct hfs_btree *ext_tree;
 	struct hfs_btree *cat_tree;
@@ -258,6 +261,15 @@ struct hfsplus_readdir_data {
 	struct hfsplus_cat_key key;
 };
 
+/*
+ * Find minimum acceptible I/O size for an hfsplus sb.
+ */
+static inline unsigned short hfsplus_min_io_size(struct super_block *sb)
+{
+	return max_t(unsigned short, bdev_logical_block_size(sb->s_bdev),
+		     HFSPLUS_SECTOR_SIZE);
+}
+
 #define hfs_btree_open hfsplus_btree_open
 #define hfs_btree_close hfsplus_btree_close
 #define hfs_btree_write hfsplus_btree_write
@@ -436,8 +448,8 @@ int hfsplus_compare_dentry(const struct dentry *parent,
 /* wrapper.c */
 int hfsplus_read_wrapper(struct super_block *);
 int hfs_part_find(struct super_block *, sector_t *, sector_t *);
-int hfsplus_submit_bio(struct block_device *bdev, sector_t sector,
-		void *data, int rw);
+int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
+		void *buf, void **data, int rw);
 
 /* time macros */
 #define __hfsp_mt2ut(t)		(be32_to_cpu(t) - 2082844800U)
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index 40ad88c..eb355d8 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -88,11 +88,12 @@ static int hfs_parse_old_pmap(struct super_block *sb, struct old_pmap *pm,
 	return -ENOENT;
 }
 
-static int hfs_parse_new_pmap(struct super_block *sb, struct new_pmap *pm,
-		sector_t *part_start, sector_t *part_size)
+static int hfs_parse_new_pmap(struct super_block *sb, void *buf,
+		struct new_pmap *pm, sector_t *part_start, sector_t *part_size)
 {
 	struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
 	int size = be32_to_cpu(pm->pmMapBlkCnt);
+	int buf_size = hfsplus_min_io_size(sb);
 	int res;
 	int i = 0;
 
@@ -107,11 +108,14 @@ static int hfs_parse_new_pmap(struct super_block *sb, struct new_pmap *pm,
 		if (++i >= size)
 			return -ENOENT;
 
-		res = hfsplus_submit_bio(sb->s_bdev,
-					 *part_start + HFS_PMAP_BLK + i,
-					 pm, READ);
-		if (res)
-			return res;
+		pm = (struct new_pmap *)((u8 *)pm + HFSPLUS_SECTOR_SIZE);
+		if ((u8 *)pm - (u8 *)buf >= buf_size) {
+			res = hfsplus_submit_bio(sb,
+						 *part_start + HFS_PMAP_BLK + i,
+						 buf, (void **)&pm, READ);
+			if (res)
+				return res;
+		}
 	} while (pm->pmSig == cpu_to_be16(HFS_NEW_PMAP_MAGIC));
 
 	return -ENOENT;
@@ -124,15 +128,15 @@ static int hfs_parse_new_pmap(struct super_block *sb, struct new_pmap *pm,
 int hfs_part_find(struct super_block *sb,
 		sector_t *part_start, sector_t *part_size)
 {
-	void *data;
+	void *buf, *data;
 	int res;
 
-	data = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL);
-	if (!data)
+	buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL);
+	if (!buf)
 		return -ENOMEM;
 
-	res = hfsplus_submit_bio(sb->s_bdev, *part_start + HFS_PMAP_BLK,
-				 data, READ);
+	res = hfsplus_submit_bio(sb, *part_start + HFS_PMAP_BLK,
+				 buf, &data, READ);
 	if (res)
 		goto out;
 
@@ -141,13 +145,13 @@ int hfs_part_find(struct super_block *sb,
 		res = hfs_parse_old_pmap(sb, data, part_start, part_size);
 		break;
 	case HFS_NEW_PMAP_MAGIC:
-		res = hfs_parse_new_pmap(sb, data, part_start, part_size);
+		res = hfs_parse_new_pmap(sb, buf, data, part_start, part_size);
 		break;
 	default:
 		res = -ENOENT;
 		break;
 	}
 out:
-	kfree(data);
+	kfree(buf);
 	return res;
 }
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 84a47b7..ab4857b 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -197,17 +197,17 @@ int hfsplus_sync_fs(struct super_block *sb, int wait)
 		write_backup = 1;
 	}
 
-	error2 = hfsplus_submit_bio(sb->s_bdev,
+	error2 = hfsplus_submit_bio(sb,
 				   sbi->part_start + HFSPLUS_VOLHEAD_SECTOR,
-				   sbi->s_vhdr, WRITE_SYNC);
+				   sbi->s_vhdr_buf, NULL, WRITE_SYNC);
 	if (!error)
 		error = error2;
 	if (!write_backup)
 		goto out;
 
-	error2 = hfsplus_submit_bio(sb->s_bdev,
+	error2 = hfsplus_submit_bio(sb,
 				  sbi->part_start + sbi->sect_count - 2,
-				  sbi->s_backup_vhdr, WRITE_SYNC);
+				  sbi->s_backup_vhdr_buf, NULL, WRITE_SYNC);
 	if (!error)
 		error2 = error;
 out:
@@ -251,8 +251,8 @@ static void hfsplus_put_super(struct super_block *sb)
 	hfs_btree_close(sbi->ext_tree);
 	iput(sbi->alloc_file);
 	iput(sbi->hidden_dir);
-	kfree(sbi->s_vhdr);
-	kfree(sbi->s_backup_vhdr);
+	kfree(sbi->s_vhdr_buf);
+	kfree(sbi->s_backup_vhdr_buf);
 	unload_nls(sbi->nls);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 4ac88ff..e3881a1 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -31,25 +31,67 @@ static void hfsplus_end_io_sync(struct bio *bio, int err)
 	complete(bio->bi_private);
 }
 
-int hfsplus_submit_bio(struct block_device *bdev, sector_t sector,
-		void *data, int rw)
+/*
+ * hfsplus_submit_bio - Perfrom block I/O
+ * @sb: super block of volume for I/O
+ * @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes
+ * @buf: buffer for I/O
+ * @data: output pointer for location of requested data
+ * @rw: direction of I/O
+ *
+ * The unit of I/O is hfsplus_min_io_size(sb), which may be bigger than
+ * HFSPLUS_SECTOR_SIZE, and @buf must be sized accordingly. On reads
+ * @data will return a pointer to the start of the requested sector,
+ * which may not be the same location as @buf.
+ *
+ * If @sector is not aligned to the bdev logical block size it will
+ * be rounded down. For writes this means that @buf should contain data
+ * that starts at the rounded-down address. As long as the data was
+ * read using hfsplus_submit_bio() and the same buffer is used things
+ * will work correctly.
+ */
+int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
+		void *buf, void **data, int rw)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	struct bio *bio;
 	int ret = 0;
+	unsigned int io_size;
+	loff_t start;
+	int offset;
+
+	/*
+	 * Align sector to hardware sector size and find offset. We
+	 * assume that io_size is a power of two, which _should_
+	 * be true.
+	 */
+	io_size = hfsplus_min_io_size(sb);
+	start = (loff_t)sector << HFSPLUS_SECTOR_SHIFT;
+	offset = start & (io_size - 1);
+	sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1);
 
 	bio = bio_alloc(GFP_NOIO, 1);
 	bio->bi_sector = sector;
-	bio->bi_bdev = bdev;
+	bio->bi_bdev = sb->s_bdev;
 	bio->bi_end_io = hfsplus_end_io_sync;
 	bio->bi_private = &wait;
 
-	/*
-	 * We always submit one sector at a time, so bio_add_page must not fail.
-	 */
-	if (bio_add_page(bio, virt_to_page(data), HFSPLUS_SECTOR_SIZE,
-			 offset_in_page(data)) != HFSPLUS_SECTOR_SIZE)
-		BUG();
+	if (!(rw & WRITE) && data)
+		*data = (u8 *)buf + offset;
+
+	while (io_size > 0) {
+		unsigned int page_offset = offset_in_page(buf);
+		unsigned int len = min_t(unsigned int, PAGE_SIZE - page_offset,
+					 io_size);
+
+		ret = bio_add_page(bio, virt_to_page(buf), len, page_offset);
+		if (ret != len) {
+			ret = -EIO;
+			goto out;
+		}
+		io_size -= len;
+		buf = (u8 *)buf + len;
+	}
 
 	submit_bio(rw, bio);
 	wait_for_completion(&wait);
@@ -57,8 +99,9 @@ int hfsplus_submit_bio(struct block_device *bdev, sector_t sector,
 	if (!bio_flagged(bio, BIO_UPTODATE))
 		ret = -EIO;
 
+out:
 	bio_put(bio);
-	return ret;
+	return ret < 0 ? ret : 0;
 }
 
 static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd)
@@ -147,17 +190,17 @@ int hfsplus_read_wrapper(struct super_block *sb)
 	}
 
 	error = -ENOMEM;
-	sbi->s_vhdr = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL);
-	if (!sbi->s_vhdr)
+	sbi->s_vhdr_buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL);
+	if (!sbi->s_vhdr_buf)
 		goto out;
-	sbi->s_backup_vhdr = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL);
-	if (!sbi->s_backup_vhdr)
+	sbi->s_backup_vhdr_buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL);
+	if (!sbi->s_backup_vhdr_buf)
 		goto out_free_vhdr;
 
 reread:
-	error = hfsplus_submit_bio(sb->s_bdev,
-				   part_start + HFSPLUS_VOLHEAD_SECTOR,
-				   sbi->s_vhdr, READ);
+	error = hfsplus_submit_bio(sb, part_start + HFSPLUS_VOLHEAD_SECTOR,
+				   sbi->s_vhdr_buf, (void **)&sbi->s_vhdr,
+				   READ);
 	if (error)
 		goto out_free_backup_vhdr;
 
@@ -186,9 +229,9 @@ reread:
 		goto reread;
 	}
 
-	error = hfsplus_submit_bio(sb->s_bdev,
-				   part_start + part_size - 2,
-				   sbi->s_backup_vhdr, READ);
+	error = hfsplus_submit_bio(sb, part_start + part_size - 2,
+				   sbi->s_backup_vhdr_buf,
+				   (void **)&sbi->s_backup_vhdr, READ);
 	if (error)
 		goto out_free_backup_vhdr;
 
-- 
cgit v1.1


From f24d5457bd706667951021530dd0590a1f3e9464 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <piastryyy@gmail.com>
Date: Sun, 21 Aug 2011 19:30:15 +0400
Subject: CIFS: Fix ERR_PTR dereference in cifs_get_root

commit 5b980b01212199833ee8023770fa4cbf1b85e9f4 upstream.

move it to the beginning of the loop.

Signed-off-by: Pavel Shilovsky <piastryyy@gmail.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
Cc: Josh Boyer <jwboyer@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/cifs/cifsfs.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index fc7e57b..53e7d72 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -566,6 +566,12 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
 		struct inode *dir = dentry->d_inode;
 		struct dentry *child;
 
+		if (!dir) {
+			dput(dentry);
+			dentry = ERR_PTR(-ENOENT);
+			break;
+		}
+
 		/* skip separators */
 		while (*s == sep)
 			s++;
@@ -581,10 +587,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
 		mutex_unlock(&dir->i_mutex);
 		dput(dentry);
 		dentry = child;
-		if (!dentry->d_inode) {
-			dput(dentry);
-			dentry = ERR_PTR(-ENOENT);
-		}
 	} while (!IS_ERR(dentry));
 	_FreeXid(xid);
 	kfree(full_path);
-- 
cgit v1.1


From eedc6389bb43769e82f2779dabd45865d2b892e4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 18 Oct 2011 10:23:15 -0400
Subject: xfs: start periodic workers later

commit 2bcf6e970f5a88fa05dced5eeb0326e13d93c4a1 upstream

Start the periodic sync workers only after we have finished xfs_mountfs
and thus fully set up the filesystem structures.  Without this we can
call into xfs_qm_sync before the quotainfo strucute is set up if the
mount takes unusually long, and probably hit other incomplete states
as well.

Also clean up the xfs_fs_fill_super error path by using consistent
label names, and removing an impossible to reach case.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Arkadiusz Miskiewicz <arekm@maven.pl>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/xfs/linux-2.6/xfs_super.c | 35 ++++++++++++++---------------------
 1 file changed, 14 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a1a881e..3ebb458 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1412,37 +1412,35 @@ xfs_fs_fill_super(
 	sb->s_time_gran = 1;
 	set_posix_acl_flag(sb);
 
-	error = xfs_syncd_init(mp);
-	if (error)
-		goto out_filestream_unmount;
-
 	xfs_inode_shrinker_register(mp);
 
 	error = xfs_mountfs(mp);
 	if (error)
-		goto out_syncd_stop;
+		goto out_filestream_unmount;
+
+	error = xfs_syncd_init(mp);
+	if (error)
+		goto out_unmount;
 
 	root = igrab(VFS_I(mp->m_rootip));
 	if (!root) {
 		error = ENOENT;
-		goto fail_unmount;
+		goto out_syncd_stop;
 	}
 	if (is_bad_inode(root)) {
 		error = EINVAL;
-		goto fail_vnrele;
+		goto out_syncd_stop;
 	}
 	sb->s_root = d_alloc_root(root);
 	if (!sb->s_root) {
 		error = ENOMEM;
-		goto fail_vnrele;
+		goto out_iput;
 	}
 
 	return 0;
 
- out_syncd_stop:
-	xfs_inode_shrinker_unregister(mp);
-	xfs_syncd_stop(mp);
  out_filestream_unmount:
+	xfs_inode_shrinker_unregister(mp);
 	xfs_filestream_unmount(mp);
  out_free_sb:
 	xfs_freesb(mp);
@@ -1456,17 +1454,12 @@ xfs_fs_fill_super(
  out:
 	return -error;
 
- fail_vnrele:
-	if (sb->s_root) {
-		dput(sb->s_root);
-		sb->s_root = NULL;
-	} else {
-		iput(root);
-	}
-
- fail_unmount:
-	xfs_inode_shrinker_unregister(mp);
+ out_iput:
+	iput(root);
+ out_syncd_stop:
 	xfs_syncd_stop(mp);
+ out_unmount:
+	xfs_inode_shrinker_unregister(mp);
 
 	/*
 	 * Blow away any referenced inode in the filestreams cache.
-- 
cgit v1.1


From f5d5ee3686ab928549370e4e584a2c32ef708a37 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 18 Oct 2011 10:23:16 -0400
Subject: xfs: use a cursor for bulk AIL insertion

commit 1d8c95a363bf8cd4d4182dd19c01693b635311c2 upstream


xfs: use a cursor for bulk AIL insertion

Delayed logging can insert tens of thousands of log items into the
AIL at the same LSN. When the committing of log commit records
occur, we can get insertions occurring at an LSN that is not at the
end of the AIL. If there are thousands of items in the AIL on the
tail LSN, each insertion has to walk the AIL to find the correct
place to insert the new item into the AIL. This can consume large
amounts of CPU time and block other operations from occurring while
the traversals are in progress.

To avoid this repeated walk, use a AIL cursor to record
where we should be inserting the new items into the AIL without
having to repeat the walk. The cursor infrastructure already
provides this functionality for push walks, so is a simple extension
of existing code. While this will not avoid the initial walk, it
will avoid repeating it tens of thousands of times during a single
checkpoint commit.

This version includes logic improvements from Christoph Hellwig.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/xfs/xfs_trans.c      |  27 ++++++++++--
 fs/xfs/xfs_trans_ail.c  | 109 ++++++++++++++++++++++++++++++++++++++----------
 fs/xfs/xfs_trans_priv.h |  10 +++--
 3 files changed, 118 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index c83f63b..efc147f 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1426,6 +1426,7 @@ xfs_trans_committed(
 static inline void
 xfs_log_item_batch_insert(
 	struct xfs_ail		*ailp,
+	struct xfs_ail_cursor	*cur,
 	struct xfs_log_item	**log_items,
 	int			nr_items,
 	xfs_lsn_t		commit_lsn)
@@ -1434,7 +1435,7 @@ xfs_log_item_batch_insert(
 
 	spin_lock(&ailp->xa_lock);
 	/* xfs_trans_ail_update_bulk drops ailp->xa_lock */
-	xfs_trans_ail_update_bulk(ailp, log_items, nr_items, commit_lsn);
+	xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn);
 
 	for (i = 0; i < nr_items; i++)
 		IOP_UNPIN(log_items[i], 0);
@@ -1452,6 +1453,13 @@ xfs_log_item_batch_insert(
  * as an iclog write error even though we haven't started any IO yet. Hence in
  * this case all we need to do is IOP_COMMITTED processing, followed by an
  * IOP_UNPIN(aborted) call.
+ *
+ * The AIL cursor is used to optimise the insert process. If commit_lsn is not
+ * at the end of the AIL, the insert cursor avoids the need to walk
+ * the AIL to find the insertion point on every xfs_log_item_batch_insert()
+ * call. This saves a lot of needless list walking and is a net win, even
+ * though it slightly increases that amount of AIL lock traffic to set it up
+ * and tear it down.
  */
 void
 xfs_trans_committed_bulk(
@@ -1463,8 +1471,13 @@ xfs_trans_committed_bulk(
 #define LOG_ITEM_BATCH_SIZE	32
 	struct xfs_log_item	*log_items[LOG_ITEM_BATCH_SIZE];
 	struct xfs_log_vec	*lv;
+	struct xfs_ail_cursor	cur;
 	int			i = 0;
 
+	spin_lock(&ailp->xa_lock);
+	xfs_trans_ail_cursor_last(ailp, &cur, commit_lsn);
+	spin_unlock(&ailp->xa_lock);
+
 	/* unpin all the log items */
 	for (lv = log_vector; lv; lv = lv->lv_next ) {
 		struct xfs_log_item	*lip = lv->lv_item;
@@ -1493,7 +1506,9 @@ xfs_trans_committed_bulk(
 			/*
 			 * Not a bulk update option due to unusual item_lsn.
 			 * Push into AIL immediately, rechecking the lsn once
-			 * we have the ail lock. Then unpin the item.
+			 * we have the ail lock. Then unpin the item. This does
+			 * not affect the AIL cursor the bulk insert path is
+			 * using.
 			 */
 			spin_lock(&ailp->xa_lock);
 			if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0)
@@ -1507,7 +1522,7 @@ xfs_trans_committed_bulk(
 		/* Item is a candidate for bulk AIL insert.  */
 		log_items[i++] = lv->lv_item;
 		if (i >= LOG_ITEM_BATCH_SIZE) {
-			xfs_log_item_batch_insert(ailp, log_items,
+			xfs_log_item_batch_insert(ailp, &cur, log_items,
 					LOG_ITEM_BATCH_SIZE, commit_lsn);
 			i = 0;
 		}
@@ -1515,7 +1530,11 @@ xfs_trans_committed_bulk(
 
 	/* make sure we insert the remainder! */
 	if (i)
-		xfs_log_item_batch_insert(ailp, log_items, i, commit_lsn);
+		xfs_log_item_batch_insert(ailp, &cur, log_items, i, commit_lsn);
+
+	spin_lock(&ailp->xa_lock);
+	xfs_trans_ail_cursor_done(ailp, &cur);
+	spin_unlock(&ailp->xa_lock);
 }
 
 /*
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 5fc2380..9a69dc0 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -272,9 +272,9 @@ xfs_trans_ail_cursor_clear(
 }
 
 /*
- * Return the item in the AIL with the current lsn.
- * Return the current tree generation number for use
- * in calls to xfs_trans_next_ail().
+ * Initialise the cursor to the first item in the AIL with the given @lsn.
+ * This searches the list from lowest LSN to highest. Pass a @lsn of zero
+ * to initialise the cursor to the first item in the AIL.
  */
 xfs_log_item_t *
 xfs_trans_ail_cursor_first(
@@ -300,31 +300,97 @@ out:
 }
 
 /*
- * splice the log item list into the AIL at the given LSN.
+ * Initialise the cursor to the last item in the AIL with the given @lsn.
+ * This searches the list from highest LSN to lowest. If there is no item with
+ * the value of @lsn, then it sets the cursor to the last item with an LSN lower
+ * than @lsn.
+ */
+static struct xfs_log_item *
+__xfs_trans_ail_cursor_last(
+	struct xfs_ail		*ailp,
+	xfs_lsn_t		lsn)
+{
+	xfs_log_item_t		*lip;
+
+	list_for_each_entry_reverse(lip, &ailp->xa_ail, li_ail) {
+		if (XFS_LSN_CMP(lip->li_lsn, lsn) <= 0)
+			return lip;
+	}
+	return NULL;
+}
+
+/*
+ * Initialise the cursor to the last item in the AIL with the given @lsn.
+ * This searches the list from highest LSN to lowest.
+ */
+struct xfs_log_item *
+xfs_trans_ail_cursor_last(
+	struct xfs_ail		*ailp,
+	struct xfs_ail_cursor	*cur,
+	xfs_lsn_t		lsn)
+{
+	xfs_trans_ail_cursor_init(ailp, cur);
+	cur->item = __xfs_trans_ail_cursor_last(ailp, lsn);
+	return cur->item;
+}
+
+/*
+ * splice the log item list into the AIL at the given LSN. We splice to the
+ * tail of the given LSN to maintain insert order for push traversals. The
+ * cursor is optional, allowing repeated updates to the same LSN to avoid
+ * repeated traversals.
  */
 static void
 xfs_ail_splice(
-	struct xfs_ail  *ailp,
-	struct list_head *list,
-	xfs_lsn_t       lsn)
+	struct xfs_ail		*ailp,
+	struct xfs_ail_cursor	*cur,
+	struct list_head	*list,
+	xfs_lsn_t		lsn)
 {
-	xfs_log_item_t  *next_lip;
+	struct xfs_log_item	*lip = cur ? cur->item : NULL;
+	struct xfs_log_item	*next_lip;
 
-	/* If the list is empty, just insert the item.  */
-	if (list_empty(&ailp->xa_ail)) {
-		list_splice(list, &ailp->xa_ail);
-		return;
+	/*
+	 * Get a new cursor if we don't have a placeholder or the existing one
+	 * has been invalidated.
+	 */
+	if (!lip || (__psint_t)lip & 1) {
+		lip = __xfs_trans_ail_cursor_last(ailp, lsn);
+
+		if (!lip) {
+			/* The list is empty, so just splice and return.  */
+			if (cur)
+				cur->item = NULL;
+			list_splice(list, &ailp->xa_ail);
+			return;
+		}
 	}
 
-	list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
-		if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)
-			break;
+	/*
+	 * Our cursor points to the item we want to insert _after_, so we have
+	 * to update the cursor to point to the end of the list we are splicing
+	 * in so that it points to the correct location for the next splice.
+	 * i.e. before the splice
+	 *
+	 *  lsn -> lsn -> lsn + x -> lsn + x ...
+	 *          ^
+	 *          | cursor points here
+	 *
+	 * After the splice we have:
+	 *
+	 *  lsn -> lsn -> lsn -> lsn -> .... -> lsn -> lsn + x -> lsn + x ...
+	 *          ^                            ^
+	 *          | cursor points here         | needs to move here
+	 *
+	 * So we set the cursor to the last item in the list to be spliced
+	 * before we execute the splice, resulting in the cursor pointing to
+	 * the correct item after the splice occurs.
+	 */
+	if (cur) {
+		next_lip = list_entry(list->prev, struct xfs_log_item, li_ail);
+		cur->item = next_lip;
 	}
-
-	ASSERT(&next_lip->li_ail == &ailp->xa_ail ||
-	       XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0);
-
-	list_splice_init(list, &next_lip->li_ail);
+	list_splice(list, &lip->li_ail);
 }
 
 /*
@@ -645,6 +711,7 @@ xfs_trans_unlocked_item(
 void
 xfs_trans_ail_update_bulk(
 	struct xfs_ail		*ailp,
+	struct xfs_ail_cursor	*cur,
 	struct xfs_log_item	**log_items,
 	int			nr_items,
 	xfs_lsn_t		lsn) __releases(ailp->xa_lock)
@@ -674,7 +741,7 @@ xfs_trans_ail_update_bulk(
 		list_add(&lip->li_ail, &tmp);
 	}
 
-	xfs_ail_splice(ailp, &tmp, lsn);
+	xfs_ail_splice(ailp, cur, &tmp, lsn);
 
 	if (!mlip_changed) {
 		spin_unlock(&ailp->xa_lock);
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 6b164e9..c0cb408 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -82,6 +82,7 @@ struct xfs_ail {
 extern struct workqueue_struct	*xfs_ail_wq;	/* AIL workqueue */
 
 void	xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
+				struct xfs_ail_cursor *cur,
 				struct xfs_log_item **log_items, int nr_items,
 				xfs_lsn_t lsn) __releases(ailp->xa_lock);
 static inline void
@@ -90,7 +91,7 @@ xfs_trans_ail_update(
 	struct xfs_log_item	*lip,
 	xfs_lsn_t		lsn) __releases(ailp->xa_lock)
 {
-	xfs_trans_ail_update_bulk(ailp, &lip, 1, lsn);
+	xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
 }
 
 void	xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
@@ -111,10 +112,13 @@ xfs_lsn_t		xfs_ail_min_lsn(struct xfs_ail *ailp);
 void			xfs_trans_unlocked_item(struct xfs_ail *,
 					xfs_log_item_t *);
 
-struct xfs_log_item	*xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
+struct xfs_log_item *	xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
 					struct xfs_ail_cursor *cur,
 					xfs_lsn_t lsn);
-struct xfs_log_item	*xfs_trans_ail_cursor_next(struct xfs_ail *ailp,
+struct xfs_log_item *	xfs_trans_ail_cursor_last(struct xfs_ail *ailp,
+					struct xfs_ail_cursor *cur,
+					xfs_lsn_t lsn);
+struct xfs_log_item *	xfs_trans_ail_cursor_next(struct xfs_ail *ailp,
 					struct xfs_ail_cursor *cur);
 void			xfs_trans_ail_cursor_done(struct xfs_ail *ailp,
 					struct xfs_ail_cursor *cur);
-- 
cgit v1.1


From 838599d118dd286b831ba45ae380c5870ff82fe9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 18 Oct 2011 10:23:17 -0400
Subject: xfs: do not update xa_last_pushed_lsn for locked items

commit bc6e588a8971aa74c02e42db4d6e0248679f3738 upstream

If an item was locked we should not update xa_last_pushed_lsn and thus skip
it when restarting the AIL scan as we need to be able to lock and write it
out as soon as possible.  Otherwise heavy lock contention might starve AIL
pushing too easily, especially given the larger backoff once we moved
xa_last_pushed_lsn all the way to the target lsn.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Stefan Priebe <s.priebe@profihost.ag>
Tested-by: Stefan Priebe <s.priebe@profihost.ag>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/xfs/xfs_trans_ail.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 9a69dc0..4b74b88 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -491,7 +491,6 @@ xfs_ail_worker(
 
 		case XFS_ITEM_LOCKED:
 			XFS_STATS_INC(xs_push_ail_locked);
-			ailp->xa_last_pushed_lsn = lsn;
 			stuck++;
 			break;
 
-- 
cgit v1.1


From e7bde7c73957cfda82963127421069d13a44e921 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 18 Oct 2011 10:23:18 -0400
Subject: xfs: force the log if we encounter pinned buffers in .iop_pushbuf

commit 17b38471c3c07a49f0bbc2ecc2e92050c164e226 upstream

We need to check for pinned buffers even in .iop_pushbuf given that inode
items flush into the same buffers that may be pinned directly due operations
on the unlinked inode list operating directly on buffers.  To do this add a
return value to .iop_pushbuf that tells the AIL push about this and use
the existing log force mechanisms to unpin it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Stefan Priebe <s.priebe@profihost.ag>
Tested-by: Stefan Priebe <s.priebe@profihost.ag>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/xfs/quota/xfs_dquot_item.c | 10 +++++++---
 fs/xfs/xfs_buf_item.c         |  3 ++-
 fs/xfs/xfs_inode_item.c       | 10 +++++++---
 fs/xfs/xfs_trans.h            |  2 +-
 fs/xfs/xfs_trans_ail.c        |  9 +++++++--
 5 files changed, 24 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 9e0e2fa..8126fc2 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -183,13 +183,14 @@ xfs_qm_dqunpin_wait(
  * search the buffer cache can be a time consuming thing, and AIL lock is a
  * spinlock.
  */
-STATIC void
+STATIC bool
 xfs_qm_dquot_logitem_pushbuf(
 	struct xfs_log_item	*lip)
 {
 	struct xfs_dq_logitem	*qlip = DQUOT_ITEM(lip);
 	struct xfs_dquot	*dqp = qlip->qli_dquot;
 	struct xfs_buf		*bp;
+	bool			ret = true;
 
 	ASSERT(XFS_DQ_IS_LOCKED(dqp));
 
@@ -201,17 +202,20 @@ xfs_qm_dquot_logitem_pushbuf(
 	if (completion_done(&dqp->q_flush) ||
 	    !(lip->li_flags & XFS_LI_IN_AIL)) {
 		xfs_dqunlock(dqp);
-		return;
+		return true;
 	}
 
 	bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno,
 			dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
 	xfs_dqunlock(dqp);
 	if (!bp)
-		return;
+		return true;
 	if (XFS_BUF_ISDELAYWRITE(bp))
 		xfs_buf_delwri_promote(bp);
+	if (XFS_BUF_ISPINNED(bp))
+		ret = false;
 	xfs_buf_relse(bp);
+	return ret;
 }
 
 /*
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 7b7e005..a7342e8 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -632,7 +632,7 @@ xfs_buf_item_push(
  * the xfsbufd to get this buffer written. We have to unlock the buffer
  * to allow the xfsbufd to write it, too.
  */
-STATIC void
+STATIC bool
 xfs_buf_item_pushbuf(
 	struct xfs_log_item	*lip)
 {
@@ -646,6 +646,7 @@ xfs_buf_item_pushbuf(
 
 	xfs_buf_delwri_promote(bp);
 	xfs_buf_relse(bp);
+	return true;
 }
 
 STATIC void
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index b1e88d5..391044c 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -713,13 +713,14 @@ xfs_inode_item_committed(
  * marked delayed write. If that's the case, we'll promote it and that will
  * allow the caller to write the buffer by triggering the xfsbufd to run.
  */
-STATIC void
+STATIC bool
 xfs_inode_item_pushbuf(
 	struct xfs_log_item	*lip)
 {
 	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
 	struct xfs_inode	*ip = iip->ili_inode;
 	struct xfs_buf		*bp;
+	bool			ret = true;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
 
@@ -730,7 +731,7 @@ xfs_inode_item_pushbuf(
 	if (completion_done(&ip->i_flush) ||
 	    !(lip->li_flags & XFS_LI_IN_AIL)) {
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
-		return;
+		return true;
 	}
 
 	bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno,
@@ -738,10 +739,13 @@ xfs_inode_item_pushbuf(
 
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 	if (!bp)
-		return;
+		return true;
 	if (XFS_BUF_ISDELAYWRITE(bp))
 		xfs_buf_delwri_promote(bp);
+	if (XFS_BUF_ISPINNED(bp))
+		ret = false;
 	xfs_buf_relse(bp);
+	return ret;
 }
 
 /*
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 06a9759..53597f4 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -350,7 +350,7 @@ typedef struct xfs_item_ops {
 	void (*iop_unlock)(xfs_log_item_t *);
 	xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
 	void (*iop_push)(xfs_log_item_t *);
-	void (*iop_pushbuf)(xfs_log_item_t *);
+	bool (*iop_pushbuf)(xfs_log_item_t *);
 	void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
 } xfs_item_ops_t;
 
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 4b74b88..afc4aa0 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -478,8 +478,13 @@ xfs_ail_worker(
 
 		case XFS_ITEM_PUSHBUF:
 			XFS_STATS_INC(xs_push_ail_pushbuf);
-			IOP_PUSHBUF(lip);
-			ailp->xa_last_pushed_lsn = lsn;
+
+			if (!IOP_PUSHBUF(lip)) {
+				stuck++;
+				flush_log = 1;
+			} else {
+				ailp->xa_last_pushed_lsn = lsn;
+			}
 			push_xfsbufd = 1;
 			break;
 
-- 
cgit v1.1


From c7eead1e118fb7e34ee8f5063c3c090c054c3820 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 18 Oct 2011 10:23:19 -0400
Subject: xfs: revert to using a kthread for AIL pushing

commit 0030807c66f058230bcb20d2573bcaf28852e804 upstream

Currently we have a few issues with the way the workqueue code is used to
implement AIL pushing:

 - it accidentally uses the same workqueue as the syncer action, and thus
   can be prevented from running if there are enough sync actions active
   in the system.
 - it doesn't use the HIGHPRI flag to queue at the head of the queue of
   work items

At this point I'm not confident enough in getting all the workqueue flags and
tweaks right to provide a perfectly reliable execution context for AIL
pushing, which is the most important piece in XFS to make forward progress
when the log fills.

Revert back to use a kthread per filesystem which fixes all the above issues
at the cost of having a task struct and stack around for each mounted
filesystem.  In addition this also gives us much better ways to diagnose
any issues involving hung AIL pushing and removes a small amount of code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Stefan Priebe <s.priebe@profihost.ag>
Tested-by: Stefan Priebe <s.priebe@profihost.ag>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/xfs/linux-2.6/xfs_linux.h |  2 ++
 fs/xfs/linux-2.6/xfs_super.c | 13 +-------
 fs/xfs/xfs_trans_ail.c       | 73 +++++++++++++++++++++++++-------------------
 fs/xfs/xfs_trans_priv.h      |  8 +----
 4 files changed, 45 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 8633521..8731516 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -70,6 +70,8 @@
 #include <linux/ctype.h>
 #include <linux/writeback.h>
 #include <linux/capability.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 #include <linux/list_sort.h>
 
 #include <asm/page.h>
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 3ebb458..347cae9 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1660,24 +1660,13 @@ xfs_init_workqueues(void)
 	 */
 	xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8);
 	if (!xfs_syncd_wq)
-		goto out;
-
-	xfs_ail_wq = alloc_workqueue("xfsail", WQ_CPU_INTENSIVE, 8);
-	if (!xfs_ail_wq)
-		goto out_destroy_syncd;
-
+		return -ENOMEM;
 	return 0;
-
-out_destroy_syncd:
-	destroy_workqueue(xfs_syncd_wq);
-out:
-	return -ENOMEM;
 }
 
 STATIC void
 xfs_destroy_workqueues(void)
 {
-	destroy_workqueue(xfs_ail_wq);
 	destroy_workqueue(xfs_syncd_wq);
 }
 
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index afc4aa0..a4c281b 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,8 +28,6 @@
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
 
-struct workqueue_struct	*xfs_ail_wq;	/* AIL workqueue */
-
 #ifdef DEBUG
 /*
  * Check that the list is sorted as it should be.
@@ -406,16 +404,10 @@ xfs_ail_delete(
 	xfs_trans_ail_cursor_clear(ailp, lip);
 }
 
-/*
- * xfs_ail_worker does the work of pushing on the AIL. It will requeue itself
- * to run at a later time if there is more work to do to complete the push.
- */
-STATIC void
-xfs_ail_worker(
-	struct work_struct	*work)
+static long
+xfsaild_push(
+	struct xfs_ail		*ailp)
 {
-	struct xfs_ail		*ailp = container_of(to_delayed_work(work),
-					struct xfs_ail, xa_work);
 	xfs_mount_t		*mp = ailp->xa_mount;
 	struct xfs_ail_cursor	*cur = &ailp->xa_cursors;
 	xfs_log_item_t		*lip;
@@ -556,20 +548,6 @@ out_done:
 		/* We're past our target or empty, so idle */
 		ailp->xa_last_pushed_lsn = 0;
 
-		/*
-		 * We clear the XFS_AIL_PUSHING_BIT first before checking
-		 * whether the target has changed. If the target has changed,
-		 * this pushes the requeue race directly onto the result of the
-		 * atomic test/set bit, so we are guaranteed that either the
-		 * the pusher that changed the target or ourselves will requeue
-		 * the work (but not both).
-		 */
-		clear_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags);
-		smp_rmb();
-		if (XFS_LSN_CMP(ailp->xa_target, target) == 0 ||
-		    test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags))
-			return;
-
 		tout = 50;
 	} else if (XFS_LSN_CMP(lsn, target) >= 0) {
 		/*
@@ -592,9 +570,30 @@ out_done:
 		tout = 20;
 	}
 
-	/* There is more to do, requeue us.  */
-	queue_delayed_work(xfs_syncd_wq, &ailp->xa_work,
-					msecs_to_jiffies(tout));
+	return tout;
+}
+
+static int
+xfsaild(
+	void		*data)
+{
+	struct xfs_ail	*ailp = data;
+	long		tout = 0;	/* milliseconds */
+
+	while (!kthread_should_stop()) {
+		if (tout && tout <= 20)
+			__set_current_state(TASK_KILLABLE);
+		else
+			__set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(tout ?
+				 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
+
+		try_to_freeze();
+
+		tout = xfsaild_push(ailp);
+	}
+
+	return 0;
 }
 
 /*
@@ -629,8 +628,9 @@ xfs_ail_push(
 	 */
 	smp_wmb();
 	xfs_trans_ail_copy_lsn(ailp, &ailp->xa_target, &threshold_lsn);
-	if (!test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags))
-		queue_delayed_work(xfs_syncd_wq, &ailp->xa_work, 0);
+	smp_wmb();
+
+	wake_up_process(ailp->xa_task);
 }
 
 /*
@@ -865,9 +865,18 @@ xfs_trans_ail_init(
 	ailp->xa_mount = mp;
 	INIT_LIST_HEAD(&ailp->xa_ail);
 	spin_lock_init(&ailp->xa_lock);
-	INIT_DELAYED_WORK(&ailp->xa_work, xfs_ail_worker);
+
+	ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
+			ailp->xa_mount->m_fsname);
+	if (IS_ERR(ailp->xa_task))
+		goto out_free_ailp;
+
 	mp->m_ail = ailp;
 	return 0;
+
+out_free_ailp:
+	kmem_free(ailp);
+	return ENOMEM;
 }
 
 void
@@ -876,6 +885,6 @@ xfs_trans_ail_destroy(
 {
 	struct xfs_ail	*ailp = mp->m_ail;
 
-	cancel_delayed_work_sync(&ailp->xa_work);
+	kthread_stop(ailp->xa_task);
 	kmem_free(ailp);
 }
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index c0cb408..fe2e3cb 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -64,23 +64,17 @@ struct xfs_ail_cursor {
  */
 struct xfs_ail {
 	struct xfs_mount	*xa_mount;
+	struct task_struct	*xa_task;
 	struct list_head	xa_ail;
 	xfs_lsn_t		xa_target;
 	struct xfs_ail_cursor	xa_cursors;
 	spinlock_t		xa_lock;
-	struct delayed_work	xa_work;
 	xfs_lsn_t		xa_last_pushed_lsn;
-	unsigned long		xa_flags;
 };
 
-#define XFS_AIL_PUSHING_BIT	0
-
 /*
  * From xfs_trans_ail.c
  */
-
-extern struct workqueue_struct	*xfs_ail_wq;	/* AIL workqueue */
-
 void	xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
 				struct xfs_ail_cursor *cur,
 				struct xfs_log_item **log_items, int nr_items,
-- 
cgit v1.1


From d2a0110b7fb182c2c81144b834ddc7b3d645fe1e Mon Sep 17 00:00:00 2001
From: Seth Forshee <seth.forshee@canonical.com>
Date: Thu, 15 Sep 2011 10:48:27 -0400
Subject: hfsplus: Fix kfree of wrong pointers in hfsplus_fill_super() error
 path

commit f588c960fcaa6fa8bf82930bb819c9aca4eb9347 upstream.

Commit 6596528e391a ("hfsplus: ensure bio requests are not smaller than
the hardware sectors") changed the pointers used for volume header
allocations but failed to free the correct pointers in the error path
path of hfsplus_fill_super() and hfsplus_read_wrapper.

The second hunk came from a separate patch by Pavel Ivanov.

Reported-by: Pavel Ivanov <paivanof@gmail.com>
Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: Christoph Hellwig <hch@tuxera.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/hfsplus/super.c   | 4 ++--
 fs/hfsplus/wrapper.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index ab4857b..c3a76fd 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -508,8 +508,8 @@ out_close_cat_tree:
 out_close_ext_tree:
 	hfs_btree_close(sbi->ext_tree);
 out_free_vhdr:
-	kfree(sbi->s_vhdr);
-	kfree(sbi->s_backup_vhdr);
+	kfree(sbi->s_vhdr_buf);
+	kfree(sbi->s_backup_vhdr_buf);
 out_unload_nls:
 	unload_nls(sbi->nls);
 	unload_nls(nls);
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index e3881a1..7b8112d 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -275,9 +275,9 @@ reread:
 	return 0;
 
 out_free_backup_vhdr:
-	kfree(sbi->s_backup_vhdr);
+	kfree(sbi->s_backup_vhdr_buf);
 out_free_vhdr:
-	kfree(sbi->s_vhdr);
+	kfree(sbi->s_vhdr_buf);
 out:
 	return error;
 }
-- 
cgit v1.1