From 6fd70fafa11fd88d5beee360395f4c0b28d7af43 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Wed, 3 Aug 2011 23:12:18 +0400 Subject: CIFS: Fix missing a decrement of inFlight value commit 0193e072268fe62c4b19ad4b05cd0d4b23c43bb9 upstream. if we failed on getting mid entry in cifs_call_async. Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/transport.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 147aa22..c1b9c4b 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -362,6 +362,8 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, mid = AllocMidQEntry(hdr, server); if (mid == NULL) { mutex_unlock(&server->srv_mutex); + atomic_dec(&server->inFlight); + wake_up(&server->request_q); return -ENOMEM; } -- cgit v1.1 From cae28d950cc7319a7e88deccbb0c4492f0830e6e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 5 Aug 2011 09:02:40 -0400 Subject: cifs: cope with negative dentries in cifs_get_root commit 80975d21aae2136ccae1ce914a1602dc1d8b0795 upstream. The loop around lookup_one_len doesn't handle the case where it might return a negative dentry, which can cause an oops on the next pass through the loop. Check for that and break out of the loop with an error of -ENOENT if there is one. Fixes the panic reported here: https://bugzilla.redhat.com/show_bug.cgi?id=727927 Reported-by: TR Bentley Reported-by: Iain Arnell Cc: Al Viro Signed-off-by: Jeff Layton Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/cifsfs.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index bc4b12c..fc7e57b 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -581,6 +581,10 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) mutex_unlock(&dir->i_mutex); dput(dentry); dentry = child; + if (!dentry->d_inode) { + dput(dentry); + dentry = ERR_PTR(-ENOENT); + } } while (!IS_ERR(dentry)); _FreeXid(xid); kfree(full_path); -- cgit v1.1 From 407529f61453bf0e683aa4584bfe2a5b864f1525 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 5 Aug 2011 10:28:01 -0400 Subject: cifs: convert prefixpath delimiters in cifs_build_path_to_root commit f9e8c45002cacad536b338dfa9e910e341a49c31 upstream. Regression from 2.6.39... The delimiters in the prefixpath are not being converted based on whether posix paths are in effect. Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=727834 Reported-and-Tested-by: Iain Arnell Reported-by: Patrick Oltmann Cc: Pavel Shilovsky Signed-off-by: Jeff Layton Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/inode.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 9b018c8..a7b2dcd 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -764,20 +764,10 @@ char *cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, if (full_path == NULL) return full_path; - if (dfsplen) { + if (dfsplen) strncpy(full_path, tcon->treeName, dfsplen); - /* switch slash direction in prepath depending on whether - * windows or posix style path names - */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) { - int i; - for (i = 0; i < dfsplen; i++) { - if (full_path[i] == '\\') - full_path[i] = '/'; - } - } - } strncpy(full_path + dfsplen, vol->prepath, pplen); + convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb)); full_path[dfsplen + pplen] = 0; /* add trailing null */ return full_path; } -- cgit v1.1 From ed60157d262ebc0a032362013c58665d490edeee Mon Sep 17 00:00:00 2001 From: John Johansen Date: Fri, 22 Jul 2011 08:14:15 -0700 Subject: Ecryptfs: Add mount option to check uid of device being mounted = expect uid commit 764355487ea220fdc2faf128d577d7f679b91f97 upstream. Close a TOCTOU race for mounts done via ecryptfs-mount-private. The mount source (device) can be raced when the ownership test is done in userspace. Provide Ecryptfs a means to force the uid check at mount time. Signed-off-by: John Johansen Signed-off-by: Tyler Hicks Signed-off-by: Greg Kroah-Hartman --- fs/ecryptfs/main.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 9f1bb74..b4a6bef 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -175,6 +175,7 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig, ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes, ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only, + ecryptfs_opt_check_dev_ruid, ecryptfs_opt_err }; static const match_table_t tokens = { @@ -191,6 +192,7 @@ static const match_table_t tokens = { {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"}, {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"}, {ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"}, + {ecryptfs_opt_check_dev_ruid, "ecryptfs_check_dev_ruid"}, {ecryptfs_opt_err, NULL} }; @@ -236,6 +238,7 @@ static void ecryptfs_init_mount_crypt_stat( * ecryptfs_parse_options * @sb: The ecryptfs super block * @options: The options passed to the kernel + * @check_ruid: set to 1 if device uid should be checked against the ruid * * Parse mount options: * debug=N - ecryptfs_verbosity level for debug output @@ -251,7 +254,8 @@ static void ecryptfs_init_mount_crypt_stat( * * Returns zero on success; non-zero on error */ -static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options) +static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options, + uid_t *check_ruid) { char *p; int rc = 0; @@ -276,6 +280,8 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options) char *cipher_key_bytes_src; char *fn_cipher_key_bytes_src; + *check_ruid = 0; + if (!options) { rc = -EINVAL; goto out; @@ -380,6 +386,9 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options) mount_crypt_stat->flags |= ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY; break; + case ecryptfs_opt_check_dev_ruid: + *check_ruid = 1; + break; case ecryptfs_opt_err: default: printk(KERN_WARNING @@ -475,6 +484,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags const char *err = "Getting sb failed"; struct inode *inode; struct path path; + uid_t check_ruid; int rc; sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL); @@ -483,7 +493,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags goto out; } - rc = ecryptfs_parse_options(sbi, raw_data); + rc = ecryptfs_parse_options(sbi, raw_data, &check_ruid); if (rc) { err = "Error parsing options"; goto out; @@ -521,6 +531,15 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags "known incompatibilities\n"); goto out_free; } + + if (check_ruid && path.dentry->d_inode->i_uid != current_uid()) { + rc = -EPERM; + printk(KERN_ERR "Mount of device (uid: %d) not owned by " + "requested user (uid: %d)\n", + path.dentry->d_inode->i_uid, current_uid()); + goto out_free; + } + ecryptfs_set_superblock_lower(s, path.dentry->d_sb); s->s_maxbytes = path.dentry->d_sb->s_maxbytes; s->s_blocksize = path.dentry->d_sb->s_blocksize; -- cgit v1.1 From 8c9729a2ca6291a80c4d0473ae64f2b7a547cc9e Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Thu, 4 Aug 2011 22:58:51 -0500 Subject: eCryptfs: Return error when lower file pointer is NULL commit f61500e000eedc0c7a0201200a7f00ba5529c002 upstream. When an eCryptfs inode's lower file has been closed, and the pointer has been set to NULL, return an error when trying to do a lower read or write rather than calling BUG(). https://bugzilla.kernel.org/show_bug.cgi?id=37292 Signed-off-by: Tyler Hicks Signed-off-by: Greg Kroah-Hartman --- fs/ecryptfs/read_write.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index 85d4309..3745f7c 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -39,15 +39,16 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data, loff_t offset, size_t size) { - struct ecryptfs_inode_info *inode_info; + struct file *lower_file; mm_segment_t fs_save; ssize_t rc; - inode_info = ecryptfs_inode_to_private(ecryptfs_inode); - BUG_ON(!inode_info->lower_file); + lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file; + if (!lower_file) + return -EIO; fs_save = get_fs(); set_fs(get_ds()); - rc = vfs_write(inode_info->lower_file, data, size, &offset); + rc = vfs_write(lower_file, data, size, &offset); set_fs(fs_save); mark_inode_dirty_sync(ecryptfs_inode); return rc; @@ -225,15 +226,16 @@ out: int ecryptfs_read_lower(char *data, loff_t offset, size_t size, struct inode *ecryptfs_inode) { - struct ecryptfs_inode_info *inode_info = - ecryptfs_inode_to_private(ecryptfs_inode); + struct file *lower_file; mm_segment_t fs_save; ssize_t rc; - BUG_ON(!inode_info->lower_file); + lower_file = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file; + if (!lower_file) + return -EIO; fs_save = get_fs(); set_fs(get_ds()); - rc = vfs_read(inode_info->lower_file, data, size, &offset); + rc = vfs_read(lower_file, data, size, &offset); set_fs(fs_save); return rc; } -- cgit v1.1 From 1ae2a2c0515870d784f1ea101b079bd0962b6cd5 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 11 Aug 2011 09:51:46 -0500 Subject: ext3: Properly count journal credits for long symlinks commit d2db60df1e7eb39cf0f378dfc4dd8813666d46ef upstream. Commit ae54870a1dc9 ("ext3: Fix lock inversion in ext3_symlink()") recalculated the number of credits needed for a long symlink, in the process of splitting it into two transactions. However, the first credit calculation under-counted because if selinux is enabled, credits are needed to create the selinux xattr as well. Overrunning the reservation will result in an OOPS in journal_dirty_metadata() due to this assert: J_ASSERT_JH(jh, handle->h_buffer_credits > 0); Fix this by increasing the reservation size. Signed-off-by: Eric Sandeen Reviewed-by: Jan Kara Acked-by: "Theodore Ts'o" Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/ext3/namei.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 34b6d9b..e5a7111 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -2210,9 +2210,11 @@ static int ext3_symlink (struct inode * dir, /* * For non-fast symlinks, we just allocate inode and put it on * orphan list in the first transaction => we need bitmap, - * group descriptor, sb, inode block, quota blocks. + * group descriptor, sb, inode block, quota blocks, and + * possibly selinux xattr blocks. */ - credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); + credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + + EXT3_XATTR_TRANS_BLOCKS; } else { /* * Fast symlink. We have to add entry to directory -- cgit v1.1 From 8b180803c66ccc825d2969c1ea9929fcb7fba98e Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 11 Aug 2011 09:54:31 -0500 Subject: ext4: Properly count journal credits for long symlinks commit 8c20871998c082f6fbc963f1449a5ba5140ee39a upstream. Commit df5e6223407e ("ext4: fix deadlock in ext4_symlink() in ENOSPC conditions") recalculated the number of credits needed for a long symlink, in the process of splitting it into two transactions. However, the first credit calculation under-counted because if selinux is enabled, credits are needed to create the selinux xattr as well. Overrunning the reservation will result in an OOPS in jbd2_journal_dirty_metadata() due to this assert: J_ASSERT_JH(jh, handle->h_buffer_credits > 0); Fix this by increasing the reservation size. Signed-off-by: Eric Sandeen Reviewed-by: Jan Kara Acked-by: "Theodore Ts'o" Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/ext4/namei.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index b754b77..458a394 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2264,9 +2264,11 @@ static int ext4_symlink(struct inode *dir, /* * For non-fast symlinks, we just allocate inode and put it on * orphan list in the first transaction => we need bitmap, - * group descriptor, sb, inode block, quota blocks. + * group descriptor, sb, inode block, quota blocks, and + * possibly selinux xattr blocks. */ - credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); + credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + + EXT4_XATTR_TRANS_BLOCKS; } else { /* * Fast symlink. We have to add entry to directory -- cgit v1.1 From b732c7ad1f6e6056dd898de7987665d4c490c78d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 8 Aug 2011 11:50:24 -0400 Subject: cifs: demote cERROR in build_path_from_dentry to cFYI commit fa71f447065f676157ba6a2c121ba419818fc559 upstream. Running the cthon tests on a recent kernel caused this message to pop occasionally: CIFS VFS: did not end path lookup where expected namelen is 0 Some added debugging showed that namelen and dfsplen were both 0 when this occurred. That means that the read_seqretry returned true. Assuming that the comment inside the if statement is true, this should be harmless and just means that we raced with a rename. If that is the case, then there's no need for alarm and we can demote this to cFYI. While we're at it, print the dfsplen too so that we can see what happened here if the message pops during debugging. Cc: Al Viro Signed-off-by: Jeff Layton Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index d8d26f3..16cdd6d 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -110,8 +110,8 @@ cifs_bp_rename_retry: } rcu_read_unlock(); if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) { - cERROR(1, "did not end path lookup where expected namelen is %d", - namelen); + cFYI(1, "did not end path lookup where expected. namelen=%d " + "dfsplen=%d", namelen, dfsplen); /* presumably this is only possible if racing with a rename of one of the parent directories (we can not lock the dentries above us to prevent this, but retrying should be harmless) */ -- cgit v1.1 From f10df4139384fdede3be489dbfea077fc3c26ad9 Mon Sep 17 00:00:00 2001 From: Timo Warns Date: Wed, 17 Aug 2011 17:59:56 +0200 Subject: befs: Validate length of long symbolic links. commit 338d0f0a6fbc82407864606f5b64b75aeb3c70f2 upstream. Signed-off-by: Timo Warns Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/befs/linuxvfs.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 54b8c28..720d885 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -474,17 +474,22 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd) befs_data_stream *data = &befs_ino->i_data.ds; befs_off_t len = data->size; - befs_debug(sb, "Follow long symlink"); - - link = kmalloc(len, GFP_NOFS); - if (!link) { - link = ERR_PTR(-ENOMEM); - } else if (befs_read_lsymlink(sb, data, link, len) != len) { - kfree(link); - befs_error(sb, "Failed to read entire long symlink"); + if (len == 0) { + befs_error(sb, "Long symlink with illegal length"); link = ERR_PTR(-EIO); } else { - link[len - 1] = '\0'; + befs_debug(sb, "Follow long symlink"); + + link = kmalloc(len, GFP_NOFS); + if (!link) { + link = ERR_PTR(-ENOMEM); + } else if (befs_read_lsymlink(sb, data, link, len) != len) { + kfree(link); + befs_error(sb, "Failed to read entire long symlink"); + link = ERR_PTR(-EIO); + } else { + link[len - 1] = '\0'; + } } } else { link = befs_ino->i_data.symlink; -- cgit v1.1 From b3ff2fd377a0b593678af0082b6a2e4ecc3eec84 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 18 Aug 2011 04:41:55 +0000 Subject: possible memory corruption on mount commit 13589c437daf4c8e429b3236c0b923de1c9420d8 upstream. CIFS cleanup_volume_info_contents() looks like having a memory corruption problem. When UNCip is set to "&vol->UNC[2]" in cifs_parse_mount_options(), it should not be kfree()-ed in cleanup_volume_info_contents(). Introduced in commit b946845a9dc523c759cae2b6a0f6827486c3221a Signed-off-by: J.R. Okajima Reviewed-by: Jeff Layton Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/connect.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index ccc1afa..e0ea721 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2838,7 +2838,8 @@ cleanup_volume_info_contents(struct smb_vol *volume_info) kfree(volume_info->username); kzfree(volume_info->password); kfree(volume_info->UNC); - kfree(volume_info->UNCip); + if (volume_info->UNCip != volume_info->UNC + 2) + kfree(volume_info->UNCip); kfree(volume_info->domainname); kfree(volume_info->iocharset); kfree(volume_info->prepath); -- cgit v1.1 From 3be1216c9e9976074d49414a53abf572ec4b4a24 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 3 Aug 2011 21:52:51 -0700 Subject: pnfs-obj: Fix the comp_index != 0 case commit 9af7db3228acc286c50e3a0f054ec982efdbc6c6 upstream. There were bugs in the case of partial layout where olo_comp_index is not zero. This used to work and was tested but one of the later cleanup SQUASHMEs broke it and was not tested since. Also add a dprint that specify those received layout parameters. Everything else was already printed. Signed-off-by: Boaz Harrosh Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/objlayout/objio_osd.c | 16 +++++++--------- fs/nfs/objlayout/pnfs_osd_xdr_cli.c | 3 +++ 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 8ff2ea3..799e8cf 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -479,7 +479,6 @@ static int _io_check(struct objio_state *ios, bool is_write) for (i = 0; i < ios->numdevs; i++) { struct osd_sense_info osi; struct osd_request *or = ios->per_dev[i].or; - unsigned dev; int ret; if (!or) @@ -500,9 +499,8 @@ static int _io_check(struct objio_state *ios, bool is_write) continue; /* we recovered */ } - dev = ios->per_dev[i].dev; - objlayout_io_set_result(&ios->ol_state, dev, - &ios->layout->comps[dev].oc_object_id, + objlayout_io_set_result(&ios->ol_state, i, + &ios->layout->comps[i].oc_object_id, osd_pri_2_pnfs_err(osi.osd_err_pri), ios->per_dev[i].offset, ios->per_dev[i].length, @@ -650,7 +648,7 @@ static int _prepare_one_group(struct objio_state *ios, u64 length, int ret = 0; while (length) { - struct _objio_per_comp *per_dev = &ios->per_dev[dev]; + struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev]; unsigned cur_len, page_off = 0; if (!per_dev->length) { @@ -670,8 +668,8 @@ static int _prepare_one_group(struct objio_state *ios, u64 length, cur_len = stripe_unit; } - if (max_comp < dev) - max_comp = dev; + if (max_comp < dev - first_dev) + max_comp = dev - first_dev; } else { cur_len = stripe_unit; } @@ -806,7 +804,7 @@ static int _read_mirrors(struct objio_state *ios, unsigned cur_comp) struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp]; unsigned dev = per_dev->dev; struct pnfs_osd_object_cred *cred = - &ios->layout->comps[dev]; + &ios->layout->comps[cur_comp]; struct osd_obj_id obj = { .partition = cred->oc_object_id.oid_partition_id, .id = cred->oc_object_id.oid_object_id, @@ -904,7 +902,7 @@ static int _write_mirrors(struct objio_state *ios, unsigned cur_comp) for (; cur_comp < last_comp; ++cur_comp, ++dev) { struct osd_request *or = NULL; struct pnfs_osd_object_cred *cred = - &ios->layout->comps[dev]; + &ios->layout->comps[cur_comp]; struct osd_obj_id obj = { .partition = cred->oc_object_id.oid_partition_id, .id = cred->oc_object_id.oid_object_id, diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c index 16fc758..b3918f7 100644 --- a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c +++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c @@ -170,6 +170,9 @@ int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout, p = _osd_xdr_decode_data_map(p, &layout->olo_map); layout->olo_comps_index = be32_to_cpup(p++); layout->olo_num_comps = be32_to_cpup(p++); + dprintk("%s: olo_comps_index=%d olo_num_comps=%d\n", __func__, + layout->olo_comps_index, layout->olo_num_comps); + iter->total_comps = layout->olo_num_comps; return 0; } -- cgit v1.1 From b861a2580da034f6a57517c687ded68e20f99763 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Wed, 3 Aug 2011 21:54:33 -0700 Subject: pnfs-obj: Bug when we are running out of bio commit 20618b21da0796115e81906d24ff1601552701b7 upstream. When we have a situation that the number of pages we want to encode is bigger then the size of the bio. (Which can currently happen only when all IO is going to a single device .e.g group_width==1) then the IO is submitted short and we report back only the amount of bytes we actually wrote/read and all is fine. BUT ... There was a bug that the current length counter was advanced before the fail to add the extra page, and we come to a situation that the CDB length was one-page longer then the actual bio size, which is of course rejected by the osd-target. While here also fix the bio size calculation, in the case that we received more then one group of devices. Signed-off-by: Boaz Harrosh Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/objlayout/objio_osd.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 799e8cf..1d1dc1e 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -587,22 +587,19 @@ static void _calc_stripe_info(struct objio_state *ios, u64 file_offset, } static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, - unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len, + unsigned pgbase, struct _objio_per_comp *per_dev, int len, gfp_t gfp_flags) { unsigned pg = *cur_pg; + int cur_len = len; struct request_queue *q = osd_request_queue(_io_od(ios, per_dev->dev)); - per_dev->length += cur_len; - if (per_dev->bio == NULL) { - unsigned stripes = ios->layout->num_comps / - ios->layout->mirrors_p1; - unsigned pages_in_stripe = stripes * + unsigned pages_in_stripe = ios->layout->group_width * (ios->layout->stripe_unit / PAGE_SIZE); unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) / - stripes; + ios->layout->group_width; if (BIO_MAX_PAGES_KMALLOC < bio_size) bio_size = BIO_MAX_PAGES_KMALLOC; @@ -630,6 +627,7 @@ static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg, } BUG_ON(cur_len); + per_dev->length += len; *cur_pg = pg; return 0; } -- cgit v1.1 From f4bc412bc2f46d644375403b601f42d8487949da Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 2 Aug 2011 14:46:29 -0400 Subject: NFSv4.1: Fix the callback 'highest_used_slotid' behaviour commit 55a673990ec04cf63005318bcf08c2b0046e5778 upstream. Currently, there is no guarantee that we will call nfs4_cb_take_slot() even though nfs4_callback_compound() will consistently call nfs4_cb_free_slot() provided the cb_process_state has set the 'clp' field. The result is that we can trigger the BUG_ON() upon the next call to nfs4_cb_take_slot(). This patch fixes the above problem by using the slot id that was taken in the CB_SEQUENCE operation as a flag for whether or not we need to call nfs4_cb_free_slot(). It also fixes an atomicity problem: we need to set tbl->highest_used_slotid atomically with the check for NFS4_SESSION_DRAINING, otherwise we end up racing with the various tests in nfs4_begin_drain_session(). Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/callback.h | 2 +- fs/nfs/callback_proc.c | 20 ++++++++++++++------ fs/nfs/callback_xdr.c | 24 +++++++----------------- 3 files changed, 22 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index b257383..07df5f1 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -38,6 +38,7 @@ enum nfs4_callback_opnum { struct cb_process_state { __be32 drc_status; struct nfs_client *clp; + int slotid; }; struct cb_compound_hdr_arg { @@ -166,7 +167,6 @@ extern unsigned nfs4_callback_layoutrecall( void *dummy, struct cb_process_state *cps); extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses); -extern void nfs4_cb_take_slot(struct nfs_client *clp); struct cb_devicenotifyitem { uint32_t cbd_notify_type; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index d4d1954..31407ec 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -333,7 +333,7 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args) /* Normal */ if (likely(args->csa_sequenceid == slot->seq_nr + 1)) { slot->seq_nr++; - return htonl(NFS4_OK); + goto out_ok; } /* Replay */ @@ -352,11 +352,14 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args) /* Wraparound */ if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) { slot->seq_nr = 1; - return htonl(NFS4_OK); + goto out_ok; } /* Misordered request */ return htonl(NFS4ERR_SEQ_MISORDERED); +out_ok: + tbl->highest_used_slotid = args->csa_slotid; + return htonl(NFS4_OK); } /* @@ -418,26 +421,32 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, struct cb_sequenceres *res, struct cb_process_state *cps) { + struct nfs4_slot_table *tbl; struct nfs_client *clp; int i; __be32 status = htonl(NFS4ERR_BADSESSION); - cps->clp = NULL; - clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid); if (clp == NULL) goto out; + tbl = &clp->cl_session->bc_slot_table; + + spin_lock(&tbl->slot_tbl_lock); /* state manager is resetting the session */ if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) { - status = NFS4ERR_DELAY; + spin_unlock(&tbl->slot_tbl_lock); + status = htonl(NFS4ERR_DELAY); goto out; } status = validate_seqid(&clp->cl_session->bc_slot_table, args); + spin_unlock(&tbl->slot_tbl_lock); if (status) goto out; + cps->slotid = args->csa_slotid; + /* * Check for pending referring calls. If a match is found, a * related callback was received before the response to the original @@ -454,7 +463,6 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, res->csr_slotid = args->csa_slotid; res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; - nfs4_cb_take_slot(clp); out: cps->clp = clp; /* put in nfs4_callback_compound */ diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index c6c86a7..918ad64 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -754,26 +754,15 @@ static void nfs4_callback_free_slot(struct nfs4_session *session) * Let the state manager know callback processing done. * A single slot, so highest used slotid is either 0 or -1 */ - tbl->highest_used_slotid--; + tbl->highest_used_slotid = -1; nfs4_check_drain_bc_complete(session); spin_unlock(&tbl->slot_tbl_lock); } -static void nfs4_cb_free_slot(struct nfs_client *clp) +static void nfs4_cb_free_slot(struct cb_process_state *cps) { - if (clp && clp->cl_session) - nfs4_callback_free_slot(clp->cl_session); -} - -/* A single slot, so highest used slotid is either 0 or -1 */ -void nfs4_cb_take_slot(struct nfs_client *clp) -{ - struct nfs4_slot_table *tbl = &clp->cl_session->bc_slot_table; - - spin_lock(&tbl->slot_tbl_lock); - tbl->highest_used_slotid++; - BUG_ON(tbl->highest_used_slotid != 0); - spin_unlock(&tbl->slot_tbl_lock); + if (cps->slotid != -1) + nfs4_callback_free_slot(cps->clp->cl_session); } #else /* CONFIG_NFS_V4_1 */ @@ -784,7 +773,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op) return htonl(NFS4ERR_MINOR_VERS_MISMATCH); } -static void nfs4_cb_free_slot(struct nfs_client *clp) +static void nfs4_cb_free_slot(struct cb_process_state *cps) { } #endif /* CONFIG_NFS_V4_1 */ @@ -866,6 +855,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r struct cb_process_state cps = { .drc_status = 0, .clp = NULL, + .slotid = -1, }; unsigned int nops = 0; @@ -906,7 +896,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r *hdr_res.status = status; *hdr_res.nops = htonl(nops); - nfs4_cb_free_slot(cps.clp); + nfs4_cb_free_slot(&cps); nfs_put_client(cps.clp); dprintk("%s: done, status = %u\n", __func__, ntohl(status)); return rpc_success; -- cgit v1.1 From e25d2c749d25fc559f374766af66d267c97e0877 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 2 Aug 2011 14:46:52 -0400 Subject: NFSv4.1: Return NFS4ERR_BADSESSION to callbacks during session resets commit 910ac68a2b80c7de95bc8488734067b1bb15d583 upstream. If the client is in the process of resetting the session when it receives a callback, then returning NFS4ERR_DELAY may cause a deadlock with the DESTROY_SESSION call. Basically, if the client returns NFS4ERR_DELAY in response to the CB_SEQUENCE call, then the server is entitled to believe that the client is busy because it is already processing that call. In that case, the server is perfectly entitled to respond with a NFS4ERR_BACK_CHAN_BUSY to any DESTROY_SESSION call. Fix this by having the client reply with a NFS4ERR_BADSESSION in response to the callback if it is resetting the session. Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/callback_proc.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 31407ec..aaa09e9 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -437,6 +437,11 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) { spin_unlock(&tbl->slot_tbl_lock); status = htonl(NFS4ERR_DELAY); + /* Return NFS4ERR_BADSESSION if we're draining the session + * in order to reset it. + */ + if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) + status = htonl(NFS4ERR_BADSESSION); goto out; } -- cgit v1.1 From a2ea18615b6929ccc884e651cd1c0e04941548bf Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 4 Aug 2011 14:52:27 +0000 Subject: Btrfs: detect wether a device supports discard commit d5e2003c2bcda93a8f2e668eb4642d70c9c38301 upstream. We have a problem where if a user specifies discard but doesn't actually support it we will return EOPNOTSUPP from btrfs_discard_extent. This is a problem because this gets called (in a fashion) from the tree log recovery code, which has a nice little BUG_ON(ret) after it, which causes us to fail the tree log replay. So instead detect wether our devices support discard when we're adding them and then don't issue discards if we know that the device doesn't support it. And just for good measure set ret = 0 in btrfs_issue_discard just in case we still get EOPNOTSUPP so we don't screw anybody up like this again. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent-tree.c | 12 ++++++++++-- fs/btrfs/volumes.c | 17 +++++++++++++++++ fs/btrfs/volumes.h | 2 ++ 3 files changed, 29 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 71cd456..7e20a65 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1784,6 +1784,9 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, for (i = 0; i < multi->num_stripes; i++, stripe++) { + if (!stripe->dev->can_discard) + continue; + ret = btrfs_issue_discard(stripe->dev->bdev, stripe->physical, stripe->length); @@ -1791,11 +1794,16 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, discarded_bytes += stripe->length; else if (ret != -EOPNOTSUPP) break; + + /* + * Just in case we get back EOPNOTSUPP for some reason, + * just ignore the return value so we don't screw up + * people calling discard_extent. + */ + ret = 0; } kfree(multi); } - if (discarded_bytes && ret == -EOPNOTSUPP) - ret = 0; if (actual_bytes) *actual_bytes = discarded_bytes; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 19450bc..43baaf0 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -500,6 +500,9 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) fs_devices->rw_devices--; } + if (device->can_discard) + fs_devices->num_can_discard--; + new_device = kmalloc(sizeof(*new_device), GFP_NOFS); BUG_ON(!new_device); memcpy(new_device, device, sizeof(*new_device)); @@ -508,6 +511,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) new_device->bdev = NULL; new_device->writeable = 0; new_device->in_fs_metadata = 0; + new_device->can_discard = 0; list_replace_rcu(&device->dev_list, &new_device->dev_list); call_rcu(&device->rcu, free_device); @@ -547,6 +551,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder) { + struct request_queue *q; struct block_device *bdev; struct list_head *head = &fs_devices->devices; struct btrfs_device *device; @@ -603,6 +608,12 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, seeding = 0; } + q = bdev_get_queue(bdev); + if (blk_queue_discard(q)) { + device->can_discard = 1; + fs_devices->num_can_discard++; + } + device->bdev = bdev; device->in_fs_metadata = 0; device->mode = flags; @@ -1542,6 +1553,7 @@ error: int btrfs_init_new_device(struct btrfs_root *root, char *device_path) { + struct request_queue *q; struct btrfs_trans_handle *trans; struct btrfs_device *device; struct block_device *bdev; @@ -1611,6 +1623,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) lock_chunks(root); + q = bdev_get_queue(bdev); + if (blk_queue_discard(q)) + device->can_discard = 1; device->writeable = 1; device->work.func = pending_bios_fn; generate_random_uuid(device->uuid); @@ -1646,6 +1661,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) root->fs_info->fs_devices->num_devices++; root->fs_info->fs_devices->open_devices++; root->fs_info->fs_devices->rw_devices++; + if (device->can_discard) + root->fs_info->fs_devices->num_can_discard++; root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; if (!blk_queue_nonrot(bdev_get_queue(bdev))) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 7c12d61..6d866db 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -48,6 +48,7 @@ struct btrfs_device { int writeable; int in_fs_metadata; int missing; + int can_discard; spinlock_t io_lock; @@ -104,6 +105,7 @@ struct btrfs_fs_devices { u64 rw_devices; u64 missing_devices; u64 total_rw_bytes; + u64 num_can_discard; struct block_device *latest_bdev; /* all of the devices in the FS, protected by a mutex -- cgit v1.1 From 966ef7daecee021ffded11d584e57160ee0395c2 Mon Sep 17 00:00:00 2001 From: liubo Date: Sat, 6 Aug 2011 08:35:23 +0000 Subject: Btrfs: fix an oops of log replay commit 34f3e4f23ca3d259fe078f62a128d97ca83508ef upstream. When btrfs recovers from a crash, it may hit the oops below: ------------[ cut here ]------------ kernel BUG at fs/btrfs/inode.c:4580! [...] RIP: 0010:[] [] btrfs_add_link+0x161/0x1c0 [btrfs] [...] Call Trace: [] ? btrfs_inode_ref_index+0x31/0x80 [btrfs] [] add_inode_ref+0x319/0x3f0 [btrfs] [] replay_one_buffer+0x2c7/0x390 [btrfs] [] walk_down_log_tree+0x32a/0x480 [btrfs] [] walk_log_tree+0xf5/0x240 [btrfs] [] btrfs_recover_log_trees+0x250/0x350 [btrfs] [] ? btrfs_recover_log_trees+0x350/0x350 [btrfs] [] open_ctree+0x1442/0x17d0 [btrfs] [...] This comes from that while replaying an inode ref item, we forget to check those old conflicting DIR_ITEM and DIR_INDEX items in fs/file tree, then we will come to conflict corners which lead to BUG_ON(). Signed-off-by: Liu Bo Tested-by: Andy Lutomirski Signed-off-by: Chris Mason Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/tree-log.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4ce8a9f..7fa128d 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -799,14 +799,15 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - struct inode *dir; - int ret; struct btrfs_inode_ref *ref; + struct btrfs_dir_item *di; + struct inode *dir; struct inode *inode; - char *name; - int namelen; unsigned long ref_ptr; unsigned long ref_end; + char *name; + int namelen; + int ret; int search_done = 0; /* @@ -909,6 +910,25 @@ again: } btrfs_release_path(path); + /* look for a conflicting sequence number */ + di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), + btrfs_inode_ref_index(eb, ref), + name, namelen, 0); + if (di && !IS_ERR(di)) { + ret = drop_one_dir_item(trans, root, path, dir, di); + BUG_ON(ret); + } + btrfs_release_path(path); + + /* look for a conflicing name */ + di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), + name, namelen, 0); + if (di && !IS_ERR(di)) { + ret = drop_one_dir_item(trans, root, path, dir, di); + BUG_ON(ret); + } + btrfs_release_path(path); + insert: /* insert our name */ ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, -- cgit v1.1 From 2fb522e963f57a6c0206f67a355b8131b16ef607 Mon Sep 17 00:00:00 2001 From: Curt Wohlgemuth Date: Sat, 13 Aug 2011 11:25:18 -0400 Subject: ext4: Fix ext4_should_writeback_data() for no-journal mode commit 441c850857148935babe000fc2ba1455fe54a6a9 upstream. ext4_should_writeback_data() had an incorrect sequence of tests to determine if it should return 0 or 1: in particular, even in no-journal mode, 0 was being returned for a non-regular-file inode. This meant that, in non-journal mode, we would use ext4_journalled_aops for directories, symlinks, and other non-regular files. However, calling journalled aop callbacks when there is no valid handle, can cause problems. This would cause a kernel crash with Jan Kara's commit 2d859db3e4 ("ext4: fix data corruption in inodes with journalled data"), because we now dereference 'handle' in ext4_journalled_write_end(). I also added BUG_ONs to check for a valid handle in the obviously journal-only aops callbacks. I tested this running xfstests with a scratch device in these modes: - no-journal - data=ordered - data=writeback - data=journal All work fine; the data=journal run has many failures and a crash in xfstests 074, but this is no different from a vanilla kernel. Signed-off-by: Curt Wohlgemuth Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/ext4_jbd2.h | 4 ++-- fs/ext4/inode.c | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index bb85757..5802fa1 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -289,10 +289,10 @@ static inline int ext4_should_order_data(struct inode *inode) static inline int ext4_should_writeback_data(struct inode *inode) { - if (!S_ISREG(inode->i_mode)) - return 0; if (EXT4_JOURNAL(inode) == NULL) return 1; + if (!S_ISREG(inode->i_mode)) + return 0; if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) return 0; if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e3126c0..0b60a46 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1849,6 +1849,8 @@ static int ext4_journalled_write_end(struct file *file, from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; + BUG_ON(!ext4_handle_valid(handle)); + if (copied < len) { if (!PageUptodate(page)) copied = 0; @@ -2564,6 +2566,8 @@ static int __ext4_journalled_writepage(struct page *page, goto out; } + BUG_ON(!ext4_handle_valid(handle)); + ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, do_journal_get_write_access); -- cgit v1.1 From 2526f368949bccda6e8ed1bf74a4e955e3af42af Mon Sep 17 00:00:00 2001 From: Jiaying Zhang Date: Sat, 13 Aug 2011 12:17:13 -0400 Subject: ext4: call ext4_ioend_wait and ext4_flush_completed_IO in ext4_evict_inode commit 2581fdc810889fdea97689cb62481201d579c796 upstream. Flush inode's i_completed_io_list before calling ext4_io_wait to prevent the following deadlock scenario: A page fault happens while some process is writing inode A. During page fault, shrink_icache_memory is called that in turn evicts another inode B. Inode B has some pending io_end work so it calls ext4_ioend_wait() that waits for inode B's i_ioend_count to become zero. However, inode B's ioend work was queued behind some of inode A's ioend work on the same cpu's ext4-dio-unwritten workqueue. As the ext4-dio-unwritten thread on that cpu is processing inode A's ioend work, it tries to grab inode A's i_mutex lock. Since the i_mutex lock of inode A is still hold before the page fault happened, we enter a deadlock. Also moves ext4_flush_completed_IO and ext4_ioend_wait from ext4_destroy_inode() to ext4_evict_inode(). During inode deleteion, ext4_evict_inode() is called before ext4_destroy_inode() and in ext4_evict_inode(), we may call ext4_truncate() without holding i_mutex lock. As a result, there is a race between flush_completed_IO that is called from ext4_ext_truncate() and ext4_end_io_work, which may cause corruption on an io_end structure. This change moves ext4_flush_completed_IO and ext4_ioend_wait from ext4_destroy_inode() to ext4_evict_inode() to resolve the race between ext4_truncate() and ext4_end_io_work during inode deletion. Signed-off-by: Jiaying Zhang Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 6 ++++++ fs/ext4/super.c | 1 - 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0b60a46..773de46 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -189,6 +189,12 @@ void ext4_evict_inode(struct inode *inode) int err; trace_ext4_evict_inode(inode); + + mutex_lock(&inode->i_mutex); + ext4_flush_completed_IO(inode); + mutex_unlock(&inode->i_mutex); + ext4_ioend_wait(inode); + if (inode->i_nlink) { truncate_inode_pages(&inode->i_data, 0); goto no_delete; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9ea71aa..111ed9d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -892,7 +892,6 @@ static void ext4_i_callback(struct rcu_head *head) static void ext4_destroy_inode(struct inode *inode) { - ext4_ioend_wait(inode); if (!list_empty(&(EXT4_I(inode)->i_orphan))) { ext4_msg(inode->i_sb, KERN_ERR, "Inode %lu (%p): orphan list check failed!", -- cgit v1.1 From 4eddd2a50f2548c6c83081fde8fdbc3de07626f6 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Sat, 13 Aug 2011 12:30:59 -0400 Subject: ext4: Resolve the hang of direct i/o read in handling EXT4_IO_END_UNWRITTEN. commit 32c80b32c053dc52712dedac5e4d0aa7c93fc353 upstream. EXT4_IO_END_UNWRITTEN flag set and the increase of i_aiodio_unwritten should be done simultaneously since ext4_end_io_nolock always clear the flag and decrease the counter in the same time. We don't increase i_aiodio_unwritten when setting EXT4_IO_END_UNWRITTEN so it will go nagative and causes some process to wait forever. Part of the patch came from Eric in his e-mail, but it doesn't fix the problem met by Michael actually. http://marc.info/?l=linux-ext4&m=131316851417460&w=2 Reported-and-Tested-by: Michael Tokarev Signed-off-by: Eric Sandeen Signed-off-by: Tao Ma Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 9 ++++++++- fs/ext4/page-io.c | 6 ++++-- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 773de46..5390b4d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3645,8 +3645,15 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) goto out; } - io_end->flag = EXT4_IO_END_UNWRITTEN; + /* + * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now, + * but being more careful is always safe for the future change. + */ inode = io_end->inode; + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + io_end->flag |= EXT4_IO_END_UNWRITTEN; + atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + } /* Add the io_end to per-inode completed io list*/ spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 7bb8f76..97e5e98 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -338,8 +338,10 @@ submit_and_retry: if ((io_end->num_io_pages >= MAX_IO_PAGES) && (io_end->pages[io_end->num_io_pages-1] != io_page)) goto submit_and_retry; - if (buffer_uninit(bh)) - io->io_end->flag |= EXT4_IO_END_UNWRITTEN; + if (buffer_uninit(bh) && !(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + io_end->flag |= EXT4_IO_END_UNWRITTEN; + atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); + } io->io_end->size += bh->b_size; io->io_next_block++; ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); -- cgit v1.1 From 45df4b8977852ea12d6ed19f6c87e6765f6c31e5 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 13 Aug 2011 12:58:21 -0400 Subject: ext4: fix nomblk_io_submit option so it correctly converts uninit blocks commit 9dd75f1f1a02d656a11a7b9b9e6c2759b9c1e946 upstream. Bug discovered by Jan Kara: Finally, commit 1449032be17abb69116dbc393f67ceb8bd034f92 returned back the old IO submission code but apparently it forgot to return the old handling of uninitialized buffers so we unconditionnaly call block_write_full_page() without specifying end_io function. So AFAICS we never convert unwritten extents to written in some cases. For example when I mount the fs as: mount -t ext4 -o nomblk_io_submit,dioread_nolock /dev/ubdb /mnt and do int fd = open(argv[1], O_RDWR | O_CREAT | O_TRUNC, 0600); char buf[1024]; memset(buf, 'a', sizeof(buf)); fallocate(fd, 0, 0, 16384); write(fd, buf, sizeof(buf)); I get a file full of zeros (after remounting the filesystem so that pagecache is dropped) instead of seeing the first KB contain 'a's. Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5390b4d..b864839 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2156,7 +2156,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT)) err = ext4_bio_write_page(&io_submit, page, len, mpd->wbc); - else + else if (buffer_uninit(page_bufs)) { + ext4_set_bh_endio(page_bufs, inode); + err = block_write_full_page_endio(page, + noalloc_get_block_write, + mpd->wbc, ext4_end_io_buffer_write); + } else err = block_write_full_page(page, noalloc_get_block_write, mpd->wbc); -- cgit v1.1 From 4ca4e8168092fcf2c352b25556e786762668a2a4 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 24 Aug 2011 10:20:17 +0200 Subject: fuse: check size of FUSE_NOTIFY_INVAL_ENTRY message commit c2183d1e9b3f313dd8ba2b1b0197c8d9fb86a7ae upstream. FUSE_NOTIFY_INVAL_ENTRY didn't check the length of the write so the message processing could overrun and result in a "kernel BUG at fs/fuse/dev.c:629!" Reported-by: Han-Wen Nienhuys Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman --- fs/fuse/dev.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 640fc22..168a80f 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1358,6 +1358,10 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, if (outarg.namelen > FUSE_NAME_MAX) goto err; + err = -EINVAL; + if (size != sizeof(outarg) + outarg.namelen + 1) + goto err; + name.name = buf; name.len = outarg.namelen; err = fuse_copy_one(cs, buf, outarg.namelen + 1); -- cgit v1.1 From 97e5a85664252fd3f3f84751c7779c326df9c851 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sat, 10 Sep 2011 17:20:21 +1000 Subject: Avoid dereferencing a 'request_queue' after last close. commit 94007751bb02797ba87bac7aacee2731ac2039a3 upstream. On the last close of an 'md' device which as been stopped, the device is destroyed and in particular the request_queue is freed. The free is done in a separate thread so it might happen a short time later. __blkdev_put calls bdev_inode_switch_bdi *after* ->release has been called. Since commit f758eeabeb96f878c860e8f110f94ec8820822a9 bdev_inode_switch_bdi will dereference the 'old' bdi, which lives inside a request_queue, to get a spin lock. This causes the last close on an md device to sometime take a spin_lock which lives in freed memory - which results in an oops. So move the called to bdev_inode_switch_bdi before the call to ->release. Cc: Christoph Hellwig Cc: Hugh Dickins Cc: Andrew Morton Cc: Wu Fengguang Acked-by: Wu Fengguang Signed-off-by: NeilBrown Signed-off-by: Greg Kroah-Hartman --- fs/block_dev.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 610e8e0..194cf66 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1419,6 +1419,11 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) WARN_ON_ONCE(bdev->bd_holders); sync_blockdev(bdev); kill_bdev(bdev); + /* ->release can cause the old bdi to disappear, + * so must switch it out first + */ + bdev_inode_switch_bdi(bdev->bd_inode, + &default_backing_dev_info); } if (bdev->bd_contains == bdev) { if (disk->fops->release) @@ -1432,8 +1437,6 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) disk_put_part(bdev->bd_part); bdev->bd_part = NULL; bdev->bd_disk = NULL; - bdev_inode_switch_bdi(bdev->bd_inode, - &default_backing_dev_info); if (bdev != bdev->bd_contains) victim = bdev->bd_contains; bdev->bd_contains = NULL; -- cgit v1.1 From e38b21e76bb89bba3e59da494de586aac23ad430 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 11 Jul 2011 16:40:59 +0000 Subject: fs/9p: When doing inode lookup compare qid details and inode mode bits. commit fd2421f54423f307ecd31bdebdca6bc317e0c492 upstream. This make sure we don't use wrong inode from the inode hash. The inode number of the file deleted is reused by the next file system object created and if we only use inode number for inode hash lookup we could end up with wrong struct inode. Also compare inode generation number. Not all Linux file system provide st_gen in userspace. So it could be 0; Signed-off-by: Aneesh Kumar K.V Signed-off-by: Eric Van Hensbergen Signed-off-by: Greg Kroah-Hartman --- fs/9p/cache.c | 20 +++++++++----------- fs/9p/cache.h | 9 --------- fs/9p/v9fs.c | 2 +- fs/9p/v9fs.h | 2 +- fs/9p/vfs_inode.c | 36 +++++++++++++++++++++++++++++++++--- fs/9p/vfs_inode_dotl.c | 41 +++++++++++++++++++++++++++++++++++++---- 6 files changed, 81 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/9p/cache.c b/fs/9p/cache.c index 5b335c5..945aa5f 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -108,11 +108,10 @@ static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data, void *buffer, uint16_t bufmax) { const struct v9fs_inode *v9inode = cookie_netfs_data; - memcpy(buffer, &v9inode->fscache_key->path, - sizeof(v9inode->fscache_key->path)); + memcpy(buffer, &v9inode->qid.path, sizeof(v9inode->qid.path)); P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &v9inode->vfs_inode, - v9inode->fscache_key->path); - return sizeof(v9inode->fscache_key->path); + v9inode->qid.path); + return sizeof(v9inode->qid.path); } static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data, @@ -129,11 +128,10 @@ static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data, void *buffer, uint16_t buflen) { const struct v9fs_inode *v9inode = cookie_netfs_data; - memcpy(buffer, &v9inode->fscache_key->version, - sizeof(v9inode->fscache_key->version)); + memcpy(buffer, &v9inode->qid.version, sizeof(v9inode->qid.version)); P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &v9inode->vfs_inode, - v9inode->fscache_key->version); - return sizeof(v9inode->fscache_key->version); + v9inode->qid.version); + return sizeof(v9inode->qid.version); } static enum @@ -143,11 +141,11 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data, { const struct v9fs_inode *v9inode = cookie_netfs_data; - if (buflen != sizeof(v9inode->fscache_key->version)) + if (buflen != sizeof(v9inode->qid.version)) return FSCACHE_CHECKAUX_OBSOLETE; - if (memcmp(buffer, &v9inode->fscache_key->version, - sizeof(v9inode->fscache_key->version))) + if (memcmp(buffer, &v9inode->qid.version, + sizeof(v9inode->qid.version))) return FSCACHE_CHECKAUX_OBSOLETE; return FSCACHE_CHECKAUX_OKAY; diff --git a/fs/9p/cache.h b/fs/9p/cache.h index 049507a..40cc54c 100644 --- a/fs/9p/cache.h +++ b/fs/9p/cache.h @@ -93,15 +93,6 @@ static inline void v9fs_uncache_page(struct inode *inode, struct page *page) BUG_ON(PageFsCache(page)); } -static inline void v9fs_fscache_set_key(struct inode *inode, - struct p9_qid *qid) -{ - struct v9fs_inode *v9inode = V9FS_I(inode); - spin_lock(&v9inode->fscache_lock); - v9inode->fscache_key = qid; - spin_unlock(&v9inode->fscache_lock); -} - static inline void v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page) { diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index c82b017..8b7c6be 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -487,8 +487,8 @@ static void v9fs_inode_init_once(void *foo) struct v9fs_inode *v9inode = (struct v9fs_inode *)foo; #ifdef CONFIG_9P_FSCACHE v9inode->fscache = NULL; - v9inode->fscache_key = NULL; #endif + memset(&v9inode->qid, 0, sizeof(v9inode->qid)); inode_init_once(&v9inode->vfs_inode); } diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index e5ebedf..5d7392e 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -125,8 +125,8 @@ struct v9fs_inode { #ifdef CONFIG_9P_FSCACHE spinlock_t fscache_lock; struct fscache_cookie *fscache; - struct p9_qid *fscache_key; #endif + struct p9_qid qid; unsigned int cache_validity; struct p9_fid *writeback_fid; struct mutex v_mutex; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 7f6c677..88dbf07 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -216,7 +216,6 @@ struct inode *v9fs_alloc_inode(struct super_block *sb) return NULL; #ifdef CONFIG_9P_FSCACHE v9inode->fscache = NULL; - v9inode->fscache_key = NULL; spin_lock_init(&v9inode->fscache_lock); #endif v9inode->writeback_fid = NULL; @@ -433,6 +432,37 @@ void v9fs_evict_inode(struct inode *inode) } } +static int v9fs_test_inode(struct inode *inode, void *data) +{ + int umode; + struct v9fs_inode *v9inode = V9FS_I(inode); + struct p9_wstat *st = (struct p9_wstat *)data; + struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); + + umode = p9mode2unixmode(v9ses, st->mode); + /* don't match inode of different type */ + if ((inode->i_mode & S_IFMT) != (umode & S_IFMT)) + return 0; + + /* compare qid details */ + if (memcmp(&v9inode->qid.version, + &st->qid.version, sizeof(v9inode->qid.version))) + return 0; + + if (v9inode->qid.type != st->qid.type) + return 0; + return 1; +} + +static int v9fs_set_inode(struct inode *inode, void *data) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + struct p9_wstat *st = (struct p9_wstat *)data; + + memcpy(&v9inode->qid, &st->qid, sizeof(st->qid)); + return 0; +} + static struct inode *v9fs_qid_iget(struct super_block *sb, struct p9_qid *qid, struct p9_wstat *st) @@ -443,7 +473,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, struct v9fs_session_info *v9ses = sb->s_fs_info; i_ino = v9fs_qid2ino(qid); - inode = iget_locked(sb, i_ino); + inode = iget5_locked(sb, i_ino, v9fs_test_inode, v9fs_set_inode, st); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) @@ -453,6 +483,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, * FIXME!! we may need support for stale inodes * later. */ + inode->i_ino = i_ino; umode = p9mode2unixmode(v9ses, st->mode); retval = v9fs_init_inode(v9ses, inode, umode); if (retval) @@ -460,7 +491,6 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, v9fs_stat2inode(st, inode, sb); #ifdef CONFIG_9P_FSCACHE - v9fs_fscache_set_key(inode, &st->qid); v9fs_cache_inode_get_cookie(inode); #endif unlock_new_inode(inode); diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 691c78f..caa63ef 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -86,6 +86,38 @@ static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode) return dentry; } +static int v9fs_test_inode_dotl(struct inode *inode, void *data) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + struct p9_stat_dotl *st = (struct p9_stat_dotl *)data; + + /* don't match inode of different type */ + if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT)) + return 0; + + if (inode->i_generation != st->st_gen) + return 0; + + /* compare qid details */ + if (memcmp(&v9inode->qid.version, + &st->qid.version, sizeof(v9inode->qid.version))) + return 0; + + if (v9inode->qid.type != st->qid.type) + return 0; + return 1; +} + +static int v9fs_set_inode_dotl(struct inode *inode, void *data) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + struct p9_stat_dotl *st = (struct p9_stat_dotl *)data; + + memcpy(&v9inode->qid, &st->qid, sizeof(st->qid)); + inode->i_generation = st->st_gen; + return 0; +} + static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, struct p9_qid *qid, struct p9_fid *fid, @@ -97,7 +129,8 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, struct v9fs_session_info *v9ses = sb->s_fs_info; i_ino = v9fs_qid2ino(qid); - inode = iget_locked(sb, i_ino); + inode = iget5_locked(sb, i_ino, v9fs_test_inode_dotl, + v9fs_set_inode_dotl, st); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) @@ -107,13 +140,13 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, * FIXME!! we may need support for stale inodes * later. */ + inode->i_ino = i_ino; retval = v9fs_init_inode(v9ses, inode, st->st_mode); if (retval) goto error; v9fs_stat2inode_dotl(st, inode); #ifdef CONFIG_9P_FSCACHE - v9fs_fscache_set_key(inode, &st->qid); v9fs_cache_inode_get_cookie(inode); #endif retval = v9fs_get_acl(inode, fid); @@ -136,7 +169,7 @@ v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct p9_stat_dotl *st; struct inode *inode = NULL; - st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); + st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN); if (IS_ERR(st)) return ERR_CAST(st); @@ -547,7 +580,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) inode->i_blocks = stat->st_blocks; } if (stat->st_result_mask & P9_STATS_GEN) - inode->i_generation = stat->st_gen; + inode->i_generation = stat->st_gen; /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION * because the inode structure does not have fields for them. -- cgit v1.1 From 6170eea647404a2e5726ae81cc8ccc45db89812d Mon Sep 17 00:00:00 2001 From: Prem Karat Date: Fri, 6 May 2011 18:24:18 +0530 Subject: fs/9p: Fix invalid mount options/args commit a2dd43bb0d7b9ce28f8a39254c25840c0730498e upstream. Without this fix, if any invalid mount options/args are passed while mouting the 9p fs, no error (-EINVAL) is returned and default arg value is assigned. This fix returns -EINVAL when an invalid arguement is found while parsing mount options. Signed-off-by: Prem Karat Signed-off-by: Aneesh Kumar K.V Signed-off-by: Eric Van Hensbergen Signed-off-by: Greg Kroah-Hartman --- fs/9p/v9fs.c | 43 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 8b7c6be..ef96618 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -78,6 +78,25 @@ static const match_table_t tokens = { {Opt_err, NULL} }; +/* Interpret mount options for cache mode */ +static int get_cache_mode(char *s) +{ + int version = -EINVAL; + + if (!strcmp(s, "loose")) { + version = CACHE_LOOSE; + P9_DPRINTK(P9_DEBUG_9P, "Cache mode: loose\n"); + } else if (!strcmp(s, "fscache")) { + version = CACHE_FSCACHE; + P9_DPRINTK(P9_DEBUG_9P, "Cache mode: fscache\n"); + } else if (!strcmp(s, "none")) { + version = CACHE_NONE; + P9_DPRINTK(P9_DEBUG_9P, "Cache mode: none\n"); + } else + printk(KERN_INFO "9p: Unknown Cache mode %s.\n", s); + return version; +} + /** * v9fs_parse_options - parse mount options into session structure * @v9ses: existing v9fs session information @@ -97,7 +116,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) /* setup defaults */ v9ses->afid = ~0; v9ses->debug = 0; - v9ses->cache = 0; + v9ses->cache = CACHE_NONE; #ifdef CONFIG_9P_FSCACHE v9ses->cachetag = NULL; #endif @@ -171,13 +190,13 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) "problem allocating copy of cache arg\n"); goto free_and_return; } + ret = get_cache_mode(s); + if (ret == -EINVAL) { + kfree(s); + goto free_and_return; + } - if (strcmp(s, "loose") == 0) - v9ses->cache = CACHE_LOOSE; - else if (strcmp(s, "fscache") == 0) - v9ses->cache = CACHE_FSCACHE; - else - v9ses->cache = CACHE_NONE; + v9ses->cache = ret; kfree(s); break; @@ -200,9 +219,15 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) } else { v9ses->flags |= V9FS_ACCESS_SINGLE; v9ses->uid = simple_strtoul(s, &e, 10); - if (*e != '\0') - v9ses->uid = ~0; + if (*e != '\0') { + ret = -EINVAL; + printk(KERN_INFO "9p: Unknown access " + "argument %s.\n", s); + kfree(s); + goto free_and_return; + } } + kfree(s); break; -- cgit v1.1 From 926fa0b4b9cd6537f4750642165647aa20cfeae4 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 6 Jul 2011 16:32:31 +0530 Subject: fs/9p: Always ask new inode in create commit ed80fcfac2565fa866d93ba14f0e75de17a8223e upstream. This make sure we don't end up reusing the unlinked inode object. The ideal way is to use inode i_generation. But i_generation is not available in userspace always. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Eric Van Hensbergen Signed-off-by: Greg Kroah-Hartman --- fs/9p/v9fs.h | 27 +++++++++++++++++++++++---- fs/9p/vfs_inode.c | 22 +++++++++++++++++----- fs/9p/vfs_inode_dotl.c | 30 +++++++++++++++++++++--------- 3 files changed, 61 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 5d7392e..e78956c 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -153,13 +153,13 @@ extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p); extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, - struct super_block *sb); + struct super_block *sb, int new); extern const struct inode_operations v9fs_dir_inode_operations_dotl; extern const struct inode_operations v9fs_file_inode_operations_dotl; extern const struct inode_operations v9fs_symlink_inode_operations_dotl; extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, - struct super_block *sb); + struct super_block *sb, int new); /* other default globals */ #define V9FS_PORT 564 @@ -201,8 +201,27 @@ v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb) { if (v9fs_proto_dotl(v9ses)) - return v9fs_inode_from_fid_dotl(v9ses, fid, sb); + return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 0); else - return v9fs_inode_from_fid(v9ses, fid, sb); + return v9fs_inode_from_fid(v9ses, fid, sb, 0); } + +/** + * v9fs_get_new_inode_from_fid - Helper routine to populate an inode by + * issuing a attribute request + * @v9ses: session information + * @fid: fid to issue attribute request for + * @sb: superblock on which to create inode + * + */ +static inline struct inode * +v9fs_get_new_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, + struct super_block *sb) +{ + if (v9fs_proto_dotl(v9ses)) + return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 1); + else + return v9fs_inode_from_fid(v9ses, fid, sb, 1); +} + #endif diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 88dbf07..35d4121 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -454,6 +454,11 @@ static int v9fs_test_inode(struct inode *inode, void *data) return 1; } +static int v9fs_test_new_inode(struct inode *inode, void *data) +{ + return 0; +} + static int v9fs_set_inode(struct inode *inode, void *data) { struct v9fs_inode *v9inode = V9FS_I(inode); @@ -465,15 +470,22 @@ static int v9fs_set_inode(struct inode *inode, void *data) static struct inode *v9fs_qid_iget(struct super_block *sb, struct p9_qid *qid, - struct p9_wstat *st) + struct p9_wstat *st, + int new) { int retval, umode; unsigned long i_ino; struct inode *inode; struct v9fs_session_info *v9ses = sb->s_fs_info; + int (*test)(struct inode *, void *); + + if (new) + test = v9fs_test_new_inode; + else + test = v9fs_test_inode; i_ino = v9fs_qid2ino(qid); - inode = iget5_locked(sb, i_ino, v9fs_test_inode, v9fs_set_inode, st); + inode = iget5_locked(sb, i_ino, test, v9fs_set_inode, st); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) @@ -504,7 +516,7 @@ error: struct inode * v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, - struct super_block *sb) + struct super_block *sb, int new) { struct p9_wstat *st; struct inode *inode = NULL; @@ -513,7 +525,7 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, if (IS_ERR(st)) return ERR_CAST(st); - inode = v9fs_qid_iget(sb, &st->qid, st); + inode = v9fs_qid_iget(sb, &st->qid, st, new); p9stat_free(st); kfree(st); return inode; @@ -615,7 +627,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, } /* instantiate inode and assign the unopened fid to the dentry */ - inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index caa63ef..bec75f0 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -108,6 +108,12 @@ static int v9fs_test_inode_dotl(struct inode *inode, void *data) return 1; } +/* Always get a new inode */ +static int v9fs_test_new_inode_dotl(struct inode *inode, void *data) +{ + return 0; +} + static int v9fs_set_inode_dotl(struct inode *inode, void *data) { struct v9fs_inode *v9inode = V9FS_I(inode); @@ -121,16 +127,22 @@ static int v9fs_set_inode_dotl(struct inode *inode, void *data) static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, struct p9_qid *qid, struct p9_fid *fid, - struct p9_stat_dotl *st) + struct p9_stat_dotl *st, + int new) { int retval; unsigned long i_ino; struct inode *inode; struct v9fs_session_info *v9ses = sb->s_fs_info; + int (*test)(struct inode *, void *); + + if (new) + test = v9fs_test_new_inode_dotl; + else + test = v9fs_test_inode_dotl; i_ino = v9fs_qid2ino(qid); - inode = iget5_locked(sb, i_ino, v9fs_test_inode_dotl, - v9fs_set_inode_dotl, st); + inode = iget5_locked(sb, i_ino, test, v9fs_set_inode_dotl, st); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) @@ -164,7 +176,7 @@ error: struct inode * v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, - struct super_block *sb) + struct super_block *sb, int new) { struct p9_stat_dotl *st; struct inode *inode = NULL; @@ -173,7 +185,7 @@ v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, if (IS_ERR(st)) return ERR_CAST(st); - inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st); + inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st, new); kfree(st); return inode; } @@ -263,7 +275,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, fid = NULL; goto error; } - inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); @@ -383,7 +395,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, goto error; } - inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", @@ -636,7 +648,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, } /* instantiate inode and assign the unopened fid to dentry */ - inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", @@ -789,7 +801,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, goto error; } - inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { err = PTR_ERR(inode); P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", -- cgit v1.1 From 7c0e1afbe36ec6d067638daafe6e7ee30085202b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 23 Jul 2011 02:28:13 -0400 Subject: 9p: close ACL leaks commit 1ec95bf34d976b38897d1977b155a544d77b05e7 upstream. Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/9p/acl.c | 22 +++++++++++++--------- fs/9p/acl.h | 6 +++--- fs/9p/vfs_inode_dotl.c | 9 ++++++--- 3 files changed, 22 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 535ab6e..4a866cd 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -185,12 +185,15 @@ int v9fs_acl_chmod(struct dentry *dentry) } int v9fs_set_create_acl(struct dentry *dentry, - struct posix_acl *dpacl, struct posix_acl *pacl) + struct posix_acl **dpacl, struct posix_acl **pacl) { - v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl); - v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl); - posix_acl_release(dpacl); - posix_acl_release(pacl); + if (dentry) { + v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, *dpacl); + v9fs_set_acl(dentry, ACL_TYPE_ACCESS, *pacl); + } + posix_acl_release(*dpacl); + posix_acl_release(*pacl); + *dpacl = *pacl = NULL; return 0; } @@ -212,11 +215,11 @@ int v9fs_acl_mode(struct inode *dir, mode_t *modep, struct posix_acl *clone; if (S_ISDIR(mode)) - *dpacl = acl; + *dpacl = posix_acl_dup(acl); clone = posix_acl_clone(acl, GFP_NOFS); - retval = -ENOMEM; + posix_acl_release(acl); if (!clone) - goto cleanup; + return -ENOMEM; retval = posix_acl_create_masq(clone, &mode); if (retval < 0) { @@ -225,11 +228,12 @@ int v9fs_acl_mode(struct inode *dir, mode_t *modep, } if (retval > 0) *pacl = clone; + else + posix_acl_release(clone); } *modep = mode; return 0; cleanup: - posix_acl_release(acl); return retval; } diff --git a/fs/9p/acl.h b/fs/9p/acl.h index 7ef3ac9..c47ea9c 100644 --- a/fs/9p/acl.h +++ b/fs/9p/acl.h @@ -19,7 +19,7 @@ extern int v9fs_get_acl(struct inode *, struct p9_fid *); extern int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags); extern int v9fs_acl_chmod(struct dentry *); extern int v9fs_set_create_acl(struct dentry *, - struct posix_acl *, struct posix_acl *); + struct posix_acl **, struct posix_acl **); extern int v9fs_acl_mode(struct inode *dir, mode_t *modep, struct posix_acl **dpacl, struct posix_acl **pacl); #else @@ -33,8 +33,8 @@ static inline int v9fs_acl_chmod(struct dentry *dentry) return 0; } static inline int v9fs_set_create_acl(struct dentry *dentry, - struct posix_acl *dpacl, - struct posix_acl *pacl) + struct posix_acl **dpacl, + struct posix_acl **pacl) { return 0; } diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index bec75f0..185ce37 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -287,7 +287,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, goto error; /* Now set the ACL based on the default value */ - v9fs_set_create_acl(dentry, dacl, pacl); + v9fs_set_create_acl(dentry, &dacl, &pacl); v9inode = V9FS_I(inode); mutex_lock(&v9inode->v_mutex); @@ -328,6 +328,7 @@ error: err_clunk_old_fid: if (ofid) p9_client_clunk(ofid); + v9fs_set_create_acl(NULL, &dacl, &pacl); return err; } @@ -421,12 +422,13 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, d_instantiate(dentry, inode); } /* Now set the ACL based on the default value */ - v9fs_set_create_acl(dentry, dacl, pacl); + v9fs_set_create_acl(dentry, &dacl, &pacl); inc_nlink(dir); v9fs_invalidate_inode_attr(dir); error: if (fid) p9_client_clunk(fid); + v9fs_set_create_acl(NULL, &dacl, &pacl); return err; } @@ -826,10 +828,11 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, d_instantiate(dentry, inode); } /* Now set the ACL based on the default value */ - v9fs_set_create_acl(dentry, dacl, pacl); + v9fs_set_create_acl(dentry, &dacl, &pacl); error: if (fid) p9_client_clunk(fid); + v9fs_set_create_acl(NULL, &dacl, &pacl); return err; } -- cgit v1.1 From e279cdca3ca67dcd8e24051629163f9d5d838894 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 25 Jul 2011 18:06:32 +0000 Subject: fs/9p: Add fid before dentry instantiation commit 5441ae5eb3614d3c28f77073370738a2820c88e4 upstream. d_instantiate marks the dentry positive. So a parallel lookup and mkdir of the directory can find dentry that doesn't have fid attached. This can result in both the code path doing v9fs_fid_add which results in v9fs_dentry leak. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Eric Van Hensbergen Signed-off-by: Greg Kroah-Hartman --- fs/9p/vfs_inode.c | 4 +--- fs/9p/vfs_inode_dotl.c | 8 ++++---- 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 35d4121..ad7aae4 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -633,13 +633,11 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); goto error; } - d_instantiate(dentry, inode); err = v9fs_fid_add(dentry, fid); if (err < 0) goto error; - + d_instantiate(dentry, inode); return ofid; - error: if (ofid) p9_client_clunk(ofid); diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 185ce37..0a17235 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -281,10 +281,10 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); goto error; } - d_instantiate(dentry, inode); err = v9fs_fid_add(dentry, fid); if (err < 0) goto error; + d_instantiate(dentry, inode); /* Now set the ACL based on the default value */ v9fs_set_create_acl(dentry, &dacl, &pacl); @@ -403,10 +403,10 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, err); goto error; } - d_instantiate(dentry, inode); err = v9fs_fid_add(dentry, fid); if (err < 0) goto error; + d_instantiate(dentry, inode); fid = NULL; } else { /* @@ -657,10 +657,10 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, err); goto error; } - d_instantiate(dentry, inode); err = v9fs_fid_add(dentry, fid); if (err < 0) goto error; + d_instantiate(dentry, inode); fid = NULL; } else { /* Not in cached mode. No need to populate inode with stat */ @@ -810,10 +810,10 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, err); goto error; } - d_instantiate(dentry, inode); err = v9fs_fid_add(dentry, fid); if (err < 0) goto error; + d_instantiate(dentry, inode); fid = NULL; } else { /* -- cgit v1.1 From 29a3e8657d2a2640384166e3fe29a086d235fc33 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 25 Jul 2011 18:06:33 +0000 Subject: fs/9p: Don't update file type when updating file attributes commit 45089142b1497dab2327d60f6c71c40766fc3ea4 upstream. We should only update attributes that we can change on stat2inode. Also do file type initialization in v9fs_init_inode. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Eric Van Hensbergen Signed-off-by: Greg Kroah-Hartman --- fs/9p/v9fs_vfs.h | 4 +-- fs/9p/vfs_inode.c | 91 +++++++++++++++++++++++++++----------------------- fs/9p/vfs_inode_dotl.c | 23 ++++++++----- fs/9p/vfs_super.c | 2 +- 4 files changed, 68 insertions(+), 52 deletions(-) (limited to 'fs') diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 4014160..bd86d22 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -54,9 +54,9 @@ extern struct kmem_cache *v9fs_inode_cache; struct inode *v9fs_alloc_inode(struct super_block *sb); void v9fs_destroy_inode(struct inode *inode); -struct inode *v9fs_get_inode(struct super_block *sb, int mode); +struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t); int v9fs_init_inode(struct v9fs_session_info *v9ses, - struct inode *inode, int mode); + struct inode *inode, int mode, dev_t); void v9fs_evict_inode(struct inode *inode); ino_t v9fs_qid2ino(struct p9_qid *qid); void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index ad7aae4..1ab561d 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -95,15 +95,18 @@ static int unixmode2p9mode(struct v9fs_session_info *v9ses, int mode) /** * p9mode2unixmode- convert plan9 mode bits to unix mode bits * @v9ses: v9fs session information - * @mode: mode to convert + * @stat: p9_wstat from which mode need to be derived + * @rdev: major number, minor number in case of device files. * */ - -static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode) +static int p9mode2unixmode(struct v9fs_session_info *v9ses, + struct p9_wstat *stat, dev_t *rdev) { int res; + int mode = stat->mode; - res = mode & 0777; + res = mode & S_IALLUGO; + *rdev = 0; if ((mode & P9_DMDIR) == P9_DMDIR) res |= S_IFDIR; @@ -116,9 +119,26 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode) && (v9ses->nodev == 0)) res |= S_IFIFO; else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses)) - && (v9ses->nodev == 0)) - res |= S_IFBLK; - else + && (v9ses->nodev == 0)) { + char type = 0, ext[32]; + int major = -1, minor = -1; + + strncpy(ext, stat->extension, sizeof(ext)); + sscanf(ext, "%c %u %u", &type, &major, &minor); + switch (type) { + case 'c': + res |= S_IFCHR; + break; + case 'b': + res |= S_IFBLK; + break; + default: + P9_DPRINTK(P9_DEBUG_ERROR, + "Unknown special type %c %s\n", type, + stat->extension); + }; + *rdev = MKDEV(major, minor); + } else res |= S_IFREG; if (v9fs_proto_dotu(v9ses)) { @@ -131,7 +151,6 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode) if ((mode & P9_DMSETVTX) == P9_DMSETVTX) res |= S_ISVTX; } - return res; } @@ -242,13 +261,13 @@ void v9fs_destroy_inode(struct inode *inode) } int v9fs_init_inode(struct v9fs_session_info *v9ses, - struct inode *inode, int mode) + struct inode *inode, int mode, dev_t rdev) { int err = 0; inode_init_owner(inode, NULL, mode); inode->i_blocks = 0; - inode->i_rdev = 0; + inode->i_rdev = rdev; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_mapping->a_ops = &v9fs_addr_operations; @@ -335,7 +354,7 @@ error: * */ -struct inode *v9fs_get_inode(struct super_block *sb, int mode) +struct inode *v9fs_get_inode(struct super_block *sb, int mode, dev_t rdev) { int err; struct inode *inode; @@ -348,7 +367,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n"); return ERR_PTR(-ENOMEM); } - err = v9fs_init_inode(v9ses, inode, mode); + err = v9fs_init_inode(v9ses, inode, mode, rdev); if (err) { iput(inode); return ERR_PTR(err); @@ -435,11 +454,12 @@ void v9fs_evict_inode(struct inode *inode) static int v9fs_test_inode(struct inode *inode, void *data) { int umode; + dev_t rdev; struct v9fs_inode *v9inode = V9FS_I(inode); struct p9_wstat *st = (struct p9_wstat *)data; struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); - umode = p9mode2unixmode(v9ses, st->mode); + umode = p9mode2unixmode(v9ses, st, &rdev); /* don't match inode of different type */ if ((inode->i_mode & S_IFMT) != (umode & S_IFMT)) return 0; @@ -473,6 +493,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, struct p9_wstat *st, int new) { + dev_t rdev; int retval, umode; unsigned long i_ino; struct inode *inode; @@ -496,8 +517,8 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, * later. */ inode->i_ino = i_ino; - umode = p9mode2unixmode(v9ses, st->mode); - retval = v9fs_init_inode(v9ses, inode, umode); + umode = p9mode2unixmode(v9ses, st, &rdev); + retval = v9fs_init_inode(v9ses, inode, umode, rdev); if (retval) goto error; @@ -990,7 +1011,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, return PTR_ERR(st); v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb); - generic_fillattr(dentry->d_inode, stat); + generic_fillattr(dentry->d_inode, stat); p9stat_free(st); kfree(st); @@ -1074,6 +1095,7 @@ void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, struct super_block *sb) { + mode_t mode; char ext[32]; char tag_name[14]; unsigned int i_nlink; @@ -1109,31 +1131,9 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, inode->i_nlink = i_nlink; } } - inode->i_mode = p9mode2unixmode(v9ses, stat->mode); - if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) { - char type = 0; - int major = -1; - int minor = -1; - - strncpy(ext, stat->extension, sizeof(ext)); - sscanf(ext, "%c %u %u", &type, &major, &minor); - switch (type) { - case 'c': - inode->i_mode &= ~S_IFBLK; - inode->i_mode |= S_IFCHR; - break; - case 'b': - break; - default: - P9_DPRINTK(P9_DEBUG_ERROR, - "Unknown special type %c %s\n", type, - stat->extension); - }; - inode->i_rdev = MKDEV(major, minor); - init_special_inode(inode, inode->i_mode, inode->i_rdev); - } else - inode->i_rdev = 0; - + mode = stat->mode & S_IALLUGO; + mode |= inode->i_mode & ~S_IALLUGO; + inode->i_mode = mode; i_size_write(inode, stat->length); /* not real number of blocks, but 512 byte ones ... */ @@ -1399,6 +1399,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev) int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) { + int umode; + dev_t rdev; loff_t i_size; struct p9_wstat *st; struct v9fs_session_info *v9ses; @@ -1407,6 +1409,12 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) st = p9_client_stat(fid); if (IS_ERR(st)) return PTR_ERR(st); + /* + * Don't update inode if the file type is different + */ + umode = p9mode2unixmode(v9ses, st, &rdev); + if ((inode->i_mode & S_IFMT) != (umode & S_IFMT)) + goto out; spin_lock(&inode->i_lock); /* @@ -1418,6 +1426,7 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) if (v9ses->cache) inode->i_size = i_size; spin_unlock(&inode->i_lock); +out: p9stat_free(st); kfree(st); return 0; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 0a17235..818bf6f 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -153,7 +153,8 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, * later. */ inode->i_ino = i_ino; - retval = v9fs_init_inode(v9ses, inode, st->st_mode); + retval = v9fs_init_inode(v9ses, inode, + st->st_mode, new_decode_dev(st->st_rdev)); if (retval) goto error; @@ -414,7 +415,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, * inode with stat. We need to get an inode * so that we can set the acl with dentry */ - inode = v9fs_get_inode(dir->i_sb, mode); + inode = v9fs_get_inode(dir->i_sb, mode, 0); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto error; @@ -540,6 +541,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) { + mode_t mode; struct v9fs_inode *v9inode = V9FS_I(inode); if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) { @@ -552,11 +554,10 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) inode->i_uid = stat->st_uid; inode->i_gid = stat->st_gid; inode->i_nlink = stat->st_nlink; - inode->i_mode = stat->st_mode; - inode->i_rdev = new_decode_dev(stat->st_rdev); - if ((S_ISBLK(inode->i_mode)) || (S_ISCHR(inode->i_mode))) - init_special_inode(inode, inode->i_mode, inode->i_rdev); + mode = stat->st_mode & S_IALLUGO; + mode |= inode->i_mode & ~S_IALLUGO; + inode->i_mode = mode; i_size_write(inode, stat->st_size); inode->i_blocks = stat->st_blocks; @@ -664,7 +665,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, fid = NULL; } else { /* Not in cached mode. No need to populate inode with stat */ - inode = v9fs_get_inode(dir->i_sb, S_IFLNK); + inode = v9fs_get_inode(dir->i_sb, S_IFLNK, 0); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto error; @@ -820,7 +821,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, * Not in cached mode. No need to populate inode with stat. * socket syscall returns a fd, so we need instantiate */ - inode = v9fs_get_inode(dir->i_sb, mode); + inode = v9fs_get_inode(dir->i_sb, mode, rdev); if (IS_ERR(inode)) { err = PTR_ERR(inode); goto error; @@ -886,6 +887,11 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) st = p9_client_getattr_dotl(fid, P9_STATS_ALL); if (IS_ERR(st)) return PTR_ERR(st); + /* + * Don't update inode if the file type is different + */ + if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT)) + goto out; spin_lock(&inode->i_lock); /* @@ -897,6 +903,7 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) if (v9ses->cache) inode->i_size = i_size; spin_unlock(&inode->i_lock); +out: kfree(st); return 0; } diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index feef6cd..c70251d 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -149,7 +149,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, else sb->s_d_op = &v9fs_dentry_operations; - inode = v9fs_get_inode(sb, S_IFDIR | mode); + inode = v9fs_get_inode(sb, S_IFDIR | mode, 0); if (IS_ERR(inode)) { retval = PTR_ERR(inode); goto release_sb; -- cgit v1.1 From a111278ea956e42e2d1ac4b4f04a2c71d322235b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 3 Aug 2011 19:55:32 +0530 Subject: fs/9p: Add OS dependent open flags in 9p protocol commit f88657ce3f9713a0c62101dffb0e972a979e77b9 upstream. Some of the flags are OS/arch dependent we add a 9p protocol value which maps to asm-generic/fcntl.h values in Linux Based on the original patch from Venkateswararao Jujjuri [extra comments from author as to why this needs to go to stable: Earlier for different operation such as open we used the values of open flag as defined by the OS. But some of these flags such as O_DIRECT are arch dependent. So if we have the 9p client and server running on different architectures, we end up with client sending client architecture value of these open flag and server will try to map these values to what its architecture states. For ex: O_DIRECT on a x86 client maps to #define O_DIRECT 00040000 Where as on sparc server it will maps to #define O_DIRECT 0x100000 Hence we need to map these open flags to OS/arch independent flag values. Getting these changes to an early version of kernel ensures us that we work with different combination of client and server. We should ideally backport this patch to all possible kernel version.] Signed-off-by: Aneesh Kumar K.V Signed-off-by: Harsh Prateek Bora Signed-off-by: Greg Kroah-Hartman --- fs/9p/v9fs_vfs.h | 2 ++ fs/9p/vfs_file.c | 2 +- fs/9p/vfs_inode_dotl.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 57 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index bd86d22..f9a28ea 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -82,4 +82,6 @@ static inline void v9fs_invalidate_inode_attr(struct inode *inode) v9inode->cache_validity |= V9FS_INO_INVALID_ATTR; return; } + +int v9fs_open_to_dotl_flags(int flags); #endif diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index ffed558..56907e4 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -65,7 +65,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) v9inode = V9FS_I(inode); v9ses = v9fs_inode2v9ses(inode); if (v9fs_proto_dotl(v9ses)) - omode = file->f_flags; + omode = v9fs_open_to_dotl_flags(file->f_flags); else omode = v9fs_uflags2omode(file->f_flags, v9fs_proto_dotu(v9ses)); diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 818bf6f..c873172 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -191,6 +191,58 @@ v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, return inode; } +struct dotl_openflag_map { + int open_flag; + int dotl_flag; +}; + +static int v9fs_mapped_dotl_flags(int flags) +{ + int i; + int rflags = 0; + struct dotl_openflag_map dotl_oflag_map[] = { + { O_CREAT, P9_DOTL_CREATE }, + { O_EXCL, P9_DOTL_EXCL }, + { O_NOCTTY, P9_DOTL_NOCTTY }, + { O_TRUNC, P9_DOTL_TRUNC }, + { O_APPEND, P9_DOTL_APPEND }, + { O_NONBLOCK, P9_DOTL_NONBLOCK }, + { O_DSYNC, P9_DOTL_DSYNC }, + { FASYNC, P9_DOTL_FASYNC }, + { O_DIRECT, P9_DOTL_DIRECT }, + { O_LARGEFILE, P9_DOTL_LARGEFILE }, + { O_DIRECTORY, P9_DOTL_DIRECTORY }, + { O_NOFOLLOW, P9_DOTL_NOFOLLOW }, + { O_NOATIME, P9_DOTL_NOATIME }, + { O_CLOEXEC, P9_DOTL_CLOEXEC }, + { O_SYNC, P9_DOTL_SYNC}, + }; + for (i = 0; i < ARRAY_SIZE(dotl_oflag_map); i++) { + if (flags & dotl_oflag_map[i].open_flag) + rflags |= dotl_oflag_map[i].dotl_flag; + } + return rflags; +} + +/** + * v9fs_open_to_dotl_flags- convert Linux specific open flags to + * plan 9 open flag. + * @flags: flags to convert + */ +int v9fs_open_to_dotl_flags(int flags) +{ + int rflags = 0; + + /* + * We have same bits for P9_DOTL_READONLY, P9_DOTL_WRONLY + * and P9_DOTL_NOACCESS + */ + rflags |= flags & O_ACCMODE; + rflags |= v9fs_mapped_dotl_flags(flags); + + return rflags; +} + /** * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. * @dir: directory inode that is being created @@ -259,7 +311,8 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, "Failed to get acl values in creat %d\n", err); goto error; } - err = p9_client_create_dotl(ofid, name, flags, mode, gid, &qid); + err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags), + mode, gid, &qid); if (err < 0) { P9_DPRINTK(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n", -- cgit v1.1 From 8bdb14f9c33bb0ad8b37a9465c968ccce4ecda3b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 16 Aug 2011 22:19:28 +0530 Subject: fs/9p: Always ask new inode in lookup for cache mode disabled commit 73f507171cfa407b19f254aef95cbb058c8180cf upstream. This make sure we don't end up reusing the unlinked inode object. The ideal way is to use inode i_generation. But i_generation is not available in userspace always. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Greg Kroah-Hartman --- fs/9p/vfs_inode.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 1ab561d..c72e20c 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -799,6 +799,7 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nameidata) { + struct dentry *res; struct super_block *sb; struct v9fs_session_info *v9ses; struct p9_fid *dfid, *fid; @@ -830,22 +831,35 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(result); } - - inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); + /* + * Make sure we don't use a wrong inode due to parallel + * unlink. For cached mode create calls request for new + * inode. But with cache disabled, lookup should do this. + */ + if (v9ses->cache) + inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb); + else + inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); if (IS_ERR(inode)) { result = PTR_ERR(inode); inode = NULL; goto error; } - result = v9fs_fid_add(dentry, fid); if (result < 0) goto error_iput; - inst_out: - d_add(dentry, inode); - return NULL; - + /* + * If we had a rename on the server and a parallel lookup + * for the new name, then make sure we instantiate with + * the new name. ie look up for a/b, while on server somebody + * moved b under k and client parallely did a lookup for + * k/b. + */ + res = d_materialise_unique(dentry, inode); + if (!IS_ERR(res)) + return res; + result = PTR_ERR(res); error_iput: iput(inode); error: -- cgit v1.1 From 101e357617f790d36b999dada08c437fd2431c3c Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Sun, 21 Aug 2011 00:21:18 +0530 Subject: fs/9p: Use protocol-defined value for lock/getlock 'type' field. commit 51b8b4fb32271d39fbdd760397406177b2b0fd36 upstream. Signed-off-by: Jim Garlick Signed-off-by: Aneesh Kumar K.V Signed-off-by: Harsh Prateek Bora Signed-off-by: Greg Kroah-Hartman --- fs/9p/vfs_file.c | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 56907e4..9d6e168 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -169,7 +169,18 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) /* convert posix lock to p9 tlock args */ memset(&flock, 0, sizeof(flock)); - flock.type = fl->fl_type; + /* map the lock type */ + switch (fl->fl_type) { + case F_RDLCK: + flock.type = P9_LOCK_TYPE_RDLCK; + break; + case F_WRLCK: + flock.type = P9_LOCK_TYPE_WRLCK; + break; + case F_UNLCK: + flock.type = P9_LOCK_TYPE_UNLCK; + break; + } flock.start = fl->fl_start; if (fl->fl_end == OFFSET_MAX) flock.length = 0; @@ -245,7 +256,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) /* convert posix lock to p9 tgetlock args */ memset(&glock, 0, sizeof(glock)); - glock.type = fl->fl_type; + glock.type = P9_LOCK_TYPE_UNLCK; glock.start = fl->fl_start; if (fl->fl_end == OFFSET_MAX) glock.length = 0; @@ -257,17 +268,26 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) res = p9_client_getlock_dotl(fid, &glock); if (res < 0) return res; - if (glock.type != F_UNLCK) { - fl->fl_type = glock.type; + /* map 9p lock type to os lock type */ + switch (glock.type) { + case P9_LOCK_TYPE_RDLCK: + fl->fl_type = F_RDLCK; + break; + case P9_LOCK_TYPE_WRLCK: + fl->fl_type = F_WRLCK; + break; + case P9_LOCK_TYPE_UNLCK: + fl->fl_type = F_UNLCK; + break; + } + if (glock.type != P9_LOCK_TYPE_UNLCK) { fl->fl_start = glock.start; if (glock.length == 0) fl->fl_end = OFFSET_MAX; else fl->fl_end = glock.start + glock.length - 1; fl->fl_pid = glock.proc_id; - } else - fl->fl_type = F_UNLCK; - + } return res; } -- cgit v1.1 From 97abc52eb26f2ae8d033100c4c024e6138bdfd60 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 14 Sep 2011 18:55:41 +0100 Subject: restore pinning the victim dentry in vfs_rmdir()/vfs_rename_dir() commit 1d2ef5901483004d74947bbf78d5146c24038fe7 upstream. We used to get the victim pinned by dentry_unhash() prior to commit 64252c75a219 ("vfs: remove dget() from dentry_unhash()") and ->rmdir() and ->rename() instances relied on that; most of them don't care, but ones that used d_delete() themselves do. As the result, we are getting rmdir() oopses on NFS now. Just grab the reference before locking the victim and drop it explicitly after unlocking, same as vfs_rename_other() does. Signed-off-by: Al Viro Tested-by: Simon Kirby Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/namei.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 14ab8d3..b456c7a 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2582,6 +2582,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) if (!dir->i_op->rmdir) return -EPERM; + dget(dentry); mutex_lock(&dentry->d_inode->i_mutex); error = -EBUSY; @@ -2602,6 +2603,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) out: mutex_unlock(&dentry->d_inode->i_mutex); + dput(dentry); if (!error) d_delete(dentry); return error; @@ -3005,6 +3007,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, if (error) return error; + dget(new_dentry); if (target) mutex_lock(&target->i_mutex); @@ -3025,6 +3028,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, out: if (target) mutex_unlock(&target->i_mutex); + dput(new_dentry); if (!error) if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) d_move(old_dentry,new_dentry); -- cgit v1.1 From 862bee39ef540bd3a7772b899b4fa7f3036abf0d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 23 Aug 2011 07:21:28 -0400 Subject: cifs: fix possible memory corruption in CIFSFindNext commit 9438fabb73eb48055b58b89fc51e0bc4db22fabd upstream. The name_len variable in CIFSFindNext is a signed int that gets set to the resume_name_len in the cifs_search_info. The resume_name_len however is unsigned and for some infolevels is populated directly from a 32 bit value sent by the server. If the server sends a very large value for this, then that value could look negative when converted to a signed int. That would make that value pass the PATH_MAX check later in CIFSFindNext. The name_len would then be used as a length value for a memcpy. It would then be treated as unsigned again, and the memcpy scribbles over a ton of memory. Fix this by making the name_len an unsigned value in CIFSFindNext. Reported-by: Darren Lavender Signed-off-by: Jeff Layton Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/cifssmb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 1a9fe7f..07132c4 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -4079,7 +4079,8 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon, T2_FNEXT_RSP_PARMS *parms; char *response_data; int rc = 0; - int bytes_returned, name_len; + int bytes_returned; + unsigned int name_len; __u16 params, byte_count; cFYI(1, "In FindNext"); -- cgit v1.1 From 562960c731b9ffba8cd200be8d38f5f793b23d74 Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 29 Aug 2011 18:54:12 +0000 Subject: Fix the conflict between rwpidforward and rw mount options commit c9c7fa0064f4afe1d040e72f24c2256dd8ac402d upstream. Both these options are started with "rw" - that's why the first one isn't switched on even if it is specified. Fix this by adding a length check for "rw" option check. Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/connect.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index e0ea721..2451627 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1258,7 +1258,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, /* ignore */ } else if (strnicmp(data, "guest", 5) == 0) { /* ignore */ - } else if (strnicmp(data, "rw", 2) == 0) { + } else if (strnicmp(data, "rw", 2) == 0 && strlen(data) == 2) { /* ignore */ } else if (strnicmp(data, "ro", 2) == 0) { /* ignore */ @@ -1361,7 +1361,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->server_ino = 1; } else if (strnicmp(data, "noserverino", 9) == 0) { vol->server_ino = 0; - } else if (strnicmp(data, "rwpidforward", 4) == 0) { + } else if (strnicmp(data, "rwpidforward", 12) == 0) { vol->rwpidforward = 1; } else if (strnicmp(data, "cifsacl", 7) == 0) { vol->cifs_acl = 1; -- cgit v1.1 From a19dcc747653657eeea06b417d8e4593f8f44536 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 20 Sep 2011 15:19:38 -0700 Subject: make /proc/$pid/numa_maps gather_stats() take variable page size commit eb4866d0066ffd5446751c102d64feb3318d8bd1 upstream. We need to teach the numa_maps code about transparent huge pages. The first step is to teach gather_stats() that the pte it is dealing with might represent more than one page. Note that will we use this in a moment for transparent huge pages since they have use a single pmd_t which _acts_ as a "surrogate" for a bunch of smaller pte_t's. I'm a _bit_ unhappy that this interface counts in hugetlbfs page sizes for hugetlbfs pages and PAGE_SIZE for normal pages. That means that to figure out how many _bytes_ "dirty=1" means, you must first know the hugetlbfs page size. That's easier said than done especially if you don't have visibility in to the mount. But, that's probably a discussion for another day especially since it would change behavior to fix it. But, just in case anyone wonders why this patch only passes a '1' in the hugetlb case... Signed-off-by: Dave Hansen Acked-by: Hugh Dickins Acked-by: David Rientjes Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/proc/task_mmu.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 25b6a88..61342a4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -877,30 +877,31 @@ struct numa_maps_private { struct numa_maps md; }; -static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty) +static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, + unsigned long nr_pages) { int count = page_mapcount(page); - md->pages++; + md->pages += nr_pages; if (pte_dirty || PageDirty(page)) - md->dirty++; + md->dirty += nr_pages; if (PageSwapCache(page)) - md->swapcache++; + md->swapcache += nr_pages; if (PageActive(page) || PageUnevictable(page)) - md->active++; + md->active += nr_pages; if (PageWriteback(page)) - md->writeback++; + md->writeback += nr_pages; if (PageAnon(page)) - md->anon++; + md->anon += nr_pages; if (count > md->mapcount_max) md->mapcount_max = count; - md->node[page_to_nid(page)]++; + md->node[page_to_nid(page)] += nr_pages; } static int gather_pte_stats(pmd_t *pmd, unsigned long addr, @@ -931,7 +932,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, if (!node_isset(nid, node_states[N_HIGH_MEMORY])) continue; - gather_stats(page, md, pte_dirty(*pte)); + gather_stats(page, md, pte_dirty(*pte), 1); } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(orig_pte, ptl); @@ -952,7 +953,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, return 0; md = walk->private; - gather_stats(page, md, pte_dirty(*pte)); + gather_stats(page, md, pte_dirty(*pte), 1); return 0; } -- cgit v1.1 From b879781180172696903fac604ba44c3204e0c1e4 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 20 Sep 2011 15:19:39 -0700 Subject: break out numa_maps gather_pte_stats() checks commit 3200a8aaab0c9ccdc0f59b0dac2d4a47029137fa upstream. gather_pte_stats() does a number of checks on a target page to see whether it should even be considered for statistics. This breaks that code out in to a separate function so that we can use it in the transparent hugepage case in the next patch. Signed-off-by: Dave Hansen Acked-by: Hugh Dickins Reviewed-by: Christoph Lameter Acked-by: David Rientjes Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/proc/task_mmu.c | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 61342a4..9dca07e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -904,6 +904,29 @@ static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, md->node[page_to_nid(page)] += nr_pages; } +static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, + unsigned long addr) +{ + struct page *page; + int nid; + + if (!pte_present(pte)) + return NULL; + + page = vm_normal_page(vma, addr, pte); + if (!page) + return NULL; + + if (PageReserved(page)) + return NULL; + + nid = page_to_nid(page); + if (!node_isset(nid, node_states[N_HIGH_MEMORY])) + return NULL; + + return page; +} + static int gather_pte_stats(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -915,23 +938,9 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, md = walk->private; orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); do { - struct page *page; - int nid; - - if (!pte_present(*pte)) - continue; - - page = vm_normal_page(md->vma, addr, *pte); + struct page *page = can_gather_numa_stats(*pte, md->vma, addr); if (!page) continue; - - if (PageReserved(page)) - continue; - - nid = page_to_nid(page); - if (!node_isset(nid, node_states[N_HIGH_MEMORY])) - continue; - gather_stats(page, md, pte_dirty(*pte), 1); } while (pte++, addr += PAGE_SIZE, addr != end); -- cgit v1.1 From e2d598ab82b589fc40c4656a10004e328a90b6dd Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 20 Sep 2011 15:19:41 -0700 Subject: teach /proc/$pid/numa_maps about transparent hugepages commit 32ef43848f283e0ef945d3c67e851c143fea3970 upstream. This is modeled after the smaps code. It detects transparent hugepages and then does a single gather_stats() for the page as a whole. This has two benifits: 1. It is more efficient since it does many pages in a single shot. 2. It does not have to break down the huge page. Signed-off-by: Dave Hansen Acked-by: Hugh Dickins Acked-by: David Rientjes Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/proc/task_mmu.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 9dca07e..5afaa58 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -936,6 +936,26 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, pte_t *pte; md = walk->private; + spin_lock(&walk->mm->page_table_lock); + if (pmd_trans_huge(*pmd)) { + if (pmd_trans_splitting(*pmd)) { + spin_unlock(&walk->mm->page_table_lock); + wait_split_huge_page(md->vma->anon_vma, pmd); + } else { + pte_t huge_pte = *(pte_t *)pmd; + struct page *page; + + page = can_gather_numa_stats(huge_pte, md->vma, addr); + if (page) + gather_stats(page, md, pte_dirty(huge_pte), + HPAGE_PMD_SIZE/PAGE_SIZE); + spin_unlock(&walk->mm->page_table_lock); + return 0; + } + } else { + spin_unlock(&walk->mm->page_table_lock); + } + orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); do { struct page *page = can_gather_numa_stats(*pte, md->vma, addr); -- cgit v1.1 From ac693061b11c33d5a5c5ec1925de7abd3fcb0971 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Sun, 6 Jun 2010 10:38:15 -0600 Subject: writeback: introduce .tagged_writepages for the WB_SYNC_NONE sync stage commit 6e6938b6d3130305a5960c86b1a9b21e58cf6144 upstream. sync(2) is performed in two stages: the WB_SYNC_NONE sync and the WB_SYNC_ALL sync. Identify the first stage with .tagged_writepages and do livelock prevention for it, too. Jan's commit f446daaea9 ("mm: implement writeback livelock avoidance using page tagging") is a partial fix in that it only fixed the WB_SYNC_ALL phase livelock. Although ext4 is tested to no longer livelock with commit f446daaea9, it may due to some "redirty_tail() after pages_skipped" effect which is by no means a guarantee for _all_ the file systems. Note that writeback_inodes_sb() is called by not only sync(), they are treated the same because the other callers also need livelock prevention. Impact: It changes the order in which pages/inodes are synced to disk. Now in the WB_SYNC_NONE stage, it won't proceed to write the next inode until finished with the current inode. Acked-by: Jan Kara CC: Dave Chinner Signed-off-by: Wu Fengguang Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 4 ++-- fs/fs-writeback.c | 17 +++++++++-------- 2 files changed, 11 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b864839..c94774c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2756,7 +2756,7 @@ static int write_cache_pages_da(struct address_space *mapping, index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; - if (wbc->sync_mode == WB_SYNC_ALL) + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; @@ -2988,7 +2988,7 @@ static int ext4_da_writepages(struct address_space *mapping, } retry: - if (wbc->sync_mode == WB_SYNC_ALL) + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); while (!ret && wbc->nr_to_write > 0) { diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 0f015a0..5ed2ce9 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -36,6 +36,7 @@ struct wb_writeback_work { long nr_pages; struct super_block *sb; enum writeback_sync_modes sync_mode; + unsigned int tagged_writepages:1; unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; @@ -650,6 +651,7 @@ static long wb_writeback(struct bdi_writeback *wb, { struct writeback_control wbc = { .sync_mode = work->sync_mode, + .tagged_writepages = work->tagged_writepages, .older_than_this = NULL, .for_kupdate = work->for_kupdate, .for_background = work->for_background, @@ -657,7 +659,7 @@ static long wb_writeback(struct bdi_writeback *wb, }; unsigned long oldest_jif; long wrote = 0; - long write_chunk; + long write_chunk = MAX_WRITEBACK_PAGES; struct inode *inode; if (wbc.for_kupdate) { @@ -683,9 +685,7 @@ static long wb_writeback(struct bdi_writeback *wb, * (quickly) tag currently dirty pages * (maybe slowly) sync all tagged pages */ - if (wbc.sync_mode == WB_SYNC_NONE) - write_chunk = MAX_WRITEBACK_PAGES; - else + if (wbc.sync_mode == WB_SYNC_ALL || wbc.tagged_writepages) write_chunk = LONG_MAX; wbc.wb_start = jiffies; /* livelock avoidance */ @@ -1188,10 +1188,11 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) { DECLARE_COMPLETION_ONSTACK(done); struct wb_writeback_work work = { - .sb = sb, - .sync_mode = WB_SYNC_NONE, - .done = &done, - .nr_pages = nr, + .sb = sb, + .sync_mode = WB_SYNC_NONE, + .tagged_writepages = 1, + .done = &done, + .nr_pages = nr, }; WARN_ON(!rwsem_is_locked(&sb->s_umount)); -- cgit v1.1 From cfdf7986b6398049c35f8eb6e236d2387ee7ae14 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 27 Apr 2011 19:05:21 -0600 Subject: writeback: update dirtied_when for synced inode to prevent livelock commit 94c3dcbb0b0cdfd82cedd21705424d8044edc42c upstream. Explicitly update .dirtied_when on synced inodes, so that they are no longer considered for writeback in the next round. It can prevent both of the following livelock schemes: - while true; do echo data >> f; done - while true; do touch f; done (in theory) The exact livelock condition is, during sync(1): (1) no new inodes are dirtied (2) an inode being actively dirtied On (2), the inode will be tagged and synced with .nr_to_write=LONG_MAX. When finished, it will be redirty_tail()ed because it's still dirty and (.nr_to_write > 0). redirty_tail() won't update its ->dirtied_when on condition (1). The sync work will then revisit it on the next queue_io() and find it eligible again because its old ->dirtied_when predates the sync work start time. We'll do more aggressive "keep writeback as long as we wrote something" logic in wb_writeback(). The "use LONG_MAX .nr_to_write" trick in commit b9543dac5bbc ("writeback: avoid livelocking WB_SYNC_ALL writeback") will no longer be enough to stop sync livelock. Reviewed-by: Jan Kara Signed-off-by: Wu Fengguang Signed-off-by: Greg Kroah-Hartman --- fs/fs-writeback.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5ed2ce9..fe190a8 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -419,6 +419,15 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) spin_lock(&inode->i_lock); inode->i_state &= ~I_SYNC; if (!(inode->i_state & I_FREEING)) { + /* + * Sync livelock prevention. Each inode is tagged and synced in + * one shot. If still dirty, it will be redirty_tail()'ed below. + * Update the dirty time to prevent enqueue and sync it again. + */ + if ((inode->i_state & I_DIRTY) && + (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) + inode->dirtied_when = jiffies; + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { /* * We didn't write back all the pages. nfs_writepages() -- cgit v1.1 From 890ecd3d30e0ba8b1b676eaf6c925f65483f5f0d Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Sun, 18 Sep 2011 10:20:46 -0400 Subject: btrfs: fix d_off in the first dirent commit 3765fefaee2da83f10829fa64a74e6b7360350cb upstream. Since the d_off in the first dirent for "." (that originates from the 4th argument "offset" of filldir() for the 2nd dirent for "..") is wrongly assigned in btrfs_real_readdir(), telldir returns same offset for different locations. | # mkfs.btrfs /dev/sdb1 | # mount /dev/sdb1 fs0 | # cd fs0 | # touch file0 file1 | # ../test | telldir: 0 | readdir: d_off = 2, d_name = "." | telldir: 2 | readdir: d_off = 2, d_name = ".." | telldir: 2 | readdir: d_off = 3, d_name = "file0" | telldir: 3 | readdir: d_off = 2147483647, d_name = "file1" | telldir: 2147483647 To fix this problem, pass filp->f_pos (which is loff_t) instead. | # ../test | telldir: 0 | readdir: d_off = 1, d_name = "." | telldir: 1 | readdir: d_off = 2, d_name = ".." | telldir: 2 | readdir: d_off = 3, d_name = "file0" : At the moment the "offset" for "." is unused because there is no preceding dirent, however it is better to pass filp->f_pos to follow grammatical usage. Signed-off-by: Hidetoshi Seto Signed-off-by: Chris Mason Cc: Grazvydas Ignotas Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3601f0a..d42e6bf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4124,7 +4124,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, /* special case for "." */ if (filp->f_pos == 0) { - over = filldir(dirent, ".", 1, 1, btrfs_ino(inode), DT_DIR); + over = filldir(dirent, ".", 1, + filp->f_pos, btrfs_ino(inode), DT_DIR); if (over) return 0; filp->f_pos = 1; @@ -4133,7 +4134,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, if (filp->f_pos == 1) { u64 pino = parent_ino(filp->f_path.dentry); over = filldir(dirent, "..", 2, - 2, pino, DT_DIR); + filp->f_pos, pino, DT_DIR); if (over) return 0; filp->f_pos = 2; -- cgit v1.1 From 2b7eea63de50d738ae12a1bf84b76ef91c007a0e Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 26 Jul 2011 16:08:41 -0700 Subject: exec: do not call request_module() twice from search_binary_handler() commit 912193521b719fbfc2f16776febf5232fe8ba261 upstream. Currently, search_binary_handler() tries to load binary loader module using request_module() if a loader for the requested program is not yet loaded. But second attempt of request_module() does not affect the result of search_binary_handler(). If request_module() triggered recursion, calling request_module() twice causes 2 to the power of MAX_KMOD_CONCURRENT (= 50) repetitions. It is not an infinite loop but is sufficient for users to consider as a hang up. Therefore, this patch changes not to call request_module() twice, making 1 to the power of MAX_KMOD_CONCURRENT repetitions in case of recursion. Signed-off-by: Tetsuo Handa Reported-by: Richard Weinberger Tested-by: Richard Weinberger Cc: Al Viro Signed-off-by: Andrew Morton Cc: Maxim Uvarov Signed-off-by: Linus Torvalds --- fs/exec.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 6075a1e..044c13f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1411,6 +1411,8 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) printable(bprm->buf[2]) && printable(bprm->buf[3])) break; /* -ENOEXEC */ + if (try) + break; /* -ENOEXEC */ request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2])); #endif } -- cgit v1.1 From 37b2b419970e9b380a07c15d60a26987ce09ad15 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 12 Sep 2011 09:38:03 +0200 Subject: fuse: fix memory leak commit 5dfcc87fd79dfb96ed155b524337dbd0da4f5993 upstream. kmemleak is reporting that 32 bytes are being leaked by FUSE: unreferenced object 0xe373b270 (size 32): comm "fusermount", pid 1207, jiffies 4294707026 (age 2675.187s) hex dump (first 32 bytes): 01 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmemleak_alloc+0x27/0x50 [] kmem_cache_alloc+0xc5/0x180 [] fuse_alloc_forget+0x1e/0x20 [] fuse_alloc_inode+0xb0/0xd0 [] alloc_inode+0x1c/0x80 [] iget5_locked+0x8f/0x1a0 [] fuse_iget+0x72/0x1a0 [] fuse_get_root_inode+0x8a/0x90 [] fuse_fill_super+0x3ef/0x590 [] mount_nodev+0x3f/0x90 [] fuse_mount+0x15/0x20 [] mount_fs+0x1c/0xc0 [] vfs_kern_mount+0x41/0x90 [] do_kern_mount+0x39/0xd0 [] do_mount+0x2e5/0x660 [] sys_mount+0x66/0xa0 This leak report is consistent and happens once per boot on 3.1.0-rc5-dirty. This happens if a FORGET request is queued after the fuse device was released. Reported-by: Sitsofe Wheeler Signed-off-by: Miklos Szeredi Tested-by: Sitsofe Wheeler Signed-off-by: Linus Torvalds Cc: Josh Boyer Signed-off-by: Greg Kroah-Hartman --- fs/fuse/dev.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 168a80f..5cb8614 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -258,10 +258,14 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, forget->forget_one.nlookup = nlookup; spin_lock(&fc->lock); - fc->forget_list_tail->next = forget; - fc->forget_list_tail = forget; - wake_up(&fc->waitq); - kill_fasync(&fc->fasync, SIGIO, POLL_IN); + if (fc->connected) { + fc->forget_list_tail->next = forget; + fc->forget_list_tail = forget; + wake_up(&fc->waitq); + kill_fasync(&fc->fasync, SIGIO, POLL_IN); + } else { + kfree(forget); + } spin_unlock(&fc->lock); } -- cgit v1.1 From c53c89aba3ebdfc3e9acdb18bb5ee9d2f8a328d0 Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Mon, 18 Jul 2011 08:06:23 -0700 Subject: hfsplus: ensure bio requests are not smaller than the hardware sectors commit 6596528e391ad978a6a120142cba97a1d7324cb6 upstream. Currently all bio requests are 512 bytes, which may fail for media whose physical sector size is larger than this. Ensure these requests are not smaller than the block device logical block size. BugLink: http://bugs.launchpad.net/bugs/734883 Signed-off-by: Seth Forshee Signed-off-by: Christoph Hellwig Cc: Josh Boyer Signed-off-by: Greg Kroah-Hartman --- fs/hfsplus/hfsplus_fs.h | 16 ++++++++-- fs/hfsplus/part_tbl.c | 32 ++++++++++--------- fs/hfsplus/super.c | 12 +++---- fs/hfsplus/wrapper.c | 83 +++++++++++++++++++++++++++++++++++++------------ 4 files changed, 101 insertions(+), 42 deletions(-) (limited to 'fs') diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index d685752..4e7f64b 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -13,6 +13,7 @@ #include #include #include +#include #include "hfsplus_raw.h" #define DBG_BNODE_REFS 0x00000001 @@ -110,7 +111,9 @@ struct hfsplus_vh; struct hfs_btree; struct hfsplus_sb_info { + void *s_vhdr_buf; struct hfsplus_vh *s_vhdr; + void *s_backup_vhdr_buf; struct hfsplus_vh *s_backup_vhdr; struct hfs_btree *ext_tree; struct hfs_btree *cat_tree; @@ -258,6 +261,15 @@ struct hfsplus_readdir_data { struct hfsplus_cat_key key; }; +/* + * Find minimum acceptible I/O size for an hfsplus sb. + */ +static inline unsigned short hfsplus_min_io_size(struct super_block *sb) +{ + return max_t(unsigned short, bdev_logical_block_size(sb->s_bdev), + HFSPLUS_SECTOR_SIZE); +} + #define hfs_btree_open hfsplus_btree_open #define hfs_btree_close hfsplus_btree_close #define hfs_btree_write hfsplus_btree_write @@ -436,8 +448,8 @@ int hfsplus_compare_dentry(const struct dentry *parent, /* wrapper.c */ int hfsplus_read_wrapper(struct super_block *); int hfs_part_find(struct super_block *, sector_t *, sector_t *); -int hfsplus_submit_bio(struct block_device *bdev, sector_t sector, - void *data, int rw); +int hfsplus_submit_bio(struct super_block *sb, sector_t sector, + void *buf, void **data, int rw); /* time macros */ #define __hfsp_mt2ut(t) (be32_to_cpu(t) - 2082844800U) diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c index 40ad88c..eb355d8 100644 --- a/fs/hfsplus/part_tbl.c +++ b/fs/hfsplus/part_tbl.c @@ -88,11 +88,12 @@ static int hfs_parse_old_pmap(struct super_block *sb, struct old_pmap *pm, return -ENOENT; } -static int hfs_parse_new_pmap(struct super_block *sb, struct new_pmap *pm, - sector_t *part_start, sector_t *part_size) +static int hfs_parse_new_pmap(struct super_block *sb, void *buf, + struct new_pmap *pm, sector_t *part_start, sector_t *part_size) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); int size = be32_to_cpu(pm->pmMapBlkCnt); + int buf_size = hfsplus_min_io_size(sb); int res; int i = 0; @@ -107,11 +108,14 @@ static int hfs_parse_new_pmap(struct super_block *sb, struct new_pmap *pm, if (++i >= size) return -ENOENT; - res = hfsplus_submit_bio(sb->s_bdev, - *part_start + HFS_PMAP_BLK + i, - pm, READ); - if (res) - return res; + pm = (struct new_pmap *)((u8 *)pm + HFSPLUS_SECTOR_SIZE); + if ((u8 *)pm - (u8 *)buf >= buf_size) { + res = hfsplus_submit_bio(sb, + *part_start + HFS_PMAP_BLK + i, + buf, (void **)&pm, READ); + if (res) + return res; + } } while (pm->pmSig == cpu_to_be16(HFS_NEW_PMAP_MAGIC)); return -ENOENT; @@ -124,15 +128,15 @@ static int hfs_parse_new_pmap(struct super_block *sb, struct new_pmap *pm, int hfs_part_find(struct super_block *sb, sector_t *part_start, sector_t *part_size) { - void *data; + void *buf, *data; int res; - data = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL); - if (!data) + buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL); + if (!buf) return -ENOMEM; - res = hfsplus_submit_bio(sb->s_bdev, *part_start + HFS_PMAP_BLK, - data, READ); + res = hfsplus_submit_bio(sb, *part_start + HFS_PMAP_BLK, + buf, &data, READ); if (res) goto out; @@ -141,13 +145,13 @@ int hfs_part_find(struct super_block *sb, res = hfs_parse_old_pmap(sb, data, part_start, part_size); break; case HFS_NEW_PMAP_MAGIC: - res = hfs_parse_new_pmap(sb, data, part_start, part_size); + res = hfs_parse_new_pmap(sb, buf, data, part_start, part_size); break; default: res = -ENOENT; break; } out: - kfree(data); + kfree(buf); return res; } diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 84a47b7..ab4857b 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -197,17 +197,17 @@ int hfsplus_sync_fs(struct super_block *sb, int wait) write_backup = 1; } - error2 = hfsplus_submit_bio(sb->s_bdev, + error2 = hfsplus_submit_bio(sb, sbi->part_start + HFSPLUS_VOLHEAD_SECTOR, - sbi->s_vhdr, WRITE_SYNC); + sbi->s_vhdr_buf, NULL, WRITE_SYNC); if (!error) error = error2; if (!write_backup) goto out; - error2 = hfsplus_submit_bio(sb->s_bdev, + error2 = hfsplus_submit_bio(sb, sbi->part_start + sbi->sect_count - 2, - sbi->s_backup_vhdr, WRITE_SYNC); + sbi->s_backup_vhdr_buf, NULL, WRITE_SYNC); if (!error) error2 = error; out: @@ -251,8 +251,8 @@ static void hfsplus_put_super(struct super_block *sb) hfs_btree_close(sbi->ext_tree); iput(sbi->alloc_file); iput(sbi->hidden_dir); - kfree(sbi->s_vhdr); - kfree(sbi->s_backup_vhdr); + kfree(sbi->s_vhdr_buf); + kfree(sbi->s_backup_vhdr_buf); unload_nls(sbi->nls); kfree(sb->s_fs_info); sb->s_fs_info = NULL; diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 4ac88ff..e3881a1 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -31,25 +31,67 @@ static void hfsplus_end_io_sync(struct bio *bio, int err) complete(bio->bi_private); } -int hfsplus_submit_bio(struct block_device *bdev, sector_t sector, - void *data, int rw) +/* + * hfsplus_submit_bio - Perfrom block I/O + * @sb: super block of volume for I/O + * @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes + * @buf: buffer for I/O + * @data: output pointer for location of requested data + * @rw: direction of I/O + * + * The unit of I/O is hfsplus_min_io_size(sb), which may be bigger than + * HFSPLUS_SECTOR_SIZE, and @buf must be sized accordingly. On reads + * @data will return a pointer to the start of the requested sector, + * which may not be the same location as @buf. + * + * If @sector is not aligned to the bdev logical block size it will + * be rounded down. For writes this means that @buf should contain data + * that starts at the rounded-down address. As long as the data was + * read using hfsplus_submit_bio() and the same buffer is used things + * will work correctly. + */ +int hfsplus_submit_bio(struct super_block *sb, sector_t sector, + void *buf, void **data, int rw) { DECLARE_COMPLETION_ONSTACK(wait); struct bio *bio; int ret = 0; + unsigned int io_size; + loff_t start; + int offset; + + /* + * Align sector to hardware sector size and find offset. We + * assume that io_size is a power of two, which _should_ + * be true. + */ + io_size = hfsplus_min_io_size(sb); + start = (loff_t)sector << HFSPLUS_SECTOR_SHIFT; + offset = start & (io_size - 1); + sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1); bio = bio_alloc(GFP_NOIO, 1); bio->bi_sector = sector; - bio->bi_bdev = bdev; + bio->bi_bdev = sb->s_bdev; bio->bi_end_io = hfsplus_end_io_sync; bio->bi_private = &wait; - /* - * We always submit one sector at a time, so bio_add_page must not fail. - */ - if (bio_add_page(bio, virt_to_page(data), HFSPLUS_SECTOR_SIZE, - offset_in_page(data)) != HFSPLUS_SECTOR_SIZE) - BUG(); + if (!(rw & WRITE) && data) + *data = (u8 *)buf + offset; + + while (io_size > 0) { + unsigned int page_offset = offset_in_page(buf); + unsigned int len = min_t(unsigned int, PAGE_SIZE - page_offset, + io_size); + + ret = bio_add_page(bio, virt_to_page(buf), len, page_offset); + if (ret != len) { + ret = -EIO; + goto out; + } + io_size -= len; + buf = (u8 *)buf + len; + } submit_bio(rw, bio); wait_for_completion(&wait); @@ -57,8 +99,9 @@ int hfsplus_submit_bio(struct block_device *bdev, sector_t sector, if (!bio_flagged(bio, BIO_UPTODATE)) ret = -EIO; +out: bio_put(bio); - return ret; + return ret < 0 ? ret : 0; } static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd) @@ -147,17 +190,17 @@ int hfsplus_read_wrapper(struct super_block *sb) } error = -ENOMEM; - sbi->s_vhdr = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL); - if (!sbi->s_vhdr) + sbi->s_vhdr_buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL); + if (!sbi->s_vhdr_buf) goto out; - sbi->s_backup_vhdr = kmalloc(HFSPLUS_SECTOR_SIZE, GFP_KERNEL); - if (!sbi->s_backup_vhdr) + sbi->s_backup_vhdr_buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL); + if (!sbi->s_backup_vhdr_buf) goto out_free_vhdr; reread: - error = hfsplus_submit_bio(sb->s_bdev, - part_start + HFSPLUS_VOLHEAD_SECTOR, - sbi->s_vhdr, READ); + error = hfsplus_submit_bio(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, + sbi->s_vhdr_buf, (void **)&sbi->s_vhdr, + READ); if (error) goto out_free_backup_vhdr; @@ -186,9 +229,9 @@ reread: goto reread; } - error = hfsplus_submit_bio(sb->s_bdev, - part_start + part_size - 2, - sbi->s_backup_vhdr, READ); + error = hfsplus_submit_bio(sb, part_start + part_size - 2, + sbi->s_backup_vhdr_buf, + (void **)&sbi->s_backup_vhdr, READ); if (error) goto out_free_backup_vhdr; -- cgit v1.1 From f24d5457bd706667951021530dd0590a1f3e9464 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Sun, 21 Aug 2011 19:30:15 +0400 Subject: CIFS: Fix ERR_PTR dereference in cifs_get_root commit 5b980b01212199833ee8023770fa4cbf1b85e9f4 upstream. move it to the beginning of the loop. Signed-off-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Steve French Cc: Josh Boyer Signed-off-by: Greg Kroah-Hartman --- fs/cifs/cifsfs.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index fc7e57b..53e7d72 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -566,6 +566,12 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) struct inode *dir = dentry->d_inode; struct dentry *child; + if (!dir) { + dput(dentry); + dentry = ERR_PTR(-ENOENT); + break; + } + /* skip separators */ while (*s == sep) s++; @@ -581,10 +587,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) mutex_unlock(&dir->i_mutex); dput(dentry); dentry = child; - if (!dentry->d_inode) { - dput(dentry); - dentry = ERR_PTR(-ENOENT); - } } while (!IS_ERR(dentry)); _FreeXid(xid); kfree(full_path); -- cgit v1.1 From eedc6389bb43769e82f2779dabd45865d2b892e4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Oct 2011 10:23:15 -0400 Subject: xfs: start periodic workers later commit 2bcf6e970f5a88fa05dced5eeb0326e13d93c4a1 upstream Start the periodic sync workers only after we have finished xfs_mountfs and thus fully set up the filesystem structures. Without this we can call into xfs_qm_sync before the quotainfo strucute is set up if the mount takes unusually long, and probably hit other incomplete states as well. Also clean up the xfs_fs_fill_super error path by using consistent label names, and removing an impossible to reach case. Signed-off-by: Christoph Hellwig Reported-by: Arkadiusz Miskiewicz Reviewed-by: Alex Elder Signed-off-by: Greg Kroah-Hartman --- fs/xfs/linux-2.6/xfs_super.c | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index a1a881e..3ebb458 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1412,37 +1412,35 @@ xfs_fs_fill_super( sb->s_time_gran = 1; set_posix_acl_flag(sb); - error = xfs_syncd_init(mp); - if (error) - goto out_filestream_unmount; - xfs_inode_shrinker_register(mp); error = xfs_mountfs(mp); if (error) - goto out_syncd_stop; + goto out_filestream_unmount; + + error = xfs_syncd_init(mp); + if (error) + goto out_unmount; root = igrab(VFS_I(mp->m_rootip)); if (!root) { error = ENOENT; - goto fail_unmount; + goto out_syncd_stop; } if (is_bad_inode(root)) { error = EINVAL; - goto fail_vnrele; + goto out_syncd_stop; } sb->s_root = d_alloc_root(root); if (!sb->s_root) { error = ENOMEM; - goto fail_vnrele; + goto out_iput; } return 0; - out_syncd_stop: - xfs_inode_shrinker_unregister(mp); - xfs_syncd_stop(mp); out_filestream_unmount: + xfs_inode_shrinker_unregister(mp); xfs_filestream_unmount(mp); out_free_sb: xfs_freesb(mp); @@ -1456,17 +1454,12 @@ xfs_fs_fill_super( out: return -error; - fail_vnrele: - if (sb->s_root) { - dput(sb->s_root); - sb->s_root = NULL; - } else { - iput(root); - } - - fail_unmount: - xfs_inode_shrinker_unregister(mp); + out_iput: + iput(root); + out_syncd_stop: xfs_syncd_stop(mp); + out_unmount: + xfs_inode_shrinker_unregister(mp); /* * Blow away any referenced inode in the filestreams cache. -- cgit v1.1 From f5d5ee3686ab928549370e4e584a2c32ef708a37 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 18 Oct 2011 10:23:16 -0400 Subject: xfs: use a cursor for bulk AIL insertion commit 1d8c95a363bf8cd4d4182dd19c01693b635311c2 upstream xfs: use a cursor for bulk AIL insertion Delayed logging can insert tens of thousands of log items into the AIL at the same LSN. When the committing of log commit records occur, we can get insertions occurring at an LSN that is not at the end of the AIL. If there are thousands of items in the AIL on the tail LSN, each insertion has to walk the AIL to find the correct place to insert the new item into the AIL. This can consume large amounts of CPU time and block other operations from occurring while the traversals are in progress. To avoid this repeated walk, use a AIL cursor to record where we should be inserting the new items into the AIL without having to repeat the walk. The cursor infrastructure already provides this functionality for push walks, so is a simple extension of existing code. While this will not avoid the initial walk, it will avoid repeating it tens of thousands of times during a single checkpoint commit. This version includes logic improvements from Christoph Hellwig. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_trans.c | 27 ++++++++++-- fs/xfs/xfs_trans_ail.c | 109 ++++++++++++++++++++++++++++++++++++++---------- fs/xfs/xfs_trans_priv.h | 10 +++-- 3 files changed, 118 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index c83f63b..efc147f 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1426,6 +1426,7 @@ xfs_trans_committed( static inline void xfs_log_item_batch_insert( struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, struct xfs_log_item **log_items, int nr_items, xfs_lsn_t commit_lsn) @@ -1434,7 +1435,7 @@ xfs_log_item_batch_insert( spin_lock(&ailp->xa_lock); /* xfs_trans_ail_update_bulk drops ailp->xa_lock */ - xfs_trans_ail_update_bulk(ailp, log_items, nr_items, commit_lsn); + xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn); for (i = 0; i < nr_items; i++) IOP_UNPIN(log_items[i], 0); @@ -1452,6 +1453,13 @@ xfs_log_item_batch_insert( * as an iclog write error even though we haven't started any IO yet. Hence in * this case all we need to do is IOP_COMMITTED processing, followed by an * IOP_UNPIN(aborted) call. + * + * The AIL cursor is used to optimise the insert process. If commit_lsn is not + * at the end of the AIL, the insert cursor avoids the need to walk + * the AIL to find the insertion point on every xfs_log_item_batch_insert() + * call. This saves a lot of needless list walking and is a net win, even + * though it slightly increases that amount of AIL lock traffic to set it up + * and tear it down. */ void xfs_trans_committed_bulk( @@ -1463,8 +1471,13 @@ xfs_trans_committed_bulk( #define LOG_ITEM_BATCH_SIZE 32 struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE]; struct xfs_log_vec *lv; + struct xfs_ail_cursor cur; int i = 0; + spin_lock(&ailp->xa_lock); + xfs_trans_ail_cursor_last(ailp, &cur, commit_lsn); + spin_unlock(&ailp->xa_lock); + /* unpin all the log items */ for (lv = log_vector; lv; lv = lv->lv_next ) { struct xfs_log_item *lip = lv->lv_item; @@ -1493,7 +1506,9 @@ xfs_trans_committed_bulk( /* * Not a bulk update option due to unusual item_lsn. * Push into AIL immediately, rechecking the lsn once - * we have the ail lock. Then unpin the item. + * we have the ail lock. Then unpin the item. This does + * not affect the AIL cursor the bulk insert path is + * using. */ spin_lock(&ailp->xa_lock); if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) @@ -1507,7 +1522,7 @@ xfs_trans_committed_bulk( /* Item is a candidate for bulk AIL insert. */ log_items[i++] = lv->lv_item; if (i >= LOG_ITEM_BATCH_SIZE) { - xfs_log_item_batch_insert(ailp, log_items, + xfs_log_item_batch_insert(ailp, &cur, log_items, LOG_ITEM_BATCH_SIZE, commit_lsn); i = 0; } @@ -1515,7 +1530,11 @@ xfs_trans_committed_bulk( /* make sure we insert the remainder! */ if (i) - xfs_log_item_batch_insert(ailp, log_items, i, commit_lsn); + xfs_log_item_batch_insert(ailp, &cur, log_items, i, commit_lsn); + + spin_lock(&ailp->xa_lock); + xfs_trans_ail_cursor_done(ailp, &cur); + spin_unlock(&ailp->xa_lock); } /* diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 5fc2380..9a69dc0 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -272,9 +272,9 @@ xfs_trans_ail_cursor_clear( } /* - * Return the item in the AIL with the current lsn. - * Return the current tree generation number for use - * in calls to xfs_trans_next_ail(). + * Initialise the cursor to the first item in the AIL with the given @lsn. + * This searches the list from lowest LSN to highest. Pass a @lsn of zero + * to initialise the cursor to the first item in the AIL. */ xfs_log_item_t * xfs_trans_ail_cursor_first( @@ -300,31 +300,97 @@ out: } /* - * splice the log item list into the AIL at the given LSN. + * Initialise the cursor to the last item in the AIL with the given @lsn. + * This searches the list from highest LSN to lowest. If there is no item with + * the value of @lsn, then it sets the cursor to the last item with an LSN lower + * than @lsn. + */ +static struct xfs_log_item * +__xfs_trans_ail_cursor_last( + struct xfs_ail *ailp, + xfs_lsn_t lsn) +{ + xfs_log_item_t *lip; + + list_for_each_entry_reverse(lip, &ailp->xa_ail, li_ail) { + if (XFS_LSN_CMP(lip->li_lsn, lsn) <= 0) + return lip; + } + return NULL; +} + +/* + * Initialise the cursor to the last item in the AIL with the given @lsn. + * This searches the list from highest LSN to lowest. + */ +struct xfs_log_item * +xfs_trans_ail_cursor_last( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn) +{ + xfs_trans_ail_cursor_init(ailp, cur); + cur->item = __xfs_trans_ail_cursor_last(ailp, lsn); + return cur->item; +} + +/* + * splice the log item list into the AIL at the given LSN. We splice to the + * tail of the given LSN to maintain insert order for push traversals. The + * cursor is optional, allowing repeated updates to the same LSN to avoid + * repeated traversals. */ static void xfs_ail_splice( - struct xfs_ail *ailp, - struct list_head *list, - xfs_lsn_t lsn) + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + struct list_head *list, + xfs_lsn_t lsn) { - xfs_log_item_t *next_lip; + struct xfs_log_item *lip = cur ? cur->item : NULL; + struct xfs_log_item *next_lip; - /* If the list is empty, just insert the item. */ - if (list_empty(&ailp->xa_ail)) { - list_splice(list, &ailp->xa_ail); - return; + /* + * Get a new cursor if we don't have a placeholder or the existing one + * has been invalidated. + */ + if (!lip || (__psint_t)lip & 1) { + lip = __xfs_trans_ail_cursor_last(ailp, lsn); + + if (!lip) { + /* The list is empty, so just splice and return. */ + if (cur) + cur->item = NULL; + list_splice(list, &ailp->xa_ail); + return; + } } - list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) { - if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0) - break; + /* + * Our cursor points to the item we want to insert _after_, so we have + * to update the cursor to point to the end of the list we are splicing + * in so that it points to the correct location for the next splice. + * i.e. before the splice + * + * lsn -> lsn -> lsn + x -> lsn + x ... + * ^ + * | cursor points here + * + * After the splice we have: + * + * lsn -> lsn -> lsn -> lsn -> .... -> lsn -> lsn + x -> lsn + x ... + * ^ ^ + * | cursor points here | needs to move here + * + * So we set the cursor to the last item in the list to be spliced + * before we execute the splice, resulting in the cursor pointing to + * the correct item after the splice occurs. + */ + if (cur) { + next_lip = list_entry(list->prev, struct xfs_log_item, li_ail); + cur->item = next_lip; } - - ASSERT(&next_lip->li_ail == &ailp->xa_ail || - XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0); - - list_splice_init(list, &next_lip->li_ail); + list_splice(list, &lip->li_ail); } /* @@ -645,6 +711,7 @@ xfs_trans_unlocked_item( void xfs_trans_ail_update_bulk( struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, struct xfs_log_item **log_items, int nr_items, xfs_lsn_t lsn) __releases(ailp->xa_lock) @@ -674,7 +741,7 @@ xfs_trans_ail_update_bulk( list_add(&lip->li_ail, &tmp); } - xfs_ail_splice(ailp, &tmp, lsn); + xfs_ail_splice(ailp, cur, &tmp, lsn); if (!mlip_changed) { spin_unlock(&ailp->xa_lock); diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 6b164e9..c0cb408 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -82,6 +82,7 @@ struct xfs_ail { extern struct workqueue_struct *xfs_ail_wq; /* AIL workqueue */ void xfs_trans_ail_update_bulk(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, struct xfs_log_item **log_items, int nr_items, xfs_lsn_t lsn) __releases(ailp->xa_lock); static inline void @@ -90,7 +91,7 @@ xfs_trans_ail_update( struct xfs_log_item *lip, xfs_lsn_t lsn) __releases(ailp->xa_lock) { - xfs_trans_ail_update_bulk(ailp, &lip, 1, lsn); + xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); } void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp, @@ -111,10 +112,13 @@ xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp); void xfs_trans_unlocked_item(struct xfs_ail *, xfs_log_item_t *); -struct xfs_log_item *xfs_trans_ail_cursor_first(struct xfs_ail *ailp, +struct xfs_log_item * xfs_trans_ail_cursor_first(struct xfs_ail *ailp, struct xfs_ail_cursor *cur, xfs_lsn_t lsn); -struct xfs_log_item *xfs_trans_ail_cursor_next(struct xfs_ail *ailp, +struct xfs_log_item * xfs_trans_ail_cursor_last(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn); +struct xfs_log_item * xfs_trans_ail_cursor_next(struct xfs_ail *ailp, struct xfs_ail_cursor *cur); void xfs_trans_ail_cursor_done(struct xfs_ail *ailp, struct xfs_ail_cursor *cur); -- cgit v1.1 From 838599d118dd286b831ba45ae380c5870ff82fe9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Oct 2011 10:23:17 -0400 Subject: xfs: do not update xa_last_pushed_lsn for locked items commit bc6e588a8971aa74c02e42db4d6e0248679f3738 upstream If an item was locked we should not update xa_last_pushed_lsn and thus skip it when restarting the AIL scan as we need to be able to lock and write it out as soon as possible. Otherwise heavy lock contention might starve AIL pushing too easily, especially given the larger backoff once we moved xa_last_pushed_lsn all the way to the target lsn. Signed-off-by: Christoph Hellwig Reported-by: Stefan Priebe Tested-by: Stefan Priebe Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_trans_ail.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 9a69dc0..4b74b88 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -491,7 +491,6 @@ xfs_ail_worker( case XFS_ITEM_LOCKED: XFS_STATS_INC(xs_push_ail_locked); - ailp->xa_last_pushed_lsn = lsn; stuck++; break; -- cgit v1.1 From e7bde7c73957cfda82963127421069d13a44e921 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Oct 2011 10:23:18 -0400 Subject: xfs: force the log if we encounter pinned buffers in .iop_pushbuf commit 17b38471c3c07a49f0bbc2ecc2e92050c164e226 upstream We need to check for pinned buffers even in .iop_pushbuf given that inode items flush into the same buffers that may be pinned directly due operations on the unlinked inode list operating directly on buffers. To do this add a return value to .iop_pushbuf that tells the AIL push about this and use the existing log force mechanisms to unpin it. Signed-off-by: Christoph Hellwig Reported-by: Stefan Priebe Tested-by: Stefan Priebe Signed-off-by: Greg Kroah-Hartman --- fs/xfs/quota/xfs_dquot_item.c | 10 +++++++--- fs/xfs/xfs_buf_item.c | 3 ++- fs/xfs/xfs_inode_item.c | 10 +++++++--- fs/xfs/xfs_trans.h | 2 +- fs/xfs/xfs_trans_ail.c | 9 +++++++-- 5 files changed, 24 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c index 9e0e2fa..8126fc2 100644 --- a/fs/xfs/quota/xfs_dquot_item.c +++ b/fs/xfs/quota/xfs_dquot_item.c @@ -183,13 +183,14 @@ xfs_qm_dqunpin_wait( * search the buffer cache can be a time consuming thing, and AIL lock is a * spinlock. */ -STATIC void +STATIC bool xfs_qm_dquot_logitem_pushbuf( struct xfs_log_item *lip) { struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); struct xfs_dquot *dqp = qlip->qli_dquot; struct xfs_buf *bp; + bool ret = true; ASSERT(XFS_DQ_IS_LOCKED(dqp)); @@ -201,17 +202,20 @@ xfs_qm_dquot_logitem_pushbuf( if (completion_done(&dqp->q_flush) || !(lip->li_flags & XFS_LI_IN_AIL)) { xfs_dqunlock(dqp); - return; + return true; } bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno, dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK); xfs_dqunlock(dqp); if (!bp) - return; + return true; if (XFS_BUF_ISDELAYWRITE(bp)) xfs_buf_delwri_promote(bp); + if (XFS_BUF_ISPINNED(bp)) + ret = false; xfs_buf_relse(bp); + return ret; } /* diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 7b7e005..a7342e8 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -632,7 +632,7 @@ xfs_buf_item_push( * the xfsbufd to get this buffer written. We have to unlock the buffer * to allow the xfsbufd to write it, too. */ -STATIC void +STATIC bool xfs_buf_item_pushbuf( struct xfs_log_item *lip) { @@ -646,6 +646,7 @@ xfs_buf_item_pushbuf( xfs_buf_delwri_promote(bp); xfs_buf_relse(bp); + return true; } STATIC void diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index b1e88d5..391044c 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -713,13 +713,14 @@ xfs_inode_item_committed( * marked delayed write. If that's the case, we'll promote it and that will * allow the caller to write the buffer by triggering the xfsbufd to run. */ -STATIC void +STATIC bool xfs_inode_item_pushbuf( struct xfs_log_item *lip) { struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; struct xfs_buf *bp; + bool ret = true; ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); @@ -730,7 +731,7 @@ xfs_inode_item_pushbuf( if (completion_done(&ip->i_flush) || !(lip->li_flags & XFS_LI_IN_AIL)) { xfs_iunlock(ip, XFS_ILOCK_SHARED); - return; + return true; } bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno, @@ -738,10 +739,13 @@ xfs_inode_item_pushbuf( xfs_iunlock(ip, XFS_ILOCK_SHARED); if (!bp) - return; + return true; if (XFS_BUF_ISDELAYWRITE(bp)) xfs_buf_delwri_promote(bp); + if (XFS_BUF_ISPINNED(bp)) + ret = false; xfs_buf_relse(bp); + return ret; } /* diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 06a9759..53597f4 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -350,7 +350,7 @@ typedef struct xfs_item_ops { void (*iop_unlock)(xfs_log_item_t *); xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); void (*iop_push)(xfs_log_item_t *); - void (*iop_pushbuf)(xfs_log_item_t *); + bool (*iop_pushbuf)(xfs_log_item_t *); void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); } xfs_item_ops_t; diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 4b74b88..afc4aa0 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -478,8 +478,13 @@ xfs_ail_worker( case XFS_ITEM_PUSHBUF: XFS_STATS_INC(xs_push_ail_pushbuf); - IOP_PUSHBUF(lip); - ailp->xa_last_pushed_lsn = lsn; + + if (!IOP_PUSHBUF(lip)) { + stuck++; + flush_log = 1; + } else { + ailp->xa_last_pushed_lsn = lsn; + } push_xfsbufd = 1; break; -- cgit v1.1 From c7eead1e118fb7e34ee8f5063c3c090c054c3820 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Oct 2011 10:23:19 -0400 Subject: xfs: revert to using a kthread for AIL pushing commit 0030807c66f058230bcb20d2573bcaf28852e804 upstream Currently we have a few issues with the way the workqueue code is used to implement AIL pushing: - it accidentally uses the same workqueue as the syncer action, and thus can be prevented from running if there are enough sync actions active in the system. - it doesn't use the HIGHPRI flag to queue at the head of the queue of work items At this point I'm not confident enough in getting all the workqueue flags and tweaks right to provide a perfectly reliable execution context for AIL pushing, which is the most important piece in XFS to make forward progress when the log fills. Revert back to use a kthread per filesystem which fixes all the above issues at the cost of having a task struct and stack around for each mounted filesystem. In addition this also gives us much better ways to diagnose any issues involving hung AIL pushing and removes a small amount of code. Signed-off-by: Christoph Hellwig Reported-by: Stefan Priebe Tested-by: Stefan Priebe Signed-off-by: Greg Kroah-Hartman --- fs/xfs/linux-2.6/xfs_linux.h | 2 ++ fs/xfs/linux-2.6/xfs_super.c | 13 +------- fs/xfs/xfs_trans_ail.c | 73 +++++++++++++++++++++++++------------------- fs/xfs/xfs_trans_priv.h | 8 +---- 4 files changed, 45 insertions(+), 51 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 8633521..8731516 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -70,6 +70,8 @@ #include #include #include +#include +#include #include #include diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 3ebb458..347cae9 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1660,24 +1660,13 @@ xfs_init_workqueues(void) */ xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8); if (!xfs_syncd_wq) - goto out; - - xfs_ail_wq = alloc_workqueue("xfsail", WQ_CPU_INTENSIVE, 8); - if (!xfs_ail_wq) - goto out_destroy_syncd; - + return -ENOMEM; return 0; - -out_destroy_syncd: - destroy_workqueue(xfs_syncd_wq); -out: - return -ENOMEM; } STATIC void xfs_destroy_workqueues(void) { - destroy_workqueue(xfs_ail_wq); destroy_workqueue(xfs_syncd_wq); } diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index afc4aa0..a4c281b 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -28,8 +28,6 @@ #include "xfs_trans_priv.h" #include "xfs_error.h" -struct workqueue_struct *xfs_ail_wq; /* AIL workqueue */ - #ifdef DEBUG /* * Check that the list is sorted as it should be. @@ -406,16 +404,10 @@ xfs_ail_delete( xfs_trans_ail_cursor_clear(ailp, lip); } -/* - * xfs_ail_worker does the work of pushing on the AIL. It will requeue itself - * to run at a later time if there is more work to do to complete the push. - */ -STATIC void -xfs_ail_worker( - struct work_struct *work) +static long +xfsaild_push( + struct xfs_ail *ailp) { - struct xfs_ail *ailp = container_of(to_delayed_work(work), - struct xfs_ail, xa_work); xfs_mount_t *mp = ailp->xa_mount; struct xfs_ail_cursor *cur = &ailp->xa_cursors; xfs_log_item_t *lip; @@ -556,20 +548,6 @@ out_done: /* We're past our target or empty, so idle */ ailp->xa_last_pushed_lsn = 0; - /* - * We clear the XFS_AIL_PUSHING_BIT first before checking - * whether the target has changed. If the target has changed, - * this pushes the requeue race directly onto the result of the - * atomic test/set bit, so we are guaranteed that either the - * the pusher that changed the target or ourselves will requeue - * the work (but not both). - */ - clear_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags); - smp_rmb(); - if (XFS_LSN_CMP(ailp->xa_target, target) == 0 || - test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags)) - return; - tout = 50; } else if (XFS_LSN_CMP(lsn, target) >= 0) { /* @@ -592,9 +570,30 @@ out_done: tout = 20; } - /* There is more to do, requeue us. */ - queue_delayed_work(xfs_syncd_wq, &ailp->xa_work, - msecs_to_jiffies(tout)); + return tout; +} + +static int +xfsaild( + void *data) +{ + struct xfs_ail *ailp = data; + long tout = 0; /* milliseconds */ + + while (!kthread_should_stop()) { + if (tout && tout <= 20) + __set_current_state(TASK_KILLABLE); + else + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(tout ? + msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT); + + try_to_freeze(); + + tout = xfsaild_push(ailp); + } + + return 0; } /* @@ -629,8 +628,9 @@ xfs_ail_push( */ smp_wmb(); xfs_trans_ail_copy_lsn(ailp, &ailp->xa_target, &threshold_lsn); - if (!test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags)) - queue_delayed_work(xfs_syncd_wq, &ailp->xa_work, 0); + smp_wmb(); + + wake_up_process(ailp->xa_task); } /* @@ -865,9 +865,18 @@ xfs_trans_ail_init( ailp->xa_mount = mp; INIT_LIST_HEAD(&ailp->xa_ail); spin_lock_init(&ailp->xa_lock); - INIT_DELAYED_WORK(&ailp->xa_work, xfs_ail_worker); + + ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s", + ailp->xa_mount->m_fsname); + if (IS_ERR(ailp->xa_task)) + goto out_free_ailp; + mp->m_ail = ailp; return 0; + +out_free_ailp: + kmem_free(ailp); + return ENOMEM; } void @@ -876,6 +885,6 @@ xfs_trans_ail_destroy( { struct xfs_ail *ailp = mp->m_ail; - cancel_delayed_work_sync(&ailp->xa_work); + kthread_stop(ailp->xa_task); kmem_free(ailp); } diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index c0cb408..fe2e3cb 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -64,23 +64,17 @@ struct xfs_ail_cursor { */ struct xfs_ail { struct xfs_mount *xa_mount; + struct task_struct *xa_task; struct list_head xa_ail; xfs_lsn_t xa_target; struct xfs_ail_cursor xa_cursors; spinlock_t xa_lock; - struct delayed_work xa_work; xfs_lsn_t xa_last_pushed_lsn; - unsigned long xa_flags; }; -#define XFS_AIL_PUSHING_BIT 0 - /* * From xfs_trans_ail.c */ - -extern struct workqueue_struct *xfs_ail_wq; /* AIL workqueue */ - void xfs_trans_ail_update_bulk(struct xfs_ail *ailp, struct xfs_ail_cursor *cur, struct xfs_log_item **log_items, int nr_items, -- cgit v1.1 From d2a0110b7fb182c2c81144b834ddc7b3d645fe1e Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Thu, 15 Sep 2011 10:48:27 -0400 Subject: hfsplus: Fix kfree of wrong pointers in hfsplus_fill_super() error path commit f588c960fcaa6fa8bf82930bb819c9aca4eb9347 upstream. Commit 6596528e391a ("hfsplus: ensure bio requests are not smaller than the hardware sectors") changed the pointers used for volume header allocations but failed to free the correct pointers in the error path path of hfsplus_fill_super() and hfsplus_read_wrapper. The second hunk came from a separate patch by Pavel Ivanov. Reported-by: Pavel Ivanov Signed-off-by: Seth Forshee Signed-off-by: Christoph Hellwig Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/hfsplus/super.c | 4 ++-- fs/hfsplus/wrapper.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index ab4857b..c3a76fd 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -508,8 +508,8 @@ out_close_cat_tree: out_close_ext_tree: hfs_btree_close(sbi->ext_tree); out_free_vhdr: - kfree(sbi->s_vhdr); - kfree(sbi->s_backup_vhdr); + kfree(sbi->s_vhdr_buf); + kfree(sbi->s_backup_vhdr_buf); out_unload_nls: unload_nls(sbi->nls); unload_nls(nls); diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index e3881a1..7b8112d 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -275,9 +275,9 @@ reread: return 0; out_free_backup_vhdr: - kfree(sbi->s_backup_vhdr); + kfree(sbi->s_backup_vhdr_buf); out_free_vhdr: - kfree(sbi->s_vhdr); + kfree(sbi->s_vhdr_buf); out: return error; } -- cgit v1.1