From 1788ea6e3b2a58cf4fb00206e362d9caff8d86a7 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 4 Nov 2011 13:31:21 -0400 Subject: nfs: when attempting to open a directory, fall back on normal lookup (try #5) commit d953126 changed how nfs_atomic_lookup handles an -EISDIR return from an OPEN call. Prior to that patch, that caused the client to fall back to doing a normal lookup. When that patch went in, the code began returning that error to userspace. The d_revalidate codepath however never had the corresponding change, so it was still possible to end up with a NULL ctx->state pointer after that. That patch caused a regression. When we attempt to open a directory that does not have a cached dentry, that open now errors out with EISDIR. If you attempt the same open with a cached dentry, it will succeed. Fix this by reverting the change in nfs_atomic_lookup and allowing attempts to open directories to fall back to a normal lookup Also, add a NFSv4-specific f_ops->open routine that just returns -ENOTDIR. This should never be called if things are working properly, but if it ever is, then the dprintk may help in debugging. To facilitate this, a new file_operations field is also added to the nfs_rpc_ops struct. Cc: stable@kernel.org Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 2 +- fs/nfs/file.c | 32 ++++++++++++++++++++++++++++++++ fs/nfs/inode.c | 2 +- fs/nfs/nfs3proc.c | 1 + fs/nfs/nfs4proc.c | 1 + fs/nfs/proc.c | 1 + 6 files changed, 37 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index b238d95..ac28990 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1468,12 +1468,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry res = NULL; goto out; /* This turned out not to be a regular file */ + case -EISDIR: case -ENOTDIR: goto no_open; case -ELOOP: if (!(nd->intent.open.flags & O_NOFOLLOW)) goto no_open; - /* case -EISDIR: */ /* case -EINVAL: */ default: res = ERR_CAST(inode); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 0a1f831..6d93e07 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -886,3 +886,35 @@ static int nfs_setlease(struct file *file, long arg, struct file_lock **fl) file->f_path.dentry->d_name.name, arg); return -EINVAL; } + +#ifdef CONFIG_NFS_V4 +static int +nfs4_file_open(struct inode *inode, struct file *filp) +{ + /* + * NFSv4 opens are handled in d_lookup and d_revalidate. If we get to + * this point, then something is very wrong + */ + dprintk("NFS: %s called! inode=%p filp=%p\n", __func__, inode, filp); + return -ENOTDIR; +} + +const struct file_operations nfs4_file_operations = { + .llseek = nfs_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = nfs_file_read, + .aio_write = nfs_file_write, + .mmap = nfs_file_mmap, + .open = nfs4_file_open, + .flush = nfs_file_flush, + .release = nfs_file_release, + .fsync = nfs_file_fsync, + .lock = nfs_lock, + .flock = nfs_flock, + .splice_read = nfs_file_splice_read, + .splice_write = nfs_file_splice_write, + .check_flags = nfs_check_flags, + .setlease = nfs_setlease, +}; +#endif /* CONFIG_NFS_V4 */ diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index c07a55a..50a15fa 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -291,7 +291,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) */ inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops; if (S_ISREG(inode->i_mode)) { - inode->i_fop = &nfs_file_operations; + inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; inode->i_data.a_ops = &nfs_file_aops; inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info; } else if (S_ISDIR(inode->i_mode)) { diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 85f1690..d4bc9ed9 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -853,6 +853,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .dentry_ops = &nfs_dentry_operations, .dir_inode_ops = &nfs3_dir_inode_operations, .file_inode_ops = &nfs3_file_inode_operations, + .file_ops = &nfs_file_operations, .getroot = nfs3_proc_get_root, .getattr = nfs3_proc_getattr, .setattr = nfs3_proc_setattr, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b60fddf..069cb80 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6253,6 +6253,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = { .dentry_ops = &nfs4_dentry_operations, .dir_inode_ops = &nfs4_dir_inode_operations, .file_inode_ops = &nfs4_file_inode_operations, + .file_ops = &nfs4_file_operations, .getroot = nfs4_proc_get_root, .getattr = nfs4_proc_getattr, .setattr = nfs4_proc_setattr, diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index ac40b85..f48125d 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -710,6 +710,7 @@ const struct nfs_rpc_ops nfs_v2_clientops = { .dentry_ops = &nfs_dentry_operations, .dir_inode_ops = &nfs_dir_inode_operations, .file_inode_ops = &nfs_file_inode_operations, + .file_ops = &nfs_file_operations, .getroot = nfs_proc_get_root, .getattr = nfs_proc_getattr, .setattr = nfs_proc_setattr, -- cgit v1.1 From 0486958f57a496212e3c1e3d9194deebba3dc3d4 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 4 Nov 2011 13:31:22 -0400 Subject: nfs: move nfs_file_operations declaration to bottom of file.c (try #2) ...a remove a set of forward declarations. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 59 +++++++++++++++++++---------------------------------------- 1 file changed, 19 insertions(+), 40 deletions(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 6d93e07..eca56d4 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -40,48 +40,8 @@ #define NFSDBG_FACILITY NFSDBG_FILE -static int nfs_file_open(struct inode *, struct file *); -static int nfs_file_release(struct inode *, struct file *); -static loff_t nfs_file_llseek(struct file *file, loff_t offset, int origin); -static int nfs_file_mmap(struct file *, struct vm_area_struct *); -static ssize_t nfs_file_splice_read(struct file *filp, loff_t *ppos, - struct pipe_inode_info *pipe, - size_t count, unsigned int flags); -static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov, - unsigned long nr_segs, loff_t pos); -static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, - struct file *filp, loff_t *ppos, - size_t count, unsigned int flags); -static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov, - unsigned long nr_segs, loff_t pos); -static int nfs_file_flush(struct file *, fl_owner_t id); -static int nfs_file_fsync(struct file *, loff_t, loff_t, int datasync); -static int nfs_check_flags(int flags); -static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl); -static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); -static int nfs_setlease(struct file *file, long arg, struct file_lock **fl); - static const struct vm_operations_struct nfs_file_vm_ops; -const struct file_operations nfs_file_operations = { - .llseek = nfs_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = nfs_file_read, - .aio_write = nfs_file_write, - .mmap = nfs_file_mmap, - .open = nfs_file_open, - .flush = nfs_file_flush, - .release = nfs_file_release, - .fsync = nfs_file_fsync, - .lock = nfs_lock, - .flock = nfs_flock, - .splice_read = nfs_file_splice_read, - .splice_write = nfs_file_splice_write, - .check_flags = nfs_check_flags, - .setlease = nfs_setlease, -}; - const struct inode_operations nfs_file_inode_operations = { .permission = nfs_permission, .getattr = nfs_getattr, @@ -887,6 +847,25 @@ static int nfs_setlease(struct file *file, long arg, struct file_lock **fl) return -EINVAL; } +const struct file_operations nfs_file_operations = { + .llseek = nfs_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = nfs_file_read, + .aio_write = nfs_file_write, + .mmap = nfs_file_mmap, + .open = nfs_file_open, + .flush = nfs_file_flush, + .release = nfs_file_release, + .fsync = nfs_file_fsync, + .lock = nfs_lock, + .flock = nfs_flock, + .splice_read = nfs_file_splice_read, + .splice_write = nfs_file_splice_write, + .check_flags = nfs_check_flags, + .setlease = nfs_setlease, +}; + #ifdef CONFIG_NFS_V4 static int nfs4_file_open(struct inode *inode, struct file *filp) -- cgit v1.1 From 15a2015fbc692e1c97d7ce12d96e077f5ae7ea6d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sat, 5 Nov 2011 22:06:31 -0700 Subject: ceph: fix iput race when queueing inode work If we queue a work item that calls iput(), make sure we ihold() before attempting to queue work. Otherwise our queued work might miraculously run before we notice the queue_work() succeeded and call ihold(), allowing the inode to be destroyed. That is, instead of if (queue_work(...)) ihold(); we need to do ihold(); if (!queue_work(...)) iput(); Reported-by: Amon Ott Signed-off-by: Sage Weil --- fs/ceph/inode.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index e392bfc..116f365 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1328,12 +1328,13 @@ int ceph_inode_set_size(struct inode *inode, loff_t size) */ void ceph_queue_writeback(struct inode *inode) { + ihold(inode); if (queue_work(ceph_inode_to_client(inode)->wb_wq, &ceph_inode(inode)->i_wb_work)) { dout("ceph_queue_writeback %p\n", inode); - ihold(inode); } else { dout("ceph_queue_writeback %p failed\n", inode); + iput(inode); } } @@ -1353,12 +1354,13 @@ static void ceph_writeback_work(struct work_struct *work) */ void ceph_queue_invalidate(struct inode *inode) { + ihold(inode); if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq, &ceph_inode(inode)->i_pg_inv_work)) { dout("ceph_queue_invalidate %p\n", inode); - ihold(inode); } else { dout("ceph_queue_invalidate %p failed\n", inode); + iput(inode); } } @@ -1434,13 +1436,14 @@ void ceph_queue_vmtruncate(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); + ihold(inode); if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq, &ci->i_vmtruncate_work)) { dout("ceph_queue_vmtruncate %p\n", inode); - ihold(inode); } else { dout("ceph_queue_vmtruncate %p failed, pending=%d\n", inode, ci->i_truncate_pending); + iput(inode); } } -- cgit v1.1 From eb513689c97e3e73bb9b4459d490a8e894b4a546 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 7 Nov 2011 10:47:42 -0500 Subject: ext4: ignore journalled data options on remount if fs has no journal This avoids a confusing failure in the init scripts when the /etc/fstab has data=writeback or data=journal but the file system does not have a journal. So check for this case explicitly, and warn the user that we are ignoring the (pointless, since they have no journal) data=* mount option. Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9953d80..0435013 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1683,7 +1683,9 @@ static int parse_options(char *options, struct super_block *sb, data_opt = EXT4_MOUNT_WRITEBACK_DATA; datacheck: if (is_remount) { - if (test_opt(sb, DATA_FLAGS) != data_opt) { + if (!sbi->s_journal) + ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option"); + else if (test_opt(sb, DATA_FLAGS) != data_opt) { ext4_msg(sb, KERN_ERR, "Cannot change data mode on remount"); return 0; -- cgit v1.1 From 2397256d6218e7bf5147a3b01dcc6aec20fd3916 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Mon, 7 Nov 2011 10:50:09 -0500 Subject: ext4: Remove kernel_lock annotations The BKL is gone, these annotations are useless. Signed-off-by: Richard Weinberger Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0435013..3858767 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3101,8 +3101,6 @@ static void ext4_destroy_lazyinit_thread(void) } static int ext4_fill_super(struct super_block *sb, void *data, int silent) - __releases(kernel_lock) - __acquires(kernel_lock) { char *orig_data = kstrdup(data, GFP_KERNEL); struct buffer_head *bh; -- cgit v1.1 From 3c1fcb2c24519febd7ca9cf292fa5bdf513b601f Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Mon, 7 Nov 2011 11:01:13 -0500 Subject: ext4: add blk_finish_plug in error case of writepages. blk_finish_plug is needed in error case of writepages. Signed-off-by: Namjae Jeon Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cc5a6da..18b262b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2270,6 +2270,7 @@ retry: ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " "%ld pages, ino %lu; err %d", __func__, wbc->nr_to_write, inode->i_ino, ret); + blk_finish_plug(&plug); goto out_writepages; } -- cgit v1.1 From 45ea6095c8f0d6caad5658306416a5d254f1205e Mon Sep 17 00:00:00 2001 From: "slyich@gmail.com" Date: Mon, 7 Nov 2011 16:08:01 -0500 Subject: btrfs: fix double-free 'tree_root' in 'btrfs_mount()' On error path 'tree_root' is treed in 'free_fs_info()'. No need to free it explicitely. Noticed by SLUB in debug mode: Complete reproducer under usermode linux (discovered on real machine): bdev=/dev/ubda btr_root=/btr /mkfs.btrfs $bdev mount $bdev $btr_root mkdir $btr_root/subvols/ cd $btr_root/subvols/ /btrfs su cr foo /btrfs su cr bar mount $bdev -osubvol=subvols/foo $btr_root/subvols/bar umount $btr_root/subvols/bar which gives device fsid 4d55aa28-45b1-474b-b4ec-da912322195e devid 1 transid 7 /dev/ubda ============================================================================= BUG kmalloc-2048: Object already free ----------------------------------------------------------------------------- INFO: Allocated in btrfs_mount+0x389/0x7f0 age=0 cpu=0 pid=277 INFO: Freed in btrfs_mount+0x51c/0x7f0 age=0 cpu=0 pid=277 INFO: Slab 0x0000000062886200 objects=15 used=9 fp=0x0000000070b4d2d0 flags=0x4081 INFO: Object 0x0000000070b4d2d0 @offset=21200 fp=0x0000000070b4a968 ... Call Trace: 70b31948: [<6008c522>] print_trailer+0xe2/0x130 70b31978: [<6008c5aa>] object_err+0x3a/0x50 70b319a8: [<6008e242>] free_debug_processing+0x142/0x2a0 70b319e0: [<600ebf6f>] btrfs_mount+0x55f/0x7f0 70b319f8: [<6008e5c1>] __slab_free+0x221/0x2d0 Signed-off-by: Sergei Trofimovich Cc: Arne Jansen Cc: Chris Mason Cc: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/super.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 57080df..dcd5aef 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -933,8 +933,12 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, * then open_ctree will properly initialize everything later. */ fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); + if (!fs_info) { + error = -ENOMEM; + goto error_close_devices; + } tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - if (!fs_info || !tree_root) { + if (!tree_root) { error = -ENOMEM; goto error_close_devices; } @@ -964,7 +968,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, btrfs_close_devices(fs_devices); free_fs_info(fs_info); - kfree(tree_root); } else { char b[BDEVNAME_SIZE]; @@ -992,7 +995,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error_close_devices: btrfs_close_devices(fs_devices); free_fs_info(fs_info); - kfree(tree_root); return ERR_PTR(error); } -- cgit v1.1 From a6f498a891c730327645a7afa10c5ae977de6fd8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 7 Nov 2011 15:50:32 -0500 Subject: NFS: Fix a regression in the referral code Fix a regression that was introduced by commit 0c2e53f11a6dae9e3af5f50f5ad0382e7c3e0cfa (NFS: Remove the unused "lookupfh()" version of nfs4_proc_lookup()). In the case where the lookup gets an NFS4ERR_MOVED, we want to return the result of nfs4_get_referral(). Instead, that value is getting clobbered by the call to nfs4_handle_exception()... Reported-by: Tigran Mkrtchyan Tested-by: Tigran Mkrtchyan Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 069cb80..be2bbac 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2464,8 +2464,7 @@ static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qst case -NFS4ERR_BADNAME: return -ENOENT; case -NFS4ERR_MOVED: - err = nfs4_get_referral(dir, name, fattr, fhandle); - break; + return nfs4_get_referral(dir, name, fattr, fhandle); case -NFS4ERR_WRONGSEC: nfs_fixup_secinfo_attributes(fattr, fhandle); } -- cgit v1.1 From b52a360b2aa1c59ba9970fb0f52bbb093fcc7a24 Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Mon, 7 Nov 2011 16:10:24 +0000 Subject: xfs: Fix possible memory corruption in xfs_readlink Fixes a possible memory corruption when the link is larger than MAXPATHLEN and XFS_DEBUG is not enabled. This also remove the S_ISLNK assert, since the inode mode is checked previously in xfs_readlink_by_handle() and via VFS. Updated to address concerns raised by Ben Hutchings about the loose attention paid to 32- vs 64-bit values, and the lack of handling a potentially negative pathlen value: - Changed type of "pathlen" to be xfs_fsize_t, to match that of ip->i_d.di_size - Added checking for a negative pathlen to the too-long pathlen test, and generalized the message that gets reported in that case to reflect the change As a result, if a negative pathlen were encountered, this function would return EFSCORRUPTED (and would fail an assertion for a debug build)--just as would a too-long pathlen. Signed-off-by: Alex Elder Signed-off-by: Carlos Maiolino Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_vnodeops.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 4ecf2a5..ce9268a 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -112,7 +112,7 @@ xfs_readlink( char *link) { xfs_mount_t *mp = ip->i_mount; - int pathlen; + xfs_fsize_t pathlen; int error = 0; trace_xfs_readlink(ip); @@ -122,13 +122,19 @@ xfs_readlink( xfs_ilock(ip, XFS_ILOCK_SHARED); - ASSERT(S_ISLNK(ip->i_d.di_mode)); - ASSERT(ip->i_d.di_size <= MAXPATHLEN); - pathlen = ip->i_d.di_size; if (!pathlen) goto out; + if (pathlen < 0 || pathlen > MAXPATHLEN) { + xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)", + __func__, (unsigned long long) ip->i_ino, + (long long) pathlen); + ASSERT(0); + return XFS_ERROR(EFSCORRUPTED); + } + + if (ip->i_df.if_flags & XFS_IFINLINE) { memcpy(link, ip->i_df.if_u1.if_data, pathlen); link[pathlen] = '\0'; -- cgit v1.1 From 272e42b215c52d32e06bf035c1f6b70baa6716bd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 28 Oct 2011 09:54:24 +0000 Subject: xfs: constify xfs_item_ops The log item ops aren't nessecarily the biggest exploit vector, but marking them const is easy enough. Also remove the unused xfs_item_ops_t typedef while we're at it. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Reviewed-by: Alex Elder --- fs/xfs/xfs_buf_item.c | 2 +- fs/xfs/xfs_dquot_item.c | 6 +++--- fs/xfs/xfs_extfree_item.c | 4 ++-- fs/xfs/xfs_inode_item.c | 2 +- fs/xfs/xfs_log.c | 2 +- fs/xfs/xfs_log.h | 2 +- fs/xfs/xfs_trans.h | 6 +++--- 7 files changed, 12 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 1a35138..eac97ef 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -656,7 +656,7 @@ xfs_buf_item_committing( /* * This is the ops vector shared by all buf log items. */ -static struct xfs_item_ops xfs_buf_item_ops = { +static const struct xfs_item_ops xfs_buf_item_ops = { .iop_size = xfs_buf_item_size, .iop_format = xfs_buf_item_format, .iop_pin = xfs_buf_item_pin, diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index bb3f71d..0dee0b7 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -295,7 +295,7 @@ xfs_qm_dquot_logitem_committing( /* * This is the ops vector for dquots */ -static struct xfs_item_ops xfs_dquot_item_ops = { +static const struct xfs_item_ops xfs_dquot_item_ops = { .iop_size = xfs_qm_dquot_logitem_size, .iop_format = xfs_qm_dquot_logitem_format, .iop_pin = xfs_qm_dquot_logitem_pin, @@ -483,7 +483,7 @@ xfs_qm_qoff_logitem_committing( { } -static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { +static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { .iop_size = xfs_qm_qoff_logitem_size, .iop_format = xfs_qm_qoff_logitem_format, .iop_pin = xfs_qm_qoff_logitem_pin, @@ -498,7 +498,7 @@ static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { /* * This is the ops vector shared by all quotaoff-start log items. */ -static struct xfs_item_ops xfs_qm_qoff_logitem_ops = { +static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = { .iop_size = xfs_qm_qoff_logitem_size, .iop_format = xfs_qm_qoff_logitem_format, .iop_pin = xfs_qm_qoff_logitem_pin, diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index d22e626..35c2aff 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -217,7 +217,7 @@ xfs_efi_item_committing( /* * This is the ops vector shared by all efi log items. */ -static struct xfs_item_ops xfs_efi_item_ops = { +static const struct xfs_item_ops xfs_efi_item_ops = { .iop_size = xfs_efi_item_size, .iop_format = xfs_efi_item_format, .iop_pin = xfs_efi_item_pin, @@ -477,7 +477,7 @@ xfs_efd_item_committing( /* * This is the ops vector shared by all efd log items. */ -static struct xfs_item_ops xfs_efd_item_ops = { +static const struct xfs_item_ops xfs_efd_item_ops = { .iop_size = xfs_efd_item_size, .iop_format = xfs_efd_item_format, .iop_pin = xfs_efd_item_pin, diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index b7cf21b..abaafdb 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -795,7 +795,7 @@ xfs_inode_item_committing( /* * This is the ops vector shared by all buf log items. */ -static struct xfs_item_ops xfs_inode_item_ops = { +static const struct xfs_item_ops xfs_inode_item_ops = { .iop_size = xfs_inode_item_size, .iop_format = xfs_inode_item_format, .iop_pin = xfs_inode_item_pin, diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 2758a62..a14cd89 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -626,7 +626,7 @@ xfs_log_item_init( struct xfs_mount *mp, struct xfs_log_item *item, int type, - struct xfs_item_ops *ops) + const struct xfs_item_ops *ops) { item->li_mountp = mp; item->li_ailp = mp->m_ail; diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 78c9039..3f7bf45 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -137,7 +137,7 @@ struct xfs_trans; void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, int type, - struct xfs_item_ops *ops); + const struct xfs_item_ops *ops); xfs_lsn_t xfs_log_done(struct xfs_mount *mp, struct xlog_ticket *ticket, diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 603f3eb..3ae713c 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -326,7 +326,7 @@ typedef struct xfs_log_item { struct xfs_log_item *); /* buffer item iodone */ /* callback func */ - struct xfs_item_ops *li_ops; /* function list */ + const struct xfs_item_ops *li_ops; /* function list */ /* delayed logging */ struct list_head li_cil; /* CIL pointers */ @@ -341,7 +341,7 @@ typedef struct xfs_log_item { { XFS_LI_IN_AIL, "IN_AIL" }, \ { XFS_LI_ABORTED, "ABORTED" } -typedef struct xfs_item_ops { +struct xfs_item_ops { uint (*iop_size)(xfs_log_item_t *); void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); void (*iop_pin)(xfs_log_item_t *); @@ -352,7 +352,7 @@ typedef struct xfs_item_ops { void (*iop_push)(xfs_log_item_t *); bool (*iop_pushbuf)(xfs_log_item_t *); void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); -} xfs_item_ops_t; +}; #define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) #define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) -- cgit v1.1 From 810627d9a6d0e8820c798001875bc4e1b7754ebf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Nov 2011 08:56:15 +0000 Subject: xfs: fix force shutdown handling in xfs_end_io Ensure ioend->io_error gets propagated back to e.g. AIO completions. Signed-off-by: Christoph Hellwig Reviewed-by: Alex Elder --- fs/xfs/xfs_aops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 33b1331..574d4ee 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -189,7 +189,7 @@ xfs_end_io( int error = 0; if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { - error = -EIO; + ioend->io_error = -EIO; goto done; } if (ioend->io_error) -- cgit v1.1 From 917c16b2b69fc2eeb432eabca73258f08c58361e Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 8 Nov 2011 14:49:59 -0500 Subject: Btrfs: fix oops on NULL trans handle in btrfs_truncate If we fail to reserve space in the transaction during truncate, we can error out with a NULL trans handle. The cleanup code needs an extra check to make sure we aren't trying to use the bad handle. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9d0eaa5..f60e249 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6529,14 +6529,16 @@ end_trans: ret = btrfs_orphan_del(NULL, inode); } - trans->block_rsv = &root->fs_info->trans_block_rsv; - ret = btrfs_update_inode(trans, root, inode); - if (ret && !err) - err = ret; + if (trans) { + trans->block_rsv = &root->fs_info->trans_block_rsv; + ret = btrfs_update_inode(trans, root, inode); + if (ret && !err) + err = ret; - nr = trans->blocks_used; - ret = btrfs_end_transaction_throttle(trans, root); - btrfs_btree_balance_dirty(root, nr); + nr = trans->blocks_used; + ret = btrfs_end_transaction_throttle(trans, root); + btrfs_btree_balance_dirty(root, nr); + } out: btrfs_free_block_rsv(root, rsv); -- cgit v1.1 From 7fd2ae21a42d178982679b86086661292b4afe4a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 8 Nov 2011 15:47:34 -0500 Subject: Btrfs: fix our reservations for updating an inode when completing io People have been reporting ENOSPC crashes in finish_ordered_io. This is because we try to steal from the delalloc block rsv to satisfy a reservation to update the inode. The problem with this is we don't explicitly save space for updating the inode when doing delalloc. This is kind of a problem and we've gotten away with this because way back when we just stole from the delalloc reserve without any questions, and this worked out fine because generally speaking the leaf had been modified either by the mtime update when we did the original write or because we just updated the leaf when we inserted the file extent item, only on rare occasions had the leaf not actually been modified, and that was still ok because we'd just use a block or two out of the over-reservation that is delalloc. Then came the delayed inode stuff. This is amazing, except it wants a full reservation for updating the inode since it may do it at some point down the road after we've written the blocks and we have to recow everything again. This worked out because the delayed inode stuff just stole from the global reserve, that is until recently when I changed that because it caused other problems. So here we are, we're doing everything right and being screwed for it. So take an extra reservation for the inode at delalloc reservation time and carry it through the life of the delalloc reservation. If we need it we can steal it in the delayed inode stuff. If we have already stolen it try and do a normal metadata reservation. If that fails try to steal from the delalloc reservation. If _that_ fails we'll get a WARN_ON() so I can start thinking of a better way to solve this and in the meantime we'll steal from the global reserve. With this patch I ran xfstests 13 in a loop for a couple of hours and didn't see any problems. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/btrfs_inode.h | 4 +-- fs/btrfs/delayed-inode.c | 65 +++++++++++++++++++++++++++++++++++++++++++++--- fs/btrfs/extent-tree.c | 22 +++++++++++++--- fs/btrfs/inode.c | 1 + 4 files changed, 83 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 5a5d325..634608d2 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -147,14 +147,12 @@ struct btrfs_inode { * the btrfs file release call will add this inode to the * ordered operations list so that we make sure to flush out any * new data the application may have written before commit. - * - * yes, its silly to have a single bitflag, but we might grow more - * of these. */ unsigned ordered_data_close:1; unsigned orphan_meta_reserved:1; unsigned dummy_inode:1; unsigned in_defrag:1; + unsigned delalloc_meta_reserved:1; /* * always compress this one file diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index bbe8496..313ee14 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -617,12 +617,14 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, static int btrfs_delayed_inode_reserve_metadata( struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct inode *inode, struct btrfs_delayed_node *node) { struct btrfs_block_rsv *src_rsv; struct btrfs_block_rsv *dst_rsv; u64 num_bytes; int ret; + int release = false; src_rsv = trans->block_rsv; dst_rsv = &root->fs_info->delayed_block_rsv; @@ -652,11 +654,67 @@ static int btrfs_delayed_inode_reserve_metadata( if (!ret) node->bytes_reserved = num_bytes; return ret; + } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { + spin_lock(&BTRFS_I(inode)->lock); + if (BTRFS_I(inode)->delalloc_meta_reserved) { + BTRFS_I(inode)->delalloc_meta_reserved = 0; + spin_unlock(&BTRFS_I(inode)->lock); + release = true; + goto migrate; + } + spin_unlock(&BTRFS_I(inode)->lock); + + /* Ok we didn't have space pre-reserved. This shouldn't happen + * too often but it can happen if we do delalloc to an existing + * inode which gets dirtied because of the time update, and then + * isn't touched again until after the transaction commits and + * then we try to write out the data. First try to be nice and + * reserve something strictly for us. If not be a pain and try + * to steal from the delalloc block rsv. + */ + ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes); + if (!ret) + goto out; + + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); + if (!ret) + goto out; + + /* + * Ok this is a problem, let's just steal from the global rsv + * since this really shouldn't happen that often. + */ + WARN_ON(1); + ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv, + dst_rsv, num_bytes); + goto out; } +migrate: ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); - if (!ret) - node->bytes_reserved = num_bytes; + if (unlikely(ret)) { + /* This shouldn't happen */ + BUG_ON(release); + return ret; + } + +out: + /* + * Migrate only takes a reservation, it doesn't touch the size of the + * block_rsv. This is to simplify people who don't normally have things + * migrated from their block rsv. If they go to release their + * reservation, that will decrease the size as well, so if migrate + * reduced size we'd end up with a negative size. But for the + * delalloc_meta_reserved stuff we will only know to drop 1 reservation, + * but we could in fact do this reserve/migrate dance several times + * between the time we did the original reservation and we'd clean it + * up. So to take care of this, release the space for the meta + * reservation here. I think it may be time for a documentation page on + * how block rsvs. work. + */ + if (release) + btrfs_block_rsv_release(root, src_rsv, num_bytes); + node->bytes_reserved = num_bytes; return ret; } @@ -1708,7 +1766,8 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, goto release_node; } - ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); + ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode, + delayed_node); if (ret) goto release_node; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 18ea90c..0b044e5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4063,23 +4063,30 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, */ static unsigned drop_outstanding_extent(struct inode *inode) { + unsigned drop_inode_space = 0; unsigned dropped_extents = 0; BUG_ON(!BTRFS_I(inode)->outstanding_extents); BTRFS_I(inode)->outstanding_extents--; + if (BTRFS_I(inode)->outstanding_extents == 0 && + BTRFS_I(inode)->delalloc_meta_reserved) { + drop_inode_space = 1; + BTRFS_I(inode)->delalloc_meta_reserved = 0; + } + /* * If we have more or the same amount of outsanding extents than we have * reserved then we need to leave the reserved extents count alone. */ if (BTRFS_I(inode)->outstanding_extents >= BTRFS_I(inode)->reserved_extents) - return 0; + return drop_inode_space; dropped_extents = BTRFS_I(inode)->reserved_extents - BTRFS_I(inode)->outstanding_extents; BTRFS_I(inode)->reserved_extents -= dropped_extents; - return dropped_extents; + return dropped_extents + drop_inode_space; } /** @@ -4165,9 +4172,18 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) nr_extents = BTRFS_I(inode)->outstanding_extents - BTRFS_I(inode)->reserved_extents; BTRFS_I(inode)->reserved_extents += nr_extents; + } - to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); + /* + * Add an item to reserve for updating the inode when we complete the + * delalloc io. + */ + if (!BTRFS_I(inode)->delalloc_meta_reserved) { + nr_extents++; + BTRFS_I(inode)->delalloc_meta_reserved = 1; } + + to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); spin_unlock(&BTRFS_I(inode)->lock); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f60e249..2b92059 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6607,6 +6607,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->orphan_meta_reserved = 0; ei->dummy_inode = 0; ei->in_defrag = 0; + ei->delalloc_meta_reserved = 0; ei->force_compress = BTRFS_COMPRESS_NONE; ei->delayed_node = NULL; -- cgit v1.1 From a90e8b6fb80db43b029e1e76205452afa8bdc77a Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 8 Nov 2011 16:47:55 +0200 Subject: Btrfs: fix memory leak in btrfs_parse_early_options() Don't leak subvol_name string in case multiple subvol= options are given. "The lastest option is effective" behavior (consistent with subvolid= and subvolrootid= options) is preserved. Signed-off-by: Ilya Dryomov --- fs/btrfs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index dcd5aef..6befcaf 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -448,6 +448,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, token = match_token(p, tokens, args); switch (token) { case Opt_subvol: + kfree(*subvol_name); *subvol_name = match_strdup(&args[0]); break; case Opt_subvolid: -- cgit v1.1 From f23c8af8ca2789eeb0ab9ea90c214f9694d96cc5 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 8 Nov 2011 19:15:05 +0200 Subject: Btrfs: fix subvol_name leak on error in btrfs_mount() btrfs_parse_early_options() can fail due to error while scanning devices (-o device= option), but still strdup() subvol_name string: mount -o subvol=SUBV,device=BAD_DEVICE So free subvol_name string on error. Signed-off-by: Ilya Dryomov --- fs/btrfs/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 6befcaf..58e9492 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -905,8 +905,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error = btrfs_parse_early_options(data, mode, fs_type, &subvol_name, &subvol_objectid, &subvol_rootid, &fs_devices); - if (error) + if (error) { + kfree(subvol_name); return ERR_PTR(error); + } if (subvol_name) { root = mount_subvol(subvol_name, flags, device_name, data); -- cgit v1.1 From 4d34b2789538befa45a68a191dc12e0886a69f7d Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 9 Nov 2011 00:08:15 +0200 Subject: Btrfs: avoid null dereference and leaks when bailing from open_ctree() Fix bugs introduced by 6c41761f. Firstly, after failing to allocate any of the tree roots (first 'goto fail' in open_ctree()) we would dereference a NULL fs_info pointer in free_fs_info(). Secondly, after failures from init_srcu_struct(), setup_bdi() and new_inode() we would leak all earlier allocated roots: fs_info fields haven't been initialized yet so free_fs_info() is rendered useless. Fix this by initializing fs_info pointer and fs_info fields before any allocations happen. Signed-off-by: Ilya Dryomov --- fs/btrfs/disk-io.c | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e53a5bb..91db90b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1890,31 +1890,32 @@ struct btrfs_root *open_ctree(struct super_block *sb, u64 features; struct btrfs_key location; struct buffer_head *bh; - struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); - struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); + struct btrfs_super_block *disk_super; struct btrfs_root *tree_root = btrfs_sb(sb); - struct btrfs_fs_info *fs_info = NULL; - struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); - struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); + struct btrfs_fs_info *fs_info = tree_root->fs_info; + struct btrfs_root *extent_root; + struct btrfs_root *csum_root; + struct btrfs_root *chunk_root; + struct btrfs_root *dev_root; struct btrfs_root *log_tree_root; - int ret; int err = -EINVAL; int num_backups_tried = 0; int backup_index = 0; - struct btrfs_super_block *disk_super; + extent_root = fs_info->extent_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + csum_root = fs_info->csum_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + chunk_root = fs_info->chunk_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + dev_root = fs_info->dev_root = + kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - if (!extent_root || !tree_root || !tree_root->fs_info || - !chunk_root || !dev_root || !csum_root) { + if (!extent_root || !csum_root || !chunk_root || !dev_root) { err = -ENOMEM; goto fail; } - fs_info = tree_root->fs_info; ret = init_srcu_struct(&fs_info->subvol_srcu); if (ret) { @@ -1954,12 +1955,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, mutex_init(&fs_info->reloc_mutex); init_completion(&fs_info->kobj_unregister); - fs_info->tree_root = tree_root; - fs_info->extent_root = extent_root; - fs_info->csum_root = csum_root; - fs_info->chunk_root = chunk_root; - fs_info->dev_root = dev_root; - fs_info->fs_devices = fs_devices; INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); btrfs_mapping_init(&fs_info->mapping_tree); -- cgit v1.1 From 586e46e2813c589d26258a599580421fb6fb576b Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 9 Nov 2011 13:26:37 +0200 Subject: Btrfs: close devices on all error paths in open_ctree() Fix a bug introduced by 7e662854 where we would leave devices busy on certain error paths in open_ctree(). fs_info is guaranteed to be non-NULL now so it's safe to dereference it on all error paths. Signed-off-by: Ilya Dryomov --- fs/btrfs/disk-io.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 91db90b..b6a5c0d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2460,21 +2460,20 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->caching_workers); fail_alloc: fail_iput: + btrfs_mapping_tree_free(&fs_info->mapping_tree); + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); - - btrfs_close_devices(fs_info->fs_devices); - btrfs_mapping_tree_free(&fs_info->mapping_tree); fail_bdi: bdi_destroy(&fs_info->bdi); fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: + btrfs_close_devices(fs_info->fs_devices); free_fs_info(fs_info); return ERR_PTR(err); recovery_tree_root: - if (!btrfs_test_opt(tree_root, RECOVERY)) goto fail_tree_roots; -- cgit v1.1 From 04d21a244fdf79d0ac892eaaa9a46b682467277c Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 9 Nov 2011 14:41:22 +0200 Subject: Btrfs: rework error handling in btrfs_mount() Commits 6c41761f and 45ea6095 introduced the possibility of NULL pointer dereference on error paths, also we would leave all devices busy and leak fs_info with all sub-structures on error when trying to mount an already mounted fs to a different directory. Fix this by doing all allocations before trying to open any of the devices, adjust error path for mount-already-mounted-fs case. Signed-off-by: Ilya Dryomov --- fs/btrfs/super.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 58e9492..629281c 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -891,7 +891,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, struct super_block *s; struct dentry *root; struct btrfs_fs_devices *fs_devices = NULL; - struct btrfs_root *tree_root = NULL; struct btrfs_fs_info *fs_info = NULL; fmode_t mode = FMODE_READ; char *subvol_name = NULL; @@ -920,15 +919,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if (error) return ERR_PTR(error); - error = btrfs_open_devices(fs_devices, mode, fs_type); - if (error) - return ERR_PTR(error); - - if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { - error = -EACCES; - goto error_close_devices; - } - /* * Setup a dummy root and fs_info for test/set super. This is because * we don't actually fill this stuff out until open_ctree, but we need @@ -936,28 +926,36 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, * then open_ctree will properly initialize everything later. */ fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); - if (!fs_info) { - error = -ENOMEM; - goto error_close_devices; - } - tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - if (!tree_root) { + if (!fs_info) + return ERR_PTR(-ENOMEM); + + fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + if (!fs_info->tree_root) { error = -ENOMEM; - goto error_close_devices; + goto error_fs_info; } - fs_info->tree_root = tree_root; + fs_info->tree_root->fs_info = fs_info; fs_info->fs_devices = fs_devices; - tree_root->fs_info = fs_info; fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); if (!fs_info->super_copy || !fs_info->super_for_commit) { error = -ENOMEM; + goto error_fs_info; + } + + error = btrfs_open_devices(fs_devices, mode, fs_type); + if (error) + goto error_fs_info; + + if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { + error = -EACCES; goto error_close_devices; } bdev = fs_devices->latest_bdev; - s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); + s = sget(fs_type, btrfs_test_super, btrfs_set_super, + fs_info->tree_root); if (IS_ERR(s)) { error = PTR_ERR(s); goto error_close_devices; @@ -966,7 +964,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if (s->s_root) { if ((flags ^ s->s_flags) & MS_RDONLY) { deactivate_locked_super(s); - return ERR_PTR(-EBUSY); + error = -EBUSY; + goto error_close_devices; } btrfs_close_devices(fs_devices); @@ -997,6 +996,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error_close_devices: btrfs_close_devices(fs_devices); +error_fs_info: free_fs_info(fs_info); return ERR_PTR(error); } -- cgit v1.1 From 5e442a493fc59fa536c76db1fff5b49ca36a88c5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 9 Nov 2011 18:16:00 -0500 Subject: Revert "proc: fix races against execve() of /proc/PID/fd**" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit aa6afca5bcaba8101f3ea09d5c3e4100b2b9f0e5. It escalates of some of the google-chrome SELinux problems with ptrace ("Check failed: pid_ > 0. Did not find zygote process"), and Andrew says that it is also causing mystery lockdep reports. Reported-by: Alex VillacĂ­s Lasso Requested-by: James Morris Requested-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 146 +++++++++++++++++---------------------------------------- 1 file changed, 43 insertions(+), 103 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 2db1bd3..851ba3d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1652,46 +1652,12 @@ out: return error; } -static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) -{ - struct inode *inode = dentry->d_inode; - struct task_struct *task = get_proc_task(inode); - int rc; - - if (task == NULL) - return -ESRCH; - - rc = -EACCES; - if (lock_trace(task)) - goto out_task; - - generic_fillattr(inode, stat); - unlock_trace(task); - rc = 0; -out_task: - put_task_struct(task); - return rc; -} - static const struct inode_operations proc_pid_link_inode_operations = { .readlink = proc_pid_readlink, .follow_link = proc_pid_follow_link, .setattr = proc_setattr, }; -static const struct inode_operations proc_fdinfo_link_inode_operations = { - .setattr = proc_setattr, - .getattr = proc_pid_fd_link_getattr, -}; - -static const struct inode_operations proc_fd_link_inode_operations = { - .readlink = proc_pid_readlink, - .follow_link = proc_pid_follow_link, - .setattr = proc_setattr, - .getattr = proc_pid_fd_link_getattr, -}; - /* building an inode */ @@ -1923,61 +1889,49 @@ out: static int proc_fd_info(struct inode *inode, struct path *path, char *info) { - struct task_struct *task; - struct files_struct *files; + struct task_struct *task = get_proc_task(inode); + struct files_struct *files = NULL; struct file *file; int fd = proc_fd(inode); - int rc; - - task = get_proc_task(inode); - if (!task) - return -ENOENT; - - rc = -EACCES; - if (lock_trace(task)) - goto out_task; - - rc = -ENOENT; - files = get_files_struct(task); - if (files == NULL) - goto out_unlock; - /* - * We are not taking a ref to the file structure, so we must - * hold ->file_lock. - */ - spin_lock(&files->file_lock); - file = fcheck_files(files, fd); - if (file) { - unsigned int f_flags; - struct fdtable *fdt; - - fdt = files_fdtable(files); - f_flags = file->f_flags & ~O_CLOEXEC; - if (FD_ISSET(fd, fdt->close_on_exec)) - f_flags |= O_CLOEXEC; - - if (path) { - *path = file->f_path; - path_get(&file->f_path); + if (task) { + files = get_files_struct(task); + put_task_struct(task); + } + if (files) { + /* + * We are not taking a ref to the file structure, so we must + * hold ->file_lock. + */ + spin_lock(&files->file_lock); + file = fcheck_files(files, fd); + if (file) { + unsigned int f_flags; + struct fdtable *fdt; + + fdt = files_fdtable(files); + f_flags = file->f_flags & ~O_CLOEXEC; + if (FD_ISSET(fd, fdt->close_on_exec)) + f_flags |= O_CLOEXEC; + + if (path) { + *path = file->f_path; + path_get(&file->f_path); + } + if (info) + snprintf(info, PROC_FDINFO_MAX, + "pos:\t%lli\n" + "flags:\t0%o\n", + (long long) file->f_pos, + f_flags); + spin_unlock(&files->file_lock); + put_files_struct(files); + return 0; } - if (info) - snprintf(info, PROC_FDINFO_MAX, - "pos:\t%lli\n" - "flags:\t0%o\n", - (long long) file->f_pos, - f_flags); - rc = 0; - } else - rc = -ENOENT; - spin_unlock(&files->file_lock); - put_files_struct(files); - -out_unlock: - unlock_trace(task); -out_task: - put_task_struct(task); - return rc; + spin_unlock(&files->file_lock); + put_files_struct(files); + } + return -ENOENT; } static int proc_fd_link(struct inode *inode, struct path *path) @@ -2072,7 +2026,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir, spin_unlock(&files->file_lock); put_files_struct(files); - inode->i_op = &proc_fd_link_inode_operations; + inode->i_op = &proc_pid_link_inode_operations; inode->i_size = 64; ei->op.proc_get_link = proc_fd_link; d_set_d_op(dentry, &tid_fd_dentry_operations); @@ -2104,12 +2058,7 @@ static struct dentry *proc_lookupfd_common(struct inode *dir, if (fd == ~0U) goto out; - result = ERR_PTR(-EACCES); - if (lock_trace(task)) - goto out; - result = instantiate(dir, dentry, task, &fd); - unlock_trace(task); out: put_task_struct(task); out_no_task: @@ -2129,28 +2078,23 @@ static int proc_readfd_common(struct file * filp, void * dirent, retval = -ENOENT; if (!p) goto out_no_task; - - retval = -EACCES; - if (lock_trace(p)) - goto out; - retval = 0; fd = filp->f_pos; switch (fd) { case 0: if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) - goto out_unlock; + goto out; filp->f_pos++; case 1: ino = parent_ino(dentry); if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) - goto out_unlock; + goto out; filp->f_pos++; default: files = get_files_struct(p); if (!files) - goto out_unlock; + goto out; rcu_read_lock(); for (fd = filp->f_pos-2; fd < files_fdtable(files)->max_fds; @@ -2174,9 +2118,6 @@ static int proc_readfd_common(struct file * filp, void * dirent, rcu_read_unlock(); put_files_struct(files); } - -out_unlock: - unlock_trace(p); out: put_task_struct(p); out_no_task: @@ -2254,7 +2195,6 @@ static struct dentry *proc_fdinfo_instantiate(struct inode *dir, ei->fd = fd; inode->i_mode = S_IFREG | S_IRUSR; inode->i_fop = &proc_fdinfo_file_operations; - inode->i_op = &proc_fdinfo_link_inode_operations; d_set_d_op(dentry, &tid_fd_dentry_operations); d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ -- cgit v1.1 From 62e4a76987eab2b7fa952546614bc83e5bfc9d3e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 10 Nov 2011 14:30:37 -0500 Subject: NFS: Revert pnfs ugliness from the generic NFS read code path pNFS-specific code belongs in the pnfs layer. It should not be hijacking generic NFS read or write code paths. Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 2 ++ fs/nfs/pnfs.c | 26 +++++++++++++++++++++----- fs/nfs/read.c | 14 ++------------ 3 files changed, 25 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index c1a1bd8..3f4d957 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -299,6 +299,8 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata); extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc, struct list_head *head); +extern void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, + struct inode *inode); extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); extern void nfs_readdata_release(struct nfs_read_data *rdata); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index a2478bc..e40a3ca 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1259,6 +1259,25 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) } EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); +static void pnfs_ld_handle_read_error(struct nfs_read_data *data) +{ + struct nfs_pageio_descriptor pgio; + + put_lseg(data->lseg); + data->lseg = NULL; + dprintk("pnfs write error = %d\n", data->pnfs_error); + + nfs_pageio_init_read_mds(&pgio, data->inode); + + while (!list_empty(&data->pages)) { + struct nfs_page *req = nfs_list_entry(data->pages.next); + + nfs_list_remove_request(req); + nfs_pageio_add_request(&pgio, req); + } + nfs_pageio_complete(&pgio); +} + /* * Called by non rpc-based layout drivers */ @@ -1267,11 +1286,8 @@ void pnfs_ld_read_done(struct nfs_read_data *data) if (likely(!data->pnfs_error)) { __nfs4_read_done_cb(data); data->mds_ops->rpc_call_done(&data->task, data); - } else { - put_lseg(data->lseg); - data->lseg = NULL; - dprintk("pnfs write error = %d\n", data->pnfs_error); - } + } else + pnfs_ld_handle_read_error(data); data->mds_ops->rpc_release(data); } EXPORT_SYMBOL_GPL(pnfs_ld_read_done); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 8b48ec6..cfa175c 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -109,7 +109,7 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) } } -static void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, +void nfs_pageio_init_read_mds(struct nfs_pageio_descriptor *pgio, struct inode *inode) { nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, @@ -534,23 +534,13 @@ static void nfs_readpage_result_full(struct rpc_task *task, void *calldata) static void nfs_readpage_release_full(void *calldata) { struct nfs_read_data *data = calldata; - struct nfs_pageio_descriptor pgio; - if (data->pnfs_error) { - nfs_pageio_init_read_mds(&pgio, data->inode); - pgio.pg_recoalesce = 1; - } while (!list_empty(&data->pages)) { struct nfs_page *req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); - if (!data->pnfs_error) - nfs_readpage_release(req); - else - nfs_pageio_add_request(&pgio, req); + nfs_readpage_release(req); } - if (data->pnfs_error) - nfs_pageio_complete(&pgio); nfs_readdata_release(calldata); } -- cgit v1.1 From 2115133f8b9a8dbdb217d14080814df07ce90479 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 10 Nov 2011 20:39:08 -0500 Subject: Btrfs: tweak the delayed inode reservations again Josef sent along an incremental to the inode reservation code to make sure we try and fall back to directly updating the inode item if things go horribly wrong. This reworks that patch slightly, adding a fallback function that will always try to update the inode item directly without going through the delayed_inode code. Signed-off-by: Chris Mason --- fs/btrfs/delayed-inode.c | 9 +++---- fs/btrfs/inode.c | 64 +++++++++++++++++++++++++++++++++--------------- 2 files changed, 47 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 313ee14..6a1a680 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -692,11 +692,6 @@ static int btrfs_delayed_inode_reserve_metadata( migrate: ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); - if (unlikely(ret)) { - /* This shouldn't happen */ - BUG_ON(release); - return ret; - } out: /* @@ -712,9 +707,11 @@ out: * reservation here. I think it may be time for a documentation page on * how block rsvs. work. */ + if (!ret) + node->bytes_reserved = num_bytes; + if (release) btrfs_block_rsv_release(root, src_rsv, num_bytes); - node->bytes_reserved = num_bytes; return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2b92059..7d39433 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -93,6 +93,8 @@ static noinline int cow_file_range(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, int unlock); +static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, @@ -1741,7 +1743,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) trans = btrfs_join_transaction(root); BUG_ON(IS_ERR(trans)); trans->block_rsv = &root->fs_info->delalloc_block_rsv; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode_fallback(trans, root, inode); BUG_ON(ret); } goto out; @@ -1791,7 +1793,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode_fallback(trans, root, inode); BUG_ON(ret); } ret = 0; @@ -2426,7 +2428,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, /* * copy everything in the in-memory inode into the btree. */ -noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, +static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode) { struct btrfs_inode_item *inode_item; @@ -2434,21 +2436,6 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; int ret; - /* - * If the inode is a free space inode, we can deadlock during commit - * if we put it into the delayed code. - * - * The data relocation inode should also be directly updated - * without delay - */ - if (!btrfs_is_free_space_inode(root, inode) - && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { - ret = btrfs_delayed_update_inode(trans, root, inode); - if (!ret) - btrfs_set_inode_last_trans(trans, inode); - return ret; - } - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2477,6 +2464,43 @@ failed: } /* + * copy everything in the in-memory inode into the btree. + */ +noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) +{ + int ret; + + /* + * If the inode is a free space inode, we can deadlock during commit + * if we put it into the delayed code. + * + * The data relocation inode should also be directly updated + * without delay + */ + if (!btrfs_is_free_space_inode(root, inode) + && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { + ret = btrfs_delayed_update_inode(trans, root, inode); + if (!ret) + btrfs_set_inode_last_trans(trans, inode); + return ret; + } + + return btrfs_update_inode_item(trans, root, inode); +} + +static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) +{ + int ret; + + ret = btrfs_update_inode(trans, root, inode); + if (ret == -ENOSPC) + return btrfs_update_inode_item(trans, root, inode); + return ret; +} + +/* * unlink helper that gets used here in inode.c and in the tree logging * recovery code. It remove a link in a directory with a given name, and * also drops the back refs in the inode to the directory @@ -5632,7 +5656,7 @@ again: if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { ret = btrfs_ordered_update_i_size(inode, 0, ordered); if (!ret) - err = btrfs_update_inode(trans, root, inode); + err = btrfs_update_inode_fallback(trans, root, inode); goto out; } @@ -5670,7 +5694,7 @@ again: add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); ret = btrfs_ordered_update_i_size(inode, 0, ordered); if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) - btrfs_update_inode(trans, root, inode); + btrfs_update_inode_fallback(trans, root, inode); ret = 0; out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, -- cgit v1.1 From 924cd8fbe41851eda2b68bf2ed501b2777fd77b4 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 10 Nov 2011 20:45:04 -0500 Subject: Btrfs: fix nocow when deleting the item btrfs_previous_item() just search the b+ tree, do not COW the nodes or leaves, if we modify the result of it, the meta-data will be broken. fix it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f8e29431..c37433d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -999,7 +999,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, key.objectid = device->devid; key.offset = start; key.type = BTRFS_DEV_EXTENT_KEY; - +again: ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { ret = btrfs_previous_item(root, path, key.objectid, @@ -1012,6 +1012,9 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_dev_extent); BUG_ON(found_key.offset > start || found_key.offset + btrfs_dev_extent_length(leaf, extent) < start); + key = found_key; + btrfs_release_path(path); + goto again; } else if (ret == 0) { leaf = path->nodes[0]; extent = btrfs_item_ptr(leaf, path->slots[0], -- cgit v1.1 From ba38eb4de354d228f2792f93cde2c748a3a3f3b2 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 10 Nov 2011 20:45:04 -0500 Subject: Btrfs: fix no reserved space for writing out inode cache I-node cache forgets to reserve the space when writing out it. And when we do some stress test, such as synctest, it will trigger WARN_ON() in use_block_rsv(). WARNING: at fs/btrfs/extent-tree.c:5718 btrfs_alloc_free_block+0xbf/0x281 [btrfs]() ... Call Trace: [] warn_slowpath_common+0x80/0x98 [] warn_slowpath_null+0x15/0x17 [] btrfs_alloc_free_block+0xbf/0x281 [btrfs] [] ? __set_page_dirty_nobuffers+0xfe/0x108 [] __btrfs_cow_block+0x118/0x3b5 [btrfs] [] btrfs_cow_block+0x103/0x14e [btrfs] [] btrfs_search_slot+0x249/0x6a4 [btrfs] [] btrfs_lookup_inode+0x2a/0x8a [btrfs] [] btrfs_update_inode+0xaa/0x141 [btrfs] [] btrfs_save_ino_cache+0xea/0x202 [btrfs] [] ? btrfs_update_reloc_root+0x17e/0x197 [btrfs] [] commit_fs_roots+0xaa/0x158 [btrfs] [] btrfs_commit_transaction+0x405/0x731 [btrfs] [] ? wake_up_bit+0x25/0x25 [] ? btrfs_log_dentry_safe+0x43/0x51 [btrfs] [] btrfs_sync_file+0x16a/0x198 [btrfs] [] ? mntput+0x21/0x23 [] vfs_fsync_range+0x18/0x21 [] vfs_fsync+0x17/0x19 [] do_fsync+0x29/0x3e [] sys_fsync+0xb/0xf [] system_call_fastpath+0x16/0x1b Sometimes it causes BUG_ON() in the reservation code of the delayed inode is triggered. So we must reserve enough space for inode cache. Note: If we can not reserve the enough space for inode cache, we will give up writing out it. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/inode-map.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 53dcbdf..f8962a9 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -398,6 +398,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root, struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; struct btrfs_path *path; struct inode *inode; + struct btrfs_block_rsv *rsv; + u64 num_bytes; u64 alloc_hint = 0; int ret; int prealloc; @@ -421,11 +423,26 @@ int btrfs_save_ino_cache(struct btrfs_root *root, if (!path) return -ENOMEM; + rsv = trans->block_rsv; + trans->block_rsv = &root->fs_info->trans_block_rsv; + + num_bytes = trans->bytes_reserved; + /* + * 1 item for inode item insertion if need + * 3 items for inode item update (in the worst case) + * 1 item for free space object + * 3 items for pre-allocation + */ + trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8); + ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv, + trans->bytes_reserved); + if (ret) + goto out; again: inode = lookup_free_ino_inode(root, path); if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { ret = PTR_ERR(inode); - goto out; + goto out_release; } if (IS_ERR(inode)) { @@ -434,7 +451,7 @@ again: ret = create_free_ino_inode(root, trans, path); if (ret) - goto out; + goto out_release; goto again; } @@ -477,11 +494,14 @@ again: } btrfs_free_reserved_data_space(inode, prealloc); + ret = btrfs_write_out_ino_cache(root, trans, path); out_put: iput(inode); +out_release: + btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); out: - if (ret == 0) - ret = btrfs_write_out_ino_cache(root, trans, path); + trans->block_rsv = rsv; + trans->bytes_reserved = num_bytes; btrfs_free_path(path); return ret; -- cgit v1.1 From 3254c87618354e58fa2a7b375c6664f567480c33 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 10 Nov 2011 20:45:05 -0500 Subject: Btrfs: fix unreleased path in btrfs_orphan_cleanup() When we did stress test for the space relocation, the deadlock happened. By debugging, We found it was caused by the carelessness that we forgot to unlock the read lock of the extent buffers in btrfs_orphan_cleanup() before we end the transaction handle, so the transaction commit task waited the task, which called btrfs_orphan_cleanup(), to unlock the extent buffer, but that task waited the commit task to end the transaction commit, and the deadlock happened. Fix it. Signed-ff-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7d39433..e16215f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2201,6 +2201,9 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) if (ret) goto out; } + /* release the path since we're done with it */ + btrfs_release_path(path); + root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; if (root->orphan_block_rsv) -- cgit v1.1 From 61b520a9d0083b9b361638e456af45fd75150c87 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 10 Nov 2011 20:45:05 -0500 Subject: Btrfs: Abstract similar code for btrfs_block_rsv_add{, _noflush} btrfs_block_rsv_add{, _noflush}() have similar code, so abstract that code. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0b044e5..0f47b3e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3796,16 +3796,16 @@ void btrfs_free_block_rsv(struct btrfs_root *root, kfree(rsv); } -int btrfs_block_rsv_add(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes) +static inline int __block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, int flush) { int ret; if (num_bytes == 0) return 0; - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1); + ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 1); return 0; @@ -3814,22 +3814,18 @@ int btrfs_block_rsv_add(struct btrfs_root *root, return ret; } +int btrfs_block_rsv_add(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + return __block_rsv_add(root, block_rsv, num_bytes, 1); +} + int btrfs_block_rsv_add_noflush(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes) { - int ret; - - if (num_bytes == 0) - return 0; - - ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 0); - if (!ret) { - block_rsv_add_bytes(block_rsv, num_bytes, 1); - return 0; - } - - return ret; + return __block_rsv_add(root, block_rsv, num_bytes, 0); } int btrfs_block_rsv_check(struct btrfs_root *root, -- cgit v1.1 From 76b9e23d25d5c99f994bee3172de39492e452e93 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 10 Nov 2011 20:45:05 -0500 Subject: Btrfs: fix orphan backref nodes If the root node of a fs/file tree is in the block group that is being relocated, but the others are not in the other block groups. when we create a snapshot for this tree between the relocation tree creation ends and ->create_reloc_tree is set to 0, Btrfs will create some backref nodes that are the lowest nodes of the backrefs cache. But we forget to add them into ->leaves list of the backref cache and deal with them, and at last, they will triggered BUG_ON(). kernel BUG at fs/btrfs/relocation.c:239! This patch fixes it by adding them into ->leaves list of backref cache. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/relocation.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 24d654c..dff29d5 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1174,6 +1174,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans, list_add_tail(&new_edge->list[UPPER], &new_node->lower); } + } else { + list_add_tail(&new_node->lower, &cache->leaves); } rb_node = tree_insert(&cache->rb_root, new_node->bytenr, -- cgit v1.1 From 2f120c05e67ae34c93786b1050c6828904314429 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 10 Nov 2011 20:45:05 -0500 Subject: Btrfs: only map pages if we know we need them when reading the space cache People have been running into a warning when loading space cache because the page is already mapped when trying to read in a bitmap. The way we read in entries and pages is kind of convoluted, so fix it so that io_ctl_read_entry maps the entries if it needs to, and if it hits the end of the page it simply unmaps the page. That way we can unconditionally unmap the io_ctl before reading in the bitmap and we should stop hitting these warnings. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 7a15fcf..181760f 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -537,6 +537,13 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl, struct btrfs_free_space *entry, u8 *type) { struct btrfs_free_space_entry *e; + int ret; + + if (!io_ctl->cur) { + ret = io_ctl_check_crc(io_ctl, io_ctl->index); + if (ret) + return ret; + } e = io_ctl->cur; entry->offset = le64_to_cpu(e->offset); @@ -550,10 +557,7 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl, io_ctl_unmap_page(io_ctl); - if (io_ctl->index >= io_ctl->num_pages) - return 0; - - return io_ctl_check_crc(io_ctl, io_ctl->index); + return 0; } static int io_ctl_read_bitmap(struct io_ctl *io_ctl, @@ -561,9 +565,6 @@ static int io_ctl_read_bitmap(struct io_ctl *io_ctl, { int ret; - if (io_ctl->cur && io_ctl->cur != io_ctl->orig) - io_ctl_unmap_page(io_ctl); - ret = io_ctl_check_crc(io_ctl, io_ctl->index); if (ret) return ret; @@ -699,6 +700,8 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, num_entries--; } + io_ctl_unmap_page(&io_ctl); + /* * We add the bitmaps at the end of the entries in order that * the bitmap entries are added to the cache. -- cgit v1.1 From 62f30c5462374b991e7e3f42d49ce2265c1b82f1 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 10 Nov 2011 20:45:05 -0500 Subject: Btrfs: fix deadlock caused by the race between relocation We can not do flushable reservation for the relocation when we create snapshot, because it may make the transaction commit task and the flush task wait for each other and the deadlock happens. Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 960835e..6a0574e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -882,8 +882,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); if (to_reserve > 0) { - ret = btrfs_block_rsv_add(root, &pending->block_rsv, - to_reserve); + ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv, + to_reserve); if (ret) { pending->error = ret; goto fail; -- cgit v1.1 From 69f4cb526bd02ae5af35846f9a710c099eec3347 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Fri, 11 Nov 2011 08:17:10 -0500 Subject: Btrfs: handle bio_add_page failure gracefully in scrub Currently scrub fails with ENOMEM when bio_add_page fails. Unfortunately dm based targets accept only one page per bio, thus making scrub always fails. This patch just submits the current bio when an error is encountered and starts a new one. Signed-off-by: Arne Jansen Signed-off-by: Chris Mason --- fs/btrfs/scrub.c | 64 +++++++++++++++++++++++++------------------------------- 1 file changed, 29 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ed11d38..f4190f2 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -944,50 +944,18 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer) static int scrub_submit(struct scrub_dev *sdev) { struct scrub_bio *sbio; - struct bio *bio; - int i; if (sdev->curr == -1) return 0; sbio = sdev->bios[sdev->curr]; - - bio = bio_alloc(GFP_NOFS, sbio->count); - if (!bio) - goto nomem; - - bio->bi_private = sbio; - bio->bi_end_io = scrub_bio_end_io; - bio->bi_bdev = sdev->dev->bdev; - bio->bi_sector = sbio->physical >> 9; - - for (i = 0; i < sbio->count; ++i) { - struct page *page; - int ret; - - page = alloc_page(GFP_NOFS); - if (!page) - goto nomem; - - ret = bio_add_page(bio, page, PAGE_SIZE, 0); - if (!ret) { - __free_page(page); - goto nomem; - } - } - sbio->err = 0; sdev->curr = -1; atomic_inc(&sdev->in_flight); - submit_bio(READ, bio); + submit_bio(READ, sbio->bio); return 0; - -nomem: - scrub_free_bio(bio); - - return -ENOMEM; } static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, @@ -995,6 +963,8 @@ static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, u8 *csum, int force) { struct scrub_bio *sbio; + struct page *page; + int ret; again: /* @@ -1015,12 +985,22 @@ again: } sbio = sdev->bios[sdev->curr]; if (sbio->count == 0) { + struct bio *bio; + sbio->physical = physical; sbio->logical = logical; + bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO); + if (!bio) + return -ENOMEM; + + bio->bi_private = sbio; + bio->bi_end_io = scrub_bio_end_io; + bio->bi_bdev = sdev->dev->bdev; + bio->bi_sector = sbio->physical >> 9; + sbio->err = 0; + sbio->bio = bio; } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || sbio->logical + sbio->count * PAGE_SIZE != logical) { - int ret; - ret = scrub_submit(sdev); if (ret) return ret; @@ -1030,6 +1010,20 @@ again: sbio->spag[sbio->count].generation = gen; sbio->spag[sbio->count].have_csum = 0; sbio->spag[sbio->count].mirror_num = mirror_num; + + page = alloc_page(GFP_NOFS); + if (!page) + return -ENOMEM; + + ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0); + if (!ret) { + __free_page(page); + ret = scrub_submit(sdev); + if (ret) + return ret; + goto again; + } + if (csum) { sbio->spag[sbio->count].have_csum = 1; memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); -- cgit v1.1 From 8965593e41dd2d0e2a2f1e6f245336005ea94a2c Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 11 Nov 2011 10:14:57 -0500 Subject: btrfs: rename the option to nospace_cache Rename no_space_cache option to nospace_cache to be more consistent with the rest, where the simple prefix 'no' is used to negate an option. The option has been introduced during the -rc1 cycle and there are has not been widely used, so it's safe. Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 629281c..8bd9d6d 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -197,7 +197,7 @@ static match_table_t tokens = { {Opt_subvolrootid, "subvolrootid=%d"}, {Opt_defrag, "autodefrag"}, {Opt_inode_cache, "inode_cache"}, - {Opt_no_space_cache, "no_space_cache"}, + {Opt_no_space_cache, "nospace_cache"}, {Opt_recovery, "recovery"}, {Opt_err, NULL}, }; @@ -711,7 +711,7 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) if (btrfs_test_opt(root, SPACE_CACHE)) seq_puts(seq, ",space_cache"); else - seq_puts(seq, ",no_space_cache"); + seq_puts(seq, ",nospace_cache"); if (btrfs_test_opt(root, CLEAR_CACHE)) seq_puts(seq, ",clear_cache"); if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) -- cgit v1.1 From 774ac21da76f5c3018428725074e27a3fd40b128 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 11 Nov 2011 09:48:08 -0800 Subject: ceph: initialize root dentry Set up d_fsdata on the root dentry. This fixes a NULL pointer dereference in ceph_d_prune on umount. It also means we can eventually strip out all of the conditional checks on d_fsdata because it is now set unconditionally (prior to setting up the d_ops). Fix the ceph_d_prune debug print while we're here. Signed-off-by: Sage Weil --- fs/ceph/dir.c | 2 +- fs/ceph/super.c | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 2abd0df..bca3948 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1143,7 +1143,7 @@ static void ceph_d_prune(struct dentry *dentry) { struct ceph_dentry_info *di; - dout("d_release %p\n", dentry); + dout("ceph_d_prune %p\n", dentry); /* do we have a valid parent? */ if (!dentry->d_parent || IS_ROOT(dentry)) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index a90846f..8dc73a5 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -638,10 +638,12 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, if (err == 0) { dout("open_root_inode success\n"); if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT && - fsc->sb->s_root == NULL) + fsc->sb->s_root == NULL) { root = d_alloc_root(req->r_target_inode); - else + ceph_init_dentry(root); + } else { root = d_obtain_alias(req->r_target_inode); + } req->r_target_inode = NULL; dout("open_root_inode success, root dentry is %p\n", root); } else { -- cgit v1.1 From f1ebcc74d5b2159f44c96b479b6eb8afc7829095 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 14 Nov 2011 20:48:06 -0500 Subject: Btrfs: fix tree corruption after multi-thread snapshots and inode_cache flush The btrfs snapshotting code requires that once a root has been snapshotted, we don't change it during a commit. But there are two cases to lead to tree corruptions: 1) multi-thread snapshots can commit serveral snapshots in a transaction, and this may change the src root when processing the following pending snapshots, which lead to the former snapshots corruptions; 2) the free inode cache was changing the roots when it root the cache, which lead to corruptions. This fixes things by making sure we force COW the block after we create a snapshot during commiting a transaction, then any changes to the roots will result in COW, and we get all the fs roots and snapshot roots to be consistent. Signed-off-by: Liu Bo Signed-off-by: Miao Xie Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 17 ++++++++++++++++- fs/btrfs/ctree.h | 2 ++ fs/btrfs/transaction.c | 8 ++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0fe615e..dede441 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -514,10 +514,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { + /* ensure we can see the force_cow */ + smp_rmb(); + + /* + * We do not need to cow a block if + * 1) this block is not created or changed in this transaction; + * 2) this block does not belong to TREE_RELOC tree; + * 3) the root is not forced COW. + * + * What is forced COW: + * when we create snapshot during commiting the transaction, + * after we've finished coping src root, we must COW the shared + * block to ensure the metadata consistency. + */ if (btrfs_header_generation(buf) == trans->transid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && - btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) + btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) && + !root->force_cow) return 0; return 1; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b9ba59f..b1cb3c0 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1271,6 +1271,8 @@ struct btrfs_root { * for stat. It may be used for more later */ dev_t anon_dev; + + int force_cow; }; struct btrfs_ioctl_defrag_range_args { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 6a0574e..81376d9 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -785,6 +785,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, btrfs_save_ino_cache(root, trans); + /* see comments in should_cow_block() */ + root->force_cow = 0; + smp_wmb(); + if (root->commit_root != root->node) { mutex_lock(&root->fs_commit_mutex); switch_commit_root(root); @@ -947,6 +951,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_tree_unlock(old); free_extent_buffer(old); + /* see comments in should_cow_block() */ + root->force_cow = 1; + smp_wmb(); + btrfs_set_root_node(new_root_item, tmp); /* record when the snapshot was created in key.offset */ key.offset = trans->transid; -- cgit v1.1 From bc5b8a9003132ae44559edd63a1623b7b99dfb68 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 14 Nov 2011 17:52:08 +0300 Subject: hfs: add sanity check for file name length On a corrupted file system the ->len field could be wrong leading to a buffer overflow. Reported-and-acked-by: Clement LECIGNE Signed-off-by: Dan Carpenter Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/hfs/trans.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/hfs/trans.c b/fs/hfs/trans.c index e673a88..b1ce4c7 100644 --- a/fs/hfs/trans.c +++ b/fs/hfs/trans.c @@ -40,6 +40,8 @@ int hfs_mac2asc(struct super_block *sb, char *out, const struct hfs_name *in) src = in->name; srclen = in->len; + if (srclen > HFS_NAMELEN) + srclen = HFS_NAMELEN; dst = out; dstlen = HFS_MAX_NAMELEN; if (nls_io) { -- cgit v1.1 From db3e74b582915d66e10b0c73a62763418f54c340 Mon Sep 17 00:00:00 2001 From: Mitsuo Hayasaka Date: Thu, 10 Nov 2011 01:33:10 +0000 Subject: xfs: use doalloc flag in xfs_qm_dqattach_one() The doalloc arg in xfs_qm_dqattach_one() is a flag that indicates whether a new area to handle quota information will be allocated if needed. Originally, it was passed to xfs_qm_dqget(), but has been removed by the following commit (probably by mistake): commit 8e9b6e7fa4544ea8a0e030c8987b918509c8ff47 Author: Christoph Hellwig Date: Sun Feb 8 21:51:42 2009 +0100 xfs: remove the unused XFS_QMOPT_DQLOCK flag As the result, xfs_qm_dqget() called from xfs_qm_dqattach_one() never allocates the new area even if it is needed. This patch gives the doalloc arg to xfs_qm_dqget() in xfs_qm_dqattach_one() to fix this problem. Signed-off-by: Mitsuo Hayasaka Cc: Alex Elder Cc: Christoph Hellwig Reviewed-by: Christoph Hellwig Signed-off-by: Ben Myers --- fs/xfs/xfs_qm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 5cff443..0bbb1a4 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -674,7 +674,8 @@ xfs_qm_dqattach_one( * disk and we didn't ask it to allocate; * ESRCH if quotas got turned off suddenly. */ - error = xfs_qm_dqget(ip->i_mount, ip, id, type, XFS_QMOPT_DOWARN, &dqp); + error = xfs_qm_dqget(ip->i_mount, ip, id, type, + doalloc | XFS_QMOPT_DOWARN, &dqp); if (error) return error; -- cgit v1.1 From 121f099412bd6576dfb3d94222e89d9341362177 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 16 Nov 2011 09:21:50 +0100 Subject: bio: change some signed vars to unsigned This is just a cleanup patch to silence a static checker warning. The problem is that we cap "nr_iovecs" so it can't be larger than "UIO_MAXIOV" but we don't check for negative values. It turns out this is prevented at other layers, but logically it doesn't make sense to have negative nr_iovecs so making it unsigned is nicer. Signed-off-by: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- fs/bio.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 41c93c7..b1fe82c 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -337,7 +337,7 @@ static void bio_fs_destructor(struct bio *bio) * RETURNS: * Pointer to new bio on success, NULL on failure. */ -struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) +struct bio *bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) { struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); @@ -365,7 +365,7 @@ static void bio_kmalloc_destructor(struct bio *bio) * %__GFP_WAIT, the allocation is guaranteed to succeed. * **/ -struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) +struct bio *bio_kmalloc(gfp_t gfp_mask, unsigned int nr_iovecs) { struct bio *bio; @@ -696,7 +696,8 @@ static void bio_free_map_data(struct bio_map_data *bmd) kfree(bmd); } -static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, +static struct bio_map_data *bio_alloc_map_data(int nr_segs, + unsigned int iov_count, gfp_t gfp_mask) { struct bio_map_data *bmd; -- cgit v1.1 From 8d514bbf37eecf0a3e309284728637816a36764b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 16 Nov 2011 16:06:09 -0500 Subject: btrfs: fix double mntput() in mount_subvol() Signed-off-by: Al Viro --- fs/btrfs/super.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8bd9d6d..969a774 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -861,7 +861,6 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, if (!is_subvolume_inode(path.dentry->d_inode)) { path_put(&path); - mntput(mnt); error = -EINVAL; printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n", subvol_name); -- cgit v1.1 From c13344958780b4046305ee6235d686c846535529 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 16 Nov 2011 16:12:14 -0500 Subject: switch create_mnt_ns() to saner calling conventions, fix double mntput() in nfs Life is much saner if create_mnt_ns(mnt) drops mnt in case of error... Switch it to such calling conventions, switch callers, fix double mntput() in fs/nfs/super.c one. Signed-off-by: Al Viro --- fs/btrfs/super.c | 4 +--- fs/namespace.c | 2 ++ fs/nfs/super.c | 23 ++++++++--------------- 3 files changed, 11 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 969a774..cfbedd7 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -843,10 +843,8 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, return ERR_CAST(mnt); ns_private = create_mnt_ns(mnt); - if (IS_ERR(ns_private)) { - mntput(mnt); + if (IS_ERR(ns_private)) return ERR_CAST(ns_private); - } /* * This will trigger the automount of the subvol so we can just diff --git a/fs/namespace.c b/fs/namespace.c index e5e1c7d..aea4b76 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2483,6 +2483,8 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt) __mnt_make_longterm(mnt); new_ns->root = mnt; list_add(&new_ns->list, &new_ns->root->mnt_list); + } else { + mntput(mnt); } return new_ns; } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 480b3b6..46d69f38 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2794,22 +2794,21 @@ static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt, int ret; ns_private = create_mnt_ns(root_mnt); - ret = PTR_ERR(ns_private); if (IS_ERR(ns_private)) - goto out_mntput; + return ERR_CAST(ns_private); ret = nfs_referral_loop_protect(); - if (ret != 0) - goto out_put_mnt_ns; - - ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt, - export_path, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); + if (ret == 0) { + ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt, + export_path, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, + &path); + nfs_referral_loop_unprotect(); + } - nfs_referral_loop_unprotect(); put_mnt_ns(ns_private); if (ret != 0) - goto out_err; + return ERR_PTR(ret); s = path.mnt->mnt_sb; atomic_inc(&s->s_active); @@ -2818,12 +2817,6 @@ static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt, path_put(&path); down_write(&s->s_umount); return dentry; -out_put_mnt_ns: - put_mnt_ns(ns_private); -out_mntput: - mntput(root_mnt); -out_err: - return ERR_PTR(ret); } static struct dentry *nfs4_try_mount(int flags, const char *dev_name, -- cgit v1.1 From ea441d1104cf1efb471fa81bc91e9fd1e6ae29fd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 16 Nov 2011 21:43:59 -0500 Subject: new helper: mount_subtree() takes vfsmount and relative path, does lookup within that vfsmount (possibly triggering automounts) and returns the result as root of subtree suitable for return by ->mount() (i.e. a reference to dentry and an active reference to its superblock grabbed, superblock locked exclusive). btrfs and nfs switched to it instead of open-coding the sucker. Signed-off-by: Al Viro --- fs/btrfs/super.c | 35 ++++++----------------------------- fs/namespace.c | 28 ++++++++++++++++++++++++++++ fs/nfs/super.c | 30 ++++++------------------------ 3 files changed, 40 insertions(+), 53 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index cfbedd7..17ee7fc 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -825,13 +825,9 @@ static char *setup_root_args(char *args) static struct dentry *mount_subvol(const char *subvol_name, int flags, const char *device_name, char *data) { - struct super_block *s; struct dentry *root; struct vfsmount *mnt; - struct mnt_namespace *ns_private; char *newargs; - struct path path; - int error; newargs = setup_root_args(data); if (!newargs) @@ -842,36 +838,17 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, if (IS_ERR(mnt)) return ERR_CAST(mnt); - ns_private = create_mnt_ns(mnt); - if (IS_ERR(ns_private)) - return ERR_CAST(ns_private); - - /* - * This will trigger the automount of the subvol so we can just - * drop the mnt we have here and return the dentry that we - * found. - */ - error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name, - LOOKUP_FOLLOW, &path); - put_mnt_ns(ns_private); - if (error) - return ERR_PTR(error); + root = mount_subtree(mnt, subvol_name); - if (!is_subvolume_inode(path.dentry->d_inode)) { - path_put(&path); - error = -EINVAL; + if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) { + struct super_block *s = root->d_sb; + dput(root); + root = ERR_PTR(-EINVAL); + deactivate_locked_super(s); printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n", subvol_name); - return ERR_PTR(-EINVAL); } - /* Get a ref to the sb and the dentry we found and return it */ - s = path.mnt->mnt_sb; - atomic_inc(&s->s_active); - root = dget(path.dentry); - path_put(&path); - down_write(&s->s_umount); - return root; } diff --git a/fs/namespace.c b/fs/namespace.c index aea4b76..50ee303 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2490,6 +2490,34 @@ struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt) } EXPORT_SYMBOL(create_mnt_ns); +struct dentry *mount_subtree(struct vfsmount *mnt, const char *name) +{ + struct mnt_namespace *ns; + struct path path; + int err; + + ns = create_mnt_ns(mnt); + if (IS_ERR(ns)) + return ERR_CAST(ns); + + err = vfs_path_lookup(mnt->mnt_root, mnt, + name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); + + put_mnt_ns(ns); + + if (err) + return ERR_PTR(err); + + /* trade a vfsmount reference for active sb one */ + atomic_inc(&path.mnt->mnt_sb->s_active); + mntput(path.mnt); + /* lock the sucker */ + down_write(&path.mnt->mnt_sb->s_umount); + /* ... and return the root of (sub)tree on it */ + return path.dentry; +} +EXPORT_SYMBOL(mount_subtree); + SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, char __user *, type, unsigned long, flags, void __user *, data) { diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 46d69f38..1347774 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2787,35 +2787,17 @@ static void nfs_referral_loop_unprotect(void) static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt, const char *export_path) { - struct mnt_namespace *ns_private; - struct super_block *s; struct dentry *dentry; - struct path path; - int ret; + int ret = nfs_referral_loop_protect(); - ns_private = create_mnt_ns(root_mnt); - if (IS_ERR(ns_private)) - return ERR_CAST(ns_private); - - ret = nfs_referral_loop_protect(); - if (ret == 0) { - ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt, - export_path, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, - &path); - nfs_referral_loop_unprotect(); - } - - put_mnt_ns(ns_private); - - if (ret != 0) + if (ret) { + mntput(root_mnt); return ERR_PTR(ret); + } - s = path.mnt->mnt_sb; - atomic_inc(&s->s_active); - dentry = dget(path.dentry); + dentry = mount_subtree(root_mnt, export_path); + nfs_referral_loop_unprotect(); - path_put(&path); - down_write(&s->s_umount); return dentry; } -- cgit v1.1 From 016e8d44bc06dd3322f26712bdd3f3a6973592d0 Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Fri, 19 Aug 2011 14:50:26 -0400 Subject: fs/minix: Verify bitmap block counts before mounting Newer versions of MINIX can create filesystems that allocate an extra bitmap block. Mounting of this succeeds, but doing a statfs call will result in an oops in count_free because of a negative number being used for the bh index. Avoid this by verifying the number of allocated blocks at mount time, erroring out if there are not enough and make statfs ignore the extras if there are too many. This fixes https://bugzilla.kernel.org/show_bug.cgi?id=18792 Signed-off-by: Josh Boyer Signed-off-by: Al Viro --- fs/minix/bitmap.c | 18 ++++++++++++------ fs/minix/inode.c | 25 +++++++++++++++++++++++-- fs/minix/minix.h | 9 +++++++-- 3 files changed, 42 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index 3f32bcb..7c82c29 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -20,10 +20,11 @@ static const int nibblemap[] = { 4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0 }; static DEFINE_SPINLOCK(bitmap_lock); -static unsigned long count_free(struct buffer_head *map[], unsigned numblocks, __u32 numbits) +static unsigned long count_free(struct buffer_head *map[], unsigned blocksize, __u32 numbits) { unsigned i, j, sum = 0; struct buffer_head *bh; + unsigned numblocks = minix_blocks_needed(numbits, blocksize); for (i=0; is_zmap, sbi->s_zmap_blocks, - sbi->s_nzones - sbi->s_firstdatazone + 1) + struct minix_sb_info *sbi = minix_sb(sb); + u32 bits = sbi->s_nzones - (sbi->s_firstdatazone + 1); + + return (count_free(sbi->s_zmap, sb->s_blocksize, bits) << sbi->s_log_zone_size); } @@ -273,7 +276,10 @@ struct inode *minix_new_inode(const struct inode *dir, int mode, int *error) return inode; } -unsigned long minix_count_free_inodes(struct minix_sb_info *sbi) +unsigned long minix_count_free_inodes(struct super_block *sb) { - return count_free(sbi->s_imap, sbi->s_imap_blocks, sbi->s_ninodes + 1); + struct minix_sb_info *sbi = minix_sb(sb); + u32 bits = sbi->s_ninodes + 1; + + return count_free(sbi->s_imap, sb->s_blocksize, bits); } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 64cdcd6..1d9e339 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -279,6 +279,27 @@ static int minix_fill_super(struct super_block *s, void *data, int silent) else if (sbi->s_mount_state & MINIX_ERROR_FS) printk("MINIX-fs: mounting file system with errors, " "running fsck is recommended\n"); + + /* Apparently minix can create filesystems that allocate more blocks for + * the bitmaps than needed. We simply ignore that, but verify it didn't + * create one with not enough blocks and bail out if so. + */ + block = minix_blocks_needed(sbi->s_ninodes, s->s_blocksize); + if (sbi->s_imap_blocks < block) { + printk("MINIX-fs: file system does not have enough " + "imap blocks allocated. Refusing to mount\n"); + goto out_iput; + } + + block = minix_blocks_needed( + (sbi->s_nzones - (sbi->s_firstdatazone + 1)), + s->s_blocksize); + if (sbi->s_zmap_blocks < block) { + printk("MINIX-fs: file system does not have enough " + "zmap blocks allocated. Refusing to mount.\n"); + goto out_iput; + } + return 0; out_iput: @@ -339,10 +360,10 @@ static int minix_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = sb->s_magic; buf->f_bsize = sb->s_blocksize; buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size; - buf->f_bfree = minix_count_free_blocks(sbi); + buf->f_bfree = minix_count_free_blocks(sb); buf->f_bavail = buf->f_bfree; buf->f_files = sbi->s_ninodes; - buf->f_ffree = minix_count_free_inodes(sbi); + buf->f_ffree = minix_count_free_inodes(sb); buf->f_namelen = sbi->s_namelen; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 341e212..6415fe0 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -48,10 +48,10 @@ extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, stru extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); extern struct inode * minix_new_inode(const struct inode *, int, int *); extern void minix_free_inode(struct inode * inode); -extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi); +extern unsigned long minix_count_free_inodes(struct super_block *sb); extern int minix_new_block(struct inode * inode); extern void minix_free_block(struct inode *inode, unsigned long block); -extern unsigned long minix_count_free_blocks(struct minix_sb_info *sbi); +extern unsigned long minix_count_free_blocks(struct super_block *sb); extern int minix_getattr(struct vfsmount *, struct dentry *, struct kstat *); extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len); @@ -88,6 +88,11 @@ static inline struct minix_inode_info *minix_i(struct inode *inode) return list_entry(inode, struct minix_inode_info, vfs_inode); } +static inline unsigned minix_blocks_needed(unsigned bits, unsigned blocksize) +{ + return DIV_ROUND_UP(bits, blocksize * 8); +} + #if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \ defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED) -- cgit v1.1 From f1fd306a91f875e65af0e04855b23adda6831ac9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 26 Aug 2011 22:38:50 -0400 Subject: minixfs: kill manual hweight(), simplify Signed-off-by: Al Viro --- fs/minix/bitmap.c | 41 ++++++++++++++--------------------------- fs/minix/minix.h | 2 +- 2 files changed, 15 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index 7c82c29..ef175cb 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -16,39 +16,26 @@ #include #include -static const int nibblemap[] = { 4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0 }; - static DEFINE_SPINLOCK(bitmap_lock); -static unsigned long count_free(struct buffer_head *map[], unsigned blocksize, __u32 numbits) +/* + * bitmap consists of blocks filled with 16bit words + * bit set == busy, bit clear == free + * endianness is a mess, but for counting zero bits it really doesn't matter... + */ +static __u32 count_free(struct buffer_head *map[], unsigned blocksize, __u32 numbits) { - unsigned i, j, sum = 0; - struct buffer_head *bh; - unsigned numblocks = minix_blocks_needed(numbits, blocksize); - - for (i=0; ib_size; j++) - sum += nibblemap[bh->b_data[j] & 0xf] - + nibblemap[(bh->b_data[j]>>4) & 0xf]; - } + __u32 sum = 0; + unsigned blocks = DIV_ROUND_UP(numbits, blocksize * 8); - if (numblocks==0 || !(bh=map[numblocks-1])) - return(0); - i = ((numbits - (numblocks-1) * bh->b_size * 8) / 16) * 2; - for (j=0; jb_data[j] & 0xf] - + nibblemap[(bh->b_data[j]>>4) & 0xf]; + while (blocks--) { + unsigned words = blocksize / 2; + __u16 *p = (__u16 *)(*map++)->b_data; + while (words--) + sum += 16 - hweight16(*p++); } - i = numbits%16; - if (i!=0) { - i = *(__u16 *)(&bh->b_data[j]) | ~((1<>4) & 0xf]; - sum += nibblemap[(i>>8) & 0xf] + nibblemap[(i>>12) & 0xf]; - } - return(sum); + return sum; } void minix_free_block(struct inode *inode, unsigned long block) diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 6415fe0..26bbd55 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -130,7 +130,7 @@ static inline int minix_find_first_zero_bit(const void *vaddr, unsigned size) if (!size) return 0; - size = (size >> 4) + ((size & 15) > 0); + size >>= 4; while (*p++ == 0xffff) { if (--size == 0) return (p - addr) << 4; -- cgit v1.1 From 387125fc722a8ed432066b85a552917343bdafca Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 18 Nov 2011 15:07:51 -0500 Subject: Btrfs: fix barrier flushes When btrfs is writing the super blocks, it send barrier flushes to make sure writeback caching drives get all the metadata on disk in the right order. But, we have two bugs in the way these are sent down. When doing full commits (not via the tree log), we are sending the barrier down before the last super when it should be going down before the first. In multi-device setups, we should be waiting for the barriers to complete on all devices before writing any of the supers. Both of these bugs can cause corruptions on power failures. We fix it with some new code to send down empty barriers to all devices before writing the first super. Alexandre Oliva found the multi-device bug. Arne Jansen did the async barrier loop. Signed-off-by: Chris Mason Reported-by: Alexandre Oliva --- fs/btrfs/disk-io.c | 145 ++++++++++++++++++++++++++++++++++++++++++++++------- fs/btrfs/volumes.h | 6 +++ 2 files changed, 134 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b6a5c0d..48d3013 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2573,22 +2573,10 @@ static int write_dev_supers(struct btrfs_device *device, int errors = 0; u32 crc; u64 bytenr; - int last_barrier = 0; if (max_mirrors == 0) max_mirrors = BTRFS_SUPER_MIRROR_MAX; - /* make sure only the last submit_bh does a barrier */ - if (do_barriers) { - for (i = 0; i < max_mirrors; i++) { - bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= - device->total_bytes) - break; - last_barrier = i; - } - } - for (i = 0; i < max_mirrors; i++) { bytenr = btrfs_sb_offset(i); if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) @@ -2634,17 +2622,136 @@ static int write_dev_supers(struct btrfs_device *device, bh->b_end_io = btrfs_end_buffer_write_sync; } - if (i == last_barrier && do_barriers) - ret = submit_bh(WRITE_FLUSH_FUA, bh); - else - ret = submit_bh(WRITE_SYNC, bh); - + /* + * we fua the first super. The others we allow + * to go down lazy. + */ + ret = submit_bh(WRITE_FUA, bh); if (ret) errors++; } return errors < i ? 0 : -1; } +/* + * endio for the write_dev_flush, this will wake anyone waiting + * for the barrier when it is done + */ +static void btrfs_end_empty_barrier(struct bio *bio, int err) +{ + if (err) { + if (err == -EOPNOTSUPP) + set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + clear_bit(BIO_UPTODATE, &bio->bi_flags); + } + if (bio->bi_private) + complete(bio->bi_private); + bio_put(bio); +} + +/* + * trigger flushes for one the devices. If you pass wait == 0, the flushes are + * sent down. With wait == 1, it waits for the previous flush. + * + * any device where the flush fails with eopnotsupp are flagged as not-barrier + * capable + */ +static int write_dev_flush(struct btrfs_device *device, int wait) +{ + struct bio *bio; + int ret = 0; + + if (device->nobarriers) + return 0; + + if (wait) { + bio = device->flush_bio; + if (!bio) + return 0; + + wait_for_completion(&device->flush_wait); + + if (bio_flagged(bio, BIO_EOPNOTSUPP)) { + printk("btrfs: disabling barriers on dev %s\n", + device->name); + device->nobarriers = 1; + } + if (!bio_flagged(bio, BIO_UPTODATE)) { + ret = -EIO; + } + + /* drop the reference from the wait == 0 run */ + bio_put(bio); + device->flush_bio = NULL; + + return ret; + } + + /* + * one reference for us, and we leave it for the + * caller + */ + device->flush_bio = NULL;; + bio = bio_alloc(GFP_NOFS, 0); + if (!bio) + return -ENOMEM; + + bio->bi_end_io = btrfs_end_empty_barrier; + bio->bi_bdev = device->bdev; + init_completion(&device->flush_wait); + bio->bi_private = &device->flush_wait; + device->flush_bio = bio; + + bio_get(bio); + submit_bio(WRITE_FLUSH, bio); + + return 0; +} + +/* + * send an empty flush down to each device in parallel, + * then wait for them + */ +static int barrier_all_devices(struct btrfs_fs_info *info) +{ + struct list_head *head; + struct btrfs_device *dev; + int errors = 0; + int ret; + + /* send down all the barriers */ + head = &info->fs_devices->devices; + list_for_each_entry_rcu(dev, head, dev_list) { + if (!dev->bdev) { + errors++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_flush(dev, 0); + if (ret) + errors++; + } + + /* wait for all the barriers */ + list_for_each_entry_rcu(dev, head, dev_list) { + if (!dev->bdev) { + errors++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_flush(dev, 1); + if (ret) + errors++; + } + if (errors) + return -EIO; + return 0; +} + int write_all_supers(struct btrfs_root *root, int max_mirrors) { struct list_head *head; @@ -2666,6 +2773,10 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) mutex_lock(&root->fs_info->fs_devices->device_list_mutex); head = &root->fs_info->fs_devices->devices; + + if (do_barriers) + barrier_all_devices(root->fs_info); + list_for_each_entry_rcu(dev, head, dev_list) { if (!dev->bdev) { total_errors++; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index ab5b1c4..78f2d4d 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -100,6 +100,12 @@ struct btrfs_device { struct reada_zone *reada_curr_zone; struct radix_tree_root reada_zones; struct radix_tree_root reada_extents; + + /* for sending down flush barriers */ + struct bio *flush_bio; + struct completion flush_wait; + int nobarriers; + }; struct btrfs_fs_devices { -- cgit v1.1 From 745c4d8e160afaf6c75e887c27ea4b75c8142b26 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Sun, 20 Nov 2011 07:31:57 -0500 Subject: btrfs: Fix up 32/64-bit compatibility for new ioctls This patch casts to unsigned long before casting to a pointer and fixes the following warnings: fs/btrfs/extent_io.c:2289:20: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] fs/btrfs/ioctl.c:2933:37: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] fs/btrfs/ioctl.c:2937:21: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] fs/btrfs/ioctl.c:3020:21: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] fs/btrfs/scrub.c:275:4: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast] fs/btrfs/backref.c:686:27: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] Signed-off-by: Jeff Mahoney Signed-off-by: Chris Mason --- fs/btrfs/backref.c | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/ioctl.c | 9 ++++++--- fs/btrfs/scrub.c | 2 +- 4 files changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 8855aad..22c64ff 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -683,7 +683,7 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref, return PTR_ERR(fspath); if (fspath > fspath_min) { - ipath->fspath->val[i] = (u64)fspath; + ipath->fspath->val[i] = (u64)(unsigned long)fspath; ++ipath->fspath->elem_cnt; ipath->fspath->bytes_left = fspath - fspath_min; } else { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1f87c4d..47fdba7 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2286,7 +2286,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) } if (!uptodate) { u64 failed_mirror; - failed_mirror = (u64)bio->bi_bdev; + failed_mirror = (unsigned long)bio->bi_bdev; if (tree->ops && tree->ops->readpage_io_failed_hook) ret = tree->ops->readpage_io_failed_hook( bio, page, start, end, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4a34c47..bda53fc 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2930,11 +2930,13 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) goto out; for (i = 0; i < ipath->fspath->elem_cnt; ++i) { - rel_ptr = ipath->fspath->val[i] - (u64)ipath->fspath->val; + rel_ptr = ipath->fspath->val[i] - + (u64)(unsigned long)ipath->fspath->val; ipath->fspath->val[i] = rel_ptr; } - ret = copy_to_user((void *)ipa->fspath, (void *)ipath->fspath, size); + ret = copy_to_user((void *)(unsigned long)ipa->fspath, + (void *)(unsigned long)ipath->fspath, size); if (ret) { ret = -EFAULT; goto out; @@ -3017,7 +3019,8 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, if (ret < 0) goto out; - ret = copy_to_user((void *)loi->inodes, (void *)inodes, size); + ret = copy_to_user((void *)(unsigned long)loi->inodes, + (void *)(unsigned long)inodes, size); if (ret) ret = -EFAULT; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index f4190f2..fab420d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -272,7 +272,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) swarn->logical, swarn->dev->name, (unsigned long long)swarn->sector, root, inum, offset, min(isize - offset, (u64)PAGE_SIZE), nlink, - (char *)ipath->fspath->val[i]); + (char *)(unsigned long)ipath->fspath->val[i]); free_ipath(ipath); return 0; -- cgit v1.1 From 32240a913d9f3a5aad42175d7696590ea1bfdb08 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Sun, 20 Nov 2011 07:33:38 -0500 Subject: btrfs: mirror_num should be int, not u64 My previous patch introduced some u64 for failed_mirror variables, this one makes it consistent again. Signed-off-by: Jan Schmidt Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent_io.c | 4 ++-- fs/btrfs/extent_io.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 48d3013..94abc25 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -620,7 +620,7 @@ out: static int btree_io_failed_hook(struct bio *failed_bio, struct page *page, u64 start, u64 end, - u64 mirror_num, struct extent_state *state) + int mirror_num, struct extent_state *state) { struct extent_io_tree *tree; unsigned long len; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 47fdba7..a877b8b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2285,8 +2285,8 @@ static void end_bio_extent_readpage(struct bio *bio, int err) clean_io_failure(start, page); } if (!uptodate) { - u64 failed_mirror; - failed_mirror = (unsigned long)bio->bi_bdev; + int failed_mirror; + failed_mirror = (int)(unsigned long)bio->bi_bdev; if (tree->ops && tree->ops->readpage_io_failed_hook) ret = tree->ops->readpage_io_failed_hook( bio, page, start, end, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index feb9be0..7604c30 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -70,7 +70,7 @@ struct extent_io_ops { unsigned long bio_flags); int (*readpage_io_hook)(struct page *page, u64 start, u64 end); int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, - u64 start, u64 end, u64 failed_mirror, + u64 start, u64 end, int failed_mirror, struct extent_state *state); int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, u64 start, u64 end, -- cgit v1.1 From 0f0fbf1d0e188d129756e9508090af4bdbfde00b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sun, 20 Nov 2011 07:33:38 -0500 Subject: Btrfs: fix to search one more bitmap for cluster setup Suppose there are two bitmaps [0, 256], [256, 512] and one extent [100, 120] in the free space cache, and we want to setup a cluster with offset=100, bytes=50. In this case, there will be only one bitmap [256, 512] in the temporary bitmaps list, and then setup_cluster_bitmap() won't search bitmap [0, 256]. The cause is, the list is constructed in setup_cluster_no_bitmap(), and only bitmaps with bitmap_entry->offset >= offset will be added into the list, and the very bitmap that convers offset has bitmap_entry->offset <= offset. Signed-off-by: Li Zefan Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 181760f..8f792f4 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2453,11 +2453,23 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *entry; struct rb_node *node; int ret = -ENOSPC; + u64 bitmap_offset = offset_to_bitmap(ctl, offset); if (ctl->total_bitmaps == 0) return -ENOSPC; /* + * The bitmap that covers offset won't be in the list unless offset + * is just its start offset. + */ + entry = list_first_entry(bitmaps, struct btrfs_free_space, list); + if (entry->offset != bitmap_offset) { + entry = tree_search_offset(ctl, bitmap_offset, 1, 0); + if (entry && list_empty(&entry->list)) + list_add(&entry->list, bitmaps); + } + + /* * First check our cached list of bitmaps and see if there is an entry * here that will work. */ -- cgit v1.1 From 52621cb6ed0e0e14358bb317bda7cd5fbd5c2a27 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sun, 20 Nov 2011 07:33:38 -0500 Subject: Btrfs: avoid unnecessary bitmap search for cluster setup setup_cluster_no_bitmap() searches all the extents and bitmaps starting from offset. Therefore if it returns -ENOSPC, all the bitmaps starting from offset are in the bitmaps list, so it's sufficient to search from this list in setup_cluser_bitmap(). Signed-off-by: Li Zefan Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 42 ++++-------------------------------------- 1 file changed, 4 insertions(+), 38 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 8f792f4..8c32434 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2451,7 +2451,6 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; - struct rb_node *node; int ret = -ENOSPC; u64 bitmap_offset = offset_to_bitmap(ctl, offset); @@ -2469,10 +2468,6 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, list_add(&entry->list, bitmaps); } - /* - * First check our cached list of bitmaps and see if there is an entry - * here that will work. - */ list_for_each_entry(entry, bitmaps, list) { if (entry->bytes < min_bytes) continue; @@ -2483,38 +2478,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, } /* - * If we do have entries on our list and we are here then we didn't find - * anything, so go ahead and get the next entry after the last entry in - * this list and start the search from there. + * The bitmaps list has all the bitmaps that record free space + * starting after offset, so no more search is required. */ - if (!list_empty(bitmaps)) { - entry = list_entry(bitmaps->prev, struct btrfs_free_space, - list); - node = rb_next(&entry->offset_index); - if (!node) - return -ENOSPC; - entry = rb_entry(node, struct btrfs_free_space, offset_index); - goto search; - } - - entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1); - if (!entry) - return -ENOSPC; - -search: - node = &entry->offset_index; - do { - entry = rb_entry(node, struct btrfs_free_space, offset_index); - node = rb_next(&entry->offset_index); - if (!entry->bitmap) - continue; - if (entry->bytes < min_bytes) - continue; - ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, - bytes, min_bytes); - } while (ret && node); - - return ret; + return -ENOSPC; } /* @@ -2532,8 +2499,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, u64 offset, u64 bytes, u64 empty_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; - struct list_head bitmaps; struct btrfs_free_space *entry, *tmp; + LIST_HEAD(bitmaps); u64 min_bytes; int ret; @@ -2572,7 +2539,6 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, goto out; } - INIT_LIST_HEAD(&bitmaps); ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, bytes, min_bytes); if (ret) -- cgit v1.1 From fadc0d8be4dfca80f6c568bc5874931893c6709b Mon Sep 17 00:00:00 2001 From: David Sterba Date: Sun, 20 Nov 2011 07:33:38 -0500 Subject: btrfs: fix stat blocks accounting Round inode bytes and delalloc bytes up to real blocksize before converting to sector size. Otherwise eg. files smaller than 512 are reported with zero blocks due to incorrect rounding. Signed-off-by: David Sterba Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e16215f..8ad26b1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6794,11 +6794,13 @@ static int btrfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; + u32 blocksize = inode->i_sb->s_blocksize; + generic_fillattr(inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; stat->blksize = PAGE_CACHE_SIZE; - stat->blocks = (inode_get_bytes(inode) + - BTRFS_I(inode)->delalloc_bytes) >> 9; + stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + + ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9; return 0; } -- cgit v1.1 From 5bb1468238e20b15921909e9f9601e945f03bac7 Mon Sep 17 00:00:00 2001 From: Arnd Hannemann Date: Sun, 20 Nov 2011 07:33:38 -0500 Subject: Btrfs: prefix resize related printks with btrfs: For the user it is confusing to find something like: [10197.627710] new size for /dev/mapper/vg0-usr_share is 3221225472 in kernel log, because it doesn't point directly to btrfs. This patch prefixes those messages with "btrfs:" like other btrfs related printks. Signed-off-by: Arnd Hannemann Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index bda53fc..a90e749 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1216,12 +1216,12 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, *devstr = '\0'; devstr = vol_args->name; devid = simple_strtoull(devstr, &end, 10); - printk(KERN_INFO "resizing devid %llu\n", + printk(KERN_INFO "btrfs: resizing devid %llu\n", (unsigned long long)devid); } device = btrfs_find_device(root, devid, NULL, NULL); if (!device) { - printk(KERN_INFO "resizer unable to find device %llu\n", + printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", (unsigned long long)devid); ret = -EINVAL; goto out_unlock; @@ -1267,7 +1267,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, do_div(new_size, root->sectorsize); new_size *= root->sectorsize; - printk(KERN_INFO "new size for %s is %llu\n", + printk(KERN_INFO "btrfs: new size for %s is %llu\n", device->name, (unsigned long long)new_size); if (new_size > old_size) { -- cgit v1.1 From 291c7d2f577428f896daa5002e784959328a80aa Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 14 Nov 2011 13:52:14 -0500 Subject: Btrfs: wait on caching if we're loading the free space cache We've been hitting panics when running xfstest 13 in a loop for long periods of time. And actually this problem has always existed so we've been hitting these things randomly for a while. Basically what happens is we get a thread coming into the allocator and reading the space cache off of disk and adding the entries to the free space cache as we go. Then we get another thread that comes in and tries to allocate from that block group. Since block_group->cached != BTRFS_CACHE_NO it goes ahead and tries to do the allocation. We do this because if we're doing the old slow way of caching we don't want to hold people up and wait for everything to finish. The problem with this is we could end up discarding the space cache at some arbitrary point in the future, which means we could very well end up allocating space that is either bad, or when the real caching happens it could end up thinking the space isn't in use when it really is and cause all sorts of other problems. The solution is to add a new flag to indicate we are loading the free space cache from disk, and always try to cache the block group if cache->cached != BTRFS_CACHE_FINISHED. That way if we are loading the space cache anybody else who tries to allocate from the block group will have to wait until it's finished to make sure it completes successfully. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/ctree.h | 3 +- fs/btrfs/extent-tree.c | 119 ++++++++++++++++++++++++++++++++----------------- 2 files changed, 81 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b1cb3c0..04a5dfc 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -848,7 +848,8 @@ struct btrfs_free_cluster { enum btrfs_caching_type { BTRFS_CACHE_NO = 0, BTRFS_CACHE_STARTED = 1, - BTRFS_CACHE_FINISHED = 2, + BTRFS_CACHE_FAST = 2, + BTRFS_CACHE_FINISHED = 3, }; enum btrfs_disk_cache_state { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0f47b3e..5d86877 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -467,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, struct btrfs_root *root, int load_cache_only) { + DEFINE_WAIT(wait); struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_caching_control *caching_ctl; int ret = 0; - smp_mb(); - if (cache->cached != BTRFS_CACHE_NO) + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); + BUG_ON(!caching_ctl); + + INIT_LIST_HEAD(&caching_ctl->list); + mutex_init(&caching_ctl->mutex); + init_waitqueue_head(&caching_ctl->wait); + caching_ctl->block_group = cache; + caching_ctl->progress = cache->key.objectid; + atomic_set(&caching_ctl->count, 1); + caching_ctl->work.func = caching_thread; + + spin_lock(&cache->lock); + /* + * This should be a rare occasion, but this could happen I think in the + * case where one thread starts to load the space cache info, and then + * some other thread starts a transaction commit which tries to do an + * allocation while the other thread is still loading the space cache + * info. The previous loop should have kept us from choosing this block + * group, but if we've moved to the state where we will wait on caching + * block groups we need to first check if we're doing a fast load here, + * so we can wait for it to finish, otherwise we could end up allocating + * from a block group who's cache gets evicted for one reason or + * another. + */ + while (cache->cached == BTRFS_CACHE_FAST) { + struct btrfs_caching_control *ctl; + + ctl = cache->caching_ctl; + atomic_inc(&ctl->count); + prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&cache->lock); + + schedule(); + + finish_wait(&ctl->wait, &wait); + put_caching_control(ctl); + spin_lock(&cache->lock); + } + + if (cache->cached != BTRFS_CACHE_NO) { + spin_unlock(&cache->lock); + kfree(caching_ctl); return 0; + } + WARN_ON(cache->caching_ctl); + cache->caching_ctl = caching_ctl; + cache->cached = BTRFS_CACHE_FAST; + spin_unlock(&cache->lock); /* * We can't do the read from on-disk cache during a commit since we need @@ -484,56 +530,51 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, if (trans && (!trans->transaction->in_commit) && (root && root != root->fs_info->tree_root) && btrfs_test_opt(root, SPACE_CACHE)) { - spin_lock(&cache->lock); - if (cache->cached != BTRFS_CACHE_NO) { - spin_unlock(&cache->lock); - return 0; - } - cache->cached = BTRFS_CACHE_STARTED; - spin_unlock(&cache->lock); - ret = load_free_space_cache(fs_info, cache); spin_lock(&cache->lock); if (ret == 1) { + cache->caching_ctl = NULL; cache->cached = BTRFS_CACHE_FINISHED; cache->last_byte_to_unpin = (u64)-1; } else { - cache->cached = BTRFS_CACHE_NO; + if (load_cache_only) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_NO; + } else { + cache->cached = BTRFS_CACHE_STARTED; + } } spin_unlock(&cache->lock); + wake_up(&caching_ctl->wait); if (ret == 1) { + put_caching_control(caching_ctl); free_excluded_extents(fs_info->extent_root, cache); return 0; } + } else { + /* + * We are not going to do the fast caching, set cached to the + * appropriate value and wakeup any waiters. + */ + spin_lock(&cache->lock); + if (load_cache_only) { + cache->caching_ctl = NULL; + cache->cached = BTRFS_CACHE_NO; + } else { + cache->cached = BTRFS_CACHE_STARTED; + } + spin_unlock(&cache->lock); + wake_up(&caching_ctl->wait); } - if (load_cache_only) - return 0; - - caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); - BUG_ON(!caching_ctl); - - INIT_LIST_HEAD(&caching_ctl->list); - mutex_init(&caching_ctl->mutex); - init_waitqueue_head(&caching_ctl->wait); - caching_ctl->block_group = cache; - caching_ctl->progress = cache->key.objectid; - /* one for caching kthread, one for caching block group list */ - atomic_set(&caching_ctl->count, 2); - caching_ctl->work.func = caching_thread; - - spin_lock(&cache->lock); - if (cache->cached != BTRFS_CACHE_NO) { - spin_unlock(&cache->lock); - kfree(caching_ctl); + if (load_cache_only) { + put_caching_control(caching_ctl); return 0; } - cache->caching_ctl = caching_ctl; - cache->cached = BTRFS_CACHE_STARTED; - spin_unlock(&cache->lock); down_write(&fs_info->extent_commit_sem); + atomic_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); up_write(&fs_info->extent_commit_sem); @@ -5177,13 +5218,15 @@ search: } have_block_group: - if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { + cached = block_group_cache_done(block_group); + if (unlikely(!cached)) { u64 free_percent; + found_uncached_bg = true; ret = cache_block_group(block_group, trans, orig_root, 1); if (block_group->cached == BTRFS_CACHE_FINISHED) - goto have_block_group; + goto alloc; free_percent = btrfs_block_group_used(&block_group->item); free_percent *= 100; @@ -5205,7 +5248,6 @@ have_block_group: orig_root, 0); BUG_ON(ret); } - found_uncached_bg = true; /* * If loop is set for cached only, try the next block @@ -5215,10 +5257,7 @@ have_block_group: goto loop; } - cached = block_group_cache_done(block_group); - if (unlikely(!cached)) - found_uncached_bg = true; - +alloc: if (unlikely(block_group->ro)) goto loop; -- cgit v1.1 From f7d61dcd6873c49bcc42be2caa2af1c2511aa915 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 15 Nov 2011 09:31:24 -0500 Subject: Btrfs: clear pages dirty for io and set them extent mapped When doing the io_ctl helpers to clean up the free space cache stuff I stopped using our normal prepare_pages stuff, which means I of course forgot to do things like set the pages extent mapped, which will cause us all sorts of wonderful propblems. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 8c32434..aedacdb 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -351,6 +351,11 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, } } + for (i = 0; i < io_ctl->num_pages; i++) { + clear_page_dirty_for_io(io_ctl->pages[i]); + set_page_extent_mapped(io_ctl->pages[i]); + } + return 0; } -- cgit v1.1 From 4d479cf010d56ec9c54f3099992d039918f1296b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 17 Nov 2011 11:34:31 -0500 Subject: Btrfs: sectorsize align offsets in fiemap We've been hitting BUG()'s in btrfs_cont_expand and btrfs_fallocate and anywhere else that calls btrfs_get_extent while running xfstests 13 in a loop. This is because fiemap is calling btrfs_get_extent with non-sectorsize aligned offsets, which will end up adding mappings that are not sectorsize aligned, which will cause problems in some cases for subsequent calls to btrfs_get_extent for similar areas that are sectorsize aligned. With this patch I ran xfstests 13 in a loop for a couple of hours and didn't hit the problem that I could previously hit in at most 20 minutes. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_io.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a877b8b..9472d3d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3366,6 +3366,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return -ENOMEM; path->leave_spinning = 1; + start = ALIGN(start, BTRFS_I(inode)->root->sectorsize); + len = ALIGN(len, BTRFS_I(inode)->root->sectorsize); + /* * lookup the last file extent. We're not using i_size here * because there might be preallocation past i_size @@ -3413,7 +3416,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, &cached_state, GFP_NOFS); - em = get_extent_skip_holes(inode, off, last_for_get_extent, + em = get_extent_skip_holes(inode, start, last_for_get_extent, get_extent); if (!em) goto out; -- cgit v1.1 From dd179946db2493646955efc112d73c85b3cafcb1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 16 Aug 2011 15:31:30 +0100 Subject: VFS: Log the fact that we've given ELOOP rather than creating a loop To prevent an NFS server from being used to create a directory loop in an NFS superblock on the client, the following patch was committed: commit 1836750115f20b774e55c032a3893e8c5bdf41ed Author: Al Viro Date: Tue Jul 12 21:42:24 2011 -0400 Subject: fix loop checks in d_materialise_unique() This causes ELOOP to be reported to anyone trying to access the dentry that would otherwise cause the kernel to complete the loop. However, no indication is given to the caller as to why an operation that ought to work doesn't. The fault is with the kernel, which doesn't want to try and solve the problem as it gets horrendously messy if there's another mountpoint somewhere in the trees being spliced that can't be moved[*]. [*] The real problem is that we don't handle the excision of a subtree that gets moved _out_ of what we can see. This can happen on the server where a directory is merely moved between two other dirs on the same filesystem, but where destination dir is not accessible by the client. So, given the choice to return ELOOP rather than trying to reconfigure the dentry tree, we should give the caller some indication of why they aren't being allowed to make what should be a legitimate request and log a message. Signed-off-by: David Howells Acked-by: Sachin Prabhu Signed-off-by: Al Viro --- fs/dcache.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index a901c69..10ba92d 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "internal.h" /* @@ -2383,8 +2384,16 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) actual = __d_unalias(inode, dentry, alias); } write_sequnlock(&rename_lock); - if (IS_ERR(actual)) + if (IS_ERR(actual)) { + if (PTR_ERR(actual) == -ELOOP) + pr_warn_ratelimited( + "VFS: Lookup of '%s' in %s %s" + " would have caused loop\n", + dentry->d_name.name, + inode->i_sb->s_type->name, + inode->i_sb->s_id); dput(alias); + } goto out_nolock; } } -- cgit v1.1 From 6e58ad69efe9f4c91eb15f6bc365293414c397ce Mon Sep 17 00:00:00 2001 From: Yongqiang Yang Date: Mon, 21 Nov 2011 12:09:19 -0500 Subject: ext4: fix up a undefined error in ext4_free_blocks in debugging code sbi is not defined, so let ext4_free_blocks use EXT4_SB(sb) instead when EXT4FS_DEBUG is defined. Signed-off-by: Yongqiang Yang --- fs/ext4/balloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index f6dba45..12ccacd 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -565,7 +565,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) brelse(bitmap_bh); printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu" ", computed = %llu, %llu\n", - EXT4_B2C(sbi, ext4_free_blocks_count(es)), + EXT4_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)), desc_count, bitmap_count); return bitmap_count; #else -- cgit v1.1 From 24a70313969fc3fc440216b40babdb42564acff3 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 21 Nov 2011 09:39:11 -0500 Subject: Btrfs: remove free-space-cache.c WARN during log replay The log replay code only partially loads block groups, since the block group caching code is able to detect and deal with extents the logging code has pinned down. While the logging code is pinning down block groups, there is a bogus WARN_ON we're hitting if the code wasn't able to find an extent in the cache. This commit removes the warning because it can happen any time there isn't a valid free space cache for that block group. Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index aedacdb..6e5b7e4 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1849,7 +1849,13 @@ again: info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!info) { - WARN_ON(1); + /* the tree logging code might be calling us before we + * have fully loaded the free space rbtree for this + * block group. So it is possible the entry won't + * be in the rbtree yet at all. The caching code + * will make sure not to put it in the rbtree if + * the logging code has pinned it. + */ goto out_lock; } } -- cgit v1.1 From d31da0f0ba3bc0a827a63879310818c22d9a95be Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 22 Nov 2011 12:31:21 -0500 Subject: mount_subtree() pointless use-after-free d'oh... we'd carefully pinned mnt->mnt_sb down, dropped mnt and attempt to grab s_umount on mnt->mnt_sb. The trouble is, *mnt might've been overwritten by now... Signed-off-by: Al Viro --- fs/namespace.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 50ee303..6d3a196 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2493,6 +2493,7 @@ EXPORT_SYMBOL(create_mnt_ns); struct dentry *mount_subtree(struct vfsmount *mnt, const char *name) { struct mnt_namespace *ns; + struct super_block *s; struct path path; int err; @@ -2509,10 +2510,11 @@ struct dentry *mount_subtree(struct vfsmount *mnt, const char *name) return ERR_PTR(err); /* trade a vfsmount reference for active sb one */ - atomic_inc(&path.mnt->mnt_sb->s_active); + s = path.mnt->mnt_sb; + atomic_inc(&s->s_active); mntput(path.mnt); /* lock the sucker */ - down_write(&path.mnt->mnt_sb->s_umount); + down_write(&s->s_umount); /* ... and return the root of (sub)tree on it */ return path.dentry; } -- cgit v1.1 From b59db43ad4434519feb338eacb01d77eb50825c5 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Mon, 21 Nov 2011 17:31:02 -0600 Subject: eCryptfs: Prevent file create race condition The file creation path prematurely called d_instantiate() and unlock_new_inode() before the eCryptfs inode info was fully allocated and initialized and before the eCryptfs metadata was written to the lower file. This could result in race conditions in subsequent file and inode operations leading to unexpected error conditions or a null pointer dereference while attempting to use the unallocated memory. https://launchpad.net/bugs/813146 Signed-off-by: Tyler Hicks Cc: stable@kernel.org --- fs/ecryptfs/crypto.c | 22 +++++++++--------- fs/ecryptfs/ecryptfs_kernel.h | 5 +++-- fs/ecryptfs/inode.c | 52 ++++++++++++++++++++++++++----------------- 3 files changed, 46 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 58609bd..203a1fd 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -967,7 +967,7 @@ static void ecryptfs_set_default_crypt_stat_vals( /** * ecryptfs_new_file_context - * @ecryptfs_dentry: The eCryptfs dentry + * @ecryptfs_inode: The eCryptfs inode * * If the crypto context for the file has not yet been established, * this is where we do that. Establishing a new crypto context @@ -984,13 +984,13 @@ static void ecryptfs_set_default_crypt_stat_vals( * * Returns zero on success; non-zero otherwise */ -int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry) +int ecryptfs_new_file_context(struct inode *ecryptfs_inode) { struct ecryptfs_crypt_stat *crypt_stat = - &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; + &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = &ecryptfs_superblock_to_private( - ecryptfs_dentry->d_sb)->mount_crypt_stat; + ecryptfs_inode->i_sb)->mount_crypt_stat; int cipher_name_len; int rc = 0; @@ -1299,12 +1299,12 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max, } static int -ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry, +ecryptfs_write_metadata_to_contents(struct inode *ecryptfs_inode, char *virt, size_t virt_len) { int rc; - rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt, + rc = ecryptfs_write_lower(ecryptfs_inode, virt, 0, virt_len); if (rc < 0) printk(KERN_ERR "%s: Error attempting to write header " @@ -1338,7 +1338,8 @@ static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask, /** * ecryptfs_write_metadata - * @ecryptfs_dentry: The eCryptfs dentry + * @ecryptfs_dentry: The eCryptfs dentry, which should be negative + * @ecryptfs_inode: The newly created eCryptfs inode * * Write the file headers out. This will likely involve a userspace * callout, in which the session key is encrypted with one or more @@ -1348,10 +1349,11 @@ static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask, * * Returns zero on success; non-zero on error */ -int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) +int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry, + struct inode *ecryptfs_inode) { struct ecryptfs_crypt_stat *crypt_stat = - &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; + &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; unsigned int order; char *virt; size_t virt_len; @@ -1391,7 +1393,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt, size); else - rc = ecryptfs_write_metadata_to_contents(ecryptfs_dentry, virt, + rc = ecryptfs_write_metadata_to_contents(ecryptfs_inode, virt, virt_len); if (rc) { printk(KERN_ERR "%s: Error writing metadata out to lower file; " diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 54481a3..a9f29b1 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -584,9 +584,10 @@ int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat); int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode); int ecryptfs_encrypt_page(struct page *page); int ecryptfs_decrypt_page(struct page *page); -int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry); +int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry, + struct inode *ecryptfs_inode); int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry); -int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry); +int ecryptfs_new_file_context(struct inode *ecryptfs_inode); void ecryptfs_write_crypt_stat_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat, size_t *written); diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index a36d327..32f90a3 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -172,22 +172,23 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode, * it. It will also update the eCryptfs directory inode to mimic the * stat of the lower directory inode. * - * Returns zero on success; non-zero on error condition + * Returns the new eCryptfs inode on success; an ERR_PTR on error condition */ -static int +static struct inode * ecryptfs_do_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, int mode) { int rc; struct dentry *lower_dentry; struct dentry *lower_dir_dentry; + struct inode *inode; lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); lower_dir_dentry = lock_parent(lower_dentry); if (IS_ERR(lower_dir_dentry)) { ecryptfs_printk(KERN_ERR, "Error locking directory of " "dentry\n"); - rc = PTR_ERR(lower_dir_dentry); + inode = ERR_CAST(lower_dir_dentry); goto out; } rc = ecryptfs_create_underlying_file(lower_dir_dentry->d_inode, @@ -195,20 +196,19 @@ ecryptfs_do_create(struct inode *directory_inode, if (rc) { printk(KERN_ERR "%s: Failure to create dentry in lower fs; " "rc = [%d]\n", __func__, rc); + inode = ERR_PTR(rc); goto out_lock; } - rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry, - directory_inode->i_sb); - if (rc) { - ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n"); + inode = __ecryptfs_get_inode(lower_dentry->d_inode, + directory_inode->i_sb); + if (IS_ERR(inode)) goto out_lock; - } fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode); fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode); out_lock: unlock_dir(lower_dir_dentry); out: - return rc; + return inode; } /** @@ -219,26 +219,26 @@ out: * * Returns zero on success */ -static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) +static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry, + struct inode *ecryptfs_inode) { struct ecryptfs_crypt_stat *crypt_stat = - &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; + &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; int rc = 0; - if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) { + if (S_ISDIR(ecryptfs_inode->i_mode)) { ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); goto out; } ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n"); - rc = ecryptfs_new_file_context(ecryptfs_dentry); + rc = ecryptfs_new_file_context(ecryptfs_inode); if (rc) { ecryptfs_printk(KERN_ERR, "Error creating new file " "context; rc = [%d]\n", rc); goto out; } - rc = ecryptfs_get_lower_file(ecryptfs_dentry, - ecryptfs_dentry->d_inode); + rc = ecryptfs_get_lower_file(ecryptfs_dentry, ecryptfs_inode); if (rc) { printk(KERN_ERR "%s: Error attempting to initialize " "the lower file for the dentry with name " @@ -246,10 +246,10 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry) ecryptfs_dentry->d_name.name, rc); goto out; } - rc = ecryptfs_write_metadata(ecryptfs_dentry); + rc = ecryptfs_write_metadata(ecryptfs_dentry, ecryptfs_inode); if (rc) printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc); - ecryptfs_put_lower_file(ecryptfs_dentry->d_inode); + ecryptfs_put_lower_file(ecryptfs_inode); out: return rc; } @@ -269,18 +269,28 @@ static int ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, int mode, struct nameidata *nd) { + struct inode *ecryptfs_inode; int rc; - /* ecryptfs_do_create() calls ecryptfs_interpose() */ - rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode); - if (unlikely(rc)) { + ecryptfs_inode = ecryptfs_do_create(directory_inode, ecryptfs_dentry, + mode); + if (unlikely(IS_ERR(ecryptfs_inode))) { ecryptfs_printk(KERN_WARNING, "Failed to create file in" "lower filesystem\n"); + rc = PTR_ERR(ecryptfs_inode); goto out; } /* At this point, a file exists on "disk"; we need to make sure * that this on disk file is prepared to be an ecryptfs file */ - rc = ecryptfs_initialize_file(ecryptfs_dentry); + rc = ecryptfs_initialize_file(ecryptfs_dentry, ecryptfs_inode); + if (rc) { + drop_nlink(ecryptfs_inode); + unlock_new_inode(ecryptfs_inode); + iput(ecryptfs_inode); + goto out; + } + d_instantiate(ecryptfs_dentry, ecryptfs_inode); + unlock_new_inode(ecryptfs_inode); out: return rc; } -- cgit v1.1 From 32001d6fe9ac6b0423e674a3093aa56740849f3b Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Mon, 21 Nov 2011 17:31:29 -0600 Subject: eCryptfs: Flush file in vma close Dirty pages weren't being written back when an mmap'ed eCryptfs file was closed before the mapping was unmapped. Since f_ops->flush() is not called by the munmap() path, the lower file was simply being released. This patch flushes the eCryptfs file in the vm_ops->close() path. https://launchpad.net/bugs/870326 Signed-off-by: Tyler Hicks Cc: stable@kernel.org [2.6.39+] --- fs/ecryptfs/file.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index c6ac98c..d3f95f9 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -139,6 +139,27 @@ out: return rc; } +static void ecryptfs_vma_close(struct vm_area_struct *vma) +{ + filemap_write_and_wait(vma->vm_file->f_mapping); +} + +static const struct vm_operations_struct ecryptfs_file_vm_ops = { + .close = ecryptfs_vma_close, + .fault = filemap_fault, +}; + +static int ecryptfs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + int rc; + + rc = generic_file_mmap(file, vma); + if (!rc) + vma->vm_ops = &ecryptfs_file_vm_ops; + + return rc; +} + struct kmem_cache *ecryptfs_file_info_cache; /** @@ -349,7 +370,7 @@ const struct file_operations ecryptfs_main_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ecryptfs_compat_ioctl, #endif - .mmap = generic_file_mmap, + .mmap = ecryptfs_file_mmap, .open = ecryptfs_open, .flush = ecryptfs_flush, .release = ecryptfs_release, -- cgit v1.1 From 0f751e641a71157aa584c2a2e22fda52b52b8a56 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Wed, 23 Nov 2011 11:31:24 -0600 Subject: eCryptfs: Extend array bounds for all filename chars From mhalcrow's original commit message: Characters with ASCII values greater than the size of filename_rev_map[] are valid filename characters. ecryptfs_decode_from_filename() will access kernel memory beyond that array, and ecryptfs_parse_tag_70_packet() will then decrypt those characters. The attacker, using the FNEK of the crafted file, can then re-encrypt the characters to reveal the kernel memory past the end of the filename_rev_map[] array. I expect low security impact since this array is statically allocated in the text area, and the amount of memory past the array that is accessible is limited by the largest possible ASCII filename character. This patch solves the issue reported by mhalcrow but with an implementation suggested by Linus to simply extend the length of filename_rev_map[] to 256. Characters greater than 0x7A are mapped to 0x00, which is how invalid characters less than 0x7A were previously being handled. Signed-off-by: Tyler Hicks Reported-by: Michael Halcrow Cc: stable@kernel.org --- fs/ecryptfs/crypto.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 203a1fd..2a83425 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1945,7 +1945,7 @@ static unsigned char *portable_filename_chars = ("-.0123456789ABCD" /* We could either offset on every reverse map or just pad some 0x00's * at the front here */ -static const unsigned char filename_rev_map[] = { +static const unsigned char filename_rev_map[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */ @@ -1961,7 +1961,7 @@ static const unsigned char filename_rev_map[] = { 0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */ 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */ 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */ - 0x3D, 0x3E, 0x3F + 0x3D, 0x3E, 0x3F /* 123 - 255 initialized to 0x00 */ }; /** -- cgit v1.1