From f65cb45cba63f249458b669aa67069eabc37b2f5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 16 Dec 2008 13:40:44 +0100 Subject: perfcounters: flush on setuid exec Pavel Machek pointed out that performance counters should be flushed when crossing protection domains on setuid execution. Reported-by: Pavel Machek Acked-by: Pavel Machek Signed-off-by: Ingo Molnar --- fs/exec.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index ec5df9a..d5165d8 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -1017,6 +1018,13 @@ int flush_old_exec(struct linux_binprm * bprm) set_dumpable(current->mm, suid_dumpable); } + /* + * Flush performance counters when crossing a + * security domain: + */ + if (!get_dumpable(current->mm)) + perf_counter_exit_task(current); + /* An exec changes our domain. We are no longer part of the thread group */ -- cgit v1.1 From 9c83633ad38138855181af6936e8ac570ef7e2cb Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 7 Apr 2009 14:48:16 +0300 Subject: missing unlock in jfs_quota_write() We should unlock &inode->i_mutex on the error path. This bug was in ext2_quota_write(). I sent a patch to them today as well. Found by smatch (http://repo.or.cz/w/smatch.git). Compile tested. regards, dan carpenter Signed-off-by: Dan Carpenter Signed-off-by: Dave Kleikamp --- fs/jfs/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 6f21adf..d9b0e92 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -720,8 +720,10 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type, blk++; } out: - if (len == towrite) + if (len == towrite) { + mutex_unlock(&inode->i_mutex); return err; + } if (inode->i_size < off+len-towrite) i_size_write(inode, off+len-towrite); inode->i_version++; -- cgit v1.1 From 8d1b2d9361b494bfc761700c348c65ebbe3deb5b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:30 +0200 Subject: perf_counter: track task-comm data Similar to the mmap data stream, add one that tracks the task COMM field, so that the userspace reporting knows what to call a task. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130409.127422406@chello.nl> Signed-off-by: Ingo Molnar --- fs/exec.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index e015c0b..bf47ed0 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -951,6 +951,7 @@ void set_task_comm(struct task_struct *tsk, char *buf) task_lock(tsk); strlcpy(tsk->comm, buf, sizeof(tsk->comm)); task_unlock(tsk); + perf_counter_comm(tsk); } int flush_old_exec(struct linux_binprm * bprm) -- cgit v1.1 From 79ffab34391933ee3b95dac7f25c0478fa2f8f1e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 13 May 2009 15:13:42 -0400 Subject: ext4: Properly initialize the buffer_head state These struct buffer_heads are allocated on the stack (and hence are initialized with stack garbage). They are only used to call a get_blocks() function, so that's mostly OK, but b_state must be initialized to be 0 so we don't have any unexpected BH_* flags set by accident, such as BH_Unwritten or BH_Delay. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 1 + fs/ext4/inode.c | 15 ++++++++++++++- fs/mpage.c | 6 ++++-- 3 files changed, 19 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e3a55eb..a953214 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3150,6 +3150,7 @@ retry: ret = PTR_ERR(handle); break; } + map_bh.b_state = 0; ret = ext4_get_blocks_wrap(handle, inode, block, max_blocks, &map_bh, EXT4_CREATE_UNINITIALIZED_EXT, 0, 0); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2a9ffd5..d7ad0bb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2055,7 +2055,20 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) if ((mpd->b_state & (1 << BH_Mapped)) && !(mpd->b_state & (1 << BH_Delay))) return 0; - new.b_state = mpd->b_state; + /* + * We need to make sure the BH_Delay flag is passed down to + * ext4_da_get_block_write(), since it calls + * ext4_get_blocks_wrap() with the EXT4_DELALLOC_RSVED flag. + * This flag causes ext4_get_blocks_wrap() to call + * ext4_da_update_reserve_space() if the passed buffer head + * has the BH_Delay flag set. In the future, once we clean up + * the interfaces to ext4_get_blocks_wrap(), we should pass in + * a separate flag which requests that the delayed allocation + * statistics should be updated, instead of depending on the + * state information getting passed down via the map_bh's + * state bitmasks plus the magic EXT4_DELALLOC_RSVED flag. + */ + new.b_state = mpd->b_state & (1 << BH_Delay); new.b_blocknr = 0; new.b_size = mpd->b_size; next = mpd->b_blocknr; diff --git a/fs/mpage.c b/fs/mpage.c index 680ba60..42381bd 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -379,7 +379,8 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, struct buffer_head map_bh; unsigned long first_logical_block = 0; - clear_buffer_mapped(&map_bh); + map_bh.b_state = 0; + map_bh.b_size = 0; for (page_idx = 0; page_idx < nr_pages; page_idx++) { struct page *page = list_entry(pages->prev, struct page, lru); @@ -412,7 +413,8 @@ int mpage_readpage(struct page *page, get_block_t get_block) struct buffer_head map_bh; unsigned long first_logical_block = 0; - clear_buffer_mapped(&map_bh); + map_bh.b_state = 0; + map_bh.b_size = 0; bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio, &map_bh, &first_logical_block, get_block); if (bio) -- cgit v1.1 From 8fb0e342481c4d80040670fec915f0b9c7c6499a Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 12 May 2009 16:22:37 -0400 Subject: vfs: Add BUG_ON for delayed and unwritten flags in submit_bh() The BH_Delay and BH_Unwritten flags should never leak out to submit_bh(). So add some BUG_ON() checks to submit_bh so we can get a stack trace and determine how and why this might have happened. (Note that only XFS and ext4 use these buffer head flags, and XFS does not use submit_bh(). So this patch should only modify behavior for ext4.) Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" Cc: linux-fsdevel@vger.kernel.org --- fs/buffer.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index aed2977..ad01129 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2933,6 +2933,8 @@ int submit_bh(int rw, struct buffer_head * bh) BUG_ON(!buffer_locked(bh)); BUG_ON(!buffer_mapped(bh)); BUG_ON(!bh->b_end_io); + BUG_ON(buffer_delay(bh)); + BUG_ON(buffer_unwritten(bh)); /* * Mask in barrier bit for a write (could be either a WRITE or a -- cgit v1.1 From 29fa89d088941d79765d60f22d5ccdd6b8696e11 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 12 May 2009 16:30:27 -0400 Subject: ext4: Mark the unwritten buffer_head as mapped during write_begin Setting BH_Unwritten buffer_heads as BH_Mapped avoids multiple (unnecessary) calls to get_block() during the call to the write(2) system call. Setting BH_Unwritten buffer heads as BH_Mapped requires that the writepages() functions can handle BH_Unwritten buffer_heads. After this commit, things work as follows: ext4_ext_get_block() returns unmapped, unwritten, buffer head when called with create = 0 for prealloc space. This makes sure we handle the read path and non-delayed allocation case correctly. Even though the buffer head is marked unmapped we have valid b_blocknr and b_bdev values in the buffer_head. ext4_da_get_block_prep() called for block resrevation will now return mapped, unwritten, new buffer_head for prealloc space. This avoids multiple calls to get_block() for write to same offset. By making such buffers as BH_New, we also assure that sub-block zeroing of buffered writes happens correctly. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 4 +-- fs/ext4/inode.c | 82 +++++++++++++++++++++++++++++++++++-------------------- 2 files changed, 54 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a953214..ea5c476 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2872,6 +2872,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, if (create == EXT4_CREATE_UNINITIALIZED_EXT) goto out; if (!create) { + if (allocated > max_blocks) + allocated = max_blocks; /* * We have blocks reserved already. We * return allocated blocks so that delalloc @@ -2879,8 +2881,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * the buffer head will be unmapped so that * a read from the block returns 0s. */ - if (allocated > max_blocks) - allocated = max_blocks; set_buffer_unwritten(bh_result); bh_result->b_bdev = inode->i_sb->s_bdev; bh_result->b_blocknr = newblock; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d7ad0bb..96f3366 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1852,7 +1852,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) * @logical - first logical block to start assignment with * * the function goes through all passed space and put actual disk - * block numbers into buffer heads, dropping BH_Delay + * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten */ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, struct buffer_head *exbh) @@ -1902,16 +1902,24 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, do { if (cur_logical >= logical + blocks) break; - if (buffer_delay(bh)) { - bh->b_blocknr = pblock; - clear_buffer_delay(bh); - bh->b_bdev = inode->i_sb->s_bdev; - } else if (buffer_unwritten(bh)) { - bh->b_blocknr = pblock; - clear_buffer_unwritten(bh); - set_buffer_mapped(bh); - set_buffer_new(bh); - bh->b_bdev = inode->i_sb->s_bdev; + + if (buffer_delay(bh) || + buffer_unwritten(bh)) { + + BUG_ON(bh->b_bdev != inode->i_sb->s_bdev); + + if (buffer_delay(bh)) { + clear_buffer_delay(bh); + bh->b_blocknr = pblock; + } else { + /* + * unwritten already should have + * blocknr assigned. Verify that + */ + clear_buffer_unwritten(bh); + BUG_ON(bh->b_blocknr != pblock); + } + } else if (buffer_mapped(bh)) BUG_ON(bh->b_blocknr != pblock); @@ -2053,7 +2061,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) * We consider only non-mapped and non-allocated blocks */ if ((mpd->b_state & (1 << BH_Mapped)) && - !(mpd->b_state & (1 << BH_Delay))) + !(mpd->b_state & (1 << BH_Delay)) && + !(mpd->b_state & (1 << BH_Unwritten))) return 0; /* * We need to make sure the BH_Delay flag is passed down to @@ -2205,6 +2214,17 @@ flush_it: return; } +static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) +{ + /* + * unmapped buffer is possible for holes. + * delay buffer is possible with delayed allocation. + * We also need to consider unwritten buffer as unmapped. + */ + return (!buffer_mapped(bh) || buffer_delay(bh) || + buffer_unwritten(bh)) && buffer_dirty(bh); +} + /* * __mpage_da_writepage - finds extent of pages and blocks * @@ -2289,8 +2309,7 @@ static int __mpage_da_writepage(struct page *page, * Otherwise we won't make progress * with the page in ext4_da_writepage */ - if (buffer_dirty(bh) && - (!buffer_mapped(bh) || buffer_delay(bh))) { + if (ext4_bh_unmapped_or_delay(NULL, bh)) { mpage_add_bh_to_extent(mpd, logical, bh->b_size, bh->b_state); @@ -2318,6 +2337,14 @@ static int __mpage_da_writepage(struct page *page, /* * this is a special callback for ->write_begin() only * it's intention is to return mapped block or reserve space + * + * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. + * We also have b_blocknr = -1 and b_bdev initialized properly + * + * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. + * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev + * initialized properly. + * */ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -2353,28 +2380,23 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, set_buffer_delay(bh_result); } else if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); - /* - * With sub-block writes into unwritten extents - * we also need to mark the buffer as new so that - * the unwritten parts of the buffer gets correctly zeroed. - */ - if (buffer_unwritten(bh_result)) + if (buffer_unwritten(bh_result)) { + /* A delayed write to unwritten bh should + * be marked new and mapped. Mapped ensures + * that we don't do get_block multiple times + * when we write to the same offset and new + * ensures that we do proper zero out for + * partial write. + */ set_buffer_new(bh_result); + set_buffer_mapped(bh_result); + } ret = 0; } return ret; } -static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) -{ - /* - * unmapped buffer is possible for holes. - * delay buffer is possible with delayed allocation - */ - return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh)); -} - static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { @@ -2828,7 +2850,7 @@ static int ext4_da_should_update_i_disksize(struct page *page, for (i = 0; i < idx; i++) bh = bh->b_this_page; - if (!buffer_mapped(bh) || (buffer_delay(bh))) + if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh)) return 0; return 1; } -- cgit v1.1 From c5ca7c7636fa689a9746b6032f83aa7fffec31c6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 27 Apr 2009 22:48:48 -0400 Subject: ext4: Fallback to vmalloc if kmalloc can't allocate s_flex_groups array For very large filesystems, the s_flex_groups array can get quite big. For example, a filesystem that can be resized up to 16TB will have 8192 flex groups (assuming the default flex_bg size of 16), so the array is 96k, which is *very* marginal for kmalloc(). On the other hand, a 160GB filesystem without the resize_inode feature will only require 960 bytes. So we try to allocate the array first using kmalloc(), and if that fails, we'll try to use vmalloc() instead. Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2958f4e..3f4475d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -586,7 +587,10 @@ static void ext4_put_super(struct super_block *sb) for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); kfree(sbi->s_group_desc); - kfree(sbi->s_flex_groups); + if (is_vmalloc_addr(sbi->s_flex_groups)) + vfree(sbi->s_flex_groups); + else + kfree(sbi->s_flex_groups); percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); @@ -1620,6 +1624,7 @@ static int ext4_fill_flex_info(struct super_block *sb) ext4_group_t flex_group_count; ext4_group_t flex_group; int groups_per_flex = 0; + size_t size; int i; if (!sbi->s_es->s_log_groups_per_flex) { @@ -1634,8 +1639,13 @@ static int ext4_fill_flex_info(struct super_block *sb) flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; - sbi->s_flex_groups = kzalloc(flex_group_count * - sizeof(struct flex_groups), GFP_KERNEL); + size = flex_group_count * sizeof(struct flex_groups); + sbi->s_flex_groups = kzalloc(size, GFP_KERNEL); + if (sbi->s_flex_groups == NULL) { + sbi->s_flex_groups = vmalloc(size); + if (sbi->s_flex_groups) + memset(sbi->s_flex_groups, 0, size); + } if (sbi->s_flex_groups == NULL) { printk(KERN_ERR "EXT4-fs: not enough memory for " "%u flex groups\n", flex_group_count); @@ -2842,6 +2852,12 @@ failed_mount4: sbi->s_journal = NULL; } failed_mount3: + if (sbi->s_flex_groups) { + if (is_vmalloc_addr(sbi->s_flex_groups)) + vfree(sbi->s_flex_groups); + else + kfree(sbi->s_flex_groups); + } percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); -- cgit v1.1 From f7c439504ccba0cca43271e651013ab97a221c62 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 24 Apr 2009 23:31:59 -0400 Subject: ext4: Use is_power_of_2() for clarity Signed-off-by: Robert P. J. Day Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3f4475d..3e509bc 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1483,7 +1483,7 @@ set_qf_format: return 0; if (option < 0 || option > (1 << 30)) return 0; - if (option & (option - 1)) { + if (!is_power_of_2(option)) { printk(KERN_ERR "EXT4-fs: inode_readahead_blks" " must be a power of 2\n"); return 0; @@ -2101,8 +2101,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a, if (parse_strtoul(buf, 0x40000000, &t)) return -EINVAL; - /* inode_readahead_blks must be a power of 2 */ - if (t & (t-1)) + if (!is_power_of_2(t)) return -EINVAL; sbi->s_inode_readahead_blks = t; -- cgit v1.1 From e2d670523c6c4ccb0fca9f3ab1b8f066d9aa57d6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 1 May 2009 00:33:44 -0400 Subject: ext4: Simplify ext4_commit_super()'s function signature The ext4_commit_super() function took both a struct super_block * and a struct ext4_super_block *, but the struct ext4_super_block can be derived from the struct super_block. Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3e509bc..ad4c9be 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -54,8 +54,7 @@ static struct kset *ext4_kset; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); -static int ext4_commit_super(struct super_block *sb, - struct ext4_super_block *es, int sync); +static int ext4_commit_super(struct super_block *sb, int sync); static void ext4_mark_recovery_complete(struct super_block *sb, struct ext4_super_block *es); static void ext4_clear_journal_err(struct super_block *sb, @@ -306,7 +305,7 @@ static void ext4_handle_error(struct super_block *sb) printk(KERN_CRIT "Remounting filesystem read-only\n"); sb->s_flags |= MS_RDONLY; } - ext4_commit_super(sb, es, 1); + ext4_commit_super(sb, 1); if (test_opt(sb, ERRORS_PANIC)) panic("EXT4-fs (device %s): panic forced after error\n", sb->s_id); @@ -448,7 +447,7 @@ __acquires(bitlock) if (test_opt(sb, ERRORS_CONT)) { EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - ext4_commit_super(sb, es, 0); + ext4_commit_super(sb, 0); return; } ext4_unlock_group(sb, grp); @@ -577,7 +576,7 @@ static void ext4_put_super(struct super_block *sb) if (!(sb->s_flags & MS_RDONLY)) { EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); es->s_state = cpu_to_le16(sbi->s_mount_state); - ext4_commit_super(sb, es, 1); + ext4_commit_super(sb, 1); } if (sbi->s_proc) { remove_proc_entry(sb->s_id, ext4_proc_root); @@ -1596,7 +1595,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, if (sbi->s_journal) EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); - ext4_commit_super(sb, es, 1); + ext4_commit_super(sb, 1); if (test_opt(sb, DEBUG)) printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " "bpg=%lu, ipg=%lu, mo=%04lx]\n", @@ -2655,7 +2654,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (test_opt(sb, ERRORS_PANIC)) { EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - ext4_commit_super(sb, es, 1); + ext4_commit_super(sb, 1); goto failed_mount4; } } @@ -3132,15 +3131,15 @@ static int ext4_load_journal(struct super_block *sb, sb->s_dirt = 1; /* Make sure we flush the recovery flag to disk. */ - ext4_commit_super(sb, es, 1); + ext4_commit_super(sb, 1); } return 0; } -static int ext4_commit_super(struct super_block *sb, - struct ext4_super_block *es, int sync) +static int ext4_commit_super(struct super_block *sb, int sync) { + struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; int error = 0; @@ -3212,7 +3211,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb, sb->s_flags & MS_RDONLY) { EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); sb->s_dirt = 0; - ext4_commit_super(sb, es, 1); + ext4_commit_super(sb, 1); } unlock_super(sb); @@ -3253,7 +3252,7 @@ static void ext4_clear_journal_err(struct super_block *sb, EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - ext4_commit_super(sb, es, 1); + ext4_commit_super(sb, 1); jbd2_journal_clear_err(journal); } @@ -3293,7 +3292,7 @@ static void ext4_write_super(struct super_block *sb) BUG(); sb->s_dirt = 0; } else { - ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); + ext4_commit_super(sb, 1); } } @@ -3312,7 +3311,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) target); } } else { - ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait); + ext4_commit_super(sb, wait); } return ret; } @@ -3345,7 +3344,7 @@ static int ext4_freeze(struct super_block *sb) /* Journal blocked and flushed, clear needs_recovery flag. */ EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); - error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); + error = ext4_commit_super(sb, 1); if (error) goto out; } @@ -3365,7 +3364,7 @@ static int ext4_unfreeze(struct super_block *sb) lock_super(sb); /* Reser the needs_recovery flag before the fs is unlocked. */ EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); - ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); + ext4_commit_super(sb, 1); unlock_super(sb); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); } @@ -3520,7 +3519,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } } if (sbi->s_journal == NULL) - ext4_commit_super(sb, es, 1); + ext4_commit_super(sb, 1); #ifdef CONFIG_QUOTA /* Release old quota file names */ -- cgit v1.1 From 7234ab2a55e77784b44cf2d862136d9e41b8d98a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 30 Apr 2009 21:24:04 -0400 Subject: ext4: Fix and simplify s_dirt handling The s_dirt flag wasn't completely handled correctly, but it didn't really matter when journalling was enabled. It turns out that when ext4 runs without a journal, we don't clear s_dirt in places where we should have, with the result that the high-level write_super() function was writing the superblock when it wasn't necessary. So we fix this by making ext4_commit_super() clear the s_dirt flag, and removing many of the other places where s_dirt is manipulated. When journalling is enabled, the s_dirt flag might be left set more often, but s_dirt really doesn't matter when journalling is enabled. Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ad4c9be..7c7a08a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3128,7 +3128,6 @@ static int ext4_load_journal(struct super_block *sb, if (journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { es->s_journal_dev = cpu_to_le32(journal_devnum); - sb->s_dirt = 1; /* Make sure we flush the recovery flag to disk. */ ext4_commit_super(sb, 1); @@ -3168,7 +3167,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) &EXT4_SB(sb)->s_freeblocks_counter)); es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( &EXT4_SB(sb)->s_freeinodes_counter)); - + sb->s_dirt = 0; BUFFER_TRACE(sbh, "marking dirty"); mark_buffer_dirty(sbh); if (sync) { @@ -3210,7 +3209,6 @@ static void ext4_mark_recovery_complete(struct super_block *sb, if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) && sb->s_flags & MS_RDONLY) { EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); - sb->s_dirt = 0; ext4_commit_super(sb, 1); } unlock_super(sb); @@ -3271,10 +3269,8 @@ int ext4_force_commit(struct super_block *sb) return 0; journal = EXT4_SB(sb)->s_journal; - if (journal) { - sb->s_dirt = 0; + if (journal) ret = ext4_journal_force_commit(journal); - } return ret; } @@ -3282,15 +3278,13 @@ int ext4_force_commit(struct super_block *sb) /* * Ext4 always journals updates to the superblock itself, so we don't * have to propagate any other updates to the superblock on disk at this - * point. (We can probably nuke this function altogether, and remove - * any mention to sb->s_dirt in all of fs/ext4; eventual cleanup...) + * point if the journalling is enabled. */ static void ext4_write_super(struct super_block *sb) { if (EXT4_SB(sb)->s_journal) { if (mutex_trylock(&sb->s_lock) != 0) BUG(); - sb->s_dirt = 0; } else { ext4_commit_super(sb, 1); } @@ -3302,7 +3296,6 @@ static int ext4_sync_fs(struct super_block *sb, int wait) tid_t target; trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); - sb->s_dirt = 0; if (EXT4_SB(sb)->s_journal) { if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { @@ -3324,7 +3317,6 @@ static int ext4_freeze(struct super_block *sb) { int error = 0; journal_t *journal; - sb->s_dirt = 0; if (!(sb->s_flags & MS_RDONLY)) { journal = EXT4_SB(sb)->s_journal; -- cgit v1.1 From 9ca92389c5312a51e819c15c762f0abdc7f3129b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 1 May 2009 12:52:25 -0400 Subject: ext4: Use separate super_operations structure for no_journal filesystems By using a separate super_operations structure for filesystems that have and don't have journals, we can simply ext4_write_super() --- which is only needed when no journal is present --- and ext4_freeze(), ext4_unfreeze(), and ext4_sync_fs(), which are only needed when the journal is present. Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 108 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 57 insertions(+), 51 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7c7a08a..68c3a44 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -995,7 +995,6 @@ static const struct super_operations ext4_sops = { .dirty_inode = ext4_dirty_inode, .delete_inode = ext4_delete_inode, .put_super = ext4_put_super, - .write_super = ext4_write_super, .sync_fs = ext4_sync_fs, .freeze_fs = ext4_freeze, .unfreeze_fs = ext4_unfreeze, @@ -1010,6 +1009,25 @@ static const struct super_operations ext4_sops = { .bdev_try_to_free_page = bdev_try_to_free_page, }; +static const struct super_operations ext4_nojournal_sops = { + .alloc_inode = ext4_alloc_inode, + .destroy_inode = ext4_destroy_inode, + .write_inode = ext4_write_inode, + .dirty_inode = ext4_dirty_inode, + .delete_inode = ext4_delete_inode, + .write_super = ext4_write_super, + .put_super = ext4_put_super, + .statfs = ext4_statfs, + .remount_fs = ext4_remount, + .clear_inode = ext4_clear_inode, + .show_options = ext4_show_options, +#ifdef CONFIG_QUOTA + .quota_read = ext4_quota_read, + .quota_write = ext4_quota_write, +#endif + .bdev_try_to_free_page = bdev_try_to_free_page, +}; + static const struct export_operations ext4_export_ops = { .fh_to_dentry = ext4_fh_to_dentry, .fh_to_parent = ext4_fh_to_parent, @@ -2615,7 +2633,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* * set up enough so that it can read an inode */ - sb->s_op = &ext4_sops; + if (!test_opt(sb, NOLOAD) && + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) + sb->s_op = &ext4_sops; + else + sb->s_op = &ext4_nojournal_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_QUOTA @@ -3275,19 +3297,9 @@ int ext4_force_commit(struct super_block *sb) return ret; } -/* - * Ext4 always journals updates to the superblock itself, so we don't - * have to propagate any other updates to the superblock on disk at this - * point if the journalling is enabled. - */ static void ext4_write_super(struct super_block *sb) { - if (EXT4_SB(sb)->s_journal) { - if (mutex_trylock(&sb->s_lock) != 0) - BUG(); - } else { - ext4_commit_super(sb, 1); - } + ext4_commit_super(sb, 1); } static int ext4_sync_fs(struct super_block *sb, int wait) @@ -3296,15 +3308,9 @@ static int ext4_sync_fs(struct super_block *sb, int wait) tid_t target; trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); - if (EXT4_SB(sb)->s_journal) { - if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, - &target)) { - if (wait) - jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, - target); - } - } else { - ext4_commit_super(sb, wait); + if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { + if (wait) + jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target); } return ret; } @@ -3318,32 +3324,31 @@ static int ext4_freeze(struct super_block *sb) int error = 0; journal_t *journal; - if (!(sb->s_flags & MS_RDONLY)) { - journal = EXT4_SB(sb)->s_journal; + if (sb->s_flags & MS_RDONLY) + return 0; - if (journal) { - /* Now we set up the journal barrier. */ - jbd2_journal_lock_updates(journal); + journal = EXT4_SB(sb)->s_journal; - /* - * We don't want to clear needs_recovery flag when we - * failed to flush the journal. - */ - error = jbd2_journal_flush(journal); - if (error < 0) - goto out; - } + /* Now we set up the journal barrier. */ + jbd2_journal_lock_updates(journal); - /* Journal blocked and flushed, clear needs_recovery flag. */ - EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); - error = ext4_commit_super(sb, 1); - if (error) - goto out; + /* + * Don't clear the needs_recovery flag if we failed to flush + * the journal. + */ + error = jbd2_journal_flush(journal); + if (error < 0) { + out: + jbd2_journal_unlock_updates(journal); + return error; } + + /* Journal blocked and flushed, clear needs_recovery flag. */ + EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + error = ext4_commit_super(sb, 1); + if (error) + goto out; return 0; -out: - jbd2_journal_unlock_updates(journal); - return error; } /* @@ -3352,14 +3357,15 @@ out: */ static int ext4_unfreeze(struct super_block *sb) { - if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) { - lock_super(sb); - /* Reser the needs_recovery flag before the fs is unlocked. */ - EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); - ext4_commit_super(sb, 1); - unlock_super(sb); - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); - } + if (sb->s_flags & MS_RDONLY) + return 0; + + lock_super(sb); + /* Reset the needs_recovery flag before the fs is unlocked. */ + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + ext4_commit_super(sb, 1); + unlock_super(sb); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); return 0; } -- cgit v1.1 From 8df9675f8b498d0bfa1f0b5b06f56bf1ff366dd5 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 1 May 2009 08:50:38 -0400 Subject: ext4: Avoid races caused by on-line resizing and SMP memory reordering Ext4's on-line resizing adds a new block group and then, only at the last step adjusts s_groups_count. However, it's possible on SMP systems that another CPU could see the updated the s_group_count and not see the newly initialized data structures for the just-added block group. For this reason, it's important to insert a SMP read barrier after reading s_groups_count and before reading any (for example) the new block group descriptors allowed by the increased value of s_groups_count. Unfortunately, we rather blatently violate this locking protocol documented in fs/ext4/resize.c. Fortunately, (1) on-line resizes happen relatively rarely, and (2) it seems rare that the filesystem code will immediately try to use just-added block group before any memory ordering issues resolve themselves. So apparently problems here are relatively hard to hit, since ext3 has been vulnerable to the same issue for years with no one apparently complaining. Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 15 +++++++-------- fs/ext4/ext4.h | 12 ++++++++++++ fs/ext4/ialloc.c | 40 +++++++++++++++++++--------------------- fs/ext4/inode.c | 7 ++++--- fs/ext4/mballoc.c | 45 ++++++++++++++++++++++++--------------------- fs/ext4/super.c | 3 +-- 6 files changed, 67 insertions(+), 55 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 53c72ad..a5ba039 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -88,6 +88,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, ext4_group_t block_group, struct ext4_group_desc *gdp) { int bit, bit_max; + ext4_group_t ngroups = ext4_get_groups_count(sb); unsigned free_blocks, group_blocks; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -123,7 +124,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, bit_max += ext4_bg_num_gdb(sb, block_group); } - if (block_group == sbi->s_groups_count - 1) { + if (block_group == ngroups - 1) { /* * Even though mke2fs always initialize first and last group * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need @@ -131,7 +132,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, */ group_blocks = ext4_blocks_count(sbi->s_es) - le32_to_cpu(sbi->s_es->s_first_data_block) - - (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1)); + (EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1)); } else { group_blocks = EXT4_BLOCKS_PER_GROUP(sb); } @@ -205,18 +206,18 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, { unsigned int group_desc; unsigned int offset; + ext4_group_t ngroups = ext4_get_groups_count(sb); struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (block_group >= sbi->s_groups_count) { + if (block_group >= ngroups) { ext4_error(sb, "ext4_get_group_desc", "block_group >= groups_count - " "block_group = %u, groups_count = %u", - block_group, sbi->s_groups_count); + block_group, ngroups); return NULL; } - smp_rmb(); group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); @@ -665,7 +666,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) ext4_fsblk_t desc_count; struct ext4_group_desc *gdp; ext4_group_t i; - ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + ext4_group_t ngroups = ext4_get_groups_count(sb); #ifdef EXT4FS_DEBUG struct ext4_super_block *es; ext4_fsblk_t bitmap_count; @@ -677,7 +678,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) bitmap_count = 0; gdp = NULL; - smp_rmb(); for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) @@ -700,7 +700,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) return bitmap_count; #else desc_count = 0; - smp_rmb(); for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d0f15ef..02ec44b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1228,6 +1228,18 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb, return grp_info[indexv][indexh]; } +/* + * Reading s_groups_count requires using smp_rmb() afterwards. See + * the locking protocol documented in the comments of ext4_group_add() + * in resize.c + */ +static inline ext4_group_t ext4_get_groups_count(struct super_block *sb) +{ + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + + smp_rmb(); + return ngroups; +} static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, ext4_group_t block_group) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index f18e0a0..55ba419 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -316,7 +316,7 @@ error_return: static int find_group_dir(struct super_block *sb, struct inode *parent, ext4_group_t *best_group) { - ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + ext4_group_t ngroups = ext4_get_groups_count(sb); unsigned int freei, avefreei; struct ext4_group_desc *desc, *best_desc = NULL; ext4_group_t group; @@ -353,7 +353,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent, struct flex_groups *flex_group = sbi->s_flex_groups; ext4_group_t parent_group = EXT4_I(parent)->i_block_group; ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group); - ext4_group_t ngroups = sbi->s_groups_count; + ext4_group_t ngroups = ext4_get_groups_count(sb); int flex_size = ext4_flex_bg_size(sbi); ext4_group_t best_flex = parent_fbg_group; int blocks_per_flex = sbi->s_blocks_per_group * flex_size; @@ -362,7 +362,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent, ext4_group_t n_fbg_groups; ext4_group_t i; - n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >> + n_fbg_groups = (ngroups + flex_size - 1) >> sbi->s_log_groups_per_flex; find_close_to_parent: @@ -478,20 +478,21 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_group_t ngroups = sbi->s_groups_count; + ext4_group_t real_ngroups = ext4_get_groups_count(sb); int inodes_per_group = EXT4_INODES_PER_GROUP(sb); unsigned int freei, avefreei; ext4_fsblk_t freeb, avefreeb; unsigned int ndirs; int max_dirs, min_inodes; ext4_grpblk_t min_blocks; - ext4_group_t i, grp, g; + ext4_group_t i, grp, g, ngroups; struct ext4_group_desc *desc; struct orlov_stats stats; int flex_size = ext4_flex_bg_size(sbi); + ngroups = real_ngroups; if (flex_size > 1) { - ngroups = (ngroups + flex_size - 1) >> + ngroups = (real_ngroups + flex_size - 1) >> sbi->s_log_groups_per_flex; parent_group >>= sbi->s_log_groups_per_flex; } @@ -543,7 +544,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, */ grp *= flex_size; for (i = 0; i < flex_size; i++) { - if (grp+i >= sbi->s_groups_count) + if (grp+i >= real_ngroups) break; desc = ext4_get_group_desc(sb, grp+i, NULL); if (desc && ext4_free_inodes_count(sb, desc)) { @@ -583,7 +584,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, } fallback: - ngroups = sbi->s_groups_count; + ngroups = real_ngroups; avefreei = freei / ngroups; fallback_retry: parent_group = EXT4_I(parent)->i_block_group; @@ -613,9 +614,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent, ext4_group_t *group, int mode) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; - ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + ext4_group_t i, last, ngroups = ext4_get_groups_count(sb); struct ext4_group_desc *desc; - ext4_group_t i, last; int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); /* @@ -799,11 +799,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) struct super_block *sb; struct buffer_head *inode_bitmap_bh = NULL; struct buffer_head *group_desc_bh; - ext4_group_t group = 0; + ext4_group_t ngroups, group = 0; unsigned long ino = 0; struct inode *inode; struct ext4_group_desc *gdp = NULL; - struct ext4_super_block *es; struct ext4_inode_info *ei; struct ext4_sb_info *sbi; int ret2, err = 0; @@ -818,15 +817,14 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) return ERR_PTR(-EPERM); sb = dir->i_sb; + ngroups = ext4_get_groups_count(sb); trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id, dir->i_ino, mode); inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); ei = EXT4_I(inode); - sbi = EXT4_SB(sb); - es = sbi->s_es; if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) { ret2 = find_group_flex(sb, dir, &group); @@ -856,7 +854,7 @@ got_group: if (ret2 == -1) goto out; - for (i = 0; i < sbi->s_groups_count; i++) { + for (i = 0; i < ngroups; i++) { err = -EIO; gdp = ext4_get_group_desc(sb, group, &group_desc_bh); @@ -917,7 +915,7 @@ repeat_in_this_group: * group descriptor metadata has not yet been updated. * So we just go onto the next blockgroup. */ - if (++group == sbi->s_groups_count) + if (++group == ngroups) group = 0; } err = -ENOSPC; @@ -1158,7 +1156,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) { unsigned long desc_count; struct ext4_group_desc *gdp; - ext4_group_t i; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); #ifdef EXT4FS_DEBUG struct ext4_super_block *es; unsigned long bitmap_count, x; @@ -1168,7 +1166,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) desc_count = 0; bitmap_count = 0; gdp = NULL; - for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { + for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; @@ -1190,7 +1188,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) return desc_count; #else desc_count = 0; - for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { + for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; @@ -1205,9 +1203,9 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) unsigned long ext4_count_dirs(struct super_block * sb) { unsigned long count = 0; - ext4_group_t i; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); - for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { + for (i = 0; i < ngroups; i++) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 96f3366..4e7f363 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4965,7 +4965,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) */ int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) { - int groups, gdpblocks; + ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); + int gdpblocks; int idxblocks; int ret = 0; @@ -4992,8 +4993,8 @@ int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) groups += nrblocks; gdpblocks = groups; - if (groups > EXT4_SB(inode->i_sb)->s_groups_count) - groups = EXT4_SB(inode->i_sb)->s_groups_count; + if (groups > ngroups) + groups = ngroups; if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f871677..c3af9e6 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -739,6 +739,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb, static int ext4_mb_init_cache(struct page *page, char *incore) { + ext4_group_t ngroups; int blocksize; int blocks_per_page; int groups_per_page; @@ -757,6 +758,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) inode = page->mapping->host; sb = inode->i_sb; + ngroups = ext4_get_groups_count(sb); blocksize = 1 << inode->i_blkbits; blocks_per_page = PAGE_CACHE_SIZE / blocksize; @@ -780,7 +782,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) for (i = 0; i < groups_per_page; i++) { struct ext4_group_desc *desc; - if (first_group + i >= EXT4_SB(sb)->s_groups_count) + if (first_group + i >= ngroups) break; err = -EIO; @@ -852,7 +854,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) struct ext4_group_info *grinfo; group = (first_block + i) >> 1; - if (group >= EXT4_SB(sb)->s_groups_count) + if (group >= ngroups) break; /* @@ -1788,6 +1790,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group) int block, pnum; int blocks_per_page; int groups_per_page; + ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t first_group; struct ext4_group_info *grp; @@ -1807,7 +1810,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group) /* read all groups the page covers into the cache */ for (i = 0; i < groups_per_page; i++) { - if ((first_group + i) >= EXT4_SB(sb)->s_groups_count) + if ((first_group + i) >= ngroups) break; grp = ext4_get_group_info(sb, first_group + i); /* take all groups write allocation @@ -1945,8 +1948,7 @@ err: static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { - ext4_group_t group; - ext4_group_t i; + ext4_group_t ngroups, group, i; int cr; int err = 0; int bsbits; @@ -1957,6 +1959,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) sb = ac->ac_sb; sbi = EXT4_SB(sb); + ngroups = ext4_get_groups_count(sb); BUG_ON(ac->ac_status == AC_STATUS_FOUND); /* first, try the goal */ @@ -2017,11 +2020,11 @@ repeat: */ group = ac->ac_g_ex.fe_group; - for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) { + for (i = 0; i < ngroups; group++, i++) { struct ext4_group_info *grp; struct ext4_group_desc *desc; - if (group == EXT4_SB(sb)->s_groups_count) + if (group == ngroups) group = 0; /* quick check to skip empty groups */ @@ -2315,12 +2318,10 @@ static struct file_operations ext4_mb_seq_history_fops = { static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) { struct super_block *sb = seq->private; - struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group; - if (*pos < 0 || *pos >= sbi->s_groups_count) + if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) return NULL; - group = *pos + 1; return (void *) ((unsigned long) group); } @@ -2328,11 +2329,10 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) { struct super_block *sb = seq->private; - struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group; ++*pos; - if (*pos < 0 || *pos >= sbi->s_groups_count) + if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) return NULL; group = *pos + 1; return (void *) ((unsigned long) group); @@ -2587,6 +2587,7 @@ void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add) static int ext4_mb_init_backend(struct super_block *sb) { + ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; int metalen; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -2598,7 +2599,7 @@ static int ext4_mb_init_backend(struct super_block *sb) struct ext4_group_desc *desc; /* This is the number of blocks used by GDT */ - num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - + num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); /* @@ -2644,7 +2645,7 @@ static int ext4_mb_init_backend(struct super_block *sb) for (i = 0; i < num_meta_group_infos; i++) { if ((i + 1) == num_meta_group_infos) metalen = sizeof(*meta_group_info) * - (sbi->s_groups_count - + (ngroups - (i << EXT4_DESC_PER_BLOCK_BITS(sb))); meta_group_info = kmalloc(metalen, GFP_KERNEL); if (meta_group_info == NULL) { @@ -2655,7 +2656,7 @@ static int ext4_mb_init_backend(struct super_block *sb) sbi->s_group_info[i] = meta_group_info; } - for (i = 0; i < sbi->s_groups_count; i++) { + for (i = 0; i < ngroups; i++) { desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { printk(KERN_ERR @@ -2781,13 +2782,14 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) int ext4_mb_release(struct super_block *sb) { + ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; int num_meta_group_infos; struct ext4_group_info *grinfo; struct ext4_sb_info *sbi = EXT4_SB(sb); if (sbi->s_group_info) { - for (i = 0; i < sbi->s_groups_count; i++) { + for (i = 0; i < ngroups; i++) { grinfo = ext4_get_group_info(sb, i); #ifdef DOUBLE_CHECK kfree(grinfo->bb_bitmap); @@ -2797,7 +2799,7 @@ int ext4_mb_release(struct super_block *sb) ext4_unlock_group(sb, i); kfree(grinfo); } - num_meta_group_infos = (sbi->s_groups_count + + num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); for (i = 0; i < num_meta_group_infos; i++) @@ -4121,7 +4123,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode, static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; - ext4_group_t i; + ext4_group_t ngroups, i; printk(KERN_ERR "EXT4-fs: Can't allocate:" " Allocation context details:\n"); @@ -4145,7 +4147,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned, ac->ac_found); printk(KERN_ERR "EXT4-fs: groups: \n"); - for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { + ngroups = ext4_get_groups_count(sb); + for (i = 0; i < ngroups; i++) { struct ext4_group_info *grp = ext4_get_group_info(sb, i); struct ext4_prealloc_space *pa; ext4_grpblk_t start; @@ -4469,13 +4472,13 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) { - ext4_group_t i; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); int ret; int freed = 0; trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d", sb->s_id, needed); - for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) { + for (i = 0; i < ngroups && needed > 0; i++) { ret = ext4_mb_discard_group_preallocations(sb, i, needed); freed += ret; needed -= ret; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 68c3a44..fcd7b24 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3557,9 +3557,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) if (test_opt(sb, MINIX_DF)) { sbi->s_overhead_last = 0; } else if (sbi->s_blocks_last != ext4_blocks_count(es)) { - ext4_group_t ngroups = sbi->s_groups_count, i; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); ext4_fsblk_t overhead = 0; - smp_rmb(); /* * Compute the overhead (FS structures). This is constant -- cgit v1.1 From 114e9fc90703bd6aac0229fb559e97caa6c49770 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 25 Apr 2009 15:48:07 -0400 Subject: ext4: Remove outdated comment about lock_super() ext4_fill_super() is no longer called by read_super(), and it is no longer called with the superblock locked. The unlock_super()/lock_super() is no longer present, so this comment is entirely superfluous. Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index fcd7b24..e3b35f2 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2828,14 +2828,6 @@ no_journal: goto failed_mount4; }; - /* - * akpm: core read_super() calls in here with the superblock locked. - * That deadlocks, because orphan cleanup needs to lock the superblock - * in numerous places. Here we just pop the lock - it's relatively - * harmless, because we are now ready to accept write_super() requests, - * and aviro says that's the only reason for hanging onto the - * superblock lock. - */ EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; -- cgit v1.1 From a63c9eb2ce6f5028da90f282798232c4f398ceb8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 1 May 2009 01:59:42 -0400 Subject: ext4: ext4_mark_recovery_complete() doesn't need to use lock_super The function ext4_mark_recovery_complete() is called from two call paths: either (a) while mounting the filesystem, in which case there's no danger of any other CPU calling write_super() until the mount is completed, and (b) while remounting the filesystem read-write, in which case the fs core has already locked the superblock. This also allows us to take out a very vile unlock_super()/lock_super() pair in ext4_remount(). Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e3b35f2..45d0ada 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3219,13 +3219,11 @@ static void ext4_mark_recovery_complete(struct super_block *sb, if (jbd2_journal_flush(journal) < 0) goto out; - lock_super(sb); if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) && sb->s_flags & MS_RDONLY) { EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); ext4_commit_super(sb, 1); } - unlock_super(sb); out: jbd2_journal_unlock_updates(journal); @@ -3436,15 +3434,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) (sbi->s_mount_state & EXT4_VALID_FS)) es->s_state = cpu_to_le16(sbi->s_mount_state); - /* - * We have to unlock super so that we can wait for - * transactions. - */ - if (sbi->s_journal) { - unlock_super(sb); + if (sbi->s_journal) ext4_mark_recovery_complete(sb, es); - lock_super(sb); - } } else { int ret; if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, -- cgit v1.1 From 3b9d4ed26680771295d904a6b83e88e620780893 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 25 Apr 2009 22:54:04 -0400 Subject: ext4: Replace lock/unlock_super() with an explicit lock for the orphan list Use a separate lock to protect the orphan list, so we can stop overloading the use of lock_super(). Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_sb.h | 1 + fs/ext4/namei.c | 20 +++++++++++--------- fs/ext4/super.c | 1 + 3 files changed, 13 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index 57b71fe..4bda2f7 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h @@ -71,6 +71,7 @@ struct ext4_sb_info { struct inode *s_journal_inode; struct journal_s *s_journal; struct list_head s_orphan; + struct mutex s_orphan_lock; unsigned long s_commit_interval; u32 s_max_batch_time; u32 s_min_batch_time; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 22098e1..8018e49 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1997,7 +1997,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) if (!ext4_handle_valid(handle)) return 0; - lock_super(sb); + mutex_lock(&EXT4_SB(sb)->s_orphan_lock); if (!list_empty(&EXT4_I(inode)->i_orphan)) goto out_unlock; @@ -2006,9 +2006,13 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) /* @@@ FIXME: Observation from aviro: * I think I can trigger J_ASSERT in ext4_orphan_add(). We block - * here (on lock_super()), so race with ext4_link() which might bump + * here (on s_orphan_lock), so race with ext4_link() which might bump * ->i_nlink. For, say it, character device. Not a regular file, * not a directory, not a symlink and ->i_nlink > 0. + * + * tytso, 4/25/2009: I'm not sure how that could happen; + * shouldn't the fs core protect us from these sort of + * unlink()/link() races? */ J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); @@ -2045,7 +2049,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) jbd_debug(4, "orphan inode %lu will point to %d\n", inode->i_ino, NEXT_ORPHAN(inode)); out_unlock: - unlock_super(sb); + mutex_unlock(&EXT4_SB(sb)->s_orphan_lock); ext4_std_error(inode->i_sb, err); return err; } @@ -2066,11 +2070,9 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) if (!ext4_handle_valid(handle)) return 0; - lock_super(inode->i_sb); - if (list_empty(&ei->i_orphan)) { - unlock_super(inode->i_sb); - return 0; - } + mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); + if (list_empty(&ei->i_orphan)) + goto out; ino_next = NEXT_ORPHAN(inode); prev = ei->i_orphan.prev; @@ -2120,7 +2122,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) out_err: ext4_std_error(inode->i_sb, err); out: - unlock_super(inode->i_sb); + mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock); return err; out_brelse: diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 45d0ada..7f43fde 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2645,6 +2645,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->dq_op = &ext4_quota_operations; #endif INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ + mutex_init(&sbi->s_orphan_lock); sb->s_root = NULL; -- cgit v1.1 From 32ed5058ce90024efcd811254b4b1de0468099df Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 25 Apr 2009 22:53:39 -0400 Subject: ext4: Replace lock/unlock_super() with an explicit lock for resizing Use a separate lock to protect s_groups_count and the other block group descriptors which get changed via an on-line resize operation, so we can stop overloading the use of lock_super(). Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_sb.h | 1 + fs/ext4/resize.c | 35 ++++++++++++++++++----------------- fs/ext4/super.c | 1 + 3 files changed, 20 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index 4bda2f7..2d36223 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h @@ -72,6 +72,7 @@ struct ext4_sb_info { struct journal_s *s_journal; struct list_head s_orphan; struct mutex s_orphan_lock; + struct mutex s_resize_lock; unsigned long s_commit_interval; u32 s_max_batch_time; u32 s_min_batch_time; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 546c7dd..e8ded13 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -193,7 +193,7 @@ static int setup_new_group_blocks(struct super_block *sb, if (IS_ERR(handle)) return PTR_ERR(handle); - lock_super(sb); + mutex_lock(&sbi->s_resize_lock); if (input->group != sbi->s_groups_count) { err = -EBUSY; goto exit_journal; @@ -302,7 +302,7 @@ exit_bh: brelse(bh); exit_journal: - unlock_super(sb); + mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext4_journal_stop(handle)) && !err) err = err2; @@ -643,11 +643,12 @@ exit_free: * important part is that the new block and inode counts are in the backup * superblocks, and the location of the new group metadata in the GDT backups. * - * We do not need lock_super() for this, because these blocks are not - * otherwise touched by the filesystem code when it is mounted. We don't - * need to worry about last changing from sbi->s_groups_count, because the - * worst that can happen is that we do not copy the full number of backups - * at this time. The resize which changed s_groups_count will backup again. + * We do not need take the s_resize_lock for this, because these + * blocks are not otherwise touched by the filesystem code when it is + * mounted. We don't need to worry about last changing from + * sbi->s_groups_count, because the worst that can happen is that we + * do not copy the full number of backups at this time. The resize + * which changed s_groups_count will backup again. */ static void update_backups(struct super_block *sb, int blk_off, char *data, int size) @@ -809,7 +810,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) goto exit_put; } - lock_super(sb); + mutex_lock(&sbi->s_resize_lock); if (input->group != sbi->s_groups_count) { ext4_warning(sb, __func__, "multiple resizers run on filesystem!"); @@ -840,7 +841,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) /* * OK, now we've set up the new group. Time to make it active. * - * Current kernels don't lock all allocations via lock_super(), + * We do not lock all allocations via s_resize_lock * so we have to be safe wrt. concurrent accesses the group * data. So we need to be careful to set all of the relevant * group descriptor data etc. *before* we enable the group. @@ -900,12 +901,12 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) * * The precise rules we use are: * - * * Writers of s_groups_count *must* hold lock_super + * * Writers of s_groups_count *must* hold s_resize_lock * AND * * Writers must perform a smp_wmb() after updating all dependent * data and before modifying the groups count * - * * Readers must hold lock_super() over the access + * * Readers must hold s_resize_lock over the access * OR * * Readers must perform an smp_rmb() after reading the groups count * and before reading any dependent data. @@ -948,7 +949,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) sb->s_dirt = 1; exit_journal: - unlock_super(sb); + mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext4_journal_stop(handle)) && !err) err = err2; if (!err) { @@ -986,7 +987,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, /* We don't need to worry about locking wrt other resizers just * yet: we're going to revalidate es->s_blocks_count after - * taking lock_super() below. */ + * taking the s_resize_lock below. */ o_blocks_count = ext4_blocks_count(es); o_groups_count = EXT4_SB(sb)->s_groups_count; @@ -1056,11 +1057,11 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, goto exit_put; } - lock_super(sb); + mutex_lock(&EXT4_SB(sb)->s_resize_lock); if (o_blocks_count != ext4_blocks_count(es)) { ext4_warning(sb, __func__, "multiple resizers run on filesystem!"); - unlock_super(sb); + mutex_unlock(&EXT4_SB(sb)->s_resize_lock); ext4_journal_stop(handle); err = -EBUSY; goto exit_put; @@ -1070,14 +1071,14 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, EXT4_SB(sb)->s_sbh))) { ext4_warning(sb, __func__, "error %d on journal write access", err); - unlock_super(sb); + mutex_unlock(&EXT4_SB(sb)->s_resize_lock); ext4_journal_stop(handle); goto exit_put; } ext4_blocks_count_set(es, o_blocks_count + add); ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); sb->s_dirt = 1; - unlock_super(sb); + mutex_unlock(&EXT4_SB(sb)->s_resize_lock); ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); /* We add the blocks to the bitmap and set the group need init bit */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7f43fde..1fbf090 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2646,6 +2646,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) #endif INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); + mutex_init(&sbi->s_resize_lock); sb->s_root = NULL; -- cgit v1.1 From db2dbb12dc47a50c7a4c5678f526014063e486f6 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Wed, 22 Apr 2009 14:08:13 +0200 Subject: block: implement blkdev_readpages Doing a proper block dev ->readpages() speeds up the crazy dump(8) approach of using interleaved process IO. Signed-off-by: Jeff Moyer Signed-off-by: Jens Axboe --- fs/block_dev.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index f45dbc1..a85fe31 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -331,6 +331,12 @@ static int blkdev_readpage(struct file * file, struct page * page) return block_read_full_page(page, blkdev_get_block); } +static int blkdev_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block); +} + static int blkdev_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1399,6 +1405,7 @@ static int blkdev_releasepage(struct page *page, gfp_t wait) static const struct address_space_operations def_blk_aops = { .readpage = blkdev_readpage, + .readpages = blkdev_readpages, .writepage = blkdev_writepage, .sync_page = block_sync_page, .write_begin = blkdev_write_begin, -- cgit v1.1 From 75507efb1372b6acf1aa6bf00ebd49ce196fd994 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 1 May 2009 12:58:36 -0400 Subject: ext4: Don't avoid using BLOCK_UNINIT block groups in mballoc By avoiding the use of not-yet-used block groups (i.e., block groups with the BLOCK_UNINIT flag), mballoc had a tendency to create large files with large non-contiguous gaps. In addition avoiding the use of new block groups had a tendency to push regular file data into the first block group in a flex_bg group, which slows down the speed of e2fsck pass 2, since it has a tendency to seek much more. For example: Before Patch After Patch Time in seconds Time in seconds Real / User/ Sys MB/s Real / User/ Sys MB/s Pass 1 8.52 / 2.21 / 0.46 20.43 8.84 / 4.97 / 1.11 19.68 Pass 2 21.16 / 1.02 / 1.86 11.30 6.54 / 1.77 / 1.78 36.39 Pass 3 0.01 / 0.00 / 0.00 139.00 0.01 / 0.01 / 0.00 128.90 Pass 4 0.16 / 0.15 / 0.00 0.00 0.17 / 0.17 / 0.00 0.00 Pass 5 2.52 / 1.99 / 0.09 0.79 2.31 / 1.78 / 0.06 0.86 Total 32.40 / 5.11 / 2.49 12.81 17.99 / 8.75 / 2.98 23.01 This was on a sample 80 gig root filesystem which was approximately 50% full. Note the improved e2fsck pass 2 performance, by over a factor of 3, due to a decreased number of seeks. (The total amount of I/O in pass 2 was unchanged; the layout of the directory blocks was simply much better from e2fsck's's perspective.) Other changes as a result of this patch on this sample filesystem: Before Patch After Patch # of non-contig files 762 779 # of non-contig directories 571 570 # of BLOCK_UNINIT bg's 307 293 # of INODE_UNINIT bg's 503 503 Out of 640 block groups, of which 333 were in use, this patch caused an extra 14 block groups to be utilized. The number of non-contiguous files did go up slightly, but when measured against the 99.9% of the files (603,154) which were contiguously allocated, this is pretty insignificant. Signed-off-by: "Theodore Ts'o" Signed-off-by: Andreas Dilger --- fs/ext4/mballoc.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c3af9e6..dbd47ea 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1728,7 +1728,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, unsigned free, fragments; unsigned i, bits; int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); - struct ext4_group_desc *desc; struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); BUG_ON(cr < 0 || cr >= 4); @@ -1744,10 +1743,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, switch (cr) { case 0: BUG_ON(ac->ac_2order == 0); - /* If this group is uninitialized, skip it initially */ - desc = ext4_get_group_desc(ac->ac_sb, group, NULL); - if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) - return 0; /* Avoid using the first bg of a flexgroup for data files */ if ((ac->ac_flags & EXT4_MB_HINT_DATA) && @@ -2067,9 +2062,7 @@ repeat: ac->ac_groups_scanned++; desc = ext4_get_group_desc(sb, group, NULL); - if (cr == 0 || (desc->bg_flags & - cpu_to_le16(EXT4_BG_BLOCK_UNINIT) && - ac->ac_2order != 0)) + if (cr == 0) ext4_mb_simple_scan_group(ac, &e4b); else if (cr == 1 && ac->ac_g_ex.fe_len == sbi->s_stripe) -- cgit v1.1 From d444c3c38189b3f18337a213855ac1c07af4e2d9 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 1 May 2009 13:44:33 -0400 Subject: ext4: Move the ext4_i.h header file into ext4.h There is no longer a reason for a separate ext4_i.h header file, so move it into ext4.h just to make life easier for developers to find the relevant data structures and typedefs. Should also speed up compiles slightly, too. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 122 +++++++++++++++++++++++++++++++++++++++++++++++- fs/ext4/ext4_i.h | 140 ------------------------------------------------------- 2 files changed, 121 insertions(+), 141 deletions(-) delete mode 100644 fs/ext4/ext4_i.h (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 02ec44b..ba57d66 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -21,7 +21,10 @@ #include #include #include -#include "ext4_i.h" +#include +#include +#include +#include /* * The fourth extended filesystem constants/structures @@ -46,6 +49,19 @@ #define ext4_debug(f, a...) do {} while (0) #endif +/* data type for block offset of block group */ +typedef int ext4_grpblk_t; + +/* data type for filesystem-wide blocks number */ +typedef unsigned long long ext4_fsblk_t; + +/* data type for file logical block number */ +typedef __u32 ext4_lblk_t; + +/* data type for block group number */ +typedef unsigned int ext4_group_t; + + /* prefer goal again. length */ #define EXT4_MB_HINT_MERGE 1 /* blocks already reserved */ @@ -516,6 +532,110 @@ do { \ #endif /* defined(__KERNEL__) || defined(__linux__) */ /* + * storage for cached extent + */ +struct ext4_ext_cache { + ext4_fsblk_t ec_start; + ext4_lblk_t ec_block; + __u32 ec_len; /* must be 32bit to return holes */ + __u32 ec_type; +}; + +/* + * fourth extended file system inode data in memory + */ +struct ext4_inode_info { + __le32 i_data[15]; /* unconverted */ + __u32 i_flags; + ext4_fsblk_t i_file_acl; + __u32 i_dtime; + + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, + * it is ued for making block allocation decisions - we try to + * place a file's data blocks near its inode block, and new inodes + * near to their parent directory's inode. + */ + ext4_group_t i_block_group; + __u32 i_state; /* Dynamic state flags for ext4 */ + + ext4_lblk_t i_dir_start_lookup; +#ifdef CONFIG_EXT4_FS_XATTR + /* + * Extended attributes can be read independently of the main file + * data. Taking i_mutex even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; +#endif +#ifdef CONFIG_EXT4_FS_POSIX_ACL + struct posix_acl *i_acl; + struct posix_acl *i_default_acl; +#endif + + struct list_head i_orphan; /* unlinked but open inodes */ + + /* + * i_disksize keeps track of what the inode size is ON DISK, not + * in memory. During truncate, i_size is set to the new size by + * the VFS prior to calling ext4_truncate(), but the filesystem won't + * set i_disksize to 0 until the truncate is actually under way. + * + * The intent is that i_disksize always represents the blocks which + * are used by this file. This allows recovery to restart truncate + * on orphans if we crash during truncate. We actually write i_disksize + * into the on-disk inode when writing inodes out, instead of i_size. + * + * The only time when i_disksize and i_size may be different is when + * a truncate is in progress. The only things which change i_disksize + * are ext4_get_block (growth) and ext4_truncate (shrinkth). + */ + loff_t i_disksize; + + /* + * i_data_sem is for serialising ext4_truncate() against + * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's + * data tree are chopped off during truncate. We can't do that in + * ext4 because whenever we perform intermediate commits during + * truncate, the inode and all the metadata blocks *must* be in a + * consistent state which allows truncation of the orphans to restart + * during recovery. Hence we must fix the get_block-vs-truncate race + * by other means, so we have i_data_sem. + */ + struct rw_semaphore i_data_sem; + struct inode vfs_inode; + struct jbd2_inode jinode; + + struct ext4_ext_cache i_cached_extent; + /* + * File creation time. Its function is same as that of + * struct timespec i_{a,c,m}time in the generic inode. + */ + struct timespec i_crtime; + + /* mballoc */ + struct list_head i_prealloc_list; + spinlock_t i_prealloc_lock; + + /* ialloc */ + ext4_group_t i_last_alloc_group; + + /* allocation reservation info for delalloc */ + unsigned int i_reserved_data_blocks; + unsigned int i_reserved_meta_blocks; + unsigned int i_allocated_meta_blocks; + unsigned short i_delalloc_reserved_flag; + + /* on-disk additional length */ + __u16 i_extra_isize; + + spinlock_t i_block_reservation_lock; +}; + +/* * File system states */ #define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */ diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h deleted file mode 100644 index 4ce2187..0000000 --- a/fs/ext4/ext4_i.h +++ /dev/null @@ -1,140 +0,0 @@ -/* - * ext4_i.h - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/include/linux/minix_fs_i.h - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -#ifndef _EXT4_I -#define _EXT4_I - -#include -#include -#include -#include - -/* data type for block offset of block group */ -typedef int ext4_grpblk_t; - -/* data type for filesystem-wide blocks number */ -typedef unsigned long long ext4_fsblk_t; - -/* data type for file logical block number */ -typedef __u32 ext4_lblk_t; - -/* data type for block group number */ -typedef unsigned int ext4_group_t; - -/* - * storage for cached extent - */ -struct ext4_ext_cache { - ext4_fsblk_t ec_start; - ext4_lblk_t ec_block; - __u32 ec_len; /* must be 32bit to return holes */ - __u32 ec_type; -}; - -/* - * fourth extended file system inode data in memory - */ -struct ext4_inode_info { - __le32 i_data[15]; /* unconverted */ - __u32 i_flags; - ext4_fsblk_t i_file_acl; - __u32 i_dtime; - - /* - * i_block_group is the number of the block group which contains - * this file's inode. Constant across the lifetime of the inode, - * it is ued for making block allocation decisions - we try to - * place a file's data blocks near its inode block, and new inodes - * near to their parent directory's inode. - */ - ext4_group_t i_block_group; - __u32 i_state; /* Dynamic state flags for ext4 */ - - ext4_lblk_t i_dir_start_lookup; -#ifdef CONFIG_EXT4_FS_XATTR - /* - * Extended attributes can be read independently of the main file - * data. Taking i_mutex even when reading would cause contention - * between readers of EAs and writers of regular file data, so - * instead we synchronize on xattr_sem when reading or changing - * EAs. - */ - struct rw_semaphore xattr_sem; -#endif -#ifdef CONFIG_EXT4_FS_POSIX_ACL - struct posix_acl *i_acl; - struct posix_acl *i_default_acl; -#endif - - struct list_head i_orphan; /* unlinked but open inodes */ - - /* - * i_disksize keeps track of what the inode size is ON DISK, not - * in memory. During truncate, i_size is set to the new size by - * the VFS prior to calling ext4_truncate(), but the filesystem won't - * set i_disksize to 0 until the truncate is actually under way. - * - * The intent is that i_disksize always represents the blocks which - * are used by this file. This allows recovery to restart truncate - * on orphans if we crash during truncate. We actually write i_disksize - * into the on-disk inode when writing inodes out, instead of i_size. - * - * The only time when i_disksize and i_size may be different is when - * a truncate is in progress. The only things which change i_disksize - * are ext4_get_block (growth) and ext4_truncate (shrinkth). - */ - loff_t i_disksize; - - /* - * i_data_sem is for serialising ext4_truncate() against - * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's - * data tree are chopped off during truncate. We can't do that in - * ext4 because whenever we perform intermediate commits during - * truncate, the inode and all the metadata blocks *must* be in a - * consistent state which allows truncation of the orphans to restart - * during recovery. Hence we must fix the get_block-vs-truncate race - * by other means, so we have i_data_sem. - */ - struct rw_semaphore i_data_sem; - struct inode vfs_inode; - struct jbd2_inode jinode; - - struct ext4_ext_cache i_cached_extent; - /* - * File creation time. Its function is same as that of - * struct timespec i_{a,c,m}time in the generic inode. - */ - struct timespec i_crtime; - - /* mballoc */ - struct list_head i_prealloc_list; - spinlock_t i_prealloc_lock; - - /* ialloc */ - ext4_group_t i_last_alloc_group; - - /* allocation reservation info for delalloc */ - unsigned int i_reserved_data_blocks; - unsigned int i_reserved_meta_blocks; - unsigned int i_allocated_meta_blocks; - unsigned short i_delalloc_reserved_flag; - - /* on-disk additional length */ - __u16 i_extra_isize; - - spinlock_t i_block_reservation_lock; -}; - -#endif /* _EXT4_I */ -- cgit v1.1 From ca0faba0e8ac844dc0279825eb8db876b5962ea5 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 3 May 2009 16:33:44 -0400 Subject: ext4: Move the ext4_sb.h header file into ext4.h There is no longer a reason for a separate ext4_sb.h header file, so move it into ext4.h just to make life easier for developers to find the relevant data structures and typedefs. Should also speed up compiles slightly, too. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 144 +++++++++++++++++++++++++++++++++++++++++++++-- fs/ext4/ext4_sb.h | 163 ------------------------------------------------------ 2 files changed, 140 insertions(+), 167 deletions(-) delete mode 100644 fs/ext4/ext4_sb.h (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ba57d66..af3c906 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -25,6 +25,10 @@ #include #include #include +#include +#include +#include +#include /* * The fourth extended filesystem constants/structures @@ -195,9 +199,6 @@ struct flex_groups { #define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ #define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ -#ifdef __KERNEL__ -#include "ext4_sb.h" -#endif /* * Macro-instructions used to manage group descriptors */ @@ -809,6 +810,136 @@ struct ext4_super_block { }; #ifdef __KERNEL__ +/* + * fourth extended-fs super-block data in memory + */ +struct ext4_sb_info { + unsigned long s_desc_size; /* Size of a group descriptor in bytes */ + unsigned long s_inodes_per_block;/* Number of inodes per block */ + unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_inodes_per_group;/* Number of inodes in a group */ + unsigned long s_itb_per_group; /* Number of inode table blocks per group */ + unsigned long s_gdb_count; /* Number of group descriptor blocks */ + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + ext4_group_t s_groups_count; /* Number of groups in the fs */ + unsigned long s_overhead_last; /* Last calculated overhead */ + unsigned long s_blocks_last; /* Last seen block count */ + loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ + struct buffer_head **s_group_desc; + unsigned long s_mount_opt; + ext4_fsblk_t s_sb_block; + uid_t s_resuid; + gid_t s_resgid; + unsigned short s_mount_state; + unsigned short s_pad; + int s_addr_per_block_bits; + int s_desc_per_block_bits; + int s_inode_size; + int s_first_ino; + unsigned int s_inode_readahead_blks; + spinlock_t s_next_gen_lock; + u32 s_next_generation; + u32 s_hash_seed[4]; + int s_def_hash_version; + int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ + struct percpu_counter s_freeblocks_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct percpu_counter s_dirtyblocks_counter; + struct blockgroup_lock *s_blockgroup_lock; + struct proc_dir_entry *s_proc; + struct kobject s_kobj; + struct completion s_kobj_unregister; + + /* Journaling */ + struct inode *s_journal_inode; + struct journal_s *s_journal; + struct list_head s_orphan; + struct mutex s_orphan_lock; + struct mutex s_resize_lock; + unsigned long s_commit_interval; + u32 s_max_batch_time; + u32 s_min_batch_time; + struct block_device *journal_bdev; +#ifdef CONFIG_JBD2_DEBUG + struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ + wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ +#endif +#ifdef CONFIG_QUOTA + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ +#endif + unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ + +#ifdef EXTENTS_STATS + /* ext4 extents stats */ + unsigned long s_ext_min; + unsigned long s_ext_max; + unsigned long s_depth_max; + spinlock_t s_ext_stats_lock; + unsigned long s_ext_blocks; + unsigned long s_ext_extents; +#endif + + /* for buddy allocator */ + struct ext4_group_info ***s_group_info; + struct inode *s_buddy_cache; + long s_blocks_reserved; + spinlock_t s_reserve_lock; + spinlock_t s_md_lock; + tid_t s_last_transaction; + unsigned short *s_mb_offsets; + unsigned int *s_mb_maxs; + + /* tunables */ + unsigned long s_stripe; + unsigned int s_mb_stream_request; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; + unsigned int s_mb_group_prealloc; + /* where last allocation was done - for stream allocation */ + unsigned long s_mb_last_group; + unsigned long s_mb_last_start; + + /* history to debug policy */ + struct ext4_mb_history *s_mb_history; + int s_mb_history_cur; + int s_mb_history_max; + int s_mb_history_num; + spinlock_t s_mb_history_lock; + int s_mb_history_filter; + + /* stats for buddy allocator */ + spinlock_t s_mb_pa_lock; + atomic_t s_bal_reqs; /* number of reqs with len > 1 */ + atomic_t s_bal_success; /* we found long enough chunks */ + atomic_t s_bal_allocated; /* in blocks */ + atomic_t s_bal_ex_scanned; /* total extents scanned */ + atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_breaks; /* too long searches */ + atomic_t s_bal_2orders; /* 2^order hits */ + spinlock_t s_bal_lock; + unsigned long s_mb_buddies_generated; + unsigned long long s_mb_generation_time; + atomic_t s_mb_lost_chunks; + atomic_t s_mb_preallocated; + atomic_t s_mb_discarded; + + /* locality groups */ + struct ext4_locality_group *s_locality_groups; + + /* for write statistics */ + unsigned long s_sectors_written_start; + u64 s_kbytes_written; + + unsigned int s_log_groups_per_flex; + struct flex_groups *s_flex_groups; +}; + static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) { return sb->s_fs_info; @@ -824,7 +955,6 @@ static inline struct timespec ext4_current_time(struct inode *inode) current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; } - static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) { return ino == EXT4_ROOT_INO || @@ -833,6 +963,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) (ino >= EXT4_FIRST_INO(sb) && ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); } + +static inline spinlock_t * +sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group) +{ + return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group); +} #else /* Assume that user mode programs are passing in an ext4fs superblock, not * a kernel struct super_block. This will allow us to call the feature-test diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h deleted file mode 100644 index 2d36223..0000000 --- a/fs/ext4/ext4_sb.h +++ /dev/null @@ -1,163 +0,0 @@ -/* - * ext4_sb.h - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/include/linux/minix_fs_sb.h - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -#ifndef _EXT4_SB -#define _EXT4_SB - -#ifdef __KERNEL__ -#include -#include -#include -#include -#endif -#include - -/* - * fourth extended-fs super-block data in memory - */ -struct ext4_sb_info { - unsigned long s_desc_size; /* Size of a group descriptor in bytes */ - unsigned long s_inodes_per_block;/* Number of inodes per block */ - unsigned long s_blocks_per_group;/* Number of blocks in a group */ - unsigned long s_inodes_per_group;/* Number of inodes in a group */ - unsigned long s_itb_per_group; /* Number of inode table blocks per group */ - unsigned long s_gdb_count; /* Number of group descriptor blocks */ - unsigned long s_desc_per_block; /* Number of group descriptors per block */ - ext4_group_t s_groups_count; /* Number of groups in the fs */ - unsigned long s_overhead_last; /* Last calculated overhead */ - unsigned long s_blocks_last; /* Last seen block count */ - loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ - struct buffer_head * s_sbh; /* Buffer containing the super block */ - struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ - struct buffer_head **s_group_desc; - unsigned long s_mount_opt; - ext4_fsblk_t s_sb_block; - uid_t s_resuid; - gid_t s_resgid; - unsigned short s_mount_state; - unsigned short s_pad; - int s_addr_per_block_bits; - int s_desc_per_block_bits; - int s_inode_size; - int s_first_ino; - unsigned int s_inode_readahead_blks; - spinlock_t s_next_gen_lock; - u32 s_next_generation; - u32 s_hash_seed[4]; - int s_def_hash_version; - int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ - struct percpu_counter s_freeblocks_counter; - struct percpu_counter s_freeinodes_counter; - struct percpu_counter s_dirs_counter; - struct percpu_counter s_dirtyblocks_counter; - struct blockgroup_lock *s_blockgroup_lock; - struct proc_dir_entry *s_proc; - struct kobject s_kobj; - struct completion s_kobj_unregister; - - /* Journaling */ - struct inode *s_journal_inode; - struct journal_s *s_journal; - struct list_head s_orphan; - struct mutex s_orphan_lock; - struct mutex s_resize_lock; - unsigned long s_commit_interval; - u32 s_max_batch_time; - u32 s_min_batch_time; - struct block_device *journal_bdev; -#ifdef CONFIG_JBD2_DEBUG - struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ - wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ -#endif -#ifdef CONFIG_QUOTA - char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ - int s_jquota_fmt; /* Format of quota to use */ -#endif - unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ - -#ifdef EXTENTS_STATS - /* ext4 extents stats */ - unsigned long s_ext_min; - unsigned long s_ext_max; - unsigned long s_depth_max; - spinlock_t s_ext_stats_lock; - unsigned long s_ext_blocks; - unsigned long s_ext_extents; -#endif - - /* for buddy allocator */ - struct ext4_group_info ***s_group_info; - struct inode *s_buddy_cache; - long s_blocks_reserved; - spinlock_t s_reserve_lock; - spinlock_t s_md_lock; - tid_t s_last_transaction; - unsigned short *s_mb_offsets; - unsigned int *s_mb_maxs; - - /* tunables */ - unsigned long s_stripe; - unsigned int s_mb_stream_request; - unsigned int s_mb_max_to_scan; - unsigned int s_mb_min_to_scan; - unsigned int s_mb_stats; - unsigned int s_mb_order2_reqs; - unsigned int s_mb_group_prealloc; - /* where last allocation was done - for stream allocation */ - unsigned long s_mb_last_group; - unsigned long s_mb_last_start; - - /* history to debug policy */ - struct ext4_mb_history *s_mb_history; - int s_mb_history_cur; - int s_mb_history_max; - int s_mb_history_num; - spinlock_t s_mb_history_lock; - int s_mb_history_filter; - - /* stats for buddy allocator */ - spinlock_t s_mb_pa_lock; - atomic_t s_bal_reqs; /* number of reqs with len > 1 */ - atomic_t s_bal_success; /* we found long enough chunks */ - atomic_t s_bal_allocated; /* in blocks */ - atomic_t s_bal_ex_scanned; /* total extents scanned */ - atomic_t s_bal_goals; /* goal hits */ - atomic_t s_bal_breaks; /* too long searches */ - atomic_t s_bal_2orders; /* 2^order hits */ - spinlock_t s_bal_lock; - unsigned long s_mb_buddies_generated; - unsigned long long s_mb_generation_time; - atomic_t s_mb_lost_chunks; - atomic_t s_mb_preallocated; - atomic_t s_mb_discarded; - - /* locality groups */ - struct ext4_locality_group *s_locality_groups; - - /* for write statistics */ - unsigned long s_sectors_written_start; - u64 s_kbytes_written; - - unsigned int s_log_groups_per_flex; - struct flex_groups *s_flex_groups; -}; - -static inline spinlock_t * -sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group) -{ - return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group); -} - -#endif /* _EXT4_SB */ -- cgit v1.1 From 596397b77c895d0fa3674f579c94ad5ea88ef01d Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 1 May 2009 13:49:15 -0400 Subject: ext4: Move fs/ext4/namei.h into ext4.h The fs/ext4/namei.h header file had only a single function declaration, and should have never been a standalone file. Move it into ext4.h, where should have been from the beginning. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 1 + fs/ext4/namei.c | 1 - fs/ext4/namei.h | 8 -------- fs/ext4/super.c | 1 - 4 files changed, 1 insertion(+), 10 deletions(-) delete mode 100644 fs/ext4/namei.h (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index af3c906..d9c5251 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1594,6 +1594,7 @@ extern const struct file_operations ext4_file_operations; /* namei.c */ extern const struct inode_operations ext4_dir_inode_operations; extern const struct inode_operations ext4_special_inode_operations; +extern struct dentry *ext4_get_parent(struct dentry *child); /* symlink.c */ extern const struct inode_operations ext4_symlink_inode_operations; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 8018e49..c9690b2 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -37,7 +37,6 @@ #include "ext4.h" #include "ext4_jbd2.h" -#include "namei.h" #include "xattr.h" #include "acl.h" diff --git a/fs/ext4/namei.h b/fs/ext4/namei.h deleted file mode 100644 index 5e4dfff..0000000 --- a/fs/ext4/namei.h +++ /dev/null @@ -1,8 +0,0 @@ -/* linux/fs/ext4/namei.h - * - * Copyright (C) 2005 Simtec Electronics - * Ben Dooks - * -*/ - -extern struct dentry *ext4_get_parent(struct dentry *child); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1fbf090..d79e1c4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -46,7 +46,6 @@ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" -#include "namei.h" #include "group.h" struct proc_dir_entry *ext4_proc_root; -- cgit v1.1 From bb23c20a851a5038b255a3c0d0aa56093c1da3f8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 1 May 2009 19:44:44 -0400 Subject: ext4: Move fs/ext4/group.h into ext4.h Move the function prototypes in group.h into ext4.h so they are all defined in one place. Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 1 - fs/ext4/ext4.h | 17 +++++++++++++++++ fs/ext4/group.h | 29 ----------------------------- fs/ext4/ialloc.c | 1 - fs/ext4/mballoc.h | 1 - fs/ext4/resize.c | 1 - fs/ext4/super.c | 1 - 7 files changed, 17 insertions(+), 34 deletions(-) delete mode 100644 fs/ext4/group.h (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index a5ba039..92f557d 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -19,7 +19,6 @@ #include #include "ext4.h" #include "ext4_jbd2.h" -#include "group.h" #include "mballoc.h" /* diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d9c5251..5973f32 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1270,6 +1270,14 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, ext4_group_t block_group, struct buffer_head ** bh); extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); +struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, + ext4_group_t block_group); +extern unsigned ext4_init_block_bitmap(struct super_block *sb, + struct buffer_head *bh, + ext4_group_t group, + struct ext4_group_desc *desc); +#define ext4_free_blocks_after_init(sb, group, desc) \ + ext4_init_block_bitmap(sb, NULL, group, desc) /* dir.c */ extern int ext4_check_dir_entry(const char *, struct inode *, @@ -1294,6 +1302,11 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); extern unsigned long ext4_count_dirs(struct super_block *); extern void ext4_check_inodes_bitmap(struct super_block *); +extern unsigned ext4_init_inode_bitmap(struct super_block *sb, + struct buffer_head *bh, + ext4_group_t group, + struct ext4_group_desc *desc); +extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap); /* mballoc.c */ extern long ext4_mb_stats; @@ -1417,6 +1430,10 @@ extern void ext4_used_dirs_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); extern void ext4_itable_unused_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); +extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group, + struct ext4_group_desc *gdp); +extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, + struct ext4_group_desc *gdp); static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) { diff --git a/fs/ext4/group.h b/fs/ext4/group.h deleted file mode 100644 index c2c0a8d..0000000 --- a/fs/ext4/group.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * linux/fs/ext4/group.h - * - * Copyright (C) 2007 Cluster File Systems, Inc - * - * Author: Andreas Dilger - */ - -#ifndef _LINUX_EXT4_GROUP_H -#define _LINUX_EXT4_GROUP_H - -extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group, - struct ext4_group_desc *gdp); -extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, - struct ext4_group_desc *gdp); -struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, - ext4_group_t block_group); -extern unsigned ext4_init_block_bitmap(struct super_block *sb, - struct buffer_head *bh, - ext4_group_t group, - struct ext4_group_desc *desc); -#define ext4_free_blocks_after_init(sb, group, desc) \ - ext4_init_block_bitmap(sb, NULL, group, desc) -extern unsigned ext4_init_inode_bitmap(struct super_block *sb, - struct buffer_head *bh, - ext4_group_t group, - struct ext4_group_desc *desc); -extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap); -#endif /* _LINUX_EXT4_GROUP_H */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 55ba419..916d05c 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -27,7 +27,6 @@ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" -#include "group.h" /* * ialloc.c contains the inodes allocation and deallocation routines diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index dd9e6cd..75e34f6 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -23,7 +23,6 @@ #include #include "ext4_jbd2.h" #include "ext4.h" -#include "group.h" /* * with AGGRESSIVE_CHECK allocator runs consistency checks over diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index e8ded13..27eb289 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -15,7 +15,6 @@ #include #include "ext4_jbd2.h" -#include "group.h" #define outside(b, first, last) ((b) < (first) || (b) >= (last)) #define inside(b, first, last) ((b) >= (first) && (b) < (last)) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d79e1c4..7903f20 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -46,7 +46,6 @@ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" -#include "group.h" struct proc_dir_entry *ext4_proc_root; static struct kset *ext4_kset; -- cgit v1.1 From f40339031b04279c3fdde7ac5fe97db33b2a7694 Mon Sep 17 00:00:00 2001 From: Curt Wohlgemuth Date: Fri, 1 May 2009 20:27:20 -0400 Subject: ext4: Make the length of the mb_history file tunable In memory-constrained systems with many partitions, the ~68K for each partition for the mb_history buffer can be excessive. This patch adds a new mount option, mb_history_length, as well as a way of setting the default via a module parameter (or via a sysfs parameter in /sys/module/ext4/parameter/default_mb_history_length). If the mb_history_length is set to zero, the mb_history facility is disabled entirely. Signed-off-by: Curt Wohlgemuth Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 13 +++++++------ fs/ext4/super.c | 18 +++++++++++++++++- 2 files changed, 24 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index dbd47ea..df75855 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2413,7 +2413,8 @@ static void ext4_mb_history_release(struct super_block *sb) if (sbi->s_proc != NULL) { remove_proc_entry("mb_groups", sbi->s_proc); - remove_proc_entry("mb_history", sbi->s_proc); + if (sbi->s_mb_history_max) + remove_proc_entry("mb_history", sbi->s_proc); } kfree(sbi->s_mb_history); } @@ -2424,17 +2425,17 @@ static void ext4_mb_history_init(struct super_block *sb) int i; if (sbi->s_proc != NULL) { - proc_create_data("mb_history", S_IRUGO, sbi->s_proc, - &ext4_mb_seq_history_fops, sb); + if (sbi->s_mb_history_max) + proc_create_data("mb_history", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_history_fops, sb); proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, &ext4_mb_seq_groups_fops, sb); } - sbi->s_mb_history_max = 1000; sbi->s_mb_history_cur = 0; spin_lock_init(&sbi->s_mb_history_lock); i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history); - sbi->s_mb_history = kzalloc(i, GFP_KERNEL); + sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL; /* if we can't allocate history, then we simple won't use it */ } @@ -2444,7 +2445,7 @@ ext4_mb_store_history(struct ext4_allocation_context *ac) struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_mb_history h; - if (unlikely(sbi->s_mb_history == NULL)) + if (sbi->s_mb_history == NULL) return; if (!(ac->ac_op & sbi->s_mb_history_filter)) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7903f20..39223a5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -47,6 +47,13 @@ #include "xattr.h" #include "acl.h" +static int default_mb_history_length = 1000; + +module_param_named(default_mb_history_length, default_mb_history_length, + int, 0644); +MODULE_PARM_DESC(default_mb_history_length, + "Default number of entries saved for mb_history"); + struct proc_dir_entry *ext4_proc_root; static struct kset *ext4_kset; @@ -1042,7 +1049,7 @@ enum { Opt_journal_update, Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, - Opt_data_err_abort, Opt_data_err_ignore, + Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, @@ -1088,6 +1095,7 @@ static const match_table_t tokens = { {Opt_data_writeback, "data=writeback"}, {Opt_data_err_abort, "data_err=abort"}, {Opt_data_err_ignore, "data_err=ignore"}, + {Opt_mb_history_length, "mb_history_length=%u"}, {Opt_offusrjquota, "usrjquota="}, {Opt_usrjquota, "usrjquota=%s"}, {Opt_offgrpjquota, "grpjquota="}, @@ -1329,6 +1337,13 @@ static int parse_options(char *options, struct super_block *sb, case Opt_data_err_ignore: clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); break; + case Opt_mb_history_length: + if (match_int(&args[0], &option)) + return 0; + if (option < 0) + return 0; + sbi->s_mb_history_max = option; + break; #ifdef CONFIG_QUOTA case Opt_usrjquota: qtype = USRQUOTA; @@ -2345,6 +2360,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; + sbi->s_mb_history_max = default_mb_history_length; set_opt(sbi->s_mount_opt, BARRIER); -- cgit v1.1 From abc8746eb91fb01e8d411896f80f7687c0d8372e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 2 May 2009 22:54:32 -0400 Subject: ext4: hook fiemap operation for directories Add fiemap callback for directories Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/namei.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index c9690b2..f2bc160 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2534,6 +2534,7 @@ const struct inode_operations ext4_dir_inode_operations = { .removexattr = generic_removexattr, #endif .permission = ext4_permission, + .fiemap = ext4_fiemap, }; const struct inode_operations ext4_special_inode_operations = { -- cgit v1.1 From 19ba0559f9ce104171ab16706893ce01f03ef116 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 13 May 2009 18:12:05 -0400 Subject: vfs: Enable FS_IOC_FIEMAP and FIGETBSZ for all filetypes The fiemap and get_blk_size ioctls should be enabled even for directories. So move it outisde file_ioctl. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ioctl.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ioctl.c b/fs/ioctl.c index 82d9c42..286f38d 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -414,10 +414,6 @@ static int file_ioctl(struct file *filp, unsigned int cmd, switch (cmd) { case FIBMAP: return ioctl_fibmap(filp, p); - case FS_IOC_FIEMAP: - return ioctl_fiemap(filp, arg); - case FIGETBSZ: - return put_user(inode->i_sb->s_blocksize, p); case FIONREAD: return put_user(i_size_read(inode) - filp->f_pos, p); } @@ -557,6 +553,16 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, error = ioctl_fsthaw(filp); break; + case FS_IOC_FIEMAP: + return ioctl_fiemap(filp, arg); + + case FIGETBSZ: + { + struct inode *inode = filp->f_path.dentry->d_inode; + int __user *p = (int __user *)arg; + return put_user(inode->i_sb->s_blocksize, p); + } + default: if (S_ISREG(filp->f_path.dentry->d_inode->i_mode)) error = file_ioctl(filp, cmd, arg); -- cgit v1.1 From c9877b205f6ce7943bb95281342f4001cc1c00ec Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 1 May 2009 23:32:06 -0400 Subject: ext4: fix for fiemap last-block test Carl Henrik Lunde reported and debugged this; the test for the last allocated block was comparing bytes to blocks in this test: if (logical + length - 1 == EXT_MAX_BLOCK || ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK) flags |= FIEMAP_EXTENT_LAST; so any extent which ended right at 4G was stopping the extent walk. Just replacing these values with the extent block & length should fix it. Also give blksize_bits a saner type, and reverse the order of the tests to make the more likely case tested first. Signed-off-by: Eric Sandeen Reported-by: Carl Henrik Lunde Tested-by: Carl Henrik Lunde Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ea5c476..5f72952 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3196,7 +3196,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, void *data) { struct fiemap_extent_info *fieinfo = data; - unsigned long blksize_bits = inode->i_sb->s_blocksize_bits; + unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; __u64 logical; __u64 physical; __u64 length; @@ -3243,8 +3243,8 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, * * XXX this might miss a single-block extent at EXT_MAX_BLOCK */ - if (logical + length - 1 == EXT_MAX_BLOCK || - ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK) + if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK || + newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) flags |= FIEMAP_EXTENT_LAST; error = fiemap_fill_next_extent(fieinfo, logical, physical, -- cgit v1.1 From eefd7f03b86b8a319890e7fac5a6fcc7f8694b76 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 2 May 2009 19:05:37 -0400 Subject: ext4: fix the length returned by fiemap for an unallocated extent If the file's blocks have not yet been allocated because of delayed allocation, the length of the extent returned by fiemap is incorrect. This commit fixes this bug. Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 5f72952..4fec6b7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3244,8 +3244,15 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, * XXX this might miss a single-block extent at EXT_MAX_BLOCK */ if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK || - newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) + newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) { + loff_t size = i_size_read(inode); + loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb); + flags |= FIEMAP_EXTENT_LAST; + if ((flags & FIEMAP_EXTENT_DELALLOC) && + logical+length > size) + length = (size - logical + bs - 1) & ~(bs-1); + } error = fiemap_fill_next_extent(fieinfo, logical, physical, length, flags); -- cgit v1.1 From 955ce5f5be67dfe0d1d096b543af33fe8a1ce3dd Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 2 May 2009 20:35:09 -0400 Subject: ext4: Convert ext4_lock_group to use sb_bgl_lock We have sb_bgl_lock() and ext4_group_info.bb_state bit spinlock to protech group information. The later is only used within mballoc code. Consolidate them to use sb_bgl_lock(). This makes the mballoc.c code much simpler and also avoid confusion with two locks protecting same info. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 12 ++++----- fs/ext4/ext4.h | 26 +++++++------------ fs/ext4/ialloc.c | 29 ++++++++++----------- fs/ext4/mballoc.c | 78 +++++++++++++++++++------------------------------------ fs/ext4/super.c | 6 ++--- 5 files changed, 59 insertions(+), 92 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 92f557d..e2126d7 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -326,16 +326,16 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) unlock_buffer(bh); return bh; } - spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); + ext4_lock_group(sb, block_group); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { ext4_init_block_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); - spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + ext4_unlock_group(sb, block_group); unlock_buffer(bh); return bh; } - spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + ext4_unlock_group(sb, block_group); if (buffer_uptodate(bh)) { /* * if not uninit if bh is uptodate, @@ -451,7 +451,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, down_write(&grp->alloc_sem); for (i = 0, blocks_freed = 0; i < count; i++) { BUFFER_TRACE(bitmap_bh, "clear bit"); - if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), + if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), bit + i, bitmap_bh->b_data)) { ext4_error(sb, __func__, "bit already cleared for block %llu", @@ -461,11 +461,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, blocks_freed++; } } - spin_lock(sb_bgl_lock(sbi, block_group)); + ext4_lock_group(sb, block_group); blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc); ext4_free_blks_set(sb, desc, blk_free_count); desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); - spin_unlock(sb_bgl_lock(sbi, block_group)); + ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed); if (sbi->s_log_groups_per_flex) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5973f32..149e02d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -963,12 +963,6 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) (ino >= EXT4_FIRST_INO(sb) && ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); } - -static inline spinlock_t * -sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group) -{ - return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group); -} #else /* Assume that user mode programs are passing in an ext4fs superblock, not * a kernel struct super_block. This will allow us to call the feature-test @@ -1568,33 +1562,31 @@ struct ext4_group_info { }; #define EXT4_GROUP_INFO_NEED_INIT_BIT 0 -#define EXT4_GROUP_INFO_LOCKED_BIT 1 #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) -static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) +static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, + ext4_group_t group) { - struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); + return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); +} - bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); +static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) +{ + spin_lock(ext4_group_lock_ptr(sb, group)); } static inline void ext4_unlock_group(struct super_block *sb, ext4_group_t group) { - struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); - - bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); + spin_unlock(ext4_group_lock_ptr(sb, group)); } static inline int ext4_is_group_locked(struct super_block *sb, ext4_group_t group) { - struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); - - return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT, - &(grinfo->bb_state)); + return spin_is_locked(ext4_group_lock_ptr(sb, group)); } /* diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 916d05c..82f7d1d 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -122,16 +122,16 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) unlock_buffer(bh); return bh; } - spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); + ext4_lock_group(sb, block_group); if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { ext4_init_inode_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); - spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + ext4_unlock_group(sb, block_group); unlock_buffer(bh); return bh; } - spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + ext4_unlock_group(sb, block_group); if (buffer_uptodate(bh)) { /* * if not uninit if bh is uptodate, @@ -246,9 +246,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) goto error_return; /* Ok, now we can actually update the inode bitmaps.. */ - spin_lock(sb_bgl_lock(sbi, block_group)); - cleared = ext4_clear_bit(bit, bitmap_bh->b_data); - spin_unlock(sb_bgl_lock(sbi, block_group)); + cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), + bit, bitmap_bh->b_data); if (!cleared) ext4_error(sb, "ext4_free_inode", "bit already cleared for inode %lu", ino); @@ -260,7 +259,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) if (fatal) goto error_return; if (gdp) { - spin_lock(sb_bgl_lock(sbi, block_group)); + ext4_lock_group(sb, block_group); count = ext4_free_inodes_count(sb, gdp) + 1; ext4_free_inodes_set(sb, gdp, count); if (is_directory) { @@ -276,7 +275,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) } gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); - spin_unlock(sb_bgl_lock(sbi, block_group)); + ext4_unlock_group(sb, block_group); percpu_counter_inc(&sbi->s_freeinodes_counter); if (is_directory) percpu_counter_dec(&sbi->s_dirs_counter); @@ -707,10 +706,10 @@ static int find_group_other(struct super_block *sb, struct inode *parent, /* * claim the inode from the inode bitmap. If the group - * is uninit we need to take the groups's sb_bgl_lock + * is uninit we need to take the groups's ext4_group_lock * and clear the uninit flag. The inode bitmap update * and group desc uninit flag clear should be done - * after holding sb_bgl_lock so that ext4_read_inode_bitmap + * after holding ext4_group_lock so that ext4_read_inode_bitmap * doesn't race with the ext4_claim_inode */ static int ext4_claim_inode(struct super_block *sb, @@ -721,7 +720,7 @@ static int ext4_claim_inode(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); - spin_lock(sb_bgl_lock(sbi, group)); + ext4_lock_group(sb, group); if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { /* not a free inode */ retval = 1; @@ -730,7 +729,7 @@ static int ext4_claim_inode(struct super_block *sb, ino++; if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || ino > EXT4_INODES_PER_GROUP(sb)) { - spin_unlock(sb_bgl_lock(sbi, group)); + ext4_unlock_group(sb, group); ext4_error(sb, __func__, "reserved inode or inode > inodes count - " "block_group = %u, inode=%lu", group, @@ -779,7 +778,7 @@ static int ext4_claim_inode(struct super_block *sb, } gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); err_ret: - spin_unlock(sb_bgl_lock(sbi, group)); + ext4_unlock_group(sb, group); return retval; } @@ -935,7 +934,7 @@ got: } free = 0; - spin_lock(sb_bgl_lock(sbi, group)); + ext4_lock_group(sb, group); /* recheck and clear flag under lock if we still need to */ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { free = ext4_free_blocks_after_init(sb, group, gdp); @@ -944,7 +943,7 @@ got: gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); } - spin_unlock(sb_bgl_lock(sbi, group)); + ext4_unlock_group(sb, group); /* Don't need to dirty bitmap block if we didn't change it */ if (free) { diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index df75855..e76459c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -372,24 +372,12 @@ static inline void mb_set_bit(int bit, void *addr) ext4_set_bit(bit, addr); } -static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr) -{ - addr = mb_correct_addr_and_bit(&bit, addr); - ext4_set_bit_atomic(lock, bit, addr); -} - static inline void mb_clear_bit(int bit, void *addr) { addr = mb_correct_addr_and_bit(&bit, addr); ext4_clear_bit(bit, addr); } -static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr) -{ - addr = mb_correct_addr_and_bit(&bit, addr); - ext4_clear_bit_atomic(lock, bit, addr); -} - static inline int mb_find_next_zero_bit(void *addr, int max, int start) { int fix = 0, ret, tmpmax; @@ -803,17 +791,17 @@ static int ext4_mb_init_cache(struct page *page, char *incore) unlock_buffer(bh[i]); continue; } - spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); + ext4_lock_group(sb, first_group + i); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { ext4_init_block_bitmap(sb, bh[i], first_group + i, desc); set_bitmap_uptodate(bh[i]); set_buffer_uptodate(bh[i]); - spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); + ext4_unlock_group(sb, first_group + i); unlock_buffer(bh[i]); continue; } - spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); + ext4_unlock_group(sb, first_group + i); if (buffer_uptodate(bh[i])) { /* * if not uninit if bh is uptodate, @@ -1080,7 +1068,7 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) return 0; } -static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len) +static void mb_clear_bits(void *bm, int cur, int len) { __u32 *addr; @@ -1093,15 +1081,12 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len) cur += 32; continue; } - if (lock) - mb_clear_bit_atomic(lock, cur, bm); - else - mb_clear_bit(cur, bm); + mb_clear_bit(cur, bm); cur++; } } -static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) +static void mb_set_bits(void *bm, int cur, int len) { __u32 *addr; @@ -1114,10 +1099,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) cur += 32; continue; } - if (lock) - mb_set_bit_atomic(lock, cur, bm); - else - mb_set_bit(cur, bm); + mb_set_bit(cur, bm); cur++; } } @@ -1332,8 +1314,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) e4b->bd_info->bb_counters[ord]++; } - mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group), - EXT4_MB_BITMAP(e4b), ex->fe_start, len0); + mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); mb_check_buddy(e4b); return ret; @@ -2756,7 +2737,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) return 0; } -/* need to called with ext4 group lock (ext4_lock_group) */ +/* need to called with the ext4 group lock held */ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) { struct ext4_prealloc_space *pa; @@ -2993,14 +2974,17 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, * Fix the bitmap and repeat the block allocation * We leak some of the blocks here. */ - mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), - bitmap_bh->b_data, ac->ac_b_ex.fe_start, - ac->ac_b_ex.fe_len); + ext4_lock_group(sb, ac->ac_b_ex.fe_group); + mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len); + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!err) err = -EAGAIN; goto out_err; } + + ext4_lock_group(sb, ac->ac_b_ex.fe_group); #ifdef AGGRESSIVE_CHECK { int i; @@ -3010,9 +2994,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, } } #endif - spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); - mb_set_bits(NULL, bitmap_bh->b_data, - ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); + mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_blks_set(sb, gdp, @@ -3022,7 +3004,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len; ext4_free_blks_set(sb, gdp, len); gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); - spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); + + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); /* * Now reduce the dirty block count also. Should not go negative @@ -3455,7 +3438,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) * the function goes through all block freed in the group * but not yet committed and marks them used in in-core bitmap. * buddy must be generated from this bitmap - * Need to be called with ext4 group lock (ext4_lock_group) + * Need to be called with the ext4 group lock held */ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group) @@ -3469,9 +3452,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, while (n) { entry = rb_entry(n, struct ext4_free_data, node); - mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), - bitmap, entry->start_blk, - entry->count); + mb_set_bits(bitmap, entry->start_blk, entry->count); n = rb_next(n); } return; @@ -3480,7 +3461,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, /* * the function goes through all preallocation in this group and marks them * used in in-core bitmap. buddy must be generated from this bitmap - * Need to be called with ext4 group lock (ext4_lock_group) + * Need to be called with ext4 group lock held */ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group) @@ -3512,8 +3493,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, if (unlikely(len == 0)) continue; BUG_ON(groupnr != group); - mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), - bitmap, start, len); + mb_set_bits(bitmap, start, len); preallocated += len; count++; } @@ -4856,29 +4836,25 @@ do_more: new_entry->group = block_group; new_entry->count = count; new_entry->t_tid = handle->h_transaction->t_tid; + ext4_lock_group(sb, block_group); - mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, - bit, count); + mb_clear_bits(bitmap_bh->b_data, bit, count); ext4_mb_free_metadata(handle, &e4b, new_entry); - ext4_unlock_group(sb, block_group); } else { - ext4_lock_group(sb, block_group); /* need to update group_info->bb_free and bitmap * with group lock held. generate_buddy look at * them with group lock_held */ - mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, - bit, count); + ext4_lock_group(sb, block_group); + mb_clear_bits(bitmap_bh->b_data, bit, count); mb_free_blocks(inode, &e4b, bit, count); ext4_mb_return_to_preallocation(inode, &e4b, block, count); - ext4_unlock_group(sb, block_group); } - spin_lock(sb_bgl_lock(sbi, block_group)); ret = ext4_free_blks_count(sb, gdp) + count; ext4_free_blks_set(sb, gdp, ret); gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); - spin_unlock(sb_bgl_lock(sbi, block_group)); + ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeblocks_counter, count); if (sbi->s_log_groups_per_flex) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 39223a5..dc34ed3 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1784,18 +1784,18 @@ static int ext4_check_descriptors(struct super_block *sb) "(block %llu)!\n", i, inode_table); return 0; } - spin_lock(sb_bgl_lock(sbi, i)); + ext4_lock_group(sb, i); if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Checksum for group %u failed (%u!=%u)\n", i, le16_to_cpu(ext4_group_desc_csum(sbi, i, gdp)), le16_to_cpu(gdp->bg_checksum)); if (!(sb->s_flags & MS_RDONLY)) { - spin_unlock(sb_bgl_lock(sbi, i)); + ext4_unlock_group(sb, i); return 0; } } - spin_unlock(sb_bgl_lock(sbi, i)); + ext4_unlock_group(sb, i); if (!flexbg_flag) first_block += EXT4_BLOCKS_PER_GROUP(sb); } -- cgit v1.1 From 5e751e992f3fb08ba35e1ca8095ec8fbf9eda523 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 8 May 2009 13:55:22 +0100 Subject: CRED: Rename cred_exec_mutex to reflect that it's a guard against ptrace Rename cred_exec_mutex to reflect that it's a guard against foreign intervention on a process's credential state, such as is made by ptrace(). The attachment of a debugger to a process affects execve()'s calculation of the new credential state - _and_ also setprocattr()'s calculation of that state. Signed-off-by: David Howells Signed-off-by: James Morris --- fs/compat.c | 6 +++--- fs/exec.c | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index 681ed81..bb2a9b2 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1488,7 +1488,7 @@ int compat_do_execve(char * filename, if (!bprm) goto out_files; - retval = mutex_lock_interruptible(¤t->cred_exec_mutex); + retval = mutex_lock_interruptible(¤t->cred_guard_mutex); if (retval < 0) goto out_free; current->in_execve = 1; @@ -1550,7 +1550,7 @@ int compat_do_execve(char * filename, /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; - mutex_unlock(¤t->cred_exec_mutex); + mutex_unlock(¤t->cred_guard_mutex); acct_update_integrals(current); free_bprm(bprm); if (displaced) @@ -1573,7 +1573,7 @@ out_unmark: out_unlock: current->in_execve = 0; - mutex_unlock(¤t->cred_exec_mutex); + mutex_unlock(¤t->cred_guard_mutex); out_free: free_bprm(bprm); diff --git a/fs/exec.c b/fs/exec.c index 639177b..998e856 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1045,7 +1045,7 @@ void install_exec_creds(struct linux_binprm *bprm) commit_creds(bprm->cred); bprm->cred = NULL; - /* cred_exec_mutex must be held at least to this point to prevent + /* cred_guard_mutex must be held at least to this point to prevent * ptrace_attach() from altering our determination of the task's * credentials; any time after this it may be unlocked */ @@ -1055,7 +1055,7 @@ EXPORT_SYMBOL(install_exec_creds); /* * determine how safe it is to execute the proposed program - * - the caller must hold current->cred_exec_mutex to protect against + * - the caller must hold current->cred_guard_mutex to protect against * PTRACE_ATTACH */ int check_unsafe_exec(struct linux_binprm *bprm) @@ -1297,7 +1297,7 @@ int do_execve(char * filename, if (!bprm) goto out_files; - retval = mutex_lock_interruptible(¤t->cred_exec_mutex); + retval = mutex_lock_interruptible(¤t->cred_guard_mutex); if (retval < 0) goto out_free; current->in_execve = 1; @@ -1360,7 +1360,7 @@ int do_execve(char * filename, /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; - mutex_unlock(¤t->cred_exec_mutex); + mutex_unlock(¤t->cred_guard_mutex); acct_update_integrals(current); free_bprm(bprm); if (displaced) @@ -1383,7 +1383,7 @@ out_unmark: out_unlock: current->in_execve = 0; - mutex_unlock(¤t->cred_exec_mutex); + mutex_unlock(¤t->cred_guard_mutex); out_free: free_bprm(bprm); -- cgit v1.1 From 107db7c7dd137aeb7361b8c2606ac936c0be58ff Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 8 May 2009 13:55:27 +0100 Subject: CRED: Guard the setprocattr security hook against ptrace Guard the setprocattr security hook against ptrace by taking the target task's cred_guard_mutex around it. The problem is that setprocattr() may otherwise note the lack of a debugger, and then perform an action on that basis whilst letting a debugger attach between the two points. Holding cred_guard_mutex across the test and the action prevents ptrace_attach() from doing that. Signed-off-by: David Howells Signed-off-by: James Morris --- fs/proc/base.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index fb45615..23342e1 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2128,9 +2128,15 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, if (copy_from_user(page, buf, count)) goto out_free; + /* Guard against adverse ptrace interaction */ + length = mutex_lock_interruptible(&task->cred_guard_mutex); + if (length < 0) + goto out_free; + length = security_setprocattr(task, (char*)file->f_path.dentry->d_name.name, (void*)page, count); + mutex_unlock(&task->cred_guard_mutex); out_free: free_page((unsigned long) page); out: -- cgit v1.1 From c3a4d78c580de4edc9ef0f7c59812fb02ceb037f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 May 2009 22:24:37 +0900 Subject: block: add rq->resid_len rq->data_len served two purposes - the length of data buffer on issue and the residual count on completion. This duality creates some headaches. First of all, block layer and low level drivers can't really determine what rq->data_len contains while a request is executing. It could be the total request length or it coulde be anything else one of the lower layers is using to keep track of residual count. This complicates things because blk_rq_bytes() and thus [__]blk_end_request_all() relies on rq->data_len for PC commands. Drivers which want to report residual count should first cache the total request length, update rq->data_len and then complete the request with the cached data length. Secondly, it makes requests default to reporting full residual count, ie. reporting that no data transfer occurred. The residual count is an exception not the norm; however, the driver should clear rq->data_len to zero to signify the normal cases while leaving it alone means no data transfer occurred at all. This reverse default behavior complicates code unnecessarily and renders block PC on some drivers (ide-tape/floppy) unuseable. This patch adds rq->resid_len which is used only for residual count. While at it, remove now unnecessasry blk_rq_bytes() caching in ide_pc_intr() as rq->data_len is not changed anymore. Boaz : spotted missing conversion in osd Sergei : spotted too early conversion to blk_rq_bytes() in ide-tape [ Impact: cleanup residual count handling, report 0 resid by default ] Signed-off-by: Tejun Heo Cc: James Bottomley Cc: Bartlomiej Zolnierkiewicz Cc: Borislav Petkov Cc: Sergei Shtylyov Cc: Mike Miller Cc: Eric Moore Cc: Alan Stern Cc: FUJITA Tomonori Cc: Doug Gilbert Cc: Mike Miller Cc: Eric Moore Cc: Darrick J. Wong Cc: Pete Zaitcev Cc: Boaz Harrosh Signed-off-by: Jens Axboe --- fs/exofs/osd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c index b249ae9..06ca926 100644 --- a/fs/exofs/osd.c +++ b/fs/exofs/osd.c @@ -50,10 +50,10 @@ int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid) /* FIXME: should be include in osd_sense_info */ if (in_resid) - *in_resid = or->in.req ? or->in.req->data_len : 0; + *in_resid = or->in.req ? or->in.req->resid_len : 0; if (out_resid) - *out_resid = or->out.req ? or->out.req->data_len : 0; + *out_resid = or->out.req ? or->out.req->resid_len : 0; return ret; } -- cgit v1.1 From c969f58ca43fc403c75f5d3da4cf1e21de7afaa0 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 7 Apr 2009 14:13:01 +0100 Subject: GFS2: Update the rw flags After Jens recent updates: http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=a1f242524c3c1f5d40f1c9c343427e34d1aadd6e et al. this is a patch to bring gfs2 uptodate with the core code. Also I've managed to squash another call to ll_rw_block() along the way. There is still one part of the GFS2 I/O paths which are not correctly annotated and that is due to the sharing of the writeback code between the data and metadata address spaces. I would like to change that too, but this patch is still worth doing on its own, I think. Signed-off-by: Steven Whitehouse --- fs/gfs2/log.c | 6 +++--- fs/gfs2/lops.c | 14 ++++++++------ fs/gfs2/meta_io.c | 38 +++++++++++++++++++++++++++----------- 3 files changed, 38 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 98918a7..aa62cf5 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -120,7 +120,7 @@ __acquires(&sdp->sd_log_lock) lock_buffer(bh); if (test_clear_buffer_dirty(bh)) { bh->b_end_io = end_buffer_write_sync; - submit_bh(WRITE, bh); + submit_bh(WRITE_SYNC_PLUG, bh); } else { unlock_buffer(bh); brelse(bh); @@ -604,7 +604,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) goto skip_barrier; get_bh(bh); - submit_bh(WRITE_BARRIER | (1 << BIO_RW_META), bh); + submit_bh(WRITE_SYNC | (1 << BIO_RW_BARRIER) | (1 << BIO_RW_META), bh); wait_on_buffer(bh); if (buffer_eopnotsupp(bh)) { clear_buffer_eopnotsupp(bh); @@ -664,7 +664,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp) lock_buffer(bh); if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) { bh->b_end_io = end_buffer_write_sync; - submit_bh(WRITE, bh); + submit_bh(WRITE_SYNC_PLUG, bh); } else { unlock_buffer(bh); brelse(bh); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 80e4f5f..00315f5 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include "gfs2.h" #include "incore.h" @@ -189,7 +191,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp) } gfs2_log_unlock(sdp); - submit_bh(WRITE, bh); + submit_bh(WRITE_SYNC_PLUG, bh); gfs2_log_lock(sdp); n = 0; @@ -199,7 +201,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp) gfs2_log_unlock(sdp); lock_buffer(bd2->bd_bh); bh = gfs2_log_fake_buf(sdp, bd2->bd_bh); - submit_bh(WRITE, bh); + submit_bh(WRITE_SYNC_PLUG, bh); gfs2_log_lock(sdp); if (++n >= num) break; @@ -341,7 +343,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp) sdp->sd_log_num_revoke--; if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) { - submit_bh(WRITE, bh); + submit_bh(WRITE_SYNC_PLUG, bh); bh = gfs2_log_get_buf(sdp); mh = (struct gfs2_meta_header *)bh->b_data; @@ -358,7 +360,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp) } gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); - submit_bh(WRITE, bh); + submit_bh(WRITE_SYNC_PLUG, bh); } static void revoke_lo_before_scan(struct gfs2_jdesc *jd, @@ -560,7 +562,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh, ptr = bh_log_ptr(bh); get_bh(bh); - submit_bh(WRITE, bh); + submit_bh(WRITE_SYNC_PLUG, bh); gfs2_log_lock(sdp); while(!list_empty(list)) { bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list); @@ -586,7 +588,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh, } else { bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh); } - submit_bh(WRITE, bh1); + submit_bh(WRITE_SYNC_PLUG, bh1); gfs2_log_lock(sdp); ptr += 2; } diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 8d6f132..75b2aec 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -201,16 +201,32 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, struct buffer_head **bhp) { - *bhp = gfs2_getbuf(gl, blkno, CREATE); - if (!buffer_uptodate(*bhp)) { - ll_rw_block(READ_META, 1, bhp); - if (flags & DIO_WAIT) { - int error = gfs2_meta_wait(gl->gl_sbd, *bhp); - if (error) { - brelse(*bhp); - return error; - } - } + struct gfs2_sbd *sdp = gl->gl_sbd; + struct buffer_head *bh; + + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + return -EIO; + + *bhp = bh = gfs2_getbuf(gl, blkno, CREATE); + + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + return 0; + } + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(READ_SYNC | (1 << BIO_RW_META), bh); + if (!(flags & DIO_WAIT)) + return 0; + + wait_on_buffer(bh); + if (unlikely(!buffer_uptodate(bh))) { + struct gfs2_trans *tr = current->journal_info; + if (tr && tr->tr_touched) + gfs2_io_error_bh(sdp, bh); + brelse(bh); + return -EIO; } return 0; @@ -404,7 +420,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) if (buffer_uptodate(first_bh)) goto out; if (!buffer_locked(first_bh)) - ll_rw_block(READ_META, 1, &first_bh); + ll_rw_block(READ_SYNC | (1 << BIO_RW_META), 1, &first_bh); dblock++; extlen--; -- cgit v1.1 From 4a0f9a321a113392b448e477018311d14fba2b34 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 20 Apr 2009 08:16:26 +0100 Subject: GFS2: Optimise writepage for metadata This adds a GFS2 specific writepage for metadata, rather than continuing to use the VFS function. As a result we now tag all our metadata I/O with the correct flag so that blktraces will now be less confusing. Also, the generic function was checking for a number of corner cases which cannot happen on the metadata address spaces so that this should be faster too. Signed-off-by: Steven Whitehouse --- fs/gfs2/meta_io.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 57 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 75b2aec..78a5f43 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -33,17 +33,65 @@ #include "util.h" #include "ops_address.h" -static int aspace_get_block(struct inode *inode, sector_t lblock, - struct buffer_head *bh_result, int create) +static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc) { - gfs2_assert_warn(inode->i_sb->s_fs_info, 0); - return -EOPNOTSUPP; -} + int err; + struct buffer_head *bh, *head; + int nr_underway = 0; + int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC_PLUG : WRITE)); + + BUG_ON(!PageLocked(page)); + BUG_ON(!page_has_buffers(page)); + + head = page_buffers(page); + bh = head; + + do { + if (!buffer_mapped(bh)) + continue; + /* + * If it's a fully non-blocking write attempt and we cannot + * lock the buffer then redirty the page. Note that this can + * potentially cause a busy-wait loop from pdflush and kswapd + * activity, but those code paths have their own higher-level + * throttling. + */ + if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { + lock_buffer(bh); + } else if (!trylock_buffer(bh)) { + redirty_page_for_writepage(wbc, page); + continue; + } + if (test_clear_buffer_dirty(bh)) { + mark_buffer_async_write(bh); + } else { + unlock_buffer(bh); + } + } while ((bh = bh->b_this_page) != head); + + /* + * The page and its buffers are protected by PageWriteback(), so we can + * drop the bh refcounts early. + */ + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + submit_bh(write_op, bh); + nr_underway++; + } + bh = next; + } while (bh != head); + unlock_page(page); -static int gfs2_aspace_writepage(struct page *page, - struct writeback_control *wbc) -{ - return block_write_full_page(page, aspace_get_block, wbc); + err = 0; + if (nr_underway == 0) + end_page_writeback(page); + + return err; } static const struct address_space_operations aspace_aops = { -- cgit v1.1 From 48bf2b1711dc498494e77705c415ee46bb508fd9 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 29 Apr 2009 13:59:35 +0100 Subject: GFS2: Something nonlinear this way comes! For some reason GFS2 has been missing support for non-linear mappings. This patch fixes that, and also avoids taking any locks for mmap in the O_NOATIME case. In fact we don't actually need to take the lock here at all - just doing file_accessed() would be enough, but we have to take the lock eventually and this helps it hit disk (and thus be seen by other nodes) faster. Signed-off-by: Steven Whitehouse --- fs/gfs2/ops_file.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index 5d82e91..0ee7bd2 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -425,33 +425,36 @@ static struct vm_operations_struct gfs2_vm_ops = { .page_mkwrite = gfs2_page_mkwrite, }; - /** * gfs2_mmap - * @file: The file to map * @vma: The VMA which described the mapping * - * Returns: 0 or error code + * There is no need to get a lock here unless we should be updating + * atime. We ignore any locking errors since the only consequence is + * a missed atime update (which will just be deferred until later). + * + * Returns: 0 */ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) { struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); - struct gfs2_holder i_gh; - int error; - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); - error = gfs2_glock_nq(&i_gh); - if (error) { - gfs2_holder_uninit(&i_gh); - return error; - } + if (!(file->f_flags & O_NOATIME)) { + struct gfs2_holder i_gh; + int error; + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + error = gfs2_glock_nq(&i_gh); + file_accessed(file); + if (error == 0) + gfs2_glock_dq_uninit(&i_gh); + } vma->vm_ops = &gfs2_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; - gfs2_glock_dq_uninit(&i_gh); - - return error; + return 0; } /** -- cgit v1.1 From 7c77f0b3f9208c339a4b40737bb2cb0f0319bb8d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 7 May 2009 15:37:35 +0200 Subject: splice: implement pipe to pipe splicing Allow splice(2) to work when both the input and the output is a pipe. Based on the impementation of the tee(2) syscall, but instead of duplicating the buffer references move the buffers from the input pipe to the output pipe. Moving the whole buffer only succeeds if the full length of the buffer is spliced. Otherwise duplicate the buffer, just like tee(2), set the length of the output buffer and advance the offset on the input buffer. Since splice is operating on two pipes, special care needs to be taken with locking to prevent AN ABBA deadlock. Again this is done similarly to the tee(2) syscall, first preparing the input and output pipes so there's data to consume and space for that data, and then doing the move operation while holding both locks. If other processes are doing I/O on the same pipes parallel to the splice, then by the time both inodes are locked there might be no buffers left to move, or no space to move them to. In this case retry the whole operation, including the preparation phase. This could lead to starvation, but I'm not sure if that's serious enough to worry about. Signed-off-by: Miklos Szeredi Signed-off-by: Jens Axboe --- fs/splice.c | 162 +++++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 151 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 666953d5..e405cf5 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1112,6 +1112,9 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, return ret; } +static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, + struct pipe_inode_info *opipe, + size_t len, unsigned int flags); /* * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same * location, so checking ->i_pipe is not enough to verify that this is a @@ -1132,12 +1135,32 @@ static long do_splice(struct file *in, loff_t __user *off_in, struct file *out, loff_t __user *off_out, size_t len, unsigned int flags) { - struct pipe_inode_info *pipe; + struct pipe_inode_info *ipipe; + struct pipe_inode_info *opipe; loff_t offset, *off; long ret; - pipe = pipe_info(in->f_path.dentry->d_inode); - if (pipe) { + ipipe = pipe_info(in->f_path.dentry->d_inode); + opipe = pipe_info(out->f_path.dentry->d_inode); + + if (ipipe && opipe) { + if (off_in || off_out) + return -ESPIPE; + + if (!(in->f_mode & FMODE_READ)) + return -EBADF; + + if (!(out->f_mode & FMODE_WRITE)) + return -EBADF; + + /* Splicing to self would be fun, but... */ + if (ipipe == opipe) + return -EINVAL; + + return splice_pipe_to_pipe(ipipe, opipe, len, flags); + } + + if (ipipe) { if (off_in) return -ESPIPE; if (off_out) { @@ -1149,7 +1172,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, } else off = &out->f_pos; - ret = do_splice_from(pipe, out, off, len, flags); + ret = do_splice_from(ipipe, out, off, len, flags); if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) ret = -EFAULT; @@ -1157,8 +1180,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, return ret; } - pipe = pipe_info(out->f_path.dentry->d_inode); - if (pipe) { + if (opipe) { if (off_out) return -ESPIPE; if (off_in) { @@ -1170,7 +1192,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, } else off = &in->f_pos; - ret = do_splice_to(in, off, pipe, len, flags); + ret = do_splice_to(in, off, opipe, len, flags); if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) ret = -EFAULT; @@ -1511,7 +1533,7 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, * Make sure there's data to read. Wait for input if we can, otherwise * return an appropriate error. */ -static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) +static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) { int ret; @@ -1549,7 +1571,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) * Make sure there's writeable room. Wait for room if we can, otherwise * return an appropriate error. */ -static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) +static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) { int ret; @@ -1587,6 +1609,124 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) } /* + * Splice contents of ipipe to opipe. + */ +static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, + struct pipe_inode_info *opipe, + size_t len, unsigned int flags) +{ + struct pipe_buffer *ibuf, *obuf; + int ret = 0, nbuf; + bool input_wakeup = false; + + +retry: + ret = ipipe_prep(ipipe, flags); + if (ret) + return ret; + + ret = opipe_prep(opipe, flags); + if (ret) + return ret; + + /* + * Potential ABBA deadlock, work around it by ordering lock + * grabbing by pipe info address. Otherwise two different processes + * could deadlock (one doing tee from A -> B, the other from B -> A). + */ + pipe_double_lock(ipipe, opipe); + + do { + if (!opipe->readers) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; + break; + } + + if (!ipipe->nrbufs && !ipipe->writers) + break; + + /* + * Cannot make any progress, because either the input + * pipe is empty or the output pipe is full. + */ + if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) { + /* Already processed some buffers, break */ + if (ret) + break; + + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + + /* + * We raced with another reader/writer and haven't + * managed to process any buffers. A zero return + * value means EOF, so retry instead. + */ + pipe_unlock(ipipe); + pipe_unlock(opipe); + goto retry; + } + + ibuf = ipipe->bufs + ipipe->curbuf; + nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS; + obuf = opipe->bufs + nbuf; + + if (len >= ibuf->len) { + /* + * Simply move the whole buffer from ipipe to opipe + */ + *obuf = *ibuf; + ibuf->ops = NULL; + opipe->nrbufs++; + ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS; + ipipe->nrbufs--; + input_wakeup = true; + } else { + /* + * Get a reference to this pipe buffer, + * so we can copy the contents over. + */ + ibuf->ops->get(ipipe, ibuf); + *obuf = *ibuf; + + /* + * Don't inherit the gift flag, we need to + * prevent multiple steals of this page. + */ + obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + + obuf->len = len; + opipe->nrbufs++; + ibuf->offset += obuf->len; + ibuf->len -= obuf->len; + } + ret += obuf->len; + len -= obuf->len; + } while (len); + + pipe_unlock(ipipe); + pipe_unlock(opipe); + + /* + * If we put data in the output pipe, wakeup any potential readers. + */ + if (ret > 0) { + smp_mb(); + if (waitqueue_active(&opipe->wait)) + wake_up_interruptible(&opipe->wait); + kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); + } + if (input_wakeup) + wakeup_pipe_writers(ipipe); + + return ret; +} + +/* * Link contents of ipipe to opipe. */ static int link_pipe(struct pipe_inode_info *ipipe, @@ -1690,9 +1830,9 @@ static long do_tee(struct file *in, struct file *out, size_t len, * Keep going, unless we encounter an error. The ipipe/opipe * ordering doesn't really matter. */ - ret = link_ipipe_prep(ipipe, flags); + ret = ipipe_prep(ipipe, flags); if (!ret) { - ret = link_opipe_prep(opipe, flags); + ret = opipe_prep(opipe, flags); if (!ret) ret = link_pipe(ipipe, opipe, len, flags); } -- cgit v1.1 From 6818173bd658439b83896a2a7586f64ab51bf29c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 7 May 2009 15:37:36 +0200 Subject: splice: implement default splice_read method If f_op->splice_read() is not implemented, fall back to a plain read. Use vfs_readv() to read into previously allocated pages. This will allow splice and functions using splice, such as the loop device, to work on all filesystems. This includes "direct_io" files in fuse which bypass the page cache. Signed-off-by: Miklos Szeredi Signed-off-by: Jens Axboe --- fs/coda/file.c | 9 +++-- fs/pipe.c | 14 +++++++ fs/read_write.c | 7 +--- fs/splice.c | 120 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 136 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/coda/file.c b/fs/coda/file.c index 6a347fb..ffd4281 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -47,6 +47,8 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos, struct pipe_inode_info *pipe, size_t count, unsigned int flags) { + ssize_t (*splice_read)(struct file *, loff_t *, + struct pipe_inode_info *, size_t, unsigned int); struct coda_file_info *cfi; struct file *host_file; @@ -54,10 +56,11 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos, BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); host_file = cfi->cfi_container; - if (!host_file->f_op || !host_file->f_op->splice_read) - return -EINVAL; + splice_read = host_file->f_op->splice_read; + if (!splice_read) + splice_read = default_file_splice_read; - return host_file->f_op->splice_read(host_file, ppos, pipe, count,flags); + return splice_read(host_file, ppos, pipe, count, flags); } static ssize_t diff --git a/fs/pipe.c b/fs/pipe.c index 13414ec..f7dd21a 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -302,6 +302,20 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *info, return 0; } +/** + * generic_pipe_buf_release - put a reference to a &struct pipe_buffer + * @pipe: the pipe that the buffer belongs to + * @buf: the buffer to put a reference to + * + * Description: + * This function releases a reference to @buf. + */ +void generic_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + page_cache_release(buf->page); +} + static const struct pipe_buf_operations anon_pipe_buf_ops = { .can_merge = 1, .map = generic_pipe_buf_map, diff --git a/fs/read_write.c b/fs/read_write.c index 9d1e76b..6c8c55d 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -805,12 +805,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, goto out; if (!(in_file->f_mode & FMODE_READ)) goto fput_in; - retval = -EINVAL; - in_inode = in_file->f_path.dentry->d_inode; - if (!in_inode) - goto fput_in; - if (!in_file->f_op || !in_file->f_op->splice_read) - goto fput_in; retval = -ESPIPE; if (!ppos) ppos = &in_file->f_pos; @@ -834,6 +828,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, retval = -EINVAL; if (!out_file->f_op || !out_file->f_op->sendpage) goto fput_out; + in_inode = in_file->f_path.dentry->d_inode; out_inode = out_file->f_path.dentry->d_inode; retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count); if (retval < 0) diff --git a/fs/splice.c b/fs/splice.c index e405cf5..3bd9cb2 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -507,9 +507,116 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, return ret; } - EXPORT_SYMBOL(generic_file_splice_read); +static const struct pipe_buf_operations default_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = generic_pipe_buf_release, + .steal = generic_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +static ssize_t kernel_readv(struct file *file, const struct iovec *vec, + unsigned long vlen, loff_t offset) +{ + mm_segment_t old_fs; + loff_t pos = offset; + ssize_t res; + + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos); + set_fs(old_fs); + + return res; +} + +ssize_t default_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + unsigned int nr_pages; + unsigned int nr_freed; + size_t offset; + struct page *pages[PIPE_BUFFERS]; + struct partial_page partial[PIPE_BUFFERS]; + struct iovec vec[PIPE_BUFFERS]; + pgoff_t index; + ssize_t res; + size_t this_len; + int error; + int i; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .flags = flags, + .ops = &default_pipe_buf_ops, + .spd_release = spd_release_page, + }; + + index = *ppos >> PAGE_CACHE_SHIFT; + offset = *ppos & ~PAGE_CACHE_MASK; + nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) { + struct page *page; + + page = alloc_page(GFP_HIGHUSER); + error = -ENOMEM; + if (!page) + goto err; + + this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset); + vec[i].iov_base = (void __user *) kmap(page); + vec[i].iov_len = this_len; + pages[i] = page; + spd.nr_pages++; + len -= this_len; + offset = 0; + } + + res = kernel_readv(in, vec, spd.nr_pages, *ppos); + if (res < 0) + goto err; + + error = 0; + if (!res) + goto err; + + nr_freed = 0; + for (i = 0; i < spd.nr_pages; i++) { + kunmap(pages[i]); + this_len = min_t(size_t, vec[i].iov_len, res); + partial[i].offset = 0; + partial[i].len = this_len; + if (!this_len) { + __free_page(pages[i]); + pages[i] = NULL; + nr_freed++; + } + res -= this_len; + } + spd.nr_pages -= nr_freed; + + res = splice_to_pipe(pipe, &spd); + if (res > 0) + *ppos += res; + + return res; + +err: + for (i = 0; i < spd.nr_pages; i++) { + kunmap(pages[i]); + __free_page(pages[i]); + } + return error; +} +EXPORT_SYMBOL(default_file_splice_read); + /* * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' * using sendpage(). Return the number of bytes sent. @@ -933,11 +1040,10 @@ static long do_splice_to(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { + ssize_t (*splice_read)(struct file *, loff_t *, + struct pipe_inode_info *, size_t, unsigned int); int ret; - if (unlikely(!in->f_op || !in->f_op->splice_read)) - return -EINVAL; - if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; @@ -945,7 +1051,11 @@ static long do_splice_to(struct file *in, loff_t *ppos, if (unlikely(ret < 0)) return ret; - return in->f_op->splice_read(in, ppos, pipe, len, flags); + splice_read = in->f_op->splice_read; + if (!splice_read) + splice_read = default_file_splice_read; + + return splice_read(in, ppos, pipe, len, flags); } /** -- cgit v1.1 From 0b0a47f5c4a30b58432e20ae1706a27baea91a88 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 7 May 2009 15:37:37 +0200 Subject: splice: implement default splice_write method If f_op->splice_write() is not implemented, fall back to a plain write. Use vfs_writev() to write from the pipe buffers. This will allow splice on all filesystems and file types. This includes "direct_io" files in fuse which bypass the page cache. Signed-off-by: Miklos Szeredi Signed-off-by: Jens Axboe --- fs/splice.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 138 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 3bd9cb2..eefd96b 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -535,6 +535,21 @@ static ssize_t kernel_readv(struct file *file, const struct iovec *vec, return res; } +static ssize_t kernel_writev(struct file *file, const struct iovec *vec, + unsigned long vlen, loff_t *ppos) +{ + mm_segment_t old_fs; + ssize_t res; + + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + res = vfs_writev(file, (const struct iovec __user *)vec, vlen, ppos); + set_fs(old_fs); + + return res; +} + ssize_t default_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) @@ -988,6 +1003,122 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, EXPORT_SYMBOL(generic_file_splice_write); +static struct pipe_buffer *nth_pipe_buf(struct pipe_inode_info *pipe, int n) +{ + return &pipe->bufs[(pipe->curbuf + n) % PIPE_BUFFERS]; +} + +static ssize_t default_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, + size_t len, unsigned int flags) +{ + ssize_t ret = 0; + ssize_t total_len = 0; + int do_wakeup = 0; + + pipe_lock(pipe); + while (len) { + struct pipe_buffer *buf; + void *data[PIPE_BUFFERS]; + struct iovec vec[PIPE_BUFFERS]; + unsigned int nr_pages = 0; + unsigned int write_len = 0; + unsigned int now_len = len; + unsigned int this_len; + int i; + + BUG_ON(pipe->nrbufs > PIPE_BUFFERS); + for (i = 0; i < pipe->nrbufs && now_len; i++) { + buf = nth_pipe_buf(pipe, i); + + ret = buf->ops->confirm(pipe, buf); + if (ret) + break; + + data[i] = buf->ops->map(pipe, buf, 0); + this_len = min(buf->len, now_len); + vec[i].iov_base = (void __user *) data[i] + buf->offset; + vec[i].iov_len = this_len; + now_len -= this_len; + write_len += this_len; + nr_pages++; + } + + if (nr_pages) { + ret = kernel_writev(out, vec, nr_pages, ppos); + if (ret == 0) + ret = -EIO; + if (ret > 0) { + len -= ret; + total_len += ret; + } + } + + for (i = 0; i < nr_pages; i++) { + buf = nth_pipe_buf(pipe, i); + buf->ops->unmap(pipe, buf, data[i]); + + if (ret > 0) { + this_len = min_t(unsigned, vec[i].iov_len, ret); + buf->offset += this_len; + buf->len -= this_len; + ret -= this_len; + } + } + + if (ret < 0) + break; + + while (pipe->nrbufs) { + const struct pipe_buf_operations *ops; + + buf = nth_pipe_buf(pipe, 0); + if (buf->len) + break; + + ops = buf->ops; + buf->ops = NULL; + ops->release(pipe, buf); + pipe->curbuf = (pipe->curbuf + 1) % PIPE_BUFFERS; + pipe->nrbufs--; + if (pipe->inode) + do_wakeup = 1; + } + + if (pipe->nrbufs) + continue; + if (!pipe->writers) + break; + if (!pipe->waiting_writers) { + if (total_len) + break; + } + + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + if (do_wakeup) { + wakeup_pipe_writers(pipe); + do_wakeup = 0; + } + + pipe_wait(pipe); + } + pipe_unlock(pipe); + + if (do_wakeup) + wakeup_pipe_writers(pipe); + + return total_len ? total_len : ret; +} + /** * generic_splice_sendpage - splice data from a pipe to a socket * @pipe: pipe to splice from @@ -1015,11 +1146,10 @@ EXPORT_SYMBOL(generic_splice_sendpage); static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { + ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, + loff_t *, size_t, unsigned int); int ret; - if (unlikely(!out->f_op || !out->f_op->splice_write)) - return -EINVAL; - if (unlikely(!(out->f_mode & FMODE_WRITE))) return -EBADF; @@ -1030,7 +1160,11 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, if (unlikely(ret < 0)) return ret; - return out->f_op->splice_write(pipe, out, ppos, len, flags); + splice_write = out->f_op->splice_write; + if (!splice_write) + splice_write = default_file_splice_write; + + return splice_write(pipe, out, ppos, len, flags); } /* -- cgit v1.1 From bc8e67409ccdcff72c3f1656b1fb1aad7ff396db Mon Sep 17 00:00:00 2001 From: Vincent Minet Date: Fri, 15 May 2009 08:33:18 -0400 Subject: ext4: Fix spinlock assertions on UP systems On UP systems without DEBUG_SPINLOCK, ext4_is_group_locked always fails which triggers a BUG_ON() call. This patch fixes it by using assert_spin_locked instead. Signed-off-by: Vincent Minet Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 6 ------ fs/ext4/mballoc.c | 10 +++++----- 2 files changed, 5 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 149e02d..89190ae 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1583,12 +1583,6 @@ static inline void ext4_unlock_group(struct super_block *sb, spin_unlock(ext4_group_lock_ptr(sb, group)); } -static inline int ext4_is_group_locked(struct super_block *sb, - ext4_group_t group) -{ - return spin_is_locked(ext4_group_lock_ptr(sb, group)); -} - /* * Inodes and files operations */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e76459c..541bd9a 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -436,7 +436,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; - BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group)); + assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); for (i = 0; i < count; i++) { if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { ext4_fsblk_t blocknr; @@ -460,7 +460,7 @@ static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; - BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); + assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); for (i = 0; i < count; i++) { BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); mb_set_bit(first + i, e4b->bd_info->bb_bitmap); @@ -1115,7 +1115,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, struct super_block *sb = e4b->bd_sb; BUG_ON(first + count > (sb->s_blocksize << 3)); - BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group)); + assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); @@ -1196,7 +1196,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, int ord; void *buddy; - BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); + assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); BUG_ON(ex == NULL); buddy = mb_find_buddy(e4b, order, &max); @@ -1260,7 +1260,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); BUG_ON(e4b->bd_group != ex->fe_group); - BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); + assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); mb_check_buddy(e4b); mb_mark_used_double(e4b, start, len); -- cgit v1.1 From f888e652d758bfe0c04c209b72a05972daeba386 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 12 May 2009 00:21:29 -0400 Subject: ext4: Simplify function signature for ext4_da_get_block_write() The function ext4_da_get_block_write() is called in exactly one write, and the last argument, create, is always 1. Remove it to simplify the code slightly. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4e7f363..476d843 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2000,7 +2000,7 @@ static void ext4_print_free_blocks(struct inode *inode) #define EXT4_DELALLOC_RSVED 1 static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) + struct buffer_head *bh_result) { int ret; unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; @@ -2010,7 +2010,7 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, handle = ext4_journal_current_handle(); BUG_ON(!handle); ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, - bh_result, create, 0, EXT4_DELALLOC_RSVED); + bh_result, 1, 0, EXT4_DELALLOC_RSVED); if (ret <= 0) return ret; @@ -2088,7 +2088,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) if (!new.b_size) return 0; - err = ext4_da_get_block_write(mpd->inode, next, &new, 1); + err = ext4_da_get_block_write(mpd->inode, next, &new); if (err) { /* * If get block returns with error we simply -- cgit v1.1 From e4d996ca806e93dddb5d76c0d3d859b494c559f6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 12 May 2009 00:25:28 -0400 Subject: ext4: Rename ext4_get_blocks_handle() to be ext4_ind_get_blocks() The static function ext4_get_blocks_handle() is badly named. Of *course* it takes a handle. Since its counterpart for extent-based file is ext4_ext_get_blocks(), rename it to be ext4_ind_get_blocks(). Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 476d843..f758e80 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -914,7 +914,7 @@ err_out: * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) */ -static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, +static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, unsigned int maxblocks, struct buffer_head *bh_result, int create, int extend_disksize) @@ -1129,7 +1129,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) * mapped. * * If file type is extents based, it will call ext4_ext_get_blocks(), - * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping + * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping * based files * * On success, it returns the number of blocks being mapped or allocate. @@ -1160,8 +1160,8 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, bh, 0, 0); } else { - retval = ext4_get_blocks_handle(handle, - inode, block, max_blocks, bh, 0, 0); + retval = ext4_ind_get_blocks(handle, inode, block, max_blocks, + bh, 0, 0); } up_read((&EXT4_I(inode)->i_data_sem)); @@ -1215,7 +1215,7 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, bh, create, extend_disksize); } else { - retval = ext4_get_blocks_handle(handle, inode, block, + retval = ext4_ind_get_blocks(handle, inode, block, max_blocks, bh, create, extend_disksize); if (retval > 0 && buffer_new(bh)) { @@ -1297,7 +1297,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, err = ext4_get_blocks_wrap(handle, inode, block, 1, &dummy, create, 1, 0); /* - * ext4_get_blocks_handle() returns number of blocks + * ext4_get_blocks_wrap() returns number of blocks * mapped. 0 in case of a HOLE. */ if (err > 0) { -- cgit v1.1 From 7537d81aa7b7cd31b0caeac8091456e93d96fa8d Mon Sep 17 00:00:00 2001 From: Abhijith Das Date: Tue, 12 May 2009 11:16:20 -0500 Subject: GFS2: Fix timestamps on write This patch copies the timestamps from the vfs inode into gfs2 and syncs it to the disk inode during writes. Signed-off-by: Abhijith Das Signed-off-by: Steven Whitehouse --- fs/gfs2/ops_address.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index a6dde17..e566421 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c @@ -781,10 +781,12 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, unlock_page(page); page_cache_release(page); - if (inode->i_size < to) { - i_size_write(inode, to); - ip->i_disksize = inode->i_size; - di->di_size = cpu_to_be64(inode->i_size); + if (copied) { + if (inode->i_size < to) { + i_size_write(inode, to); + ip->i_disksize = inode->i_size; + } + gfs2_dinode_out(ip, di); mark_inode_dirty(inode); } @@ -824,7 +826,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, struct gfs2_sbd *sdp = GFS2_SB(inode); struct buffer_head *dibh; struct gfs2_alloc *al = ip->i_alloc; - struct gfs2_dinode *di; unsigned int from = pos & (PAGE_CACHE_SIZE - 1); unsigned int to = from + len; int ret; @@ -847,11 +848,10 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, gfs2_page_add_databufs(ip, page, from, to); ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - - if (likely(ret >= 0) && (inode->i_size > ip->i_disksize)) { - di = (struct gfs2_dinode *)dibh->b_data; - ip->i_disksize = inode->i_size; - di->di_size = cpu_to_be64(inode->i_size); + if (ret > 0) { + if (inode->i_size > ip->i_disksize) + ip->i_disksize = inode->i_size; + gfs2_dinode_out(ip, dibh->b_data); mark_inode_dirty(inode); } -- cgit v1.1 From 12b7ac176831df1aa58a787e67c3e5d698b30163 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 14 May 2009 00:57:44 -0400 Subject: ext4: Rename ext4_get_blocks_wrap() to be ext4_get_blocks() Another function rename for clarity's sake. The _wrap prefix simply confuses people, and didn't add much people trying to follow the code paths. Signed-off-by: "Theodore Ts'o" --- fs/ext4/dir.c | 3 +-- fs/ext4/ext4.h | 8 ++++---- fs/ext4/extents.c | 6 +++--- fs/ext4/inode.c | 35 +++++++++++++++++------------------ 4 files changed, 25 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index b647899..052d637 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -131,8 +131,7 @@ static int ext4_readdir(struct file *filp, struct buffer_head *bh = NULL; map_bh.b_state = 0; - err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, - 0, 0, 0); + err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0, 0, 0); if (err > 0) { pgoff_t index = map_bh.b_blocknr >> (PAGE_CACHE_SHIFT - inode->i_blkbits); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 89190ae..5dc8368 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1617,10 +1617,10 @@ extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len); -extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, - sector_t block, unsigned int max_blocks, - struct buffer_head *bh, int create, - int extend_disksize, int flag); +extern int ext4_get_blocks(handle_t *handle, struct inode *inode, + sector_t block, unsigned int max_blocks, + struct buffer_head *bh, int create, + int extend_disksize, int flag); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 4fec6b7..7e7d02d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3151,9 +3151,9 @@ retry: break; } map_bh.b_state = 0; - ret = ext4_get_blocks_wrap(handle, inode, block, - max_blocks, &map_bh, - EXT4_CREATE_UNINITIALIZED_EXT, 0, 0); + ret = ext4_get_blocks(handle, inode, block, + max_blocks, &map_bh, + EXT4_CREATE_UNINITIALIZED_EXT, 0, 0); if (ret <= 0) { #ifdef EXT4FS_DEBUG WARN_ON(ret <= 0); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f758e80..a9a9b9b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1121,7 +1121,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) } /* - * The ext4_get_blocks_wrap() function try to look up the requested blocks, + * The ext4_get_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. * * Otherwise it takes the write lock of the i_data_sem and allocate blocks @@ -1142,9 +1142,9 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) * * It returns the error in case of allocation failure. */ -int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, - unsigned int max_blocks, struct buffer_head *bh, - int create, int extend_disksize, int flag) +int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, + unsigned int max_blocks, struct buffer_head *bh, + int create, int extend_disksize, int flag) { int retval; @@ -1268,8 +1268,8 @@ int ext4_get_block(struct inode *inode, sector_t iblock, started = 1; } - ret = ext4_get_blocks_wrap(handle, inode, iblock, - max_blocks, bh_result, create, 0, 0); + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, + create, 0, 0); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; @@ -1294,10 +1294,9 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, dummy.b_state = 0; dummy.b_blocknr = -1000; buffer_trace_init(&dummy.b_history); - err = ext4_get_blocks_wrap(handle, inode, block, 1, - &dummy, create, 1, 0); + err = ext4_get_blocks(handle, inode, block, 1, &dummy, create, 1, 0); /* - * ext4_get_blocks_wrap() returns number of blocks + * ext4_get_blocks() returns number of blocks * mapped. 0 in case of a HOLE. */ if (err > 0) { @@ -2009,8 +2008,8 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, handle = ext4_journal_current_handle(); BUG_ON(!handle); - ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, - bh_result, 1, 0, EXT4_DELALLOC_RSVED); + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, + bh_result, 1, 0, EXT4_DELALLOC_RSVED); if (ret <= 0) return ret; @@ -2067,11 +2066,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) /* * We need to make sure the BH_Delay flag is passed down to * ext4_da_get_block_write(), since it calls - * ext4_get_blocks_wrap() with the EXT4_DELALLOC_RSVED flag. - * This flag causes ext4_get_blocks_wrap() to call + * ext4_get_blocks() with the EXT4_DELALLOC_RSVED flag. + * This flag causes ext4_get_blocks() to call * ext4_da_update_reserve_space() if the passed buffer head * has the BH_Delay flag set. In the future, once we clean up - * the interfaces to ext4_get_blocks_wrap(), we should pass in + * the interfaces to ext4_get_blocks(), we should pass in * a separate flag which requests that the delayed allocation * statistics should be updated, instead of depending on the * state information getting passed down via the map_bh's @@ -2363,7 +2362,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, * preallocated blocks are unmapped but should treated * the same as allocated blocks. */ - ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0); + ret = ext4_get_blocks(NULL, inode, iblock, 1, bh_result, 0, 0, 0); if ((ret == 0) && !buffer_delay(bh_result)) { /* the block isn't (pre)allocated yet, let's reserve space */ /* @@ -2407,8 +2406,8 @@ static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock, * we don't want to do block allocation in writepage * so call get_block_wrap with create = 0 */ - ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks, - bh_result, 0, 0, 0); + ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, + bh_result, 0, 0, 0); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; @@ -5034,7 +5033,7 @@ int ext4_writepage_trans_blocks(struct inode *inode) * Calculate the journal credits for a chunk of data modification. * * This is called from DIO, fallocate or whoever calling - * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks. + * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks. * * journal buffers for data blocks are not included here, as DIO * and fallocate do no need to journal data buffers. -- cgit v1.1 From c21770573319922e3f3fcb331cfaa290c49f1c81 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 14 May 2009 00:58:52 -0400 Subject: ext4: Define a new set of flags for ext4_get_blocks() The functions ext4_get_blocks(), ext4_ext_get_blocks(), and ext4_ind_get_blocks() used an ad-hoc set of integer variables used as boolean flags passed in as arguments. Use a single flags parameter and a setandard set of bitfield flags instead. This saves space on the call stack, and it also makes the code a bit more understandable. Signed-off-by: "Theodore Ts'o" --- fs/ext4/dir.c | 2 +- fs/ext4/ext4.h | 22 ++++++++++++++------- fs/ext4/extents.c | 22 ++++++++++----------- fs/ext4/inode.c | 57 +++++++++++++++++++++++++++++-------------------------- 4 files changed, 57 insertions(+), 46 deletions(-) (limited to 'fs') diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 052d637..9dc93168 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -131,7 +131,7 @@ static int ext4_readdir(struct file *filp, struct buffer_head *bh = NULL; map_bh.b_state = 0; - err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0, 0, 0); + err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0); if (err > 0) { pgoff_t index = map_bh.b_blocknr >> (PAGE_CACHE_SHIFT - inode->i_blkbits); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5dc8368..17feb4a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -314,10 +314,20 @@ struct ext4_new_group_data { }; /* - * Following is used by preallocation code to tell get_blocks() that we - * want uninitialzed extents. + * Flags used by ext4_get_blocks() */ -#define EXT4_CREATE_UNINITIALIZED_EXT 2 + /* Allocate any needed blocks and/or convert an unitialized + extent to be an initialized ext4 */ +#define EXT4_GET_BLOCKS_CREATE 1 + /* Request the creation of an unitialized extent */ +#define EXT4_GET_BLOCKS_UNINIT_EXT 2 +#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ + EXT4_GET_BLOCKS_CREATE) + /* Update the ext4_inode_info i_disksize field */ +#define EXT4_GET_BLOCKS_EXTEND_DISKSIZE 4 + /* Caller is from the delayed allocation writeout path, + so the filesystem blocks have already been accounted for */ +#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 8 /* * ioctl commands @@ -1610,8 +1620,7 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk); extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, unsigned int max_blocks, - struct buffer_head *bh_result, - int create, int extend_disksize); + struct buffer_head *bh_result, int flags); extern void ext4_ext_truncate(struct inode *); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); @@ -1619,8 +1628,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len); extern int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, unsigned int max_blocks, - struct buffer_head *bh, int create, - int extend_disksize, int flag); + struct buffer_head *bh, int flags); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7e7d02d..27c383c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2784,7 +2784,7 @@ fix_extent_len: int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, unsigned int max_blocks, struct buffer_head *bh_result, - int create, int extend_disksize) + int flags) { struct ext4_ext_path *path = NULL; struct ext4_extent_header *eh; @@ -2803,7 +2803,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, cache_type = ext4_ext_in_cache(inode, iblock, &newex); if (cache_type) { if (cache_type == EXT4_EXT_CACHE_GAP) { - if (!create) { + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { /* * block isn't allocated yet and * user doesn't want to allocate it @@ -2869,9 +2869,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, EXT4_EXT_CACHE_EXTENT); goto out; } - if (create == EXT4_CREATE_UNINITIALIZED_EXT) + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) goto out; - if (!create) { + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { if (allocated > max_blocks) allocated = max_blocks; /* @@ -2903,7 +2903,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * requested block isn't allocated yet; * we couldn't try to create block if create flag is zero */ - if (!create) { + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { /* * put just found gap into cache to speed up * subsequent requests @@ -2932,10 +2932,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * EXT_UNINIT_MAX_LEN. */ if (max_blocks > EXT_INIT_MAX_LEN && - create != EXT4_CREATE_UNINITIALIZED_EXT) + !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) max_blocks = EXT_INIT_MAX_LEN; else if (max_blocks > EXT_UNINIT_MAX_LEN && - create == EXT4_CREATE_UNINITIALIZED_EXT) + (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) max_blocks = EXT_UNINIT_MAX_LEN; /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */ @@ -2966,7 +2966,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, /* try to insert new extent into found leaf and return */ ext4_ext_store_pblock(&newex, newblock); newex.ee_len = cpu_to_le16(ar.len); - if (create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */ + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */ ext4_ext_mark_uninitialized(&newex); err = ext4_ext_insert_extent(handle, inode, path, &newex); if (err) { @@ -2983,7 +2983,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, newblock = ext_pblock(&newex); allocated = ext4_ext_get_actual_len(&newex); outnew: - if (extend_disksize) { + if (flags & EXT4_GET_BLOCKS_EXTEND_DISKSIZE) { disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits; if (disksize > i_size_read(inode)) disksize = i_size_read(inode); @@ -2994,7 +2994,7 @@ outnew: set_buffer_new(bh_result); /* Cache only when it is _not_ an uninitialized extent */ - if (create != EXT4_CREATE_UNINITIALIZED_EXT) + if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) ext4_ext_put_in_cache(inode, iblock, allocated, newblock, EXT4_EXT_CACHE_EXTENT); out: @@ -3153,7 +3153,7 @@ retry: map_bh.b_state = 0; ret = ext4_get_blocks(handle, inode, block, max_blocks, &map_bh, - EXT4_CREATE_UNINITIALIZED_EXT, 0, 0); + EXT4_GET_BLOCKS_CREATE_UNINIT_EXT); if (ret <= 0) { #ifdef EXT4FS_DEBUG WARN_ON(ret <= 0); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a9a9b9b..8b7564d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -917,7 +917,7 @@ err_out: static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, unsigned int maxblocks, struct buffer_head *bh_result, - int create, int extend_disksize) + int flags) { int err = -EIO; ext4_lblk_t offsets[4]; @@ -934,7 +934,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); - J_ASSERT(handle != NULL || create == 0); + J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); depth = ext4_block_to_path(inode, iblock, offsets, &blocks_to_boundary); @@ -963,7 +963,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, } /* Next simple case - plain lookup or failed read of indirect block */ - if (!create || err == -EIO) + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) goto cleanup; /* @@ -1002,7 +1002,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, * protect it if you're about to implement concurrent * ext4_get_block() -bzzz */ - if (!err && extend_disksize) { + if (!err && (flags & EXT4_GET_BLOCKS_EXTEND_DISKSIZE)) { disksize = ((loff_t) iblock + count) << inode->i_blkbits; if (disksize > i_size_read(inode)) disksize = i_size_read(inode); @@ -1144,7 +1144,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) */ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, unsigned int max_blocks, struct buffer_head *bh, - int create, int extend_disksize, int flag) + int flags) { int retval; @@ -1158,15 +1158,15 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, down_read((&EXT4_I(inode)->i_data_sem)); if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, - bh, 0, 0); + bh, 0); } else { retval = ext4_ind_get_blocks(handle, inode, block, max_blocks, - bh, 0, 0); + bh, 0); } up_read((&EXT4_I(inode)->i_data_sem)); /* If it is only a block(s) look up */ - if (!create) + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) return retval; /* @@ -1205,7 +1205,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, * let the underlying get_block() function know to * avoid double accounting */ - if (flag) + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) EXT4_I(inode)->i_delalloc_reserved_flag = 1; /* * We need to check for EXT4 here because migrate @@ -1213,10 +1213,10 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, */ if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, - bh, create, extend_disksize); + bh, flags); } else { retval = ext4_ind_get_blocks(handle, inode, block, - max_blocks, bh, create, extend_disksize); + max_blocks, bh, flags); if (retval > 0 && buffer_new(bh)) { /* @@ -1229,7 +1229,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, } } - if (flag) { + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { EXT4_I(inode)->i_delalloc_reserved_flag = 0; /* * Update reserved blocks/metadata blocks @@ -1269,7 +1269,7 @@ int ext4_get_block(struct inode *inode, sector_t iblock, } ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, - create, 0, 0); + create ? EXT4_GET_BLOCKS_CREATE : 0); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; @@ -1288,16 +1288,19 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, { struct buffer_head dummy; int fatal = 0, err; + int flags = EXT4_GET_BLOCKS_EXTEND_DISKSIZE; J_ASSERT(handle != NULL || create == 0); dummy.b_state = 0; dummy.b_blocknr = -1000; buffer_trace_init(&dummy.b_history); - err = ext4_get_blocks(handle, inode, block, 1, &dummy, create, 1, 0); + if (create) + flags |= EXT4_GET_BLOCKS_CREATE; + err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags); /* - * ext4_get_blocks() returns number of blocks - * mapped. 0 in case of a HOLE. + * ext4_get_blocks() returns number of blocks mapped. 0 in + * case of a HOLE. */ if (err > 0) { if (err > 1) @@ -1997,7 +2000,6 @@ static void ext4_print_free_blocks(struct inode *inode) return; } -#define EXT4_DELALLOC_RSVED 1 static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result) { @@ -2009,7 +2011,8 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, handle = ext4_journal_current_handle(); BUG_ON(!handle); ret = ext4_get_blocks(handle, inode, iblock, max_blocks, - bh_result, 1, 0, EXT4_DELALLOC_RSVED); + bh_result, EXT4_GET_BLOCKS_CREATE| + EXT4_GET_BLOCKS_DELALLOC_RESERVE); if (ret <= 0) return ret; @@ -2065,16 +2068,17 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) return 0; /* * We need to make sure the BH_Delay flag is passed down to - * ext4_da_get_block_write(), since it calls - * ext4_get_blocks() with the EXT4_DELALLOC_RSVED flag. - * This flag causes ext4_get_blocks() to call + * ext4_da_get_block_write(), since it calls ext4_get_blocks() + * with the EXT4_GET_BLOCKS_DELALLOC_RESERVE flag. This flag + * causes ext4_get_blocks() to call * ext4_da_update_reserve_space() if the passed buffer head * has the BH_Delay flag set. In the future, once we clean up - * the interfaces to ext4_get_blocks(), we should pass in - * a separate flag which requests that the delayed allocation + * the interfaces to ext4_get_blocks(), we should pass in a + * separate flag which requests that the delayed allocation * statistics should be updated, instead of depending on the * state information getting passed down via the map_bh's - * state bitmasks plus the magic EXT4_DELALLOC_RSVED flag. + * state bitmasks plus the magic + * EXT4_GET_BLOCKS_DELALLOC_RESERVE flag. */ new.b_state = mpd->b_state & (1 << BH_Delay); new.b_blocknr = 0; @@ -2362,7 +2366,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, * preallocated blocks are unmapped but should treated * the same as allocated blocks. */ - ret = ext4_get_blocks(NULL, inode, iblock, 1, bh_result, 0, 0, 0); + ret = ext4_get_blocks(NULL, inode, iblock, 1, bh_result, 0); if ((ret == 0) && !buffer_delay(bh_result)) { /* the block isn't (pre)allocated yet, let's reserve space */ /* @@ -2406,8 +2410,7 @@ static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock, * we don't want to do block allocation in writepage * so call get_block_wrap with create = 0 */ - ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, - bh_result, 0, 0, 0); + ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; -- cgit v1.1 From b920c75502cb2c48654ef196d647c8eb81ab608a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 14 May 2009 00:54:29 -0400 Subject: ext4: Add documentation to the ext4_*get_block* functions This adds more documentation to various internal functions in fs/ext4/inode.c, most notably ext4_ind_get_blocks(), ext4_da_get_block_write(), ext4_da_get_block_prep(), ext4_normal_get_block_write(). In addition, the static function ext4_normal_get_block_write() has been renamed noalloc_get_block_write(), since it is used in many places far beyond ext4_normal_writepage(). Plenty of warnings have been added to the noalloc_get_block_write() function, since the way it is used is amazingly fragile. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 86 ++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 55 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8b7564d..fd5f27a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -892,6 +892,10 @@ err_out: } /* + * The ext4_ind_get_blocks() function handles non-extents inodes + * (i.e., using the traditional indirect/double-indirect i_blocks + * scheme) for ext4_get_blocks(). + * * Allocation strategy is simple: if we have to allocate something, we will * have to go the whole way to leaf. So let's do it before attaching anything * to tree, set linkage between the newborn blocks, write them if sync is @@ -909,10 +913,11 @@ err_out: * return = 0, if plain lookup failed. * return < 0, error case. * - * - * Need to be called with - * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block - * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) + * The ext4_ind_get_blocks() function should be called with + * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem + * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or + * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system + * blocks. */ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, unsigned int maxblocks, @@ -1152,8 +1157,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, clear_buffer_unwritten(bh); /* - * Try to see if we can get the block without requesting - * for new file system block. + * Try to see if we can get the block without requesting a new + * file system block. */ down_read((&EXT4_I(inode)->i_data_sem)); if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { @@ -2000,6 +2005,12 @@ static void ext4_print_free_blocks(struct inode *inode) return; } +/* + * This function is used by mpage_da_map_blocks(). We separate it out + * as a separate function just to make life easier, and because + * mpage_da_map_blocks() used to be a generic function that took a + * get_block_t. + */ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result) { @@ -2031,8 +2042,8 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, /* * Update on-disk size along with block allocation we don't - * use 'extend_disksize' as size may change within already - * allocated block -bzzz + * use EXT4_GET_BLOCKS_EXTEND_DISKSIZE as size may change + * within already allocated block -bzzz */ disksize = ((loff_t) iblock + ret) << inode->i_blkbits; if (disksize > i_size_read(inode)) @@ -2338,8 +2349,9 @@ static int __mpage_da_writepage(struct page *page, } /* - * this is a special callback for ->write_begin() only - * it's intention is to return mapped block or reserve space + * This is a special get_blocks_t callback which is used by + * ext4_da_write_begin(). It will either return mapped block or + * reserve space for a single block. * * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. * We also have b_blocknr = -1 and b_bdev initialized properly @@ -2347,7 +2359,6 @@ static int __mpage_da_writepage(struct page *page, * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev * initialized properly. - * */ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -2400,7 +2411,23 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, return ret; } -static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock, +/* + * This function is used as a standard get_block_t calback function + * when there is no desire to allocate any blocks. It is used as a + * callback function for block_prepare_write(), nobh_writepage(), and + * block_write_full_page(). These functions should only try to map a + * single block at a time. + * + * Since this function doesn't do block allocations even if the caller + * requests it by passing in create=1, it is critically important that + * any caller checks to make sure that any buffer heads are returned + * by this function are either all already mapped or marked for + * delayed allocation before calling nobh_writepage() or + * block_write_full_page(). Otherwise, b_blocknr could be left + * unitialized, and the page write functions will be taken by + * surprise. + */ +static int noalloc_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { int ret = 0; @@ -2419,10 +2446,11 @@ static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock, } /* - * get called vi ext4_da_writepages after taking page lock (have journal handle) - * get called via journal_submit_inode_data_buffers (no journal handle) - * get called via shrink_page_list via pdflush (no journal handle) - * or grab_page_cache when doing write_begin (have journal handle) + * This function can get called via... + * - ext4_da_writepages after taking page lock (have journal handle) + * - journal_submit_inode_data_buffers (no journal handle) + * - shrink_page_list via pdflush (no journal handle) + * - grab_page_cache when doing write_begin (have journal handle) */ static int ext4_da_writepage(struct page *page, struct writeback_control *wbc) @@ -2473,7 +2501,7 @@ static int ext4_da_writepage(struct page *page, * do block allocation here. */ ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, - ext4_normal_get_block_write); + noalloc_get_block_write); if (!ret) { page_bufs = page_buffers(page); /* check whether all are mapped and non delay */ @@ -2498,11 +2526,10 @@ static int ext4_da_writepage(struct page *page, } if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) - ret = nobh_writepage(page, ext4_normal_get_block_write, wbc); + ret = nobh_writepage(page, noalloc_get_block_write, wbc); else - ret = block_write_full_page(page, - ext4_normal_get_block_write, - wbc); + ret = block_write_full_page(page, noalloc_get_block_write, + wbc); return ret; } @@ -2814,7 +2841,7 @@ retry: *pagep = page; ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, - ext4_da_get_block_prep); + ext4_da_get_block_prep); if (ret < 0) { unlock_page(page); ext4_journal_stop(handle); @@ -3122,12 +3149,10 @@ static int __ext4_normal_writepage(struct page *page, struct inode *inode = page->mapping->host; if (test_opt(inode->i_sb, NOBH)) - return nobh_writepage(page, - ext4_normal_get_block_write, wbc); + return nobh_writepage(page, noalloc_get_block_write, wbc); else - return block_write_full_page(page, - ext4_normal_get_block_write, - wbc); + return block_write_full_page(page, noalloc_get_block_write, + wbc); } static int ext4_normal_writepage(struct page *page, @@ -3179,7 +3204,7 @@ static int __ext4_journalled_writepage(struct page *page, int err; ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, - ext4_normal_get_block_write); + noalloc_get_block_write); if (ret != 0) goto out_unlock; @@ -3264,9 +3289,8 @@ static int ext4_journalled_writepage(struct page *page, * really know unless we go poke around in the buffer_heads. * But block_write_full_page will do the right thing. */ - return block_write_full_page(page, - ext4_normal_get_block_write, - wbc); + return block_write_full_page(page, noalloc_get_block_write, + wbc); } no_write: redirty_page_for_writepage(wbc, page); -- cgit v1.1 From a2dc52b5d1d8cc280b3e795abf1c80ac8c49f30c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 12 May 2009 13:51:29 -0400 Subject: ext4: Add BUG_ON debugging checks to noalloc_get_block_write() Enforce that noalloc_get_block_write() is only called to map one block at a time, and that it always is successful in finding a mapping for given an inode's logical block block number if it is called with create == 1. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index fd5f27a..e6113c3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2433,11 +2433,14 @@ static int noalloc_get_block_write(struct inode *inode, sector_t iblock, int ret = 0; unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); + /* * we don't want to do block allocation in writepage * so call get_block_wrap with create = 0 */ ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0); + BUG_ON(create && ret == 0); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; -- cgit v1.1 From 4f23122858a27ba97444b9b37a066d83edebd4c8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 13 May 2009 08:35:35 +0200 Subject: splice: fix repeated kmap()'s in default_file_splice_read() We cannot reliably map more than one page at the time, or we risk deadlocking. Just allocate the pages from low mem instead. Reported-by: Andrew Morton Signed-off-by: Jens Axboe --- fs/splice.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index eefd96b..c5e3c79 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -580,13 +580,13 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) { struct page *page; - page = alloc_page(GFP_HIGHUSER); + page = alloc_page(GFP_USER); error = -ENOMEM; if (!page) goto err; this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset); - vec[i].iov_base = (void __user *) kmap(page); + vec[i].iov_base = (void __user *) page_address(page); vec[i].iov_len = this_len; pages[i] = page; spd.nr_pages++; @@ -604,7 +604,6 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, nr_freed = 0; for (i = 0; i < spd.nr_pages; i++) { - kunmap(pages[i]); this_len = min_t(size_t, vec[i].iov_len, res); partial[i].offset = 0; partial[i].len = this_len; @@ -624,10 +623,9 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, return res; err: - for (i = 0; i < spd.nr_pages; i++) { - kunmap(pages[i]); + for (i = 0; i < spd.nr_pages; i++) __free_page(pages[i]); - } + return error; } EXPORT_SYMBOL(default_file_splice_read); -- cgit v1.1 From a1c0643ff9f360a30644f6e3cd643ca2a5083aea Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 13 May 2009 10:56:52 +0100 Subject: GFS2: Move journal live test at transaction start There seems little point grabbing the transaction glock only to have to release it again if the journal isn't live. This moves the test earlier to avoid grabbing the lock when we don't need it in the first place. Signed-off-by: Steven Whitehouse --- fs/gfs2/trans.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 053752d..4ef0e9f 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -33,6 +33,9 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, BUG_ON(current->journal_info); BUG_ON(blocks == 0 && revokes == 0); + if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) + return -EROFS; + tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS); if (!tr) return -ENOMEM; @@ -54,12 +57,6 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, if (error) goto fail_holder_uninit; - if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { - tr->tr_t_gh.gh_flags |= GL_NOCACHE; - error = -EROFS; - goto fail_gunlock; - } - error = gfs2_log_reserve(sdp, tr->tr_reserved); if (error) goto fail_gunlock; -- cgit v1.1 From 48c2b613616235d7c97fda5982f50100a6c79166 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 13 May 2009 14:49:48 +0100 Subject: GFS2: Add commit= mount option It has always been possible to adjust the gfs2 log commit interval, but only from the sysfs interface. This adds a mount option, commit=, which will be familar to ext3 users. The sysfs interface continues to be available as well, although this might be removed in the future. Also this patch cleans up some duplicated structures in the GFS2 sysfs code. Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 1 + fs/gfs2/mount.c | 10 ++++++ fs/gfs2/ops_fstype.c | 4 ++- fs/gfs2/ops_super.c | 13 +++++++- fs/gfs2/sys.c | 92 +++++++++++++++++++--------------------------------- 5 files changed, 60 insertions(+), 60 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 399d1b9..65f438e 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -418,6 +418,7 @@ struct gfs2_args { unsigned int ar_data:2; /* ordered/writeback */ unsigned int ar_meta:1; /* mount metafs */ unsigned int ar_discard:1; /* discard requests */ + int ar_commit; /* Commit interval */ }; struct gfs2_tune { diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c index f7e8527..947af15 100644 --- a/fs/gfs2/mount.c +++ b/fs/gfs2/mount.c @@ -45,6 +45,7 @@ enum { Opt_meta, Opt_discard, Opt_nodiscard, + Opt_commit, Opt_err, }; @@ -73,6 +74,7 @@ static const match_table_t tokens = { {Opt_meta, "meta"}, {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, + {Opt_commit, "commit=%d"}, {Opt_err, NULL} }; @@ -89,6 +91,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options) char *o; int token; substring_t tmp[MAX_OPT_ARGS]; + int rv; /* Split the options into tokens with the "," character and process them */ @@ -173,6 +176,13 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options) case Opt_nodiscard: args->ar_discard = 0; break; + case Opt_commit: + rv = match_int(&tmp[0], &args->ar_commit); + if (rv || args->ar_commit <= 0) { + fs_info(sdp, "commit mount option requires a positive numeric argument\n"); + return rv ? rv : -EINVAL; + } + break; case Opt_err: default: fs_info(sdp, "invalid mount option: %s\n", o); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 1ff9473..7981fbc 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -55,7 +55,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt) spin_lock_init(>->gt_spin); gt->gt_incore_log_blocks = 1024; - gt->gt_log_flush_secs = 60; gt->gt_recoverd_secs = 60; gt->gt_logd_secs = 1; gt->gt_quota_simul_sync = 64; @@ -1165,6 +1164,7 @@ static int fill_super(struct super_block *sb, void *data, int silent) sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT; sdp->sd_args.ar_data = GFS2_DATA_DEFAULT; + sdp->sd_args.ar_commit = 60; error = gfs2_mount_args(sdp, &sdp->sd_args, data); if (error) { @@ -1191,6 +1191,8 @@ static int fill_super(struct super_block *sb, void *data, int silent) GFS2_BASIC_BLOCK_SHIFT; sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; + sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit; + error = init_names(sdp, silent); if (error) goto fail; diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c index 4580195..0677a83 100644 --- a/fs/gfs2/ops_super.c +++ b/fs/gfs2/ops_super.c @@ -436,8 +436,12 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) { struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_args args = sdp->sd_args; /* Default to current settings */ + struct gfs2_tune *gt = &sdp->sd_tune; int error; + spin_lock(>->gt_spin); + args.ar_commit = gt->gt_log_flush_secs; + spin_unlock(>->gt_spin); error = gfs2_mount_args(sdp, &args, data); if (error) return error; @@ -473,6 +477,10 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) sb->s_flags |= MS_POSIXACL; else sb->s_flags &= ~MS_POSIXACL; + spin_lock(>->gt_spin); + gt->gt_log_flush_secs = args.ar_commit; + spin_unlock(>->gt_spin); + return 0; } @@ -550,6 +558,7 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) { struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info; struct gfs2_args *args = &sdp->sd_args; + int lfsecs; if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir)) seq_printf(s, ",meta"); @@ -610,7 +619,9 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) } if (args->ar_discard) seq_printf(s, ",discard"); - + lfsecs = sdp->sd_tune.gt_log_flush_secs; + if (lfsecs != 60) + seq_printf(s, ",commit=%d", lfsecs); return 0; } diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 7655f502..d53b22e 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -26,6 +26,36 @@ #include "util.h" #include "glops.h" +struct gfs2_attr { + struct attribute attr; + ssize_t (*show)(struct gfs2_sbd *, char *); + ssize_t (*store)(struct gfs2_sbd *, const char *, size_t); +}; + +static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); + struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr); + return a->show ? a->show(sdp, buf) : 0; +} + +static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); + struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr); + return a->store ? a->store(sdp, buf, len) : len; +} + +static struct sysfs_ops gfs2_attr_ops = { + .show = gfs2_attr_show, + .store = gfs2_attr_store, +}; + + +static struct kset *gfs2_kset; + static ssize_t id_show(struct gfs2_sbd *sdp, char *buf) { return snprintf(buf, PAGE_SIZE, "%u:%u\n", @@ -212,11 +242,6 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len return len; } -struct gfs2_attr { - struct attribute attr; - ssize_t (*show)(struct gfs2_sbd *, char *); - ssize_t (*store)(struct gfs2_sbd *, const char *, size_t); -}; #define GFS2_ATTR(name, mode, show, store) \ static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store) @@ -246,49 +271,21 @@ static struct attribute *gfs2_attrs[] = { NULL, }; -static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); - struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr); - return a->show ? a->show(sdp, buf) : 0; -} - -static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t len) -{ - struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); - struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr); - return a->store ? a->store(sdp, buf, len) : len; -} - -static struct sysfs_ops gfs2_attr_ops = { - .show = gfs2_attr_show, - .store = gfs2_attr_store, -}; - static struct kobj_type gfs2_ktype = { .default_attrs = gfs2_attrs, .sysfs_ops = &gfs2_attr_ops, }; -static struct kset *gfs2_kset; - /* * display struct lm_lockstruct fields */ -struct lockstruct_attr { - struct attribute attr; - ssize_t (*show)(struct gfs2_sbd *, char *); -}; - #define LOCKSTRUCT_ATTR(name, fmt) \ static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ { \ return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_lockstruct.ls_##name); \ } \ -static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name) +static struct gfs2_attr lockstruct_attr_##name = __ATTR_RO(name) LOCKSTRUCT_ATTR(jid, "%u\n"); LOCKSTRUCT_ATTR(first, "%u\n"); @@ -401,14 +398,8 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf) return sprintf(buf, "%d\n", ls->ls_recover_jid_status); } -struct gdlm_attr { - struct attribute attr; - ssize_t (*show)(struct gfs2_sbd *sdp, char *); - ssize_t (*store)(struct gfs2_sbd *sdp, const char *, size_t); -}; - #define GDLM_ATTR(_name,_mode,_show,_store) \ -static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) +static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); GDLM_ATTR(block, 0644, block_show, block_store); @@ -434,21 +425,12 @@ static struct attribute *lock_module_attrs[] = { NULL, }; -/* - * display struct gfs2_args fields - */ - -struct args_attr { - struct attribute attr; - ssize_t (*show)(struct gfs2_sbd *, char *); -}; - #define ARGS_ATTR(name, fmt) \ static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ { \ return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_args.ar_##name); \ } \ -static struct args_attr args_attr_##name = __ATTR_RO(name) +static struct gfs2_attr args_attr_##name = __ATTR_RO(name) ARGS_ATTR(lockproto, "%s\n"); ARGS_ATTR(locktable, "%s\n"); @@ -531,14 +513,8 @@ static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field, return len; } -struct tune_attr { - struct attribute attr; - ssize_t (*show)(struct gfs2_sbd *, char *); - ssize_t (*store)(struct gfs2_sbd *, const char *, size_t); -}; - #define TUNE_ATTR_3(name, show, store) \ -static struct tune_attr tune_attr_##name = __ATTR(name, 0644, show, store) +static struct gfs2_attr tune_attr_##name = __ATTR(name, 0644, show, store) #define TUNE_ATTR_2(name, store) \ static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ -- cgit v1.1 From 9582d41135c0d362f04ed6bf3dc8d693a7eafee2 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 13 May 2009 15:06:25 +0100 Subject: GFS2: Remove a couple of unused sysfs entries These two tunables are pointless and would never need to be changed anyway. There is also a race between them and umount as the deamons which they refer to might have gone away. The easiest way to fix the race is to remove the interface. Signed-off-by: Steven Whitehouse --- fs/gfs2/sys.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index d53b22e..894bf77 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -530,15 +530,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\ } \ TUNE_ATTR_2(name, name##_store) -#define TUNE_ATTR_DAEMON(name, process) \ -static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\ -{ \ - ssize_t r = tune_set(sdp, &sdp->sd_tune.gt_##name, 1, buf, len); \ - wake_up_process(sdp->sd_##process); \ - return r; \ -} \ -TUNE_ATTR_2(name, name##_store) - TUNE_ATTR(incore_log_blocks, 0); TUNE_ATTR(log_flush_secs, 0); TUNE_ATTR(quota_warn_period, 0); @@ -550,8 +541,6 @@ TUNE_ATTR(new_files_jdata, 0); TUNE_ATTR(quota_simul_sync, 1); TUNE_ATTR(stall_secs, 1); TUNE_ATTR(statfs_quantum, 1); -TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process); -TUNE_ATTR_DAEMON(logd_secs, logd_process); TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); static struct attribute *tune_attrs[] = { @@ -565,8 +554,6 @@ static struct attribute *tune_attrs[] = { &tune_attr_quota_simul_sync.attr, &tune_attr_stall_secs.attr, &tune_attr_statfs_quantum.attr, - &tune_attr_recoverd_secs.attr, - &tune_attr_logd_secs.attr, &tune_attr_quota_scale.attr, &tune_attr_new_files_jdata.attr, NULL, -- cgit v1.1 From 77f6bf57ba9d2c50173536dbfdacdab27cb867ca Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 14 May 2009 09:49:44 +0200 Subject: splice: fix error return code fs/splice.c: In function 'default_file_splice_read': fs/splice.c:566: warning: 'error' may be used uninitialized in this function which is sort-of true. The code will in fact return -ENOMEM instead of the kernel_readv() return value. Cc: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- fs/splice.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index c5e3c79..41179c0 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -595,8 +595,10 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, } res = kernel_readv(in, vec, spd.nr_pages, *ppos); - if (res < 0) + if (res < 0) { + error = res; goto err; + } error = 0; if (!res) -- cgit v1.1 From 2fa3cdfb319055fd8b25abdafa413e16f00ad493 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 14 May 2009 09:29:45 -0400 Subject: ext4: Merge ext4_da_get_block_write() into mpage_da_map_blocks() The static function ext4_da_get_block_write() was only used by mpage_da_map_blocks(). So to simplify the code, merge that function into mpage_da_map_blocks(). Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 110 ++++++++++++++++++++++---------------------------------- 1 file changed, 43 insertions(+), 67 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e6113c3..bfe50a2 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2006,57 +2006,6 @@ static void ext4_print_free_blocks(struct inode *inode) } /* - * This function is used by mpage_da_map_blocks(). We separate it out - * as a separate function just to make life easier, and because - * mpage_da_map_blocks() used to be a generic function that took a - * get_block_t. - */ -static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result) -{ - int ret; - unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; - loff_t disksize = EXT4_I(inode)->i_disksize; - handle_t *handle = NULL; - - handle = ext4_journal_current_handle(); - BUG_ON(!handle); - ret = ext4_get_blocks(handle, inode, iblock, max_blocks, - bh_result, EXT4_GET_BLOCKS_CREATE| - EXT4_GET_BLOCKS_DELALLOC_RESERVE); - if (ret <= 0) - return ret; - - bh_result->b_size = (ret << inode->i_blkbits); - - if (ext4_should_order_data(inode)) { - int retval; - retval = ext4_jbd2_file_inode(handle, inode); - if (retval) - /* - * Failed to add inode for ordered mode. Don't - * update file size - */ - return retval; - } - - /* - * Update on-disk size along with block allocation we don't - * use EXT4_GET_BLOCKS_EXTEND_DISKSIZE as size may change - * within already allocated block -bzzz - */ - disksize = ((loff_t) iblock + ret) << inode->i_blkbits; - if (disksize > i_size_read(inode)) - disksize = i_size_read(inode); - if (disksize > EXT4_I(inode)->i_disksize) { - ext4_update_i_disksize(inode, disksize); - ret = ext4_mark_inode_dirty(handle, inode); - return ret; - } - return 0; -} - -/* * mpage_da_map_blocks - go through given space * * @mpd - bh describing space @@ -2066,9 +2015,12 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, */ static int mpage_da_map_blocks(struct mpage_da_data *mpd) { - int err = 0; + int err, blks; struct buffer_head new; - sector_t next; + sector_t next = mpd->b_blocknr; + unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; + loff_t disksize = EXT4_I(mpd->inode)->i_disksize; + handle_t *handle = NULL; /* * We consider only non-mapped and non-allocated blocks @@ -2077,6 +2029,16 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) !(mpd->b_state & (1 << BH_Delay)) && !(mpd->b_state & (1 << BH_Unwritten))) return 0; + + /* + * If we didn't accumulate anything to write simply return + */ + if (!mpd->b_size) + return 0; + + handle = ext4_journal_current_handle(); + BUG_ON(!handle); + /* * We need to make sure the BH_Delay flag is passed down to * ext4_da_get_block_write(), since it calls ext4_get_blocks() @@ -2092,18 +2054,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) * EXT4_GET_BLOCKS_DELALLOC_RESERVE flag. */ new.b_state = mpd->b_state & (1 << BH_Delay); - new.b_blocknr = 0; - new.b_size = mpd->b_size; - next = mpd->b_blocknr; - /* - * If we didn't accumulate anything - * to write simply return - */ - if (!new.b_size) - return 0; - - err = ext4_da_get_block_write(mpd->inode, next, &new); - if (err) { + blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks, + &new, EXT4_GET_BLOCKS_CREATE| + EXT4_GET_BLOCKS_DELALLOC_RESERVE); + if (blks < 0) { + err = blks; /* * If get block returns with error we simply * return. Later writepage will redirty the page and @@ -2136,12 +2091,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) if (err == -ENOSPC) { ext4_print_free_blocks(mpd->inode); } - /* invlaidate all the pages */ + /* invalidate all the pages */ ext4_da_block_invalidatepages(mpd, next, mpd->b_size >> mpd->inode->i_blkbits); return err; } - BUG_ON(new.b_size == 0); + BUG_ON(blks == 0); + + new.b_size = (blks << mpd->inode->i_blkbits); if (buffer_new(&new)) __unmap_underlying_blocks(mpd->inode, &new); @@ -2154,6 +2111,25 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) (mpd->b_state & (1 << BH_Unwritten))) mpage_put_bnr_to_bhs(mpd, next, &new); + if (ext4_should_order_data(mpd->inode)) { + err = ext4_jbd2_file_inode(handle, mpd->inode); + if (err) + return err; + } + + /* + * Update on-disk size along with block allocation we don't + * use EXT4_GET_BLOCKS_EXTEND_DISKSIZE as size may change + * within already allocated block -bzzz + */ + disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits; + if (disksize > i_size_read(mpd->inode)) + disksize = i_size_read(mpd->inode); + if (disksize > EXT4_I(mpd->inode)->i_disksize) { + ext4_update_i_disksize(mpd->inode, disksize); + return ext4_mark_inode_dirty(handle, mpd->inode); + } + return 0; } -- cgit v1.1 From 2ac3b6e00acb46406c993d57921f86a594aafe08 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 14 May 2009 13:57:08 -0400 Subject: ext4: Clean up ext4_get_blocks() so it does not depend on bh_result->b_state The ext4_get_blocks() function was depending on the value of bh_result->b_state as an input parameter to decide whether or not update the delalloc accounting statistics by calling ext4_da_update_reserve_space(). We now use a separate flag, EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE, to requests this update, so that all callers of ext4_get_blocks() can clear map_bh.b_state before calling ext4_get_blocks() without worrying about any consistency issues. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 15 ++++++++++----- fs/ext4/inode.c | 56 +++++++++++++++++++++++++++++++------------------------- 2 files changed, 41 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 17feb4a..d164f12 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -318,16 +318,21 @@ struct ext4_new_group_data { */ /* Allocate any needed blocks and/or convert an unitialized extent to be an initialized ext4 */ -#define EXT4_GET_BLOCKS_CREATE 1 +#define EXT4_GET_BLOCKS_CREATE 0x0001 /* Request the creation of an unitialized extent */ -#define EXT4_GET_BLOCKS_UNINIT_EXT 2 +#define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002 #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ EXT4_GET_BLOCKS_CREATE) /* Update the ext4_inode_info i_disksize field */ -#define EXT4_GET_BLOCKS_EXTEND_DISKSIZE 4 +#define EXT4_GET_BLOCKS_EXTEND_DISKSIZE 0x0004 /* Caller is from the delayed allocation writeout path, - so the filesystem blocks have already been accounted for */ -#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 8 + so set the magic i_delalloc_reserve_flag after taking the + inode allocation semaphore for */ +#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0008 + /* Call ext4_da_update_reserve_space() after successfully + allocating the blocks */ +#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0010 + /* * ioctl commands diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bfe50a2..d7b7480 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1234,16 +1234,15 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, } } - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) EXT4_I(inode)->i_delalloc_reserved_flag = 0; - /* - * Update reserved blocks/metadata blocks - * after successful block allocation - * which were deferred till now - */ - if ((retval > 0) && buffer_delay(bh)) - ext4_da_update_reserve_space(inode, retval); - } + + /* + * Update reserved blocks/metadata blocks after successful + * block allocation which had been deferred till now. + */ + if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE)) + ext4_da_update_reserve_space(inode, retval); up_write((&EXT4_I(inode)->i_data_sem)); return retval; @@ -2015,7 +2014,7 @@ static void ext4_print_free_blocks(struct inode *inode) */ static int mpage_da_map_blocks(struct mpage_da_data *mpd) { - int err, blks; + int err, blks, get_blocks_flags; struct buffer_head new; sector_t next = mpd->b_blocknr; unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; @@ -2040,23 +2039,30 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) BUG_ON(!handle); /* - * We need to make sure the BH_Delay flag is passed down to - * ext4_da_get_block_write(), since it calls ext4_get_blocks() - * with the EXT4_GET_BLOCKS_DELALLOC_RESERVE flag. This flag - * causes ext4_get_blocks() to call - * ext4_da_update_reserve_space() if the passed buffer head - * has the BH_Delay flag set. In the future, once we clean up - * the interfaces to ext4_get_blocks(), we should pass in a - * separate flag which requests that the delayed allocation - * statistics should be updated, instead of depending on the - * state information getting passed down via the map_bh's - * state bitmasks plus the magic - * EXT4_GET_BLOCKS_DELALLOC_RESERVE flag. + * Call ext4_get_blocks() to allocate any delayed allocation + * blocks, or to convert an uninitialized extent to be + * initialized (in the case where we have written into + * one or more preallocated blocks). + * + * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to + * indicate that we are on the delayed allocation path. This + * affects functions in many different parts of the allocation + * call path. This flag exists primarily because we don't + * want to change *many* call functions, so ext4_get_blocks() + * will set the magic i_delalloc_reserved_flag once the + * inode's allocation semaphore is taken. + * + * If the blocks in questions were delalloc blocks, set + * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting + * variables are updated after the blocks have been allocated. */ - new.b_state = mpd->b_state & (1 << BH_Delay); + new.b_state = 0; + get_blocks_flags = (EXT4_GET_BLOCKS_CREATE | + EXT4_GET_BLOCKS_DELALLOC_RESERVE); + if (mpd->b_state & (1 << BH_Delay)) + get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE; blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks, - &new, EXT4_GET_BLOCKS_CREATE| - EXT4_GET_BLOCKS_DELALLOC_RESERVE); + &new, get_blocks_flags); if (blks < 0) { err = blks; /* -- cgit v1.1 From 2d02494f5a90f2e4b3c4c6acc85ec94674cdc431 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 2 May 2009 20:08:52 +0200 Subject: sched, timers: cleanup avenrun users avenrun is an rough estimate so we don't have to worry about consistency of the three avenrun values. Remove the xtime lock dependency and provide a function to scale the values. Cleanup the users. [ Impact: cleanup ] Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra --- fs/proc/loadavg.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index 9bca39c..1afa4dd 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -12,20 +12,14 @@ static int loadavg_proc_show(struct seq_file *m, void *v) { - int a, b, c; - unsigned long seq; + unsigned long avnrun[3]; - do { - seq = read_seqbegin(&xtime_lock); - a = avenrun[0] + (FIXED_1/200); - b = avenrun[1] + (FIXED_1/200); - c = avenrun[2] + (FIXED_1/200); - } while (read_seqretry(&xtime_lock, seq)); + get_avenrun(avnrun, FIXED_1/200, 0); - seq_printf(m, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n", - LOAD_INT(a), LOAD_FRAC(a), - LOAD_INT(b), LOAD_FRAC(b), - LOAD_INT(c), LOAD_FRAC(c), + seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n", + LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]), + LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]), + LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]), nr_running(), nr_threads, task_active_pid_ns(current)->last_pid); return 0; -- cgit v1.1 From 6fd058f7791087648c683eb8572edf3be3c4c23c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 17 May 2009 15:38:01 -0400 Subject: ext4: Add a comprehensive block validity check to ext4_get_blocks() To catch filesystem bugs or corruption which could lead to the filesystem getting severly damaged, this patch adds a facility for tracking all of the filesystem metadata blocks by contiguous regions in a red-black tree. This allows quick searching of the tree to locate extents which might overlap with filesystem metadata blocks. This facility is also used by the multi-block allocator to assure that it is not allocating blocks out of the system zone, as well as by the routines used when reading indirect blocks and extents information from disk to make sure their contents are valid. Signed-off-by: "Theodore Ts'o" --- fs/ext4/Makefile | 4 +- fs/ext4/block_validity.c | 244 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/ext4.h | 11 +++ fs/ext4/extents.c | 22 +---- fs/ext4/inode.c | 47 +++++++-- fs/ext4/mballoc.c | 11 +-- fs/ext4/super.c | 32 ++++++- 7 files changed, 332 insertions(+), 39 deletions(-) create mode 100644 fs/ext4/block_validity.c (limited to 'fs') diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index a8ff003..8a34710 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -5,8 +5,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ - ext4_jbd2.o migrate.o mballoc.o + ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ + ext4_jbd2.o migrate.o mballoc.o block_validity.o ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c new file mode 100644 index 0000000..50784ef --- /dev/null +++ b/fs/ext4/block_validity.c @@ -0,0 +1,244 @@ +/* + * linux/fs/ext4/block_validity.c + * + * Copyright (C) 2009 + * Theodore Ts'o (tytso@mit.edu) + * + * Track which blocks in the filesystem are metadata blocks that + * should never be used as data blocks by files or directories. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ext4.h" + +struct ext4_system_zone { + struct rb_node node; + ext4_fsblk_t start_blk; + unsigned int count; +}; + +static struct kmem_cache *ext4_system_zone_cachep; + +int __init init_ext4_system_zone(void) +{ + ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, + SLAB_RECLAIM_ACCOUNT); + if (ext4_system_zone_cachep == NULL) + return -ENOMEM; + return 0; +} + +void exit_ext4_system_zone(void) +{ + kmem_cache_destroy(ext4_system_zone_cachep); +} + +static inline int can_merge(struct ext4_system_zone *entry1, + struct ext4_system_zone *entry2) +{ + if ((entry1->start_blk + entry1->count) == entry2->start_blk) + return 1; + return 0; +} + +/* + * Mark a range of blocks as belonging to the "system zone" --- that + * is, filesystem metadata blocks which should never be used by + * inodes. + */ +static int add_system_zone(struct ext4_sb_info *sbi, + ext4_fsblk_t start_blk, + unsigned int count) +{ + struct ext4_system_zone *new_entry = NULL, *entry; + struct rb_node **n = &sbi->system_blks.rb_node, *node; + struct rb_node *parent = NULL, *new_node = NULL; + + while (*n) { + parent = *n; + entry = rb_entry(parent, struct ext4_system_zone, node); + if (start_blk < entry->start_blk) + n = &(*n)->rb_left; + else if (start_blk >= (entry->start_blk + entry->count)) + n = &(*n)->rb_right; + else { + if (start_blk + count > (entry->start_blk + + entry->count)) + entry->count = (start_blk + count - + entry->start_blk); + new_node = *n; + new_entry = rb_entry(new_node, struct ext4_system_zone, + node); + break; + } + } + + if (!new_entry) { + new_entry = kmem_cache_alloc(ext4_system_zone_cachep, + GFP_KERNEL); + if (!new_entry) + return -ENOMEM; + new_entry->start_blk = start_blk; + new_entry->count = count; + new_node = &new_entry->node; + + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, &sbi->system_blks); + } + + /* Can we merge to the left? */ + node = rb_prev(new_node); + if (node) { + entry = rb_entry(node, struct ext4_system_zone, node); + if (can_merge(entry, new_entry)) { + new_entry->start_blk = entry->start_blk; + new_entry->count += entry->count; + rb_erase(node, &sbi->system_blks); + kmem_cache_free(ext4_system_zone_cachep, entry); + } + } + + /* Can we merge to the right? */ + node = rb_next(new_node); + if (node) { + entry = rb_entry(node, struct ext4_system_zone, node); + if (can_merge(new_entry, entry)) { + new_entry->count += entry->count; + rb_erase(node, &sbi->system_blks); + kmem_cache_free(ext4_system_zone_cachep, entry); + } + } + return 0; +} + +static void debug_print_tree(struct ext4_sb_info *sbi) +{ + struct rb_node *node; + struct ext4_system_zone *entry; + int first = 1; + + printk(KERN_INFO "System zones: "); + node = rb_first(&sbi->system_blks); + while (node) { + entry = rb_entry(node, struct ext4_system_zone, node); + printk("%s%llu-%llu", first ? "" : ", ", + entry->start_blk, entry->start_blk + entry->count - 1); + first = 0; + node = rb_next(node); + } + printk("\n"); +} + +int ext4_setup_system_zone(struct super_block *sb) +{ + ext4_group_t ngroups = ext4_get_groups_count(sb); + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp; + ext4_group_t i; + int flex_size = ext4_flex_bg_size(sbi); + int ret; + + if (!test_opt(sb, BLOCK_VALIDITY)) { + if (EXT4_SB(sb)->system_blks.rb_node) + ext4_release_system_zone(sb); + return 0; + } + if (EXT4_SB(sb)->system_blks.rb_node) + return 0; + + for (i=0; i < ngroups; i++) { + if (ext4_bg_has_super(sb, i) && + ((i < 5) || ((i % flex_size) == 0))) + add_system_zone(sbi, ext4_group_first_block_no(sb, i), + sbi->s_gdb_count + 1); + gdp = ext4_get_group_desc(sb, i, NULL); + ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1); + if (ret) + return ret; + ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1); + if (ret) + return ret; + ret = add_system_zone(sbi, ext4_inode_table(sb, gdp), + sbi->s_itb_per_group); + if (ret) + return ret; + } + + if (test_opt(sb, DEBUG)) + debug_print_tree(EXT4_SB(sb)); + return 0; +} + +/* Called when the filesystem is unmounted */ +void ext4_release_system_zone(struct super_block *sb) +{ + struct rb_node *n = EXT4_SB(sb)->system_blks.rb_node; + struct rb_node *parent; + struct ext4_system_zone *entry; + + while (n) { + /* Do the node's children first */ + if (n->rb_left) { + n = n->rb_left; + continue; + } + if (n->rb_right) { + n = n->rb_right; + continue; + } + /* + * The node has no children; free it, and then zero + * out parent's link to it. Finally go to the + * beginning of the loop and try to free the parent + * node. + */ + parent = rb_parent(n); + entry = rb_entry(n, struct ext4_system_zone, node); + kmem_cache_free(ext4_system_zone_cachep, entry); + if (!parent) + EXT4_SB(sb)->system_blks.rb_node = NULL; + else if (parent->rb_left == n) + parent->rb_left = NULL; + else if (parent->rb_right == n) + parent->rb_right = NULL; + n = parent; + } + EXT4_SB(sb)->system_blks.rb_node = NULL; +} + +/* + * Returns 1 if the passed-in block region (start_blk, + * start_blk+count) is valid; 0 if some part of the block region + * overlaps with filesystem metadata blocks. + */ +int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, + unsigned int count) +{ + struct ext4_system_zone *entry; + struct rb_node *n = sbi->system_blks.rb_node; + + if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (start_blk + count > ext4_blocks_count(sbi->s_es))) + return 0; + while (n) { + entry = rb_entry(n, struct ext4_system_zone, node); + if (start_blk + count - 1 < entry->start_blk) + n = n->rb_left; + else if (start_blk >= (entry->start_blk + entry->count)) + n = n->rb_right; + else + return 0; + } + return 1; +} + diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d164f12..4311cc8 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -696,6 +696,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ +#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H @@ -887,6 +888,7 @@ struct ext4_sb_info { int s_jquota_fmt; /* Format of quota to use */ #endif unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ + struct rb_root system_blks; #ifdef EXTENTS_STATS /* ext4 extents stats */ @@ -1618,6 +1620,15 @@ extern struct dentry *ext4_get_parent(struct dentry *child); extern const struct inode_operations ext4_symlink_inode_operations; extern const struct inode_operations ext4_fast_symlink_inode_operations; +/* block_validity */ +extern void ext4_release_system_zone(struct super_block *sb); +extern int ext4_setup_system_zone(struct super_block *sb); +extern int __init init_ext4_system_zone(void); +extern void exit_ext4_system_zone(void); +extern int ext4_data_block_valid(struct ext4_sb_info *sbi, + ext4_fsblk_t start_blk, + unsigned int count); + /* extents.c */ extern int ext4_ext_tree_init(handle_t *handle, struct inode *); extern int ext4_ext_writepage_trans_blocks(struct inode *, int); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 27c383c..d04b779 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -326,32 +326,18 @@ ext4_ext_max_entries(struct inode *inode, int depth) static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) { - ext4_fsblk_t block = ext_pblock(ext), valid_block; + ext4_fsblk_t block = ext_pblock(ext); int len = ext4_ext_get_actual_len(ext); - struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; - valid_block = le32_to_cpu(es->s_first_data_block) + - EXT4_SB(inode->i_sb)->s_gdb_count; - if (unlikely(block <= valid_block || - ((block + len) > ext4_blocks_count(es)))) - return 0; - else - return 1; + return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); } static int ext4_valid_extent_idx(struct inode *inode, struct ext4_extent_idx *ext_idx) { - ext4_fsblk_t block = idx_pblock(ext_idx), valid_block; - struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + ext4_fsblk_t block = idx_pblock(ext_idx); - valid_block = le32_to_cpu(es->s_first_data_block) + - EXT4_SB(inode->i_sb)->s_gdb_count; - if (unlikely(block <= valid_block || - (block >= ext4_blocks_count(es)))) - return 0; - else - return 1; + return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); } static int ext4_valid_extent_entries(struct inode *inode, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d7b7480..dadd3f9 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -372,20 +372,21 @@ static int ext4_block_to_path(struct inode *inode, } static int __ext4_check_blockref(const char *function, struct inode *inode, - __le32 *p, unsigned int max) { - - unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es); + __le32 *p, unsigned int max) +{ __le32 *bref = p; + unsigned int blk; + while (bref < p+max) { - if (unlikely(le32_to_cpu(*bref) >= maxblocks)) { + blk = le32_to_cpu(*bref++); + if (blk && + unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), + blk, 1))) { ext4_error(inode->i_sb, function, - "block reference %u >= max (%u) " - "in inode #%lu, offset=%d", - le32_to_cpu(*bref), maxblocks, - inode->i_ino, (int)(bref-p)); + "invalid block reference %u " + "in inode #%lu", blk, inode->i_ino); return -EIO; } - bref++; } return 0; } @@ -1125,6 +1126,21 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) ext4_discard_preallocations(inode); } +static int check_block_validity(struct inode *inode, sector_t logical, + sector_t phys, int len) +{ + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { + ext4_error(inode->i_sb, "check_block_validity", + "inode #%lu logical block %llu mapped to %llu " + "(size %d)", inode->i_ino, + (unsigned long long) logical, + (unsigned long long) phys, len); + WARN_ON(1); + return -EIO; + } + return 0; +} + /* * The ext4_get_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. @@ -1170,6 +1186,13 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, } up_read((&EXT4_I(inode)->i_data_sem)); + if (retval > 0 && buffer_mapped(bh)) { + int ret = check_block_validity(inode, block, + bh->b_blocknr, retval); + if (ret != 0) + return ret; + } + /* If it is only a block(s) look up */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) return retval; @@ -1245,6 +1268,12 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, ext4_da_update_reserve_space(inode, retval); up_write((&EXT4_I(inode)->i_data_sem)); + if (retval > 0 && buffer_mapped(bh)) { + int ret = check_block_validity(inode, block, + bh->b_blocknr, retval); + if (ret != 0) + return ret; + } return retval; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 541bd9a..ed8482e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2961,15 +2961,10 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, + le32_to_cpu(es->s_first_data_block); len = ac->ac_b_ex.fe_len; - if (in_range(ext4_block_bitmap(sb, gdp), block, len) || - in_range(ext4_inode_bitmap(sb, gdp), block, len) || - in_range(block, ext4_inode_table(sb, gdp), - EXT4_SB(sb)->s_itb_per_group) || - in_range(block + len - 1, ext4_inode_table(sb, gdp), - EXT4_SB(sb)->s_itb_per_group)) { + if (!ext4_data_block_valid(sbi, block, len)) { ext4_error(sb, __func__, - "Allocating block %llu in system zone of %d group\n", - block, ac->ac_b_ex.fe_group); + "Allocating blocks %llu-%llu which overlap " + "fs metadata\n", block, block+len); /* File system mounted not to panic on error * Fix the bitmap and repeat the block allocation * We leak some of the blocks here. diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dc34ed3..600b7ad 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -568,6 +568,7 @@ static void ext4_put_super(struct super_block *sb) struct ext4_super_block *es = sbi->s_es; int i, err; + ext4_release_system_zone(sb); ext4_mb_release(sb); ext4_ext_release(sb); ext4_xattr_put_super(sb); @@ -1055,6 +1056,7 @@ enum { Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_stripe, Opt_delalloc, Opt_nodelalloc, + Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio }; @@ -1114,6 +1116,8 @@ static const match_table_t tokens = { {Opt_resize, "resize"}, {Opt_delalloc, "delalloc"}, {Opt_nodelalloc, "nodelalloc"}, + {Opt_block_validity, "block_validity"}, + {Opt_noblock_validity, "noblock_validity"}, {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, {Opt_journal_ioprio, "journal_ioprio=%u"}, {Opt_auto_da_alloc, "auto_da_alloc=%u"}, @@ -1508,6 +1512,12 @@ set_qf_format: case Opt_delalloc: set_opt(sbi->s_mount_opt, DELALLOC); break; + case Opt_block_validity: + set_opt(sbi->s_mount_opt, BLOCK_VALIDITY); + break; + case Opt_noblock_validity: + clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY); + break; case Opt_inode_readahead_blks: if (match_int(&args[0], &option)) return 0; @@ -2826,6 +2836,13 @@ no_journal: } else if (test_opt(sb, DELALLOC)) printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); + err = ext4_setup_system_zone(sb); + if (err) { + printk(KERN_ERR "EXT4-fs: failed to initialize system " + "zone (%d)\n", err); + goto failed_mount4; + } + ext4_ext_init(sb); err = ext4_mb_init(sb, needs_recovery); if (err) { @@ -2875,6 +2892,7 @@ cantfind_ext4: failed_mount4: printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id); + ext4_release_system_zone(sb); if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; @@ -3515,6 +3533,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) sb->s_flags &= ~MS_RDONLY; } } + ext4_setup_system_zone(sb); if (sbi->s_journal == NULL) ext4_commit_super(sb, 1); @@ -3927,13 +3946,16 @@ static int __init init_ext4_fs(void) { int err; + err = init_ext4_system_zone(); + if (err) + return err; ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); if (!ext4_kset) - return -ENOMEM; + goto out4; ext4_proc_root = proc_mkdir("fs/ext4", NULL); err = init_ext4_mballoc(); if (err) - return err; + goto out3; err = init_ext4_xattr(); if (err) @@ -3958,6 +3980,11 @@ out1: exit_ext4_xattr(); out2: exit_ext4_mballoc(); +out3: + remove_proc_entry("fs/ext4", NULL); + kset_unregister(ext4_kset); +out4: + exit_ext4_system_zone(); return err; } @@ -3972,6 +3999,7 @@ static void __exit exit_ext4_fs(void) exit_ext4_mballoc(); remove_proc_entry("fs/ext4", NULL); kset_unregister(ext4_kset); + exit_ext4_system_zone(); } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); -- cgit v1.1 From 0568c518937ee3a9b6a94d18bae9c150fe5d6832 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 17 May 2009 23:31:23 -0400 Subject: ext4: down i_data_sem only for read when walking tree for fiemap Not sure why I put this in as down_write originally; all we are doing is walking the tree, nothing will change under us and concurrent reads should be no problem. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d04b779..d4e99e9 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3312,10 +3312,10 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * Walk the extent tree gathering extent information. * ext4_ext_fiemap_cb will push extents back to user. */ - down_write(&EXT4_I(inode)->i_data_sem); + down_read(&EXT4_I(inode)->i_data_sem); error = ext4_ext_walk_space(inode, start_blk, len_blks, ext4_ext_fiemap_cb, fieinfo); - up_write(&EXT4_I(inode)->i_data_sem); + up_read(&EXT4_I(inode)->i_data_sem); } return error; -- cgit v1.1 From f68301656b5f5d2de104f2687add6beeb8f3c3b9 Mon Sep 17 00:00:00 2001 From: Manish Katiyar Date: Sun, 17 May 2009 23:52:44 -0400 Subject: ext4: Fix memory leak in ext4_fill_super() in case of a failed mount Signed-off-by: Manish Katiyar Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 600b7ad..eca6c05 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2924,6 +2924,7 @@ failed_mount: brelse(bh); out_fail: sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); kfree(sbi); lock_kernel(); return ret; -- cgit v1.1 From de5ce037304f2c88a319b1c3b808ab0c4c618c1c Mon Sep 17 00:00:00 2001 From: Manish Katiyar Date: Sun, 17 May 2009 23:52:47 -0400 Subject: ext3: Fix memory leak in ext3_fill_super() in case of a failed mount Signed-off-by: Manish Katiyar Signed-off-by: "Theodore Ts'o" --- fs/ext3/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 599dbfe..d8b73d4 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2021,6 +2021,7 @@ failed_mount: brelse(bh); out_fail: sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); kfree(sbi); lock_kernel(); return ret; -- cgit v1.1 From 0f7ee7c17241915fdaff49d1a36f5aafd80a7dce Mon Sep 17 00:00:00 2001 From: Manish Katiyar Date: Sun, 17 May 2009 23:52:51 -0400 Subject: ext2: Fix memory leak in ext2_fill_super() in case of a failed mount Signed-off-by: Manish Katiyar Signed-off-by: "Theodore Ts'o" --- fs/ext2/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 5c4afe6..e3c748f 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1093,6 +1093,7 @@ failed_mount: brelse(bh); failed_sbi: sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); kfree(sbi); return ret; } -- cgit v1.1 From fe64d517df0970a68417184a12fcd4ba0589cc28 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 19 May 2009 10:01:18 +0100 Subject: GFS2: Umount recovery race fix This patch fixes a race condition where we can receive recovery requests part way through processing a umount. This was causing problems since the recovery thread had already gone away. Looking in more detail at the recovery code, it was really trying to implement a slight variation on a work queue, and that happens to align nicely with the recently introduced slow-work subsystem. As a result I've updated the code to use slow-work, rather than its own home grown variety of work queue. When using the wait_on_bit() function, I noticed that the wait function that was supplied as an argument was appearing in the WCHAN field, so I've updated the function names in order to produce more meaningful output. Signed-off-by: Steven Whitehouse --- fs/gfs2/Kconfig | 1 + fs/gfs2/glock.c | 21 +++++++++-- fs/gfs2/incore.h | 14 +++---- fs/gfs2/main.c | 8 ++++ fs/gfs2/ops_fstype.c | 20 +++------- fs/gfs2/ops_super.c | 25 ++++++++++++- fs/gfs2/recovery.c | 102 +++++++++++++++++---------------------------------- fs/gfs2/recovery.h | 2 +- fs/gfs2/sys.c | 53 +++++++++++++------------- 9 files changed, 122 insertions(+), 124 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index 3a981b7..cad957c 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -7,6 +7,7 @@ config GFS2_FS select IP_SCTP if DLM_SCTP select FS_POSIX_ACL select CRC32 + select SLOW_WORK help A cluster filesystem. diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index ff49810..2bf62bc 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -796,22 +796,37 @@ void gfs2_holder_uninit(struct gfs2_holder *gh) gh->gh_ip = 0; } -static int just_schedule(void *word) +/** + * gfs2_glock_holder_wait + * @word: unused + * + * This function and gfs2_glock_demote_wait both show up in the WCHAN + * field. Thus I've separated these otherwise identical functions in + * order to be more informative to the user. + */ + +static int gfs2_glock_holder_wait(void *word) { schedule(); return 0; } +static int gfs2_glock_demote_wait(void *word) +{ + schedule(); + return 0; +} + static void wait_on_holder(struct gfs2_holder *gh) { might_sleep(); - wait_on_bit(&gh->gh_iflags, HIF_WAIT, just_schedule, TASK_UNINTERRUPTIBLE); + wait_on_bit(&gh->gh_iflags, HIF_WAIT, gfs2_glock_holder_wait, TASK_UNINTERRUPTIBLE); } static void wait_on_demote(struct gfs2_glock *gl) { might_sleep(); - wait_on_bit(&gl->gl_flags, GLF_DEMOTE, just_schedule, TASK_UNINTERRUPTIBLE); + wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, TASK_UNINTERRUPTIBLE); } /** diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 65f438e..0060e95 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -12,6 +12,7 @@ #include #include +#include #include #include @@ -376,11 +377,11 @@ struct gfs2_journal_extent { struct gfs2_jdesc { struct list_head jd_list; struct list_head extent_list; - + struct slow_work jd_work; struct inode *jd_inode; + unsigned long jd_flags; +#define JDF_RECOVERY 1 unsigned int jd_jid; - int jd_dirty; - unsigned int jd_blocks; }; @@ -390,9 +391,6 @@ struct gfs2_statfs_change_host { s64 sc_dinodes; }; -#define GFS2_GLOCKD_DEFAULT 1 -#define GFS2_GLOCKD_MAX 16 - #define GFS2_QUOTA_DEFAULT GFS2_QUOTA_OFF #define GFS2_QUOTA_OFF 0 #define GFS2_QUOTA_ACCOUNT 1 @@ -427,7 +425,6 @@ struct gfs2_tune { unsigned int gt_incore_log_blocks; unsigned int gt_log_flush_secs; - unsigned int gt_recoverd_secs; unsigned int gt_logd_secs; unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */ @@ -448,6 +445,7 @@ enum { SDF_JOURNAL_LIVE = 1, SDF_SHUTDOWN = 2, SDF_NOBARRIERS = 3, + SDF_NORECOVERY = 4, }; #define GFS2_FSNAME_LEN 256 @@ -494,7 +492,6 @@ struct lm_lockstruct { unsigned long ls_flags; dlm_lockspace_t *ls_dlm; - int ls_recover_jid; int ls_recover_jid_done; int ls_recover_jid_status; }; @@ -583,7 +580,6 @@ struct gfs2_sbd { /* Daemon stuff */ - struct task_struct *sd_recoverd_process; struct task_struct *sd_logd_process; struct task_struct *sd_quotad_process; diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index a6892ed..eacd78a 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -113,12 +114,18 @@ static int __init init_gfs2_fs(void) if (error) goto fail_unregister; + error = slow_work_register_user(); + if (error) + goto fail_slow; + gfs2_register_debugfs(); printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__); return 0; +fail_slow: + unregister_filesystem(&gfs2meta_fs_type); fail_unregister: unregister_filesystem(&gfs2_fs_type); fail: @@ -156,6 +163,7 @@ static void __exit exit_gfs2_fs(void) gfs2_unregister_debugfs(); unregister_filesystem(&gfs2_fs_type); unregister_filesystem(&gfs2meta_fs_type); + slow_work_unregister_user(); kmem_cache_destroy(gfs2_quotad_cachep); kmem_cache_destroy(gfs2_rgrpd_cachep); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 7981fbc..2cd1164 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -55,7 +56,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt) spin_lock_init(>->gt_spin); gt->gt_incore_log_blocks = 1024; - gt->gt_recoverd_secs = 60; gt->gt_logd_secs = 1; gt->gt_quota_simul_sync = 64; gt->gt_quota_warn_period = 10; @@ -675,6 +675,7 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) break; INIT_LIST_HEAD(&jd->extent_list); + slow_work_init(&jd->jd_work, &gfs2_recover_ops); jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1); if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { if (!jd->jd_inode) @@ -700,14 +701,13 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) { struct inode *master = sdp->sd_master_dir->d_inode; struct gfs2_holder ji_gh; - struct task_struct *p; struct gfs2_inode *ip; int jindex = 1; int error = 0; if (undo) { jindex = 0; - goto fail_recoverd; + goto fail_jinode_gh; } sdp->sd_jindex = gfs2_lookup_simple(master, "jindex"); @@ -800,18 +800,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) gfs2_glock_dq_uninit(&ji_gh); jindex = 0; - p = kthread_run(gfs2_recoverd, sdp, "gfs2_recoverd"); - error = IS_ERR(p); - if (error) { - fs_err(sdp, "can't start recoverd thread: %d\n", error); - goto fail_jinode_gh; - } - sdp->sd_recoverd_process = p; - return 0; -fail_recoverd: - kthread_stop(sdp->sd_recoverd_process); fail_jinode_gh: if (!sdp->sd_args.ar_spectator) gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); @@ -1172,8 +1162,10 @@ static int fill_super(struct super_block *sb, void *data, int silent) goto fail; } - if (sdp->sd_args.ar_spectator) + if (sdp->sd_args.ar_spectator) { sb->s_flags |= MS_RDONLY; + set_bit(SDF_NORECOVERY, &sdp->sd_flags); + } if (sdp->sd_args.ar_posix_acl) sb->s_flags |= MS_POSIXACL; diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c index 0677a83..a3c2272 100644 --- a/fs/gfs2/ops_super.c +++ b/fs/gfs2/ops_super.c @@ -121,6 +121,12 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) return error; } +static int gfs2_umount_recovery_wait(void *word) +{ + schedule(); + return 0; +} + /** * gfs2_put_super - Unmount the filesystem * @sb: The VFS superblock @@ -131,6 +137,7 @@ static void gfs2_put_super(struct super_block *sb) { struct gfs2_sbd *sdp = sb->s_fs_info; int error; + struct gfs2_jdesc *jd; /* Unfreeze the filesystem, if we need to */ @@ -139,9 +146,25 @@ static void gfs2_put_super(struct super_block *sb) gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); mutex_unlock(&sdp->sd_freeze_lock); + /* No more recovery requests */ + set_bit(SDF_NORECOVERY, &sdp->sd_flags); + smp_mb(); + + /* Wait on outstanding recovery */ +restart: + spin_lock(&sdp->sd_jindex_spin); + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + if (!test_bit(JDF_RECOVERY, &jd->jd_flags)) + continue; + spin_unlock(&sdp->sd_jindex_spin); + wait_on_bit(&jd->jd_flags, JDF_RECOVERY, + gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE); + goto restart; + } + spin_unlock(&sdp->sd_jindex_spin); + kthread_stop(sdp->sd_quotad_process); kthread_stop(sdp->sd_logd_process); - kthread_stop(sdp->sd_recoverd_process); if (!(sb->s_flags & MS_RDONLY)) { error = gfs2_make_fs_ro(sdp); diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 247e8f7..59d2695 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -13,8 +13,7 @@ #include #include #include -#include -#include +#include #include "gfs2.h" #include "incore.h" @@ -441,18 +440,25 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); } -/** - * gfs2_recover_journal - recover a given journal - * @jd: the struct gfs2_jdesc describing the journal - * - * Acquire the journal's lock, check to see if the journal is clean, and - * do recovery if necessary. - * - * Returns: errno - */ +static int gfs2_recover_get_ref(struct slow_work *work) +{ + struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); + if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags)) + return -EBUSY; + return 0; +} -int gfs2_recover_journal(struct gfs2_jdesc *jd) +static void gfs2_recover_put_ref(struct slow_work *work) +{ + struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); + clear_bit(JDF_RECOVERY, &jd->jd_flags); + smp_mb__after_clear_bit(); + wake_up_bit(&jd->jd_flags, JDF_RECOVERY); +} + +static void gfs2_recover_work(struct slow_work *work) { + struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); struct gfs2_inode *ip = GFS2_I(jd->jd_inode); struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct gfs2_log_header_host head; @@ -569,7 +575,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd) gfs2_glock_dq_uninit(&j_gh); fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); - return 0; + return; fail_gunlock_tr: gfs2_glock_dq_uninit(&t_gh); @@ -584,70 +590,28 @@ fail_gunlock_j: fail: gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); - return error; } -static struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp) -{ - struct gfs2_jdesc *jd; - int found = 0; - - spin_lock(&sdp->sd_jindex_spin); +struct slow_work_ops gfs2_recover_ops = { + .get_ref = gfs2_recover_get_ref, + .put_ref = gfs2_recover_put_ref, + .execute = gfs2_recover_work, +}; - list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { - if (jd->jd_dirty) { - jd->jd_dirty = 0; - found = 1; - break; - } - } - spin_unlock(&sdp->sd_jindex_spin); - - if (!found) - jd = NULL; - return jd; -} - -/** - * gfs2_check_journals - Recover any dirty journals - * @sdp: the filesystem - * - */ - -static void gfs2_check_journals(struct gfs2_sbd *sdp) +static int gfs2_recovery_wait(void *word) { - struct gfs2_jdesc *jd; - - for (;;) { - jd = gfs2_jdesc_find_dirty(sdp); - if (!jd) - break; - - if (jd != sdp->sd_jdesc) - gfs2_recover_journal(jd); - } + schedule(); + return 0; } -/** - * gfs2_recoverd - Recover dead machine's journals - * @sdp: Pointer to GFS2 superblock - * - */ - -int gfs2_recoverd(void *data) +int gfs2_recover_journal(struct gfs2_jdesc *jd) { - struct gfs2_sbd *sdp = data; - unsigned long t; - - while (!kthread_should_stop()) { - gfs2_check_journals(sdp); - t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ; - if (freezing(current)) - refrigerator(); - schedule_timeout_interruptible(t); - } - + int rv; + rv = slow_work_enqueue(&jd->jd_work); + if (rv) + return rv; + wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, TASK_UNINTERRUPTIBLE); return 0; } diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h index a8218ea..1616ac2 100644 --- a/fs/gfs2/recovery.h +++ b/fs/gfs2/recovery.h @@ -28,7 +28,7 @@ extern void gfs2_revoke_clean(struct gfs2_sbd *sdp); extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head); extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd); -extern int gfs2_recoverd(void *data); +extern struct slow_work_ops gfs2_recover_ops; #endif /* __RECOVERY_DOT_H__ */ diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 894bf77..9f6d48b 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -356,34 +356,33 @@ static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf) return sprintf(buf, "%d\n", ls->ls_first_done); } -static ssize_t recover_show(struct gfs2_sbd *sdp, char *buf) -{ - struct lm_lockstruct *ls = &sdp->sd_lockstruct; - return sprintf(buf, "%d\n", ls->ls_recover_jid); -} - -static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid) +static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { + unsigned jid; struct gfs2_jdesc *jd; + int rv; + + rv = sscanf(buf, "%u", &jid); + if (rv != 1) + return -EINVAL; + rv = -ESHUTDOWN; spin_lock(&sdp->sd_jindex_spin); + if (test_bit(SDF_NORECOVERY, &sdp->sd_flags)) + goto out; + rv = -EBUSY; + if (sdp->sd_jdesc->jd_jid == jid) + goto out; + rv = -ENOENT; list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { if (jd->jd_jid != jid) continue; - jd->jd_dirty = 1; + rv = slow_work_enqueue(&jd->jd_work); break; } +out: spin_unlock(&sdp->sd_jindex_spin); -} - -static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len) -{ - struct lm_lockstruct *ls = &sdp->sd_lockstruct; - ls->ls_recover_jid = simple_strtol(buf, NULL, 0); - gfs2_jdesc_make_dirty(sdp, ls->ls_recover_jid); - if (sdp->sd_recoverd_process) - wake_up_process(sdp->sd_recoverd_process); - return len; + return rv ? rv : len; } static ssize_t recover_done_show(struct gfs2_sbd *sdp, char *buf) @@ -401,15 +400,15 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf) #define GDLM_ATTR(_name,_mode,_show,_store) \ static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) -GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); -GDLM_ATTR(block, 0644, block_show, block_store); -GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); -GDLM_ATTR(id, 0444, lkid_show, NULL); -GDLM_ATTR(first, 0444, lkfirst_show, NULL); -GDLM_ATTR(first_done, 0444, first_done_show, NULL); -GDLM_ATTR(recover, 0644, recover_show, recover_store); -GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); -GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); +GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); +GDLM_ATTR(block, 0644, block_show, block_store); +GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); +GDLM_ATTR(id, 0444, lkid_show, NULL); +GDLM_ATTR(first, 0444, lkfirst_show, NULL); +GDLM_ATTR(first_done, 0444, first_done_show, NULL); +GDLM_ATTR(recover, 0200, NULL, recover_store); +GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); +GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); static struct attribute *lock_module_attrs[] = { &gdlm_attr_proto_name.attr, -- cgit v1.1 From 4fc981ef9e7c0953d5c4896ce088b19c50cb018f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 19 May 2009 18:33:06 +0900 Subject: bio: always copy back data for copied kernel requests When a read bio_copy_kern() request fails, the content of the bounce buffer is not copied back. However, as request failure doesn't necessarily mean complete failure, the buffer state can be useful. This behavior is also inconsistent with the user map counterpart and causes the subtle difference between bounced and unbounced IO causes confusion. This patch makes bio_copy_kern_endio() ignore @err and always copy back data on request completion. Signed-off-by: Tejun Heo Cc: Boaz Harrosh Cc: James Bottomley Signed-off-by: Jens Axboe --- fs/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 7bbc98f..ee3bc67 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1198,7 +1198,7 @@ static void bio_copy_kern_endio(struct bio *bio, int err) char *addr = page_address(bvec->bv_page); int len = bmd->iovecs[i].bv_len; - if (read && !err) + if (read) memcpy(p, addr, len); __free_page(bvec->bv_page); -- cgit v1.1 From b2858d7d1639c04ca3c54988d76c5f7300b76f1c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 19 May 2009 11:37:46 +0200 Subject: splice: fix kmaps in default_file_splice_write() Unfortunately multiple kmap() within a single thread are deadlockable, so writing out multiple buffers with writev() isn't possible. Change the implementation so that it does a separate write() for each buffer. This actually simplifies the code a lot since the splice_from_pipe() helper can be used. This limitation is caused by HIGHMEM pages, and so only affects a subset of architectures and configurations. In the future it may be worth to implement default_file_splice_write() in a more efficient way on configs that allow it. Signed-off-by: Miklos Szeredi Signed-off-by: Jens Axboe --- fs/splice.c | 130 ++++++++++-------------------------------------------------- 1 file changed, 22 insertions(+), 108 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 41179c0..73766d2 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -535,8 +535,8 @@ static ssize_t kernel_readv(struct file *file, const struct iovec *vec, return res; } -static ssize_t kernel_writev(struct file *file, const struct iovec *vec, - unsigned long vlen, loff_t *ppos) +static ssize_t kernel_write(struct file *file, const char *buf, size_t count, + loff_t pos) { mm_segment_t old_fs; ssize_t res; @@ -544,7 +544,7 @@ static ssize_t kernel_writev(struct file *file, const struct iovec *vec, old_fs = get_fs(); set_fs(get_ds()); /* The cast to a user pointer is valid due to the set_fs() */ - res = vfs_writev(file, (const struct iovec __user *)vec, vlen, ppos); + res = vfs_write(file, (const char __user *)buf, count, &pos); set_fs(old_fs); return res; @@ -1003,120 +1003,34 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, EXPORT_SYMBOL(generic_file_splice_write); -static struct pipe_buffer *nth_pipe_buf(struct pipe_inode_info *pipe, int n) +static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, + struct splice_desc *sd) { - return &pipe->bufs[(pipe->curbuf + n) % PIPE_BUFFERS]; + int ret; + void *data; + + ret = buf->ops->confirm(pipe, buf); + if (ret) + return ret; + + data = buf->ops->map(pipe, buf, 0); + ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos); + buf->ops->unmap(pipe, buf, data); + + return ret; } static ssize_t default_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { - ssize_t ret = 0; - ssize_t total_len = 0; - int do_wakeup = 0; - - pipe_lock(pipe); - while (len) { - struct pipe_buffer *buf; - void *data[PIPE_BUFFERS]; - struct iovec vec[PIPE_BUFFERS]; - unsigned int nr_pages = 0; - unsigned int write_len = 0; - unsigned int now_len = len; - unsigned int this_len; - int i; - - BUG_ON(pipe->nrbufs > PIPE_BUFFERS); - for (i = 0; i < pipe->nrbufs && now_len; i++) { - buf = nth_pipe_buf(pipe, i); - - ret = buf->ops->confirm(pipe, buf); - if (ret) - break; - - data[i] = buf->ops->map(pipe, buf, 0); - this_len = min(buf->len, now_len); - vec[i].iov_base = (void __user *) data[i] + buf->offset; - vec[i].iov_len = this_len; - now_len -= this_len; - write_len += this_len; - nr_pages++; - } - - if (nr_pages) { - ret = kernel_writev(out, vec, nr_pages, ppos); - if (ret == 0) - ret = -EIO; - if (ret > 0) { - len -= ret; - total_len += ret; - } - } - - for (i = 0; i < nr_pages; i++) { - buf = nth_pipe_buf(pipe, i); - buf->ops->unmap(pipe, buf, data[i]); - - if (ret > 0) { - this_len = min_t(unsigned, vec[i].iov_len, ret); - buf->offset += this_len; - buf->len -= this_len; - ret -= this_len; - } - } - - if (ret < 0) - break; - - while (pipe->nrbufs) { - const struct pipe_buf_operations *ops; - - buf = nth_pipe_buf(pipe, 0); - if (buf->len) - break; - - ops = buf->ops; - buf->ops = NULL; - ops->release(pipe, buf); - pipe->curbuf = (pipe->curbuf + 1) % PIPE_BUFFERS; - pipe->nrbufs--; - if (pipe->inode) - do_wakeup = 1; - } - - if (pipe->nrbufs) - continue; - if (!pipe->writers) - break; - if (!pipe->waiting_writers) { - if (total_len) - break; - } - - if (flags & SPLICE_F_NONBLOCK) { - ret = -EAGAIN; - break; - } - - if (signal_pending(current)) { - ret = -ERESTARTSYS; - break; - } - - if (do_wakeup) { - wakeup_pipe_writers(pipe); - do_wakeup = 0; - } - - pipe_wait(pipe); - } - pipe_unlock(pipe); + ssize_t ret; - if (do_wakeup) - wakeup_pipe_writers(pipe); + ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf); + if (ret > 0) + *ppos += ret; - return total_len ? total_len : ret; + return ret; } /** -- cgit v1.1 From ef9e8b14a5c1d0afbaf12b4c3b271188ddfc52a4 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 19 May 2009 14:25:16 +0100 Subject: GFS2: Don't warn when delete inode fails on ro filesystem If the filesystem is read-only, then we expect that delete inode will fail, so there is no need to warn about it. Signed-off-by: Steven Whitehouse --- fs/gfs2/ops_super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c index a3c2272..2fd1dcb 100644 --- a/fs/gfs2/ops_super.c +++ b/fs/gfs2/ops_super.c @@ -714,7 +714,7 @@ out_unlock: gfs2_glock_dq(&ip->i_iopen_gh); gfs2_holder_uninit(&ip->i_iopen_gh); gfs2_glock_dq_uninit(&gh); - if (error && error != GLR_TRYFAILED) + if (error && error != GLR_TRYFAILED && error != -EROFS) fs_warn(sdp, "gfs2_delete_inode: %d\n", error); out: truncate_inode_pages(&inode->i_data, 0); -- cgit v1.1 From 8b6427a2a8f7dd43e9208fb33a3b116d66db4979 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 19 May 2009 09:57:03 -0400 Subject: cifs: fix pointer initialization and checks in cifs_follow_symlink (try #4) This is the third respin of the patch posted yesterday to fix the error handling in cifs_follow_symlink. It also includes a fix for a bogus NULL pointer check in CIFSSMBQueryUnixSymLink that Jeff Moyer spotted. It's possible for CIFSSMBQueryUnixSymLink to return without setting target_path to a valid pointer. If that happens then the current value to which we're initializing this pointer could cause an oops when it's kfree'd. This patch is a little more comprehensive than the last patches. It reorganizes cifs_follow_link a bit for (hopefully) better readability. It should also eliminate the uneeded allocation of full_path on servers without unix extensions (assuming they can get to this point anyway, of which I'm not convinced). On a side note, I'm not sure I agree with the logic of enabling this query even when unix extensions are disabled on the client. It seems like that should disable this as well. But, changing that is outside the scope of this fix, so I've left it alone for now. Reported-by: Jeff Moyer Signed-off-by: Jeff Layton Reviewed-by: Jeff Moyer Reviewed-by: Christoph Hellwig Signed-off-by: Steve French --- fs/cifs/cifssmb.c | 2 +- fs/cifs/link.c | 52 ++++++++++++++++++++++++++-------------------------- 2 files changed, 27 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 5759ba5..d062602 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -2475,7 +2475,7 @@ querySymLinkRetry: /* BB FIXME investigate remapping reserved chars here */ *symlinkinfo = cifs_strndup_from_ucs(data_start, count, is_unicode, nls_codepage); - if (!symlinkinfo) + if (!*symlinkinfo) rc = -ENOMEM; } } diff --git a/fs/cifs/link.c b/fs/cifs/link.c index ea9d11e..cd83c53 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -107,48 +107,48 @@ void * cifs_follow_link(struct dentry *direntry, struct nameidata *nd) { struct inode *inode = direntry->d_inode; - int rc = -EACCES; + int rc = -ENOMEM; int xid; char *full_path = NULL; - char *target_path = ERR_PTR(-ENOMEM); - struct cifs_sb_info *cifs_sb; - struct cifsTconInfo *pTcon; + char *target_path = NULL; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifsTconInfo *tcon = cifs_sb->tcon; xid = GetXid(); - full_path = build_path_from_dentry(direntry); + /* + * For now, we just handle symlinks with unix extensions enabled. + * Eventually we should handle NTFS reparse points, and MacOS + * symlink support. For instance... + * + * rc = CIFSSMBQueryReparseLinkInfo(...) + * + * For now, just return -EACCES when the server doesn't support posix + * extensions. Note that we still allow querying symlinks when posix + * extensions are manually disabled. We could disable these as well + * but there doesn't seem to be any harm in allowing the client to + * read them. + */ + if (!(tcon->ses->capabilities & CAP_UNIX)) { + rc = -EACCES; + goto out; + } + full_path = build_path_from_dentry(direntry); if (!full_path) goto out; cFYI(1, ("Full path: %s inode = 0x%p", full_path, inode)); - cifs_sb = CIFS_SB(inode->i_sb); - pTcon = cifs_sb->tcon; - - /* We could change this to: - if (pTcon->unix_ext) - but there does not seem any point in refusing to - get symlink info if we can, even if unix extensions - turned off for this mount */ - - if (pTcon->ses->capabilities & CAP_UNIX) - rc = CIFSSMBUnixQuerySymLink(xid, pTcon, full_path, - &target_path, - cifs_sb->local_nls); - else { - /* BB add read reparse point symlink code here */ - /* rc = CIFSSMBQueryReparseLinkInfo */ - /* BB Add code to Query ReparsePoint info */ - /* BB Add MAC style xsymlink check here if enabled */ - } + rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path, + cifs_sb->local_nls); + kfree(full_path); +out: if (rc != 0) { kfree(target_path); target_path = ERR_PTR(rc); } - kfree(full_path); -out: FreeXid(xid); nd_set_link(nd, target_path); return NULL; -- cgit v1.1 From 09010978345e8883003bf411bb99753710eb5a3a Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 20 May 2009 10:48:47 +0100 Subject: GFS2: Improve resource group error handling This patch improves the error handling in the case where we discover that the summary information in the resource group doesn't match the bitmap information while in the process of allocating blocks. Originally this resulted in a kernel bug, but this patch changes that so that we return -EIO and print some messages explaining what went wrong, and how to fix it. We also remember locally not to try and allocate from the same rgrp again, so that a subsequent allocation in a different rgrp should succeed. Signed-off-by: Steven Whitehouse --- fs/gfs2/bmap.c | 9 +++++++-- fs/gfs2/dir.c | 11 +++++++++-- fs/gfs2/eattr.c | 14 +++++++++++--- fs/gfs2/glops.c | 20 +------------------ fs/gfs2/incore.h | 7 ++++--- fs/gfs2/rgrp.c | 58 +++++++++++++++++++++++++++++++++++++++++--------------- fs/gfs2/rgrp.h | 47 +++++++++++++++++++++++---------------------- 7 files changed, 99 insertions(+), 67 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 3a5d3f8..253e1a3 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -136,7 +136,9 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) and write it out to disk */ unsigned int n = 1; - block = gfs2_alloc_block(ip, &n); + error = gfs2_alloc_block(ip, &block, &n); + if (error) + goto out_brelse; if (isdir) { gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1); error = gfs2_dir_get_new_buffer(ip, block, &bh); @@ -476,8 +478,11 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, blks = dblks + iblks; i = sheight; do { + int error; n = blks - alloced; - bn = gfs2_alloc_block(ip, &n); + error = gfs2_alloc_block(ip, &bn, &n); + if (error) + return error; alloced += n; if (state != ALLOC_DATA || gfs2_is_jdata(ip)) gfs2_trans_add_unrevoke(sdp, bn, n); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index aef4d0c..297d7e5 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -803,13 +803,20 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, { struct gfs2_inode *ip = GFS2_I(inode); unsigned int n = 1; - u64 bn = gfs2_alloc_block(ip, &n); - struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn); + u64 bn; + int error; + struct buffer_head *bh; struct gfs2_leaf *leaf; struct gfs2_dirent *dent; struct qstr name = { .name = "", .len = 0, .hash = 0 }; + + error = gfs2_alloc_block(ip, &bn, &n); + if (error) + return NULL; + bh = gfs2_meta_new(ip->i_gl, bn); if (!bh) return NULL; + gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1); gfs2_trans_add_bh(ip->i_gl, bh, 1); gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF); diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c index 899763a..07ea952 100644 --- a/fs/gfs2/eattr.c +++ b/fs/gfs2/eattr.c @@ -582,8 +582,11 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp) struct gfs2_ea_header *ea; unsigned int n = 1; u64 block; + int error; - block = gfs2_alloc_block(ip, &n); + error = gfs2_alloc_block(ip, &block, &n); + if (error) + return error; gfs2_trans_add_unrevoke(sdp, block, 1); *bhp = gfs2_meta_new(ip->i_gl, block); gfs2_trans_add_bh(ip->i_gl, *bhp, 1); @@ -617,6 +620,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea, struct gfs2_ea_request *er) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + int error; ea->ea_data_len = cpu_to_be32(er->er_data_len); ea->ea_name_len = er->er_name_len; @@ -642,7 +646,9 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea, int mh_size = sizeof(struct gfs2_meta_header); unsigned int n = 1; - block = gfs2_alloc_block(ip, &n); + error = gfs2_alloc_block(ip, &block, &n); + if (error) + return error; gfs2_trans_add_unrevoke(sdp, block, 1); bh = gfs2_meta_new(ip->i_gl, block); gfs2_trans_add_bh(ip->i_gl, bh, 1); @@ -963,7 +969,9 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, } else { u64 blk; unsigned int n = 1; - blk = gfs2_alloc_block(ip, &n); + error = gfs2_alloc_block(ip, &blk, &n); + if (error) + return error; gfs2_trans_add_unrevoke(sdp, blk, 1); indbh = gfs2_meta_new(ip->i_gl, blk); gfs2_trans_add_bh(ip->i_gl, indbh, 1); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 70f87f4..d5e4ab1 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -310,24 +310,6 @@ static void rgrp_go_unlock(struct gfs2_holder *gh) } /** - * rgrp_go_dump - print out an rgrp - * @seq: The iterator - * @gl: The glock in question - * - */ - -static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) -{ - const struct gfs2_rgrpd *rgd = gl->gl_object; - if (rgd == NULL) - return 0; - gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n", - (unsigned long long)rgd->rd_addr, rgd->rd_flags, - rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes); - return 0; -} - -/** * trans_go_sync - promote/demote the transaction glock * @gl: the glock * @state: the requested state @@ -410,7 +392,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = { .go_demote_ok = rgrp_go_demote_ok, .go_lock = rgrp_go_lock, .go_unlock = rgrp_go_unlock, - .go_dump = rgrp_go_dump, + .go_dump = gfs2_rgrp_dump, .go_type = LM_TYPE_RGRP, .go_min_hold_time = HZ / 5, }; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 0060e95..de50d86 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -92,9 +92,10 @@ struct gfs2_rgrpd { unsigned int rd_bh_count; u32 rd_last_alloc; unsigned char rd_flags; -#define GFS2_RDF_CHECK 0x01 /* Need to check for unlinked inodes */ -#define GFS2_RDF_NOALLOC 0x02 /* rg prohibits allocation */ -#define GFS2_RDF_UPTODATE 0x04 /* rg is up to date */ +#define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */ +#define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */ +#define GFS2_RDF_ERROR 0x40000000 /* error in rg */ +#define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */ }; enum gfs2_state_bits { diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 5650382..fbacf09 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -701,10 +701,7 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf) u32 rg_flags; rg_flags = be32_to_cpu(str->rg_flags); - if (rg_flags & GFS2_RGF_NOALLOC) - rgd->rd_flags |= GFS2_RDF_NOALLOC; - else - rgd->rd_flags &= ~GFS2_RDF_NOALLOC; + rg_flags &= ~GFS2_RDF_MASK; rgd->rd_free = be32_to_cpu(str->rg_free); rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes); rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration); @@ -713,11 +710,8 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf) static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf) { struct gfs2_rgrp *str = buf; - u32 rg_flags = 0; - if (rgd->rd_flags & GFS2_RDF_NOALLOC) - rg_flags |= GFS2_RGF_NOALLOC; - str->rg_flags = cpu_to_be32(rg_flags); + str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK); str->rg_free = cpu_to_be32(rgd->rd_free); str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes); str->__pad = cpu_to_be32(0); @@ -942,7 +936,7 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al) struct gfs2_sbd *sdp = rgd->rd_sbd; int ret = 0; - if (rgd->rd_flags & GFS2_RDF_NOALLOC) + if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR)) return 0; spin_lock(&sdp->sd_rindex_spin); @@ -1435,13 +1429,33 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, } /** - * gfs2_alloc_block - Allocate a block + * gfs2_rgrp_dump - print out an rgrp + * @seq: The iterator + * @gl: The glock in question + * + */ + +int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl) +{ + const struct gfs2_rgrpd *rgd = gl->gl_object; + if (rgd == NULL) + return 0; + gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n", + (unsigned long long)rgd->rd_addr, rgd->rd_flags, + rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes); + return 0; +} + +/** + * gfs2_alloc_block - Allocate one or more blocks * @ip: the inode to allocate the block for + * @bn: Used to return the starting block number + * @n: requested number of blocks/extent length (value/result) * - * Returns: the allocated block + * Returns: 0 or error */ -u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n) +int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head *dibh; @@ -1457,7 +1471,10 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n) goal = rgd->rd_last_alloc; blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED, n); - BUG_ON(blk == BFITNOENT); + + /* Since all blocks are reserved in advance, this shouldn't happen */ + if (blk == BFITNOENT) + goto rgrp_error; rgd->rd_last_alloc = blk; block = rgd->rd_data0 + blk; @@ -1469,7 +1486,9 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n) di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal); brelse(dibh); } - gfs2_assert_withdraw(sdp, rgd->rd_free >= *n); + if (rgd->rd_free < *n) + goto rgrp_error; + rgd->rd_free -= *n; gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); @@ -1484,7 +1503,16 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n) rgd->rd_free_clone -= *n; spin_unlock(&sdp->sd_rindex_spin); - return block; + *bn = block; + return 0; + +rgrp_error: + fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n", + (unsigned long long)rgd->rd_addr); + fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n"); + gfs2_rgrp_dump(NULL, rgd->rd_gl); + rgd->rd_flags |= GFS2_RDF_ERROR; + return -EIO; } /** diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 3181c7e..1e76ff0 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -14,22 +14,22 @@ struct gfs2_rgrpd; struct gfs2_sbd; struct gfs2_holder; -void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd); +extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd); struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk); struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp); struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); -void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); -int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh); +extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); +extern int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh); -int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd); -void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd); -void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd); +extern int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd); +extern void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd); +extern void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd); -void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd); +extern void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd); -struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); +extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); static inline void gfs2_alloc_put(struct gfs2_inode *ip) { BUG_ON(ip->i_alloc == NULL); @@ -37,22 +37,22 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip) ip->i_alloc = NULL; } -int gfs2_inplace_reserve_i(struct gfs2_inode *ip, - char *file, unsigned int line); +extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, + unsigned int line); #define gfs2_inplace_reserve(ip) \ gfs2_inplace_reserve_i((ip), __FILE__, __LINE__) -void gfs2_inplace_release(struct gfs2_inode *ip); +extern void gfs2_inplace_release(struct gfs2_inode *ip); -unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block); +extern unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block); -u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n); -u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation); +extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); +extern u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation); -void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); -void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); -void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); -void gfs2_unlink_di(struct inode *inode); +extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); +extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); +extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); +extern void gfs2_unlink_di(struct inode *inode); struct gfs2_rgrp_list { unsigned int rl_rgrps; @@ -61,10 +61,11 @@ struct gfs2_rgrp_list { struct gfs2_holder *rl_ghs; }; -void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, - u64 block); -void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state); -void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); -u64 gfs2_ri_total(struct gfs2_sbd *sdp); +extern void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, + u64 block); +extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state); +extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); +extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); +extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); #endif /* __RGRP_DOT_H__ */ -- cgit v1.1 From 60a0b8f93664621a07b93273fc8ebc29590c62f5 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 21 May 2009 12:23:12 +0100 Subject: GFS2: Add a rgrp bitmap full flag During block allocation, it is useful to know if sections of disk are full on a finer grained basis than a single resource group. This can make a performance difference when resource groups have larger numbers of bitmap blocks, since we no longer have to search them all block by block in each individual bitmap. The full flag is set on a per-bitmap basis when it has been searched and found to have no free space. It is then skipped in subsequent searches until the flag is reset. The resetting occurs if we have to drop the glock on the resource group for any reason, or if we deallocate some blocks within that resource group and thus free up some space. Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 3 +++ fs/gfs2/rgrp.c | 77 ++++++++++++++++++++++++++++++++++---------------------- 2 files changed, 50 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index de50d86..dd87379 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -64,9 +64,12 @@ struct gfs2_log_element { const struct gfs2_log_operations *le_ops; }; +#define GBF_FULL 1 + struct gfs2_bitmap { struct buffer_head *bi_bh; char *bi_clone; + unsigned long bi_flags; u32 bi_offset; u32 bi_start; u32 bi_len; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index fbacf09..23637b9 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -442,6 +442,7 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd) for (x = 0; x < length; x++) { bi = rgd->rd_bits + x; + bi->bi_flags = 0; /* small rgrp; bitmap stored completely in header block */ if (length == 1) { bytes = bytes_left; @@ -769,6 +770,8 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) } if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) { + for (x = 0; x < length; x++) + clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags); gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data); rgd->rd_flags |= GFS2_RDF_UPTODATE; } @@ -897,6 +900,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd) continue; if (sdp->sd_args.ar_discard) gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi); + clear_bit(GBF_FULL, &bi->bi_flags); memcpy(bi->bi_clone + bi->bi_offset, bi->bi_bh->b_data + bi->bi_offset, bi->bi_len); } @@ -1309,30 +1313,37 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, { struct gfs2_bitmap *bi = NULL; const u32 length = rgd->rd_length; - u32 blk = 0; + u32 blk = BFITNOENT; unsigned int buf, x; const unsigned int elen = *n; - const u8 *buffer; + const u8 *buffer = NULL; *n = 0; /* Find bitmap block that contains bits for goal block */ for (buf = 0; buf < length; buf++) { bi = rgd->rd_bits + buf; - if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) - break; + /* Convert scope of "goal" from rgrp-wide to within found bit block */ + if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) { + goal -= bi->bi_start * GFS2_NBBY; + goto do_search; + } } + buf = 0; + goal = 0; - gfs2_assert(rgd->rd_sbd, buf < length); - - /* Convert scope of "goal" from rgrp-wide to within found bit block */ - goal -= bi->bi_start * GFS2_NBBY; - +do_search: /* Search (up to entire) bitmap in this rgrp for allocatable block. "x <= length", instead of "x < length", because we typically start the search in the middle of a bit block, but if we can't find an allocatable block anywhere else, we want to be able wrap around and search in the first part of our first-searched bit block. */ for (x = 0; x <= length; x++) { + bi = rgd->rd_bits + buf; + + if (test_bit(GBF_FULL, &bi->bi_flags) && + (old_state == GFS2_BLKST_FREE)) + goto skip; + /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone bitmaps, so we must search the originals for that. */ buffer = bi->bi_bh->b_data + bi->bi_offset; @@ -1343,33 +1354,39 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, if (blk != BFITNOENT) break; + if ((goal == 0) && (old_state == GFS2_BLKST_FREE)) + set_bit(GBF_FULL, &bi->bi_flags); + /* Try next bitmap block (wrap back to rgrp header if at end) */ - buf = (buf + 1) % length; - bi = rgd->rd_bits + buf; +skip: + buf++; + buf %= length; goal = 0; } - if (blk != BFITNOENT && old_state != new_state) { - *n = 1; - gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); + if (blk == BFITNOENT) + return blk; + *n = 1; + if (old_state == new_state) + goto out; + + gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); + gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset, + bi->bi_len, blk, new_state); + goal = blk; + while (*n < elen) { + goal++; + if (goal >= (bi->bi_len * GFS2_NBBY)) + break; + if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) != + GFS2_BLKST_FREE) + break; gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset, - bi->bi_len, blk, new_state); - goal = blk; - while (*n < elen) { - goal++; - if (goal >= (bi->bi_len * GFS2_NBBY)) - break; - if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) != - GFS2_BLKST_FREE) - break; - gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, - bi->bi_offset, bi->bi_len, goal, - new_state); - (*n)++; - } + bi->bi_len, goal, new_state); + (*n)++; } - - return (blk == BFITNOENT) ? blk : (bi->bi_start * GFS2_NBBY) + blk; +out: + return (bi->bi_start * GFS2_NBBY) + blk; } /** -- cgit v1.1 From 1ce97e564b628bee30b8dbb64e5e653a484308f6 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 21 May 2009 15:18:19 +0100 Subject: GFS2: Be more aggressive in reclaiming unlinked inodes This patch increases the frequency with which gfs2 looks for unlinked, but still allocated inodes. Its the equivalent operation to ext3's orphan list, but done with bitmaps in the resource groups. This also fixes a bug where a field in the rgrp was too small. Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 2 +- fs/gfs2/rgrp.c | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index dd87379..225347f 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -94,7 +94,7 @@ struct gfs2_rgrpd { struct gfs2_sbd *rd_sbd; unsigned int rd_bh_count; u32 rd_last_alloc; - unsigned char rd_flags; + u32 rd_flags; #define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */ #define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */ #define GFS2_RDF_ERROR 0x40000000 /* error in rg */ diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 23637b9..ee3d5c1 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -581,7 +581,6 @@ static int read_rindex_entry(struct gfs2_inode *ip, rgd->rd_gl->gl_object = rgd; rgd->rd_flags &= ~GFS2_RDF_UPTODATE; - rgd->rd_flags |= GFS2_RDF_CHECK; return error; } @@ -703,6 +702,8 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf) rg_flags = be32_to_cpu(str->rg_flags); rg_flags &= ~GFS2_RDF_MASK; + rgd->rd_flags &= GFS2_RDF_MASK; + rgd->rd_flags |= rg_flags; rgd->rd_free = be32_to_cpu(str->rg_free); rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes); rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration); @@ -773,7 +774,7 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) for (x = 0; x < length; x++) clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags); gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data); - rgd->rd_flags |= GFS2_RDF_UPTODATE; + rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); } spin_lock(&sdp->sd_rindex_spin); -- cgit v1.1 From 703a3b8e5c01cf6fb33c6d8dc99905f889a4e992 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 21 May 2009 22:21:53 +0000 Subject: [CIFS] fix posix open regression Posix open code was not properly adding the file to the list of open files. Fix allocating cifsFileInfo more than once, and adding twice to flist and tlist. Also fix mode setting to be done in one place in these paths. Signed-off-by: Steve French Reviewed-by: Shirish Pargaonkar Tested-by: Jeff Layton Tested-by: Luca Tettamanti --- fs/cifs/dir.c | 14 ++++++------- fs/cifs/file.c | 66 ++++++++++++++++++++++++++++++++-------------------------- 2 files changed, 44 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 11431ed..f49d684 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -225,6 +225,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode, if (!(oflags & FMODE_READ)) write_only = true; + mode &= ~current_umask(); rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode, pnetfid, presp_data, &oplock, full_path, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & @@ -310,7 +311,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, return -ENOMEM; } - mode &= ~current_umask(); if (oplockEnabled) oplock = REQ_OPLOCK; @@ -336,7 +336,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, else /* success, no need to query */ goto cifs_create_set_dentry; } else if ((rc != -EIO) && (rc != -EREMOTE) && - (rc != -EOPNOTSUPP)) /* path not found or net err */ + (rc != -EOPNOTSUPP) && (rc != -EINVAL)) goto cifs_create_out; /* else fallthrough to retry, using older open call, this is case where server does not support this SMB level, and @@ -609,7 +609,6 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, int xid; int rc = 0; /* to get around spurious gcc warning, set to zero here */ int oplock = 0; - int mode; __u16 fileHandle = 0; bool posix_open = false; struct cifs_sb_info *cifs_sb; @@ -660,13 +659,12 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, if (pTcon->unix_ext) { if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) && - (nd->flags & LOOKUP_OPEN)) { + (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open) { if (!((nd->intent.open.flags & O_CREAT) && (nd->intent.open.flags & O_EXCL))) { - mode = nd->intent.open.create_mode & - ~current_umask(); rc = cifs_posix_open(full_path, &newInode, - parent_dir_inode->i_sb, mode, + parent_dir_inode->i_sb, + nd->intent.open.create_mode, nd->intent.open.flags, &oplock, &fileHandle, xid); /* @@ -681,6 +679,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, */ if ((rc != -EINVAL) && (rc != -EOPNOTSUPP)) posix_open = true; + else + pTcon->broken_posix_open = true; } } if (!posix_open) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 38c06f8..302ea15 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -130,10 +130,6 @@ static inline int cifs_posix_open_inode_helper(struct inode *inode, struct cifsFileInfo *pCifsFile, int oplock, u16 netfid) { - file->private_data = kmalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); - if (file->private_data == NULL) - return -ENOMEM; - pCifsFile = cifs_init_private(file->private_data, inode, file, netfid); write_lock(&GlobalSMBSeslock); pCifsInode = CIFS_I(file->f_path.dentry->d_inode); @@ -184,6 +180,38 @@ psx_client_can_cache: return 0; } +static struct cifsFileInfo * +cifs_fill_filedata(struct file *file) +{ + struct list_head *tmp; + struct cifsFileInfo *pCifsFile = NULL; + struct cifsInodeInfo *pCifsInode = NULL; + + /* search inode for this file and fill in file->private_data */ + pCifsInode = CIFS_I(file->f_path.dentry->d_inode); + read_lock(&GlobalSMBSeslock); + list_for_each(tmp, &pCifsInode->openFileList) { + pCifsFile = list_entry(tmp, struct cifsFileInfo, flist); + if ((pCifsFile->pfile == NULL) && + (pCifsFile->pid == current->tgid)) { + /* mode set in cifs_create */ + + /* needed for writepage */ + pCifsFile->pfile = file; + file->private_data = pCifsFile; + break; + } + } + read_unlock(&GlobalSMBSeslock); + + if (file->private_data != NULL) { + return pCifsFile; + } else if ((file->f_flags & O_CREAT) && (file->f_flags & O_EXCL)) + cERROR(1, ("could not find file instance for " + "new file %p", file)); + return NULL; +} + /* all arguments to this function must be checked for validity in caller */ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file, struct cifsInodeInfo *pCifsInode, struct cifsFileInfo *pCifsFile, @@ -258,7 +286,6 @@ int cifs_open(struct inode *inode, struct file *file) struct cifsTconInfo *tcon; struct cifsFileInfo *pCifsFile; struct cifsInodeInfo *pCifsInode; - struct list_head *tmp; char *full_path = NULL; int desiredAccess; int disposition; @@ -270,32 +297,12 @@ int cifs_open(struct inode *inode, struct file *file) cifs_sb = CIFS_SB(inode->i_sb); tcon = cifs_sb->tcon; - /* search inode for this file and fill in file->private_data */ pCifsInode = CIFS_I(file->f_path.dentry->d_inode); - read_lock(&GlobalSMBSeslock); - list_for_each(tmp, &pCifsInode->openFileList) { - pCifsFile = list_entry(tmp, struct cifsFileInfo, - flist); - if ((pCifsFile->pfile == NULL) && - (pCifsFile->pid == current->tgid)) { - /* mode set in cifs_create */ - - /* needed for writepage */ - pCifsFile->pfile = file; - - file->private_data = pCifsFile; - break; - } - } - read_unlock(&GlobalSMBSeslock); - - if (file->private_data != NULL) { - rc = 0; + pCifsFile = cifs_fill_filedata(file); + if (pCifsFile) { FreeXid(xid); - return rc; - } else if ((file->f_flags & O_CREAT) && (file->f_flags & O_EXCL)) - cERROR(1, ("could not find file instance for " - "new file %p", file)); + return 0; + } full_path = build_path_from_dentry(file->f_path.dentry); if (full_path == NULL) { @@ -325,6 +332,7 @@ int cifs_open(struct inode *inode, struct file *file) /* no need for special case handling of setting mode on read only files needed here */ + pCifsFile = cifs_fill_filedata(file); cifs_posix_open_inode_helper(inode, file, pCifsInode, pCifsFile, oplock, netfid); goto out; -- cgit v1.1 From b9fc745db833bbf74b4988493b8cd902a84c9415 Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Tue, 19 May 2009 13:25:57 -0400 Subject: integrity: path_check update - Add support in ima_path_check() for integrity checking without incrementing the counts. (Required for nfsd.) - rename and export opencount_get to ima_counts_get - replace ima_shm_check calls with ima_counts_get - export ima_path_check Signed-off-by: Mimi Zohar Signed-off-by: James Morris --- fs/exec.c | 5 +++-- fs/namei.c | 6 ++++-- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 998e856..618d6d1 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -130,7 +130,8 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) MAY_READ | MAY_EXEC | MAY_OPEN); if (error) goto exit; - error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN); + error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN, + IMA_COUNT_UPDATE); if (error) goto exit; @@ -680,7 +681,7 @@ struct file *open_exec(const char *name) err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN); if (err) goto out_path_put; - err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN); + err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN, IMA_COUNT_UPDATE); if (err) goto out_path_put; diff --git a/fs/namei.c b/fs/namei.c index 78f253c..b05a2b1 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -853,7 +853,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd) err = inode_permission(nd->path.dentry->d_inode, MAY_EXEC); if (!err) - err = ima_path_check(&nd->path, MAY_EXEC); + err = ima_path_check(&nd->path, MAY_EXEC, + IMA_COUNT_UPDATE); if (err) break; @@ -1515,7 +1516,8 @@ int may_open(struct path *path, int acc_mode, int flag) return error; error = ima_path_check(path, - acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC)); + acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC), + IMA_COUNT_UPDATE); if (error) return error; /* -- cgit v1.1 From c9d9ac525a0285a5b5ad9c3f9aa8b7c1753e6121 Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Tue, 19 May 2009 13:25:58 -0400 Subject: integrity: move ima_counts_get Based on discussion on lkml (Andrew Morton and Eric Paris), move ima_counts_get down a layer into shmem/hugetlb__file_setup(). Resolves drm shmem_file_setup() usage case as well. HD comment: I still think you're doing this at the wrong level, but recognize that you probably won't be persuaded until a few more users of alloc_file() emerge, all wanting your ima_counts_get(). Resolving GEM's shmem_file_setup() is an improvement, so I'll say Acked-by: Hugh Dickins Signed-off-by: Mimi Zohar Signed-off-by: James Morris --- fs/hugetlbfs/inode.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 153d968..ccc62de 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -30,6 +30,7 @@ #include #include #include +#include #include @@ -997,6 +998,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag) &hugetlbfs_file_operations); if (!file) goto out_dentry; /* inode is already attached */ + ima_counts_get(file); return file; -- cgit v1.1 From b1e71b0622974953e46a284aa986504a90869a9b Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 22 May 2009 10:01:55 +0100 Subject: GFS2: Clean up some file names This patch renames the ops_*.c files which have no counterpart without the ops_ prefix in order to shorten the name and make it more readable. In addition, ops_address.h (which was very small) is moved into inode.h and inode.h is cleaned up by adding extern where required. Signed-off-by: Steven Whitehouse --- fs/gfs2/Makefile | 2 +- fs/gfs2/aops.c | 1145 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/gfs2/bmap.c | 1 - fs/gfs2/dentry.c | 114 +++++ fs/gfs2/export.c | 285 ++++++++++++ fs/gfs2/file.c | 765 +++++++++++++++++++++++++++++++++ fs/gfs2/inode.c | 1 - fs/gfs2/inode.h | 57 +-- fs/gfs2/meta_io.c | 1 - fs/gfs2/ops_address.c | 1146 ------------------------------------------------- fs/gfs2/ops_address.h | 23 - fs/gfs2/ops_dentry.c | 114 ----- fs/gfs2/ops_export.c | 285 ------------ fs/gfs2/ops_file.c | 766 --------------------------------- fs/gfs2/quota.c | 1 - fs/gfs2/rgrp.c | 1 - 16 files changed, 2343 insertions(+), 2364 deletions(-) create mode 100644 fs/gfs2/aops.c create mode 100644 fs/gfs2/dentry.c create mode 100644 fs/gfs2/export.c create mode 100644 fs/gfs2/file.c delete mode 100644 fs/gfs2/ops_address.c delete mode 100644 fs/gfs2/ops_address.h delete mode 100644 fs/gfs2/ops_dentry.c delete mode 100644 fs/gfs2/ops_export.c delete mode 100644 fs/gfs2/ops_file.c (limited to 'fs') diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile index a851ea4..4f7332c 100644 --- a/fs/gfs2/Makefile +++ b/fs/gfs2/Makefile @@ -1,7 +1,7 @@ obj-$(CONFIG_GFS2_FS) += gfs2.o gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \ glops.o inode.o log.o lops.o main.o meta_io.o \ - mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \ + mount.o aops.o dentry.o export.o file.o \ ops_fstype.o ops_inode.o ops_super.o quota.o \ recovery.o rgrp.o super.o sys.o trans.o util.o diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c new file mode 100644 index 0000000..03ebb43 --- /dev/null +++ b/fs/gfs2/aops.c @@ -0,0 +1,1145 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "inode.h" +#include "log.h" +#include "meta_io.h" +#include "quota.h" +#include "trans.h" +#include "rgrp.h" +#include "super.h" +#include "util.h" +#include "glops.h" + + +static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, + unsigned int from, unsigned int to) +{ + struct buffer_head *head = page_buffers(page); + unsigned int bsize = head->b_size; + struct buffer_head *bh; + unsigned int start, end; + + for (bh = head, start = 0; bh != head || !start; + bh = bh->b_this_page, start = end) { + end = start + bsize; + if (end <= from || start >= to) + continue; + if (gfs2_is_jdata(ip)) + set_buffer_uptodate(bh); + gfs2_trans_add_bh(ip->i_gl, bh, 0); + } +} + +/** + * gfs2_get_block_noalloc - Fills in a buffer head with details about a block + * @inode: The inode + * @lblock: The block number to look up + * @bh_result: The buffer head to return the result in + * @create: Non-zero if we may add block to the file + * + * Returns: errno + */ + +static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock, + struct buffer_head *bh_result, int create) +{ + int error; + + error = gfs2_block_map(inode, lblock, bh_result, 0); + if (error) + return error; + if (!buffer_mapped(bh_result)) + return -EIO; + return 0; +} + +static int gfs2_get_block_direct(struct inode *inode, sector_t lblock, + struct buffer_head *bh_result, int create) +{ + return gfs2_block_map(inode, lblock, bh_result, 0); +} + +/** + * gfs2_writepage_common - Common bits of writepage + * @page: The page to be written + * @wbc: The writeback control + * + * Returns: 1 if writepage is ok, otherwise an error code or zero if no error. + */ + +static int gfs2_writepage_common(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + + if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) + goto out; + if (current->journal_info) + goto redirty; + /* Is the page fully outside i_size? (truncate in progress) */ + offset = i_size & (PAGE_CACHE_SIZE-1); + if (page->index > end_index || (page->index == end_index && !offset)) { + page->mapping->a_ops->invalidatepage(page, 0); + goto out; + } + return 1; +redirty: + redirty_page_for_writepage(wbc, page); +out: + unlock_page(page); + return 0; +} + +/** + * gfs2_writeback_writepage - Write page for writeback mappings + * @page: The page + * @wbc: The writeback control + * + */ + +static int gfs2_writeback_writepage(struct page *page, + struct writeback_control *wbc) +{ + int ret; + + ret = gfs2_writepage_common(page, wbc); + if (ret <= 0) + return ret; + + ret = mpage_writepage(page, gfs2_get_block_noalloc, wbc); + if (ret == -EAGAIN) + ret = block_write_full_page(page, gfs2_get_block_noalloc, wbc); + return ret; +} + +/** + * gfs2_ordered_writepage - Write page for ordered data files + * @page: The page to write + * @wbc: The writeback control + * + */ + +static int gfs2_ordered_writepage(struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + int ret; + + ret = gfs2_writepage_common(page, wbc); + if (ret <= 0) + return ret; + + if (!page_has_buffers(page)) { + create_empty_buffers(page, inode->i_sb->s_blocksize, + (1 << BH_Dirty)|(1 << BH_Uptodate)); + } + gfs2_page_add_databufs(ip, page, 0, inode->i_sb->s_blocksize-1); + return block_write_full_page(page, gfs2_get_block_noalloc, wbc); +} + +/** + * __gfs2_jdata_writepage - The core of jdata writepage + * @page: The page to write + * @wbc: The writeback control + * + * This is shared between writepage and writepages and implements the + * core of the writepage operation. If a transaction is required then + * PageChecked will have been set and the transaction will have + * already been started before this is called. + */ + +static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + + if (PageChecked(page)) { + ClearPageChecked(page); + if (!page_has_buffers(page)) { + create_empty_buffers(page, inode->i_sb->s_blocksize, + (1 << BH_Dirty)|(1 << BH_Uptodate)); + } + gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1); + } + return block_write_full_page(page, gfs2_get_block_noalloc, wbc); +} + +/** + * gfs2_jdata_writepage - Write complete page + * @page: Page to write + * + * Returns: errno + * + */ + +static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct gfs2_sbd *sdp = GFS2_SB(inode); + int ret; + int done_trans = 0; + + if (PageChecked(page)) { + if (wbc->sync_mode != WB_SYNC_ALL) + goto out_ignore; + ret = gfs2_trans_begin(sdp, RES_DINODE + 1, 0); + if (ret) + goto out_ignore; + done_trans = 1; + } + ret = gfs2_writepage_common(page, wbc); + if (ret > 0) + ret = __gfs2_jdata_writepage(page, wbc); + if (done_trans) + gfs2_trans_end(sdp); + return ret; + +out_ignore: + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; +} + +/** + * gfs2_writeback_writepages - Write a bunch of dirty pages back to disk + * @mapping: The mapping to write + * @wbc: Write-back control + * + * For the data=writeback case we can already ignore buffer heads + * and write whole extents at once. This is a big reduction in the + * number of I/O requests we send and the bmap calls we make in this case. + */ +static int gfs2_writeback_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); +} + +/** + * gfs2_write_jdata_pagevec - Write back a pagevec's worth of pages + * @mapping: The mapping + * @wbc: The writeback control + * @writepage: The writepage function to call for each page + * @pvec: The vector of pages + * @nr_pages: The number of pages to write + * + * Returns: non-zero if loop should terminate, zero otherwise + */ + +static int gfs2_write_jdata_pagevec(struct address_space *mapping, + struct writeback_control *wbc, + struct pagevec *pvec, + int nr_pages, pgoff_t end) +{ + struct inode *inode = mapping->host; + struct gfs2_sbd *sdp = GFS2_SB(inode); + loff_t i_size = i_size_read(inode); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset = i_size & (PAGE_CACHE_SIZE-1); + unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize); + struct backing_dev_info *bdi = mapping->backing_dev_info; + int i; + int ret; + + ret = gfs2_trans_begin(sdp, nrblocks, nrblocks); + if (ret < 0) + return ret; + + for(i = 0; i < nr_pages; i++) { + struct page *page = pvec->pages[i]; + + lock_page(page); + + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + continue; + } + + if (!wbc->range_cyclic && page->index > end) { + ret = 1; + unlock_page(page); + continue; + } + + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + + if (PageWriteback(page) || + !clear_page_dirty_for_io(page)) { + unlock_page(page); + continue; + } + + /* Is the page fully outside i_size? (truncate in progress) */ + if (page->index > end_index || (page->index == end_index && !offset)) { + page->mapping->a_ops->invalidatepage(page, 0); + unlock_page(page); + continue; + } + + ret = __gfs2_jdata_writepage(page, wbc); + + if (ret || (--(wbc->nr_to_write) <= 0)) + ret = 1; + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + ret = 1; + } + + } + gfs2_trans_end(sdp); + return ret; +} + +/** + * gfs2_write_cache_jdata - Like write_cache_pages but different + * @mapping: The mapping to write + * @wbc: The writeback control + * @writepage: The writepage function to call + * @data: The data to pass to writepage + * + * The reason that we use our own function here is that we need to + * start transactions before we grab page locks. This allows us + * to get the ordering right. + */ + +static int gfs2_write_cache_jdata(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct backing_dev_info *bdi = mapping->backing_dev_info; + int ret = 0; + int done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; + int scanned = 0; + int range_whole = 0; + + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + return 0; + } + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + scanned = 1; + } + +retry: + while (!done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + scanned = 1; + ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end); + if (ret) + done = 1; + if (ret > 0) + ret = 0; + + pagevec_release(&pvec); + cond_resched(); + } + + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = 1; + index = 0; + goto retry; + } + + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = index; + return ret; +} + + +/** + * gfs2_jdata_writepages - Write a bunch of dirty pages back to disk + * @mapping: The mapping to write + * @wbc: The writeback control + * + */ + +static int gfs2_jdata_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct gfs2_inode *ip = GFS2_I(mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(mapping->host); + int ret; + + ret = gfs2_write_cache_jdata(mapping, wbc); + if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) { + gfs2_log_flush(sdp, ip->i_gl); + ret = gfs2_write_cache_jdata(mapping, wbc); + } + return ret; +} + +/** + * stuffed_readpage - Fill in a Linux page with stuffed file data + * @ip: the inode + * @page: the page + * + * Returns: errno + */ + +static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) +{ + struct buffer_head *dibh; + void *kaddr; + int error; + + /* + * Due to the order of unstuffing files and ->fault(), we can be + * asked for a zero page in the case of a stuffed file being extended, + * so we need to supply one here. It doesn't happen often. + */ + if (unlikely(page->index)) { + zero_user(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); + return 0; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + kaddr = kmap_atomic(page, KM_USER0); + memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), + ip->i_disksize); + memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize); + kunmap_atomic(kaddr, KM_USER0); + flush_dcache_page(page); + brelse(dibh); + SetPageUptodate(page); + + return 0; +} + + +/** + * __gfs2_readpage - readpage + * @file: The file to read a page for + * @page: The page to read + * + * This is the core of gfs2's readpage. Its used by the internal file + * reading code as in that case we already hold the glock. Also its + * called by gfs2_readpage() once the required lock has been granted. + * + */ + +static int __gfs2_readpage(void *file, struct page *page) +{ + struct gfs2_inode *ip = GFS2_I(page->mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); + int error; + + if (gfs2_is_stuffed(ip)) { + error = stuffed_readpage(ip, page); + unlock_page(page); + } else { + error = mpage_readpage(page, gfs2_block_map); + } + + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + return -EIO; + + return error; +} + +/** + * gfs2_readpage - read a page of a file + * @file: The file to read + * @page: The page of the file + * + * This deals with the locking required. We have to unlock and + * relock the page in order to get the locking in the right + * order. + */ + +static int gfs2_readpage(struct file *file, struct page *page) +{ + struct address_space *mapping = page->mapping; + struct gfs2_inode *ip = GFS2_I(mapping->host); + struct gfs2_holder gh; + int error; + + unlock_page(page); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + error = gfs2_glock_nq(&gh); + if (unlikely(error)) + goto out; + error = AOP_TRUNCATED_PAGE; + lock_page(page); + if (page->mapping == mapping && !PageUptodate(page)) + error = __gfs2_readpage(file, page); + else + unlock_page(page); + gfs2_glock_dq(&gh); +out: + gfs2_holder_uninit(&gh); + if (error && error != AOP_TRUNCATED_PAGE) + lock_page(page); + return error; +} + +/** + * gfs2_internal_read - read an internal file + * @ip: The gfs2 inode + * @ra_state: The readahead state (or NULL for no readahead) + * @buf: The buffer to fill + * @pos: The file position + * @size: The amount to read + * + */ + +int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state, + char *buf, loff_t *pos, unsigned size) +{ + struct address_space *mapping = ip->i_inode.i_mapping; + unsigned long index = *pos / PAGE_CACHE_SIZE; + unsigned offset = *pos & (PAGE_CACHE_SIZE - 1); + unsigned copied = 0; + unsigned amt; + struct page *page; + void *p; + + do { + amt = size - copied; + if (offset + size > PAGE_CACHE_SIZE) + amt = PAGE_CACHE_SIZE - offset; + page = read_cache_page(mapping, index, __gfs2_readpage, NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + p = kmap_atomic(page, KM_USER0); + memcpy(buf + copied, p + offset, amt); + kunmap_atomic(p, KM_USER0); + mark_page_accessed(page); + page_cache_release(page); + copied += amt; + index++; + offset = 0; + } while(copied < size); + (*pos) += size; + return size; +} + +/** + * gfs2_readpages - Read a bunch of pages at once + * + * Some notes: + * 1. This is only for readahead, so we can simply ignore any things + * which are slightly inconvenient (such as locking conflicts between + * the page lock and the glock) and return having done no I/O. Its + * obviously not something we'd want to do on too regular a basis. + * Any I/O we ignore at this time will be done via readpage later. + * 2. We don't handle stuffed files here we let readpage do the honours. + * 3. mpage_readpages() does most of the heavy lifting in the common case. + * 4. gfs2_block_map() is relied upon to set BH_Boundary in the right places. + */ + +static int gfs2_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct inode *inode = mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_holder gh; + int ret; + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (unlikely(ret)) + goto out_uninit; + if (!gfs2_is_stuffed(ip)) + ret = mpage_readpages(mapping, pages, nr_pages, gfs2_block_map); + gfs2_glock_dq(&gh); +out_uninit: + gfs2_holder_uninit(&gh); + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + ret = -EIO; + return ret; +} + +/** + * gfs2_write_begin - Begin to write to a file + * @file: The file to write to + * @mapping: The mapping in which to write + * @pos: The file offset at which to start writing + * @len: Length of the write + * @flags: Various flags + * @pagep: Pointer to return the page + * @fsdata: Pointer to return fs data (unused by GFS2) + * + * Returns: errno + */ + +static int gfs2_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct gfs2_inode *ip = GFS2_I(mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(mapping->host); + unsigned int data_blocks = 0, ind_blocks = 0, rblocks; + int alloc_required; + int error = 0; + struct gfs2_alloc *al; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned to = from + len; + struct page *page; + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); + error = gfs2_glock_nq(&ip->i_gh); + if (unlikely(error)) + goto out_uninit; + + error = gfs2_write_alloc_required(ip, pos, len, &alloc_required); + if (error) + goto out_unlock; + + if (alloc_required || gfs2_is_jdata(ip)) + gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks); + + if (alloc_required) { + al = gfs2_alloc_get(ip); + if (!al) { + error = -ENOMEM; + goto out_unlock; + } + + error = gfs2_quota_lock_check(ip); + if (error) + goto out_alloc_put; + + al->al_requested = data_blocks + ind_blocks; + error = gfs2_inplace_reserve(ip); + if (error) + goto out_qunlock; + } + + rblocks = RES_DINODE + ind_blocks; + if (gfs2_is_jdata(ip)) + rblocks += data_blocks ? data_blocks : 1; + if (ind_blocks || data_blocks) + rblocks += RES_STATFS + RES_QUOTA; + + error = gfs2_trans_begin(sdp, rblocks, + PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); + if (error) + goto out_trans_fail; + + error = -ENOMEM; + flags |= AOP_FLAG_NOFS; + page = grab_cache_page_write_begin(mapping, index, flags); + *pagep = page; + if (unlikely(!page)) + goto out_endtrans; + + if (gfs2_is_stuffed(ip)) { + error = 0; + if (pos + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) { + error = gfs2_unstuff_dinode(ip, page); + if (error == 0) + goto prepare_write; + } else if (!PageUptodate(page)) { + error = stuffed_readpage(ip, page); + } + goto out; + } + +prepare_write: + error = block_prepare_write(page, from, to, gfs2_block_map); +out: + if (error == 0) + return 0; + + page_cache_release(page); + if (pos + len > ip->i_inode.i_size) + vmtruncate(&ip->i_inode, ip->i_inode.i_size); +out_endtrans: + gfs2_trans_end(sdp); +out_trans_fail: + if (alloc_required) { + gfs2_inplace_release(ip); +out_qunlock: + gfs2_quota_unlock(ip); +out_alloc_put: + gfs2_alloc_put(ip); + } +out_unlock: + gfs2_glock_dq(&ip->i_gh); +out_uninit: + gfs2_holder_uninit(&ip->i_gh); + return error; +} + +/** + * adjust_fs_space - Adjusts the free space available due to gfs2_grow + * @inode: the rindex inode + */ +static void adjust_fs_space(struct inode *inode) +{ + struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + u64 fs_total, new_free; + + /* Total up the file system space, according to the latest rindex. */ + fs_total = gfs2_ri_total(sdp); + + spin_lock(&sdp->sd_statfs_spin); + if (fs_total > (m_sc->sc_total + l_sc->sc_total)) + new_free = fs_total - (m_sc->sc_total + l_sc->sc_total); + else + new_free = 0; + spin_unlock(&sdp->sd_statfs_spin); + fs_warn(sdp, "File system extended by %llu blocks.\n", + (unsigned long long)new_free); + gfs2_statfs_change(sdp, new_free, new_free, 0); +} + +/** + * gfs2_stuffed_write_end - Write end for stuffed files + * @inode: The inode + * @dibh: The buffer_head containing the on-disk inode + * @pos: The file position + * @len: The length of the write + * @copied: How much was actually copied by the VFS + * @page: The page + * + * This copies the data from the page into the inode block after + * the inode data structure itself. + * + * Returns: errno + */ +static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, + loff_t pos, unsigned len, unsigned copied, + struct page *page) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + u64 to = pos + copied; + void *kaddr; + unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode); + struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; + + BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode))); + kaddr = kmap_atomic(page, KM_USER0); + memcpy(buf + pos, kaddr + pos, copied); + memset(kaddr + pos + copied, 0, len - copied); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + + if (!PageUptodate(page)) + SetPageUptodate(page); + unlock_page(page); + page_cache_release(page); + + if (copied) { + if (inode->i_size < to) { + i_size_write(inode, to); + ip->i_disksize = inode->i_size; + } + gfs2_dinode_out(ip, di); + mark_inode_dirty(inode); + } + + if (inode == sdp->sd_rindex) + adjust_fs_space(inode); + + brelse(dibh); + gfs2_trans_end(sdp); + gfs2_glock_dq(&ip->i_gh); + gfs2_holder_uninit(&ip->i_gh); + return copied; +} + +/** + * gfs2_write_end + * @file: The file to write to + * @mapping: The address space to write to + * @pos: The file position + * @len: The length of the data + * @copied: + * @page: The page that has been written + * @fsdata: The fsdata (unused in GFS2) + * + * The main write_end function for GFS2. We have a separate one for + * stuffed files as they are slightly different, otherwise we just + * put our locking around the VFS provided functions. + * + * Returns: errno + */ + +static int gfs2_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct buffer_head *dibh; + struct gfs2_alloc *al = ip->i_alloc; + unsigned int from = pos & (PAGE_CACHE_SIZE - 1); + unsigned int to = from + len; + int ret; + + BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL); + + ret = gfs2_meta_inode_buffer(ip, &dibh); + if (unlikely(ret)) { + unlock_page(page); + page_cache_release(page); + goto failed; + } + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + + if (gfs2_is_stuffed(ip)) + return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page); + + if (!gfs2_is_writeback(ip)) + gfs2_page_add_databufs(ip, page, from, to); + + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + if (ret > 0) { + if (inode->i_size > ip->i_disksize) + ip->i_disksize = inode->i_size; + gfs2_dinode_out(ip, dibh->b_data); + mark_inode_dirty(inode); + } + + if (inode == sdp->sd_rindex) + adjust_fs_space(inode); + + brelse(dibh); + gfs2_trans_end(sdp); +failed: + if (al) { + gfs2_inplace_release(ip); + gfs2_quota_unlock(ip); + gfs2_alloc_put(ip); + } + gfs2_glock_dq(&ip->i_gh); + gfs2_holder_uninit(&ip->i_gh); + return ret; +} + +/** + * gfs2_set_page_dirty - Page dirtying function + * @page: The page to dirty + * + * Returns: 1 if it dirtyed the page, or 0 otherwise + */ + +static int gfs2_set_page_dirty(struct page *page) +{ + SetPageChecked(page); + return __set_page_dirty_buffers(page); +} + +/** + * gfs2_bmap - Block map function + * @mapping: Address space info + * @lblock: The block to map + * + * Returns: The disk address for the block or 0 on hole or error + */ + +static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock) +{ + struct gfs2_inode *ip = GFS2_I(mapping->host); + struct gfs2_holder i_gh; + sector_t dblock = 0; + int error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (error) + return 0; + + if (!gfs2_is_stuffed(ip)) + dblock = generic_block_bmap(mapping, lblock, gfs2_block_map); + + gfs2_glock_dq_uninit(&i_gh); + + return dblock; +} + +static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh) +{ + struct gfs2_bufdata *bd; + + lock_buffer(bh); + gfs2_log_lock(sdp); + clear_buffer_dirty(bh); + bd = bh->b_private; + if (bd) { + if (!list_empty(&bd->bd_le.le_list) && !buffer_pinned(bh)) + list_del_init(&bd->bd_le.le_list); + else + gfs2_remove_from_journal(bh, current->journal_info, 0); + } + bh->b_bdev = NULL; + clear_buffer_mapped(bh); + clear_buffer_req(bh); + clear_buffer_new(bh); + gfs2_log_unlock(sdp); + unlock_buffer(bh); +} + +static void gfs2_invalidatepage(struct page *page, unsigned long offset) +{ + struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); + struct buffer_head *bh, *head; + unsigned long pos = 0; + + BUG_ON(!PageLocked(page)); + if (offset == 0) + ClearPageChecked(page); + if (!page_has_buffers(page)) + goto out; + + bh = head = page_buffers(page); + do { + if (offset <= pos) + gfs2_discard(sdp, bh); + pos += bh->b_size; + bh = bh->b_this_page; + } while (bh != head); +out: + if (offset == 0) + try_to_release_page(page, 0); +} + +/** + * gfs2_ok_for_dio - check that dio is valid on this file + * @ip: The inode + * @rw: READ or WRITE + * @offset: The offset at which we are reading or writing + * + * Returns: 0 (to ignore the i/o request and thus fall back to buffered i/o) + * 1 (to accept the i/o request) + */ +static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset) +{ + /* + * Should we return an error here? I can't see that O_DIRECT for + * a stuffed file makes any sense. For now we'll silently fall + * back to buffered I/O + */ + if (gfs2_is_stuffed(ip)) + return 0; + + if (offset >= i_size_read(&ip->i_inode)) + return 0; + return 1; +} + + + +static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int rv; + + /* + * Deferred lock, even if its a write, since we do no allocation + * on this path. All we need change is atime, and this lock mode + * ensures that other nodes have flushed their buffered read caches + * (i.e. their page cache entries for this inode). We do not, + * unfortunately have the option of only flushing a range like + * the VFS does. + */ + gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh); + rv = gfs2_glock_nq(&gh); + if (rv) + return rv; + rv = gfs2_ok_for_dio(ip, rw, offset); + if (rv != 1) + goto out; /* dio not valid, fall back to buffered i/o */ + + rv = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev, + iov, offset, nr_segs, + gfs2_get_block_direct, NULL); +out: + gfs2_glock_dq_m(1, &gh); + gfs2_holder_uninit(&gh); + return rv; +} + +/** + * gfs2_releasepage - free the metadata associated with a page + * @page: the page that's being released + * @gfp_mask: passed from Linux VFS, ignored by us + * + * Call try_to_free_buffers() if the buffers in this page can be + * released. + * + * Returns: 0 + */ + +int gfs2_releasepage(struct page *page, gfp_t gfp_mask) +{ + struct inode *aspace = page->mapping->host; + struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info; + struct buffer_head *bh, *head; + struct gfs2_bufdata *bd; + + if (!page_has_buffers(page)) + return 0; + + gfs2_log_lock(sdp); + head = bh = page_buffers(page); + do { + if (atomic_read(&bh->b_count)) + goto cannot_release; + bd = bh->b_private; + if (bd && bd->bd_ail) + goto cannot_release; + gfs2_assert_warn(sdp, !buffer_pinned(bh)); + gfs2_assert_warn(sdp, !buffer_dirty(bh)); + bh = bh->b_this_page; + } while(bh != head); + gfs2_log_unlock(sdp); + + head = bh = page_buffers(page); + do { + gfs2_log_lock(sdp); + bd = bh->b_private; + if (bd) { + gfs2_assert_warn(sdp, bd->bd_bh == bh); + gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr)); + if (!list_empty(&bd->bd_le.le_list)) { + if (!buffer_pinned(bh)) + list_del_init(&bd->bd_le.le_list); + else + bd = NULL; + } + if (bd) + bd->bd_bh = NULL; + bh->b_private = NULL; + } + gfs2_log_unlock(sdp); + if (bd) + kmem_cache_free(gfs2_bufdata_cachep, bd); + + bh = bh->b_this_page; + } while (bh != head); + + return try_to_free_buffers(page); +cannot_release: + gfs2_log_unlock(sdp); + return 0; +} + +static const struct address_space_operations gfs2_writeback_aops = { + .writepage = gfs2_writeback_writepage, + .writepages = gfs2_writeback_writepages, + .readpage = gfs2_readpage, + .readpages = gfs2_readpages, + .sync_page = block_sync_page, + .write_begin = gfs2_write_begin, + .write_end = gfs2_write_end, + .bmap = gfs2_bmap, + .invalidatepage = gfs2_invalidatepage, + .releasepage = gfs2_releasepage, + .direct_IO = gfs2_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, +}; + +static const struct address_space_operations gfs2_ordered_aops = { + .writepage = gfs2_ordered_writepage, + .readpage = gfs2_readpage, + .readpages = gfs2_readpages, + .sync_page = block_sync_page, + .write_begin = gfs2_write_begin, + .write_end = gfs2_write_end, + .set_page_dirty = gfs2_set_page_dirty, + .bmap = gfs2_bmap, + .invalidatepage = gfs2_invalidatepage, + .releasepage = gfs2_releasepage, + .direct_IO = gfs2_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, +}; + +static const struct address_space_operations gfs2_jdata_aops = { + .writepage = gfs2_jdata_writepage, + .writepages = gfs2_jdata_writepages, + .readpage = gfs2_readpage, + .readpages = gfs2_readpages, + .sync_page = block_sync_page, + .write_begin = gfs2_write_begin, + .write_end = gfs2_write_end, + .set_page_dirty = gfs2_set_page_dirty, + .bmap = gfs2_bmap, + .invalidatepage = gfs2_invalidatepage, + .releasepage = gfs2_releasepage, + .is_partially_uptodate = block_is_partially_uptodate, +}; + +void gfs2_set_aops(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + + if (gfs2_is_writeback(ip)) + inode->i_mapping->a_ops = &gfs2_writeback_aops; + else if (gfs2_is_ordered(ip)) + inode->i_mapping->a_ops = &gfs2_ordered_aops; + else if (gfs2_is_jdata(ip)) + inode->i_mapping->a_ops = &gfs2_jdata_aops; + else + BUG(); +} + diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 253e1a3..1153a07 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -25,7 +25,6 @@ #include "trans.h" #include "dir.h" #include "util.h" -#include "ops_address.h" /* This doesn't need to be that large as max 64 bit pointers in a 4k * block is 512, so __u16 is fine for that. It saves stack space to diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c new file mode 100644 index 0000000..022c66c --- /dev/null +++ b/fs/gfs2/dentry.c @@ -0,0 +1,114 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "dir.h" +#include "glock.h" +#include "super.h" +#include "util.h" +#include "inode.h" + +/** + * gfs2_drevalidate - Check directory lookup consistency + * @dentry: the mapping to check + * @nd: + * + * Check to make sure the lookup necessary to arrive at this inode from its + * parent is still good. + * + * Returns: 1 if the dentry is ok, 0 if it isn't + */ + +static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) +{ + struct dentry *parent = dget_parent(dentry); + struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode); + struct gfs2_inode *dip = GFS2_I(parent->d_inode); + struct inode *inode = dentry->d_inode; + struct gfs2_holder d_gh; + struct gfs2_inode *ip = NULL; + int error; + int had_lock = 0; + + if (inode) { + if (is_bad_inode(inode)) + goto invalid; + ip = GFS2_I(inode); + } + + if (sdp->sd_args.ar_localcaching) + goto valid; + + had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL); + if (!had_lock) { + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); + if (error) + goto fail; + } + + error = gfs2_dir_check(parent->d_inode, &dentry->d_name, ip); + switch (error) { + case 0: + if (!inode) + goto invalid_gunlock; + break; + case -ENOENT: + if (!inode) + goto valid_gunlock; + goto invalid_gunlock; + default: + goto fail_gunlock; + } + +valid_gunlock: + if (!had_lock) + gfs2_glock_dq_uninit(&d_gh); +valid: + dput(parent); + return 1; + +invalid_gunlock: + if (!had_lock) + gfs2_glock_dq_uninit(&d_gh); +invalid: + if (inode && S_ISDIR(inode->i_mode)) { + if (have_submounts(dentry)) + goto valid; + shrink_dcache_parent(dentry); + } + d_drop(dentry); + dput(parent); + return 0; + +fail_gunlock: + gfs2_glock_dq_uninit(&d_gh); +fail: + dput(parent); + return 0; +} + +static int gfs2_dhash(struct dentry *dentry, struct qstr *str) +{ + str->hash = gfs2_disk_hash(str->name, str->len); + return 0; +} + +const struct dentry_operations gfs2_dops = { + .d_revalidate = gfs2_drevalidate, + .d_hash = gfs2_dhash, +}; + diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c new file mode 100644 index 0000000..9200ef2 --- /dev/null +++ b/fs/gfs2/export.c @@ -0,0 +1,285 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "dir.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "super.h" +#include "rgrp.h" +#include "util.h" + +#define GFS2_SMALL_FH_SIZE 4 +#define GFS2_LARGE_FH_SIZE 8 +#define GFS2_OLD_FH_SIZE 10 + +static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len, + int connectable) +{ + __be32 *fh = (__force __be32 *)p; + struct inode *inode = dentry->d_inode; + struct super_block *sb = inode->i_sb; + struct gfs2_inode *ip = GFS2_I(inode); + + if (*len < GFS2_SMALL_FH_SIZE || + (connectable && *len < GFS2_LARGE_FH_SIZE)) + return 255; + + fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32); + fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF); + fh[2] = cpu_to_be32(ip->i_no_addr >> 32); + fh[3] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF); + *len = GFS2_SMALL_FH_SIZE; + + if (!connectable || inode == sb->s_root->d_inode) + return *len; + + spin_lock(&dentry->d_lock); + inode = dentry->d_parent->d_inode; + ip = GFS2_I(inode); + igrab(inode); + spin_unlock(&dentry->d_lock); + + fh[4] = cpu_to_be32(ip->i_no_formal_ino >> 32); + fh[5] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF); + fh[6] = cpu_to_be32(ip->i_no_addr >> 32); + fh[7] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF); + *len = GFS2_LARGE_FH_SIZE; + + iput(inode); + + return *len; +} + +struct get_name_filldir { + struct gfs2_inum_host inum; + char *name; +}; + +static int get_name_filldir(void *opaque, const char *name, int length, + loff_t offset, u64 inum, unsigned int type) +{ + struct get_name_filldir *gnfd = opaque; + + if (inum != gnfd->inum.no_addr) + return 0; + + memcpy(gnfd->name, name, length); + gnfd->name[length] = 0; + + return 1; +} + +static int gfs2_get_name(struct dentry *parent, char *name, + struct dentry *child) +{ + struct inode *dir = parent->d_inode; + struct inode *inode = child->d_inode; + struct gfs2_inode *dip, *ip; + struct get_name_filldir gnfd; + struct gfs2_holder gh; + u64 offset = 0; + int error; + + if (!dir) + return -EINVAL; + + if (!S_ISDIR(dir->i_mode) || !inode) + return -EINVAL; + + dip = GFS2_I(dir); + ip = GFS2_I(inode); + + *name = 0; + gnfd.inum.no_addr = ip->i_no_addr; + gnfd.inum.no_formal_ino = ip->i_no_formal_ino; + gnfd.name = name; + + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh); + if (error) + return error; + + error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir); + + gfs2_glock_dq_uninit(&gh); + + if (!error && !*name) + error = -ENOENT; + + return error; +} + +static struct dentry *gfs2_get_parent(struct dentry *child) +{ + struct qstr dotdot; + struct dentry *dentry; + + /* + * XXX(hch): it would be a good idea to keep this around as a + * static variable. + */ + gfs2_str2qstr(&dotdot, ".."); + + dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &dotdot, 1)); + if (!IS_ERR(dentry)) + dentry->d_op = &gfs2_dops; + return dentry; +} + +static struct dentry *gfs2_get_dentry(struct super_block *sb, + struct gfs2_inum_host *inum) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_holder i_gh, ri_gh, rgd_gh; + struct gfs2_rgrpd *rgd; + struct inode *inode; + struct dentry *dentry; + int error; + + /* System files? */ + + inode = gfs2_ilookup(sb, inum->no_addr); + if (inode) { + if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) { + iput(inode); + return ERR_PTR(-ESTALE); + } + goto out_inode; + } + + error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops, + LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (error) + return ERR_PTR(error); + + error = gfs2_rindex_hold(sdp, &ri_gh); + if (error) + goto fail; + + error = -EINVAL; + rgd = gfs2_blk2rgrpd(sdp, inum->no_addr); + if (!rgd) + goto fail_rindex; + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh); + if (error) + goto fail_rindex; + + error = -ESTALE; + if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE) + goto fail_rgd; + + gfs2_glock_dq_uninit(&rgd_gh); + gfs2_glock_dq_uninit(&ri_gh); + + inode = gfs2_inode_lookup(sb, DT_UNKNOWN, + inum->no_addr, + 0, 0); + if (IS_ERR(inode)) { + error = PTR_ERR(inode); + goto fail; + } + + error = gfs2_inode_refresh(GFS2_I(inode)); + if (error) { + iput(inode); + goto fail; + } + + /* Pick up the works we bypass in gfs2_inode_lookup */ + if (inode->i_state & I_NEW) + gfs2_set_iop(inode); + + if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) { + iput(inode); + goto fail; + } + + error = -EIO; + if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) { + iput(inode); + goto fail; + } + + gfs2_glock_dq_uninit(&i_gh); + +out_inode: + dentry = d_obtain_alias(inode); + if (!IS_ERR(dentry)) + dentry->d_op = &gfs2_dops; + return dentry; + +fail_rgd: + gfs2_glock_dq_uninit(&rgd_gh); + +fail_rindex: + gfs2_glock_dq_uninit(&ri_gh); + +fail: + gfs2_glock_dq_uninit(&i_gh); + return ERR_PTR(error); +} + +static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + struct gfs2_inum_host this; + __be32 *fh = (__force __be32 *)fid->raw; + + switch (fh_type) { + case GFS2_SMALL_FH_SIZE: + case GFS2_LARGE_FH_SIZE: + case GFS2_OLD_FH_SIZE: + this.no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32; + this.no_formal_ino |= be32_to_cpu(fh[1]); + this.no_addr = ((u64)be32_to_cpu(fh[2])) << 32; + this.no_addr |= be32_to_cpu(fh[3]); + return gfs2_get_dentry(sb, &this); + default: + return NULL; + } +} + +static struct dentry *gfs2_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + struct gfs2_inum_host parent; + __be32 *fh = (__force __be32 *)fid->raw; + + switch (fh_type) { + case GFS2_LARGE_FH_SIZE: + case GFS2_OLD_FH_SIZE: + parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32; + parent.no_formal_ino |= be32_to_cpu(fh[5]); + parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32; + parent.no_addr |= be32_to_cpu(fh[7]); + return gfs2_get_dentry(sb, &parent); + default: + return NULL; + } +} + +const struct export_operations gfs2_export_ops = { + .encode_fh = gfs2_encode_fh, + .fh_to_dentry = gfs2_fh_to_dentry, + .fh_to_parent = gfs2_fh_to_parent, + .get_name = gfs2_get_name, + .get_parent = gfs2_get_parent, +}; + diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c new file mode 100644 index 0000000..73b6f55 --- /dev/null +++ b/fs/gfs2/file.c @@ -0,0 +1,765 @@ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "dir.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "log.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" +#include "eaops.h" + +/** + * gfs2_llseek - seek to a location in a file + * @file: the file + * @offset: the offset + * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END) + * + * SEEK_END requires the glock for the file because it references the + * file's size. + * + * Returns: The new offset, or errno + */ + +static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin) +{ + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + struct gfs2_holder i_gh; + loff_t error; + + if (origin == 2) { + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, + &i_gh); + if (!error) { + error = generic_file_llseek_unlocked(file, offset, origin); + gfs2_glock_dq_uninit(&i_gh); + } + } else + error = generic_file_llseek_unlocked(file, offset, origin); + + return error; +} + +/** + * gfs2_readdir - Read directory entries from a directory + * @file: The directory to read from + * @dirent: Buffer for dirents + * @filldir: Function used to do the copying + * + * Returns: errno + */ + +static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir) +{ + struct inode *dir = file->f_mapping->host; + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_holder d_gh; + u64 offset = file->f_pos; + int error; + + gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); + error = gfs2_glock_nq(&d_gh); + if (error) { + gfs2_holder_uninit(&d_gh); + return error; + } + + error = gfs2_dir_read(dir, &offset, dirent, filldir); + + gfs2_glock_dq_uninit(&d_gh); + + file->f_pos = offset; + + return error; +} + +/** + * fsflags_cvt + * @table: A table of 32 u32 flags + * @val: a 32 bit value to convert + * + * This function can be used to convert between fsflags values and + * GFS2's own flags values. + * + * Returns: the converted flags + */ +static u32 fsflags_cvt(const u32 *table, u32 val) +{ + u32 res = 0; + while(val) { + if (val & 1) + res |= *table; + table++; + val >>= 1; + } + return res; +} + +static const u32 fsflags_to_gfs2[32] = { + [3] = GFS2_DIF_SYNC, + [4] = GFS2_DIF_IMMUTABLE, + [5] = GFS2_DIF_APPENDONLY, + [7] = GFS2_DIF_NOATIME, + [12] = GFS2_DIF_EXHASH, + [14] = GFS2_DIF_INHERIT_JDATA, +}; + +static const u32 gfs2_to_fsflags[32] = { + [gfs2fl_Sync] = FS_SYNC_FL, + [gfs2fl_Immutable] = FS_IMMUTABLE_FL, + [gfs2fl_AppendOnly] = FS_APPEND_FL, + [gfs2fl_NoAtime] = FS_NOATIME_FL, + [gfs2fl_ExHash] = FS_INDEX_FL, + [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL, +}; + +static int gfs2_get_flags(struct file *filp, u32 __user *ptr) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int error; + u32 fsflags; + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + error = gfs2_glock_nq(&gh); + if (error) + return error; + + fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags); + if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA) + fsflags |= FS_JOURNAL_DATA_FL; + if (put_user(fsflags, ptr)) + error = -EFAULT; + + gfs2_glock_dq(&gh); + gfs2_holder_uninit(&gh); + return error; +} + +void gfs2_set_inode_flags(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + unsigned int flags = inode->i_flags; + + flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + if (ip->i_diskflags & GFS2_DIF_IMMUTABLE) + flags |= S_IMMUTABLE; + if (ip->i_diskflags & GFS2_DIF_APPENDONLY) + flags |= S_APPEND; + if (ip->i_diskflags & GFS2_DIF_NOATIME) + flags |= S_NOATIME; + if (ip->i_diskflags & GFS2_DIF_SYNC) + flags |= S_SYNC; + inode->i_flags = flags; +} + +/* Flags that can be set by user space */ +#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \ + GFS2_DIF_IMMUTABLE| \ + GFS2_DIF_APPENDONLY| \ + GFS2_DIF_NOATIME| \ + GFS2_DIF_SYNC| \ + GFS2_DIF_SYSTEM| \ + GFS2_DIF_INHERIT_JDATA) + +/** + * gfs2_set_flags - set flags on an inode + * @inode: The inode + * @flags: The flags to set + * @mask: Indicates which flags are valid + * + */ +static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct buffer_head *bh; + struct gfs2_holder gh; + int error; + u32 new_flags, flags; + + error = mnt_want_write(filp->f_path.mnt); + if (error) + return error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (error) + goto out_drop_write; + + flags = ip->i_diskflags; + new_flags = (flags & ~mask) | (reqflags & mask); + if ((new_flags ^ flags) == 0) + goto out; + + error = -EINVAL; + if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET) + goto out; + + error = -EPERM; + if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE)) + goto out; + if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY)) + goto out; + if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) && + !capable(CAP_LINUX_IMMUTABLE)) + goto out; + if (!IS_IMMUTABLE(inode)) { + error = gfs2_permission(inode, MAY_WRITE); + if (error) + goto out; + } + if ((flags ^ new_flags) & GFS2_DIF_JDATA) { + if (flags & GFS2_DIF_JDATA) + gfs2_log_flush(sdp, ip->i_gl); + error = filemap_fdatawrite(inode->i_mapping); + if (error) + goto out; + error = filemap_fdatawait(inode->i_mapping); + if (error) + goto out; + } + error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (error) + goto out; + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + goto out_trans_end; + gfs2_trans_add_bh(ip->i_gl, bh, 1); + ip->i_diskflags = new_flags; + gfs2_dinode_out(ip, bh->b_data); + brelse(bh); + gfs2_set_inode_flags(inode); + gfs2_set_aops(inode); +out_trans_end: + gfs2_trans_end(sdp); +out: + gfs2_glock_dq_uninit(&gh); +out_drop_write: + mnt_drop_write(filp->f_path.mnt); + return error; +} + +static int gfs2_set_flags(struct file *filp, u32 __user *ptr) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + u32 fsflags, gfsflags; + if (get_user(fsflags, ptr)) + return -EFAULT; + gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags); + if (!S_ISDIR(inode->i_mode)) { + if (gfsflags & GFS2_DIF_INHERIT_JDATA) + gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA); + return do_gfs2_set_flags(filp, gfsflags, ~0); + } + return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA); +} + +static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + switch(cmd) { + case FS_IOC_GETFLAGS: + return gfs2_get_flags(filp, (u32 __user *)arg); + case FS_IOC_SETFLAGS: + return gfs2_set_flags(filp, (u32 __user *)arg); + } + return -ENOTTY; +} + +/** + * gfs2_allocate_page_backing - Use bmap to allocate blocks + * @page: The (locked) page to allocate backing for + * + * We try to allocate all the blocks required for the page in + * one go. This might fail for various reasons, so we keep + * trying until all the blocks to back this page are allocated. + * If some of the blocks are already allocated, thats ok too. + */ + +static int gfs2_allocate_page_backing(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct buffer_head bh; + unsigned long size = PAGE_CACHE_SIZE; + u64 lblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + do { + bh.b_state = 0; + bh.b_size = size; + gfs2_block_map(inode, lblock, &bh, 1); + if (!buffer_mapped(&bh)) + return -EIO; + size -= bh.b_size; + lblock += (bh.b_size >> inode->i_blkbits); + } while(size > 0); + return 0; +} + +/** + * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable + * @vma: The virtual memory area + * @page: The page which is about to become writable + * + * When the page becomes writable, we need to ensure that we have + * blocks allocated on disk to back that page. + */ + +static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + unsigned long last_index; + u64 pos = page->index << PAGE_CACHE_SHIFT; + unsigned int data_blocks, ind_blocks, rblocks; + int alloc_required = 0; + struct gfs2_holder gh; + struct gfs2_alloc *al; + int ret; + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (ret) + goto out; + + set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); + set_bit(GIF_SW_PAGED, &ip->i_flags); + + ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required); + if (ret || !alloc_required) + goto out_unlock; + ret = -ENOMEM; + al = gfs2_alloc_get(ip); + if (al == NULL) + goto out_unlock; + + ret = gfs2_quota_lock_check(ip); + if (ret) + goto out_alloc_put; + gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); + al->al_requested = data_blocks + ind_blocks; + ret = gfs2_inplace_reserve(ip); + if (ret) + goto out_quota_unlock; + + rblocks = RES_DINODE + ind_blocks; + if (gfs2_is_jdata(ip)) + rblocks += data_blocks ? data_blocks : 1; + if (ind_blocks || data_blocks) + rblocks += RES_STATFS + RES_QUOTA; + ret = gfs2_trans_begin(sdp, rblocks, 0); + if (ret) + goto out_trans_fail; + + lock_page(page); + ret = -EINVAL; + last_index = ip->i_inode.i_size >> PAGE_CACHE_SHIFT; + if (page->index > last_index) + goto out_unlock_page; + ret = 0; + if (!PageUptodate(page) || page->mapping != ip->i_inode.i_mapping) + goto out_unlock_page; + if (gfs2_is_stuffed(ip)) { + ret = gfs2_unstuff_dinode(ip, page); + if (ret) + goto out_unlock_page; + } + ret = gfs2_allocate_page_backing(page); + +out_unlock_page: + unlock_page(page); + gfs2_trans_end(sdp); +out_trans_fail: + gfs2_inplace_release(ip); +out_quota_unlock: + gfs2_quota_unlock(ip); +out_alloc_put: + gfs2_alloc_put(ip); +out_unlock: + gfs2_glock_dq(&gh); +out: + gfs2_holder_uninit(&gh); + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else if (ret) + ret = VM_FAULT_SIGBUS; + return ret; +} + +static struct vm_operations_struct gfs2_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = gfs2_page_mkwrite, +}; + +/** + * gfs2_mmap - + * @file: The file to map + * @vma: The VMA which described the mapping + * + * There is no need to get a lock here unless we should be updating + * atime. We ignore any locking errors since the only consequence is + * a missed atime update (which will just be deferred until later). + * + * Returns: 0 + */ + +static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + + if (!(file->f_flags & O_NOATIME)) { + struct gfs2_holder i_gh; + int error; + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + error = gfs2_glock_nq(&i_gh); + file_accessed(file); + if (error == 0) + gfs2_glock_dq_uninit(&i_gh); + } + vma->vm_ops = &gfs2_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; + + return 0; +} + +/** + * gfs2_open - open a file + * @inode: the inode to open + * @file: the struct file for this opening + * + * Returns: errno + */ + +static int gfs2_open(struct inode *inode, struct file *file) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder i_gh; + struct gfs2_file *fp; + int error; + + fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL); + if (!fp) + return -ENOMEM; + + mutex_init(&fp->f_fl_mutex); + + gfs2_assert_warn(GFS2_SB(inode), !file->private_data); + file->private_data = fp; + + if (S_ISREG(ip->i_inode.i_mode)) { + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, + &i_gh); + if (error) + goto fail; + + if (!(file->f_flags & O_LARGEFILE) && + ip->i_disksize > MAX_NON_LFS) { + error = -EOVERFLOW; + goto fail_gunlock; + } + + gfs2_glock_dq_uninit(&i_gh); + } + + return 0; + +fail_gunlock: + gfs2_glock_dq_uninit(&i_gh); +fail: + file->private_data = NULL; + kfree(fp); + return error; +} + +/** + * gfs2_close - called to close a struct file + * @inode: the inode the struct file belongs to + * @file: the struct file being closed + * + * Returns: errno + */ + +static int gfs2_close(struct inode *inode, struct file *file) +{ + struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; + struct gfs2_file *fp; + + fp = file->private_data; + file->private_data = NULL; + + if (gfs2_assert_warn(sdp, fp)) + return -EIO; + + kfree(fp); + + return 0; +} + +/** + * gfs2_fsync - sync the dirty data for a file (across the cluster) + * @file: the file that points to the dentry (we ignore this) + * @dentry: the dentry that points to the inode to sync + * + * The VFS will flush "normal" data for us. We only need to worry + * about metadata here. For journaled data, we just do a log flush + * as we can't avoid it. Otherwise we can just bale out if datasync + * is set. For stuffed inodes we must flush the log in order to + * ensure that all data is on disk. + * + * The call to write_inode_now() is there to write back metadata and + * the inode itself. It does also try and write the data, but thats + * (hopefully) a no-op due to the VFS having already called filemap_fdatawrite() + * for us. + * + * Returns: errno + */ + +static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync) +{ + struct inode *inode = dentry->d_inode; + int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC); + int ret = 0; + + if (gfs2_is_jdata(GFS2_I(inode))) { + gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl); + return 0; + } + + if (sync_state != 0) { + if (!datasync) + ret = write_inode_now(inode, 0); + + if (gfs2_is_stuffed(GFS2_I(inode))) + gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl); + } + + return ret; +} + +#ifdef CONFIG_GFS2_FS_LOCKING_DLM + +/** + * gfs2_setlease - acquire/release a file lease + * @file: the file pointer + * @arg: lease type + * @fl: file lock + * + * We don't currently have a way to enforce a lease across the whole + * cluster; until we do, disable leases (by just returning -EINVAL), + * unless the administrator has requested purely local locking. + * + * Returns: errno + */ + +static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl) +{ + return -EINVAL; +} + +/** + * gfs2_lock - acquire/release a posix lock on a file + * @file: the file pointer + * @cmd: either modify or retrieve lock state, possibly wait + * @fl: type and range of lock + * + * Returns: errno + */ + +static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) +{ + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + if (!(fl->fl_flags & FL_POSIX)) + return -ENOLCK; + if (__mandatory_lock(&ip->i_inode)) + return -ENOLCK; + + if (cmd == F_CANCELLK) { + /* Hack: */ + cmd = F_SETLK; + fl->fl_type = F_UNLCK; + } + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + return -EIO; + if (IS_GETLK(cmd)) + return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); + else if (fl->fl_type == F_UNLCK) + return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl); + else + return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl); +} + +static int do_flock(struct file *file, int cmd, struct file_lock *fl) +{ + struct gfs2_file *fp = file->private_data; + struct gfs2_holder *fl_gh = &fp->f_fl_gh; + struct gfs2_inode *ip = GFS2_I(file->f_path.dentry->d_inode); + struct gfs2_glock *gl; + unsigned int state; + int flags; + int error = 0; + + state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; + flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE; + + mutex_lock(&fp->f_fl_mutex); + + gl = fl_gh->gh_gl; + if (gl) { + if (fl_gh->gh_state == state) + goto out; + flock_lock_file_wait(file, + &(struct file_lock){.fl_type = F_UNLCK}); + gfs2_glock_dq_wait(fl_gh); + gfs2_holder_reinit(state, flags, fl_gh); + } else { + error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr, + &gfs2_flock_glops, CREATE, &gl); + if (error) + goto out; + gfs2_holder_init(gl, state, flags, fl_gh); + gfs2_glock_put(gl); + } + error = gfs2_glock_nq(fl_gh); + if (error) { + gfs2_holder_uninit(fl_gh); + if (error == GLR_TRYFAILED) + error = -EAGAIN; + } else { + error = flock_lock_file_wait(file, fl); + gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error); + } + +out: + mutex_unlock(&fp->f_fl_mutex); + return error; +} + +static void do_unflock(struct file *file, struct file_lock *fl) +{ + struct gfs2_file *fp = file->private_data; + struct gfs2_holder *fl_gh = &fp->f_fl_gh; + + mutex_lock(&fp->f_fl_mutex); + flock_lock_file_wait(file, fl); + if (fl_gh->gh_gl) + gfs2_glock_dq_uninit(fl_gh); + mutex_unlock(&fp->f_fl_mutex); +} + +/** + * gfs2_flock - acquire/release a flock lock on a file + * @file: the file pointer + * @cmd: either modify or retrieve lock state, possibly wait + * @fl: type and range of lock + * + * Returns: errno + */ + +static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) +{ + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + + if (!(fl->fl_flags & FL_FLOCK)) + return -ENOLCK; + if (__mandatory_lock(&ip->i_inode)) + return -ENOLCK; + + if (fl->fl_type == F_UNLCK) { + do_unflock(file, fl); + return 0; + } else { + return do_flock(file, cmd, fl); + } +} + +const struct file_operations gfs2_file_fops = { + .llseek = gfs2_llseek, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .write = do_sync_write, + .aio_write = generic_file_aio_write, + .unlocked_ioctl = gfs2_ioctl, + .mmap = gfs2_mmap, + .open = gfs2_open, + .release = gfs2_close, + .fsync = gfs2_fsync, + .lock = gfs2_lock, + .flock = gfs2_flock, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, + .setlease = gfs2_setlease, +}; + +const struct file_operations gfs2_dir_fops = { + .readdir = gfs2_readdir, + .unlocked_ioctl = gfs2_ioctl, + .open = gfs2_open, + .release = gfs2_close, + .fsync = gfs2_fsync, + .lock = gfs2_lock, + .flock = gfs2_flock, +}; + +#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ + +const struct file_operations gfs2_file_fops_nolock = { + .llseek = gfs2_llseek, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .write = do_sync_write, + .aio_write = generic_file_aio_write, + .unlocked_ioctl = gfs2_ioctl, + .mmap = gfs2_mmap, + .open = gfs2_open, + .release = gfs2_close, + .fsync = gfs2_fsync, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, + .setlease = generic_setlease, +}; + +const struct file_operations gfs2_dir_fops_nolock = { + .readdir = gfs2_readdir, + .unlocked_ioctl = gfs2_ioctl, + .open = gfs2_open, + .release = gfs2_close, + .fsync = gfs2_fsync, +}; + diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 5a31d42..c03a1a3 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -30,7 +30,6 @@ #include "inode.h" #include "log.h" #include "meta_io.h" -#include "ops_address.h" #include "quota.h" #include "rgrp.h" #include "trans.h" diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index c30be2b..2c3ec07 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -11,8 +11,16 @@ #define __INODE_DOT_H__ #include +#include +#include #include "util.h" +extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask); +extern int gfs2_internal_read(struct gfs2_inode *ip, + struct file_ra_state *ra_state, + char *buf, loff_t *pos, unsigned size); +extern void gfs2_set_aops(struct inode *inode); + static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) { return !ip->i_height; @@ -73,30 +81,31 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip, } -void gfs2_set_iop(struct inode *inode); -struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, - u64 no_addr, u64 no_formal_ino, - int skip_freeing); -struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); - -int gfs2_inode_refresh(struct gfs2_inode *ip); - -int gfs2_dinode_dealloc(struct gfs2_inode *inode); -int gfs2_change_nlink(struct gfs2_inode *ip, int diff); -struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, - int is_root); -struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, - unsigned int mode, dev_t dev); -int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, - struct gfs2_inode *ip); -int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, - const struct gfs2_inode *ip); -int gfs2_permission(struct inode *inode, int mask); -int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len); -int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); -struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); -void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); -void gfs2_dinode_print(const struct gfs2_inode *ip); +extern void gfs2_set_iop(struct inode *inode); +extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, + u64 no_addr, u64 no_formal_ino, + int skip_freeing); +extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); + +extern int gfs2_inode_refresh(struct gfs2_inode *ip); + +extern int gfs2_dinode_dealloc(struct gfs2_inode *inode); +extern int gfs2_change_nlink(struct gfs2_inode *ip, int diff); +extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, + int is_root); +extern struct inode *gfs2_createi(struct gfs2_holder *ghs, + const struct qstr *name, + unsigned int mode, dev_t dev); +extern int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, + struct gfs2_inode *ip); +extern int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, + const struct gfs2_inode *ip); +extern int gfs2_permission(struct inode *inode, int mask); +extern int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len); +extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); +extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); +extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); +extern void gfs2_dinode_print(const struct gfs2_inode *ip); extern const struct inode_operations gfs2_file_iops; extern const struct inode_operations gfs2_dir_iops; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 78a5f43..cb8d7a9 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -31,7 +31,6 @@ #include "rgrp.h" #include "trans.h" #include "util.h" -#include "ops_address.h" static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc) { diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c deleted file mode 100644 index e566421..0000000 --- a/fs/gfs2/ops_address.c +++ /dev/null @@ -1,1146 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "gfs2.h" -#include "incore.h" -#include "bmap.h" -#include "glock.h" -#include "inode.h" -#include "log.h" -#include "meta_io.h" -#include "ops_address.h" -#include "quota.h" -#include "trans.h" -#include "rgrp.h" -#include "super.h" -#include "util.h" -#include "glops.h" - - -static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, - unsigned int from, unsigned int to) -{ - struct buffer_head *head = page_buffers(page); - unsigned int bsize = head->b_size; - struct buffer_head *bh; - unsigned int start, end; - - for (bh = head, start = 0; bh != head || !start; - bh = bh->b_this_page, start = end) { - end = start + bsize; - if (end <= from || start >= to) - continue; - if (gfs2_is_jdata(ip)) - set_buffer_uptodate(bh); - gfs2_trans_add_bh(ip->i_gl, bh, 0); - } -} - -/** - * gfs2_get_block_noalloc - Fills in a buffer head with details about a block - * @inode: The inode - * @lblock: The block number to look up - * @bh_result: The buffer head to return the result in - * @create: Non-zero if we may add block to the file - * - * Returns: errno - */ - -static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock, - struct buffer_head *bh_result, int create) -{ - int error; - - error = gfs2_block_map(inode, lblock, bh_result, 0); - if (error) - return error; - if (!buffer_mapped(bh_result)) - return -EIO; - return 0; -} - -static int gfs2_get_block_direct(struct inode *inode, sector_t lblock, - struct buffer_head *bh_result, int create) -{ - return gfs2_block_map(inode, lblock, bh_result, 0); -} - -/** - * gfs2_writepage_common - Common bits of writepage - * @page: The page to be written - * @wbc: The writeback control - * - * Returns: 1 if writepage is ok, otherwise an error code or zero if no error. - */ - -static int gfs2_writepage_common(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); - loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; - unsigned offset; - - if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) - goto out; - if (current->journal_info) - goto redirty; - /* Is the page fully outside i_size? (truncate in progress) */ - offset = i_size & (PAGE_CACHE_SIZE-1); - if (page->index > end_index || (page->index == end_index && !offset)) { - page->mapping->a_ops->invalidatepage(page, 0); - goto out; - } - return 1; -redirty: - redirty_page_for_writepage(wbc, page); -out: - unlock_page(page); - return 0; -} - -/** - * gfs2_writeback_writepage - Write page for writeback mappings - * @page: The page - * @wbc: The writeback control - * - */ - -static int gfs2_writeback_writepage(struct page *page, - struct writeback_control *wbc) -{ - int ret; - - ret = gfs2_writepage_common(page, wbc); - if (ret <= 0) - return ret; - - ret = mpage_writepage(page, gfs2_get_block_noalloc, wbc); - if (ret == -EAGAIN) - ret = block_write_full_page(page, gfs2_get_block_noalloc, wbc); - return ret; -} - -/** - * gfs2_ordered_writepage - Write page for ordered data files - * @page: The page to write - * @wbc: The writeback control - * - */ - -static int gfs2_ordered_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - struct gfs2_inode *ip = GFS2_I(inode); - int ret; - - ret = gfs2_writepage_common(page, wbc); - if (ret <= 0) - return ret; - - if (!page_has_buffers(page)) { - create_empty_buffers(page, inode->i_sb->s_blocksize, - (1 << BH_Dirty)|(1 << BH_Uptodate)); - } - gfs2_page_add_databufs(ip, page, 0, inode->i_sb->s_blocksize-1); - return block_write_full_page(page, gfs2_get_block_noalloc, wbc); -} - -/** - * __gfs2_jdata_writepage - The core of jdata writepage - * @page: The page to write - * @wbc: The writeback control - * - * This is shared between writepage and writepages and implements the - * core of the writepage operation. If a transaction is required then - * PageChecked will have been set and the transaction will have - * already been started before this is called. - */ - -static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); - - if (PageChecked(page)) { - ClearPageChecked(page); - if (!page_has_buffers(page)) { - create_empty_buffers(page, inode->i_sb->s_blocksize, - (1 << BH_Dirty)|(1 << BH_Uptodate)); - } - gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize-1); - } - return block_write_full_page(page, gfs2_get_block_noalloc, wbc); -} - -/** - * gfs2_jdata_writepage - Write complete page - * @page: Page to write - * - * Returns: errno - * - */ - -static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - struct gfs2_sbd *sdp = GFS2_SB(inode); - int ret; - int done_trans = 0; - - if (PageChecked(page)) { - if (wbc->sync_mode != WB_SYNC_ALL) - goto out_ignore; - ret = gfs2_trans_begin(sdp, RES_DINODE + 1, 0); - if (ret) - goto out_ignore; - done_trans = 1; - } - ret = gfs2_writepage_common(page, wbc); - if (ret > 0) - ret = __gfs2_jdata_writepage(page, wbc); - if (done_trans) - gfs2_trans_end(sdp); - return ret; - -out_ignore: - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; -} - -/** - * gfs2_writeback_writepages - Write a bunch of dirty pages back to disk - * @mapping: The mapping to write - * @wbc: Write-back control - * - * For the data=writeback case we can already ignore buffer heads - * and write whole extents at once. This is a big reduction in the - * number of I/O requests we send and the bmap calls we make in this case. - */ -static int gfs2_writeback_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); -} - -/** - * gfs2_write_jdata_pagevec - Write back a pagevec's worth of pages - * @mapping: The mapping - * @wbc: The writeback control - * @writepage: The writepage function to call for each page - * @pvec: The vector of pages - * @nr_pages: The number of pages to write - * - * Returns: non-zero if loop should terminate, zero otherwise - */ - -static int gfs2_write_jdata_pagevec(struct address_space *mapping, - struct writeback_control *wbc, - struct pagevec *pvec, - int nr_pages, pgoff_t end) -{ - struct inode *inode = mapping->host; - struct gfs2_sbd *sdp = GFS2_SB(inode); - loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; - unsigned offset = i_size & (PAGE_CACHE_SIZE-1); - unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize); - struct backing_dev_info *bdi = mapping->backing_dev_info; - int i; - int ret; - - ret = gfs2_trans_begin(sdp, nrblocks, nrblocks); - if (ret < 0) - return ret; - - for(i = 0; i < nr_pages; i++) { - struct page *page = pvec->pages[i]; - - lock_page(page); - - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - continue; - } - - if (!wbc->range_cyclic && page->index > end) { - ret = 1; - unlock_page(page); - continue; - } - - if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - - if (PageWriteback(page) || - !clear_page_dirty_for_io(page)) { - unlock_page(page); - continue; - } - - /* Is the page fully outside i_size? (truncate in progress) */ - if (page->index > end_index || (page->index == end_index && !offset)) { - page->mapping->a_ops->invalidatepage(page, 0); - unlock_page(page); - continue; - } - - ret = __gfs2_jdata_writepage(page, wbc); - - if (ret || (--(wbc->nr_to_write) <= 0)) - ret = 1; - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - ret = 1; - } - - } - gfs2_trans_end(sdp); - return ret; -} - -/** - * gfs2_write_cache_jdata - Like write_cache_pages but different - * @mapping: The mapping to write - * @wbc: The writeback control - * @writepage: The writepage function to call - * @data: The data to pass to writepage - * - * The reason that we use our own function here is that we need to - * start transactions before we grab page locks. This allows us - * to get the ordering right. - */ - -static int gfs2_write_cache_jdata(struct address_space *mapping, - struct writeback_control *wbc) -{ - struct backing_dev_info *bdi = mapping->backing_dev_info; - int ret = 0; - int done = 0; - struct pagevec pvec; - int nr_pages; - pgoff_t index; - pgoff_t end; - int scanned = 0; - int range_whole = 0; - - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - return 0; - } - - pagevec_init(&pvec, 0); - if (wbc->range_cyclic) { - index = mapping->writeback_index; /* Start from prev offset */ - end = -1; - } else { - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; - if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) - range_whole = 1; - scanned = 1; - } - -retry: - while (!done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { - scanned = 1; - ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end); - if (ret) - done = 1; - if (ret > 0) - ret = 0; - - pagevec_release(&pvec); - cond_resched(); - } - - if (!scanned && !done) { - /* - * We hit the last page and there is more work to be done: wrap - * back to the start of the file - */ - scanned = 1; - index = 0; - goto retry; - } - - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) - mapping->writeback_index = index; - return ret; -} - - -/** - * gfs2_jdata_writepages - Write a bunch of dirty pages back to disk - * @mapping: The mapping to write - * @wbc: The writeback control - * - */ - -static int gfs2_jdata_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - struct gfs2_inode *ip = GFS2_I(mapping->host); - struct gfs2_sbd *sdp = GFS2_SB(mapping->host); - int ret; - - ret = gfs2_write_cache_jdata(mapping, wbc); - if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) { - gfs2_log_flush(sdp, ip->i_gl); - ret = gfs2_write_cache_jdata(mapping, wbc); - } - return ret; -} - -/** - * stuffed_readpage - Fill in a Linux page with stuffed file data - * @ip: the inode - * @page: the page - * - * Returns: errno - */ - -static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) -{ - struct buffer_head *dibh; - void *kaddr; - int error; - - /* - * Due to the order of unstuffing files and ->fault(), we can be - * asked for a zero page in the case of a stuffed file being extended, - * so we need to supply one here. It doesn't happen often. - */ - if (unlikely(page->index)) { - zero_user(page, 0, PAGE_CACHE_SIZE); - SetPageUptodate(page); - return 0; - } - - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - return error; - - kaddr = kmap_atomic(page, KM_USER0); - memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), - ip->i_disksize); - memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize); - kunmap_atomic(kaddr, KM_USER0); - flush_dcache_page(page); - brelse(dibh); - SetPageUptodate(page); - - return 0; -} - - -/** - * __gfs2_readpage - readpage - * @file: The file to read a page for - * @page: The page to read - * - * This is the core of gfs2's readpage. Its used by the internal file - * reading code as in that case we already hold the glock. Also its - * called by gfs2_readpage() once the required lock has been granted. - * - */ - -static int __gfs2_readpage(void *file, struct page *page) -{ - struct gfs2_inode *ip = GFS2_I(page->mapping->host); - struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); - int error; - - if (gfs2_is_stuffed(ip)) { - error = stuffed_readpage(ip, page); - unlock_page(page); - } else { - error = mpage_readpage(page, gfs2_block_map); - } - - if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - return -EIO; - - return error; -} - -/** - * gfs2_readpage - read a page of a file - * @file: The file to read - * @page: The page of the file - * - * This deals with the locking required. We have to unlock and - * relock the page in order to get the locking in the right - * order. - */ - -static int gfs2_readpage(struct file *file, struct page *page) -{ - struct address_space *mapping = page->mapping; - struct gfs2_inode *ip = GFS2_I(mapping->host); - struct gfs2_holder gh; - int error; - - unlock_page(page); - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); - error = gfs2_glock_nq(&gh); - if (unlikely(error)) - goto out; - error = AOP_TRUNCATED_PAGE; - lock_page(page); - if (page->mapping == mapping && !PageUptodate(page)) - error = __gfs2_readpage(file, page); - else - unlock_page(page); - gfs2_glock_dq(&gh); -out: - gfs2_holder_uninit(&gh); - if (error && error != AOP_TRUNCATED_PAGE) - lock_page(page); - return error; -} - -/** - * gfs2_internal_read - read an internal file - * @ip: The gfs2 inode - * @ra_state: The readahead state (or NULL for no readahead) - * @buf: The buffer to fill - * @pos: The file position - * @size: The amount to read - * - */ - -int gfs2_internal_read(struct gfs2_inode *ip, struct file_ra_state *ra_state, - char *buf, loff_t *pos, unsigned size) -{ - struct address_space *mapping = ip->i_inode.i_mapping; - unsigned long index = *pos / PAGE_CACHE_SIZE; - unsigned offset = *pos & (PAGE_CACHE_SIZE - 1); - unsigned copied = 0; - unsigned amt; - struct page *page; - void *p; - - do { - amt = size - copied; - if (offset + size > PAGE_CACHE_SIZE) - amt = PAGE_CACHE_SIZE - offset; - page = read_cache_page(mapping, index, __gfs2_readpage, NULL); - if (IS_ERR(page)) - return PTR_ERR(page); - p = kmap_atomic(page, KM_USER0); - memcpy(buf + copied, p + offset, amt); - kunmap_atomic(p, KM_USER0); - mark_page_accessed(page); - page_cache_release(page); - copied += amt; - index++; - offset = 0; - } while(copied < size); - (*pos) += size; - return size; -} - -/** - * gfs2_readpages - Read a bunch of pages at once - * - * Some notes: - * 1. This is only for readahead, so we can simply ignore any things - * which are slightly inconvenient (such as locking conflicts between - * the page lock and the glock) and return having done no I/O. Its - * obviously not something we'd want to do on too regular a basis. - * Any I/O we ignore at this time will be done via readpage later. - * 2. We don't handle stuffed files here we let readpage do the honours. - * 3. mpage_readpages() does most of the heavy lifting in the common case. - * 4. gfs2_block_map() is relied upon to set BH_Boundary in the right places. - */ - -static int gfs2_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) -{ - struct inode *inode = mapping->host; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); - struct gfs2_holder gh; - int ret; - - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); - ret = gfs2_glock_nq(&gh); - if (unlikely(ret)) - goto out_uninit; - if (!gfs2_is_stuffed(ip)) - ret = mpage_readpages(mapping, pages, nr_pages, gfs2_block_map); - gfs2_glock_dq(&gh); -out_uninit: - gfs2_holder_uninit(&gh); - if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - ret = -EIO; - return ret; -} - -/** - * gfs2_write_begin - Begin to write to a file - * @file: The file to write to - * @mapping: The mapping in which to write - * @pos: The file offset at which to start writing - * @len: Length of the write - * @flags: Various flags - * @pagep: Pointer to return the page - * @fsdata: Pointer to return fs data (unused by GFS2) - * - * Returns: errno - */ - -static int gfs2_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ - struct gfs2_inode *ip = GFS2_I(mapping->host); - struct gfs2_sbd *sdp = GFS2_SB(mapping->host); - unsigned int data_blocks = 0, ind_blocks = 0, rblocks; - int alloc_required; - int error = 0; - struct gfs2_alloc *al; - pgoff_t index = pos >> PAGE_CACHE_SHIFT; - unsigned from = pos & (PAGE_CACHE_SIZE - 1); - unsigned to = from + len; - struct page *page; - - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); - error = gfs2_glock_nq(&ip->i_gh); - if (unlikely(error)) - goto out_uninit; - - error = gfs2_write_alloc_required(ip, pos, len, &alloc_required); - if (error) - goto out_unlock; - - if (alloc_required || gfs2_is_jdata(ip)) - gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks); - - if (alloc_required) { - al = gfs2_alloc_get(ip); - if (!al) { - error = -ENOMEM; - goto out_unlock; - } - - error = gfs2_quota_lock_check(ip); - if (error) - goto out_alloc_put; - - al->al_requested = data_blocks + ind_blocks; - error = gfs2_inplace_reserve(ip); - if (error) - goto out_qunlock; - } - - rblocks = RES_DINODE + ind_blocks; - if (gfs2_is_jdata(ip)) - rblocks += data_blocks ? data_blocks : 1; - if (ind_blocks || data_blocks) - rblocks += RES_STATFS + RES_QUOTA; - - error = gfs2_trans_begin(sdp, rblocks, - PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); - if (error) - goto out_trans_fail; - - error = -ENOMEM; - flags |= AOP_FLAG_NOFS; - page = grab_cache_page_write_begin(mapping, index, flags); - *pagep = page; - if (unlikely(!page)) - goto out_endtrans; - - if (gfs2_is_stuffed(ip)) { - error = 0; - if (pos + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) { - error = gfs2_unstuff_dinode(ip, page); - if (error == 0) - goto prepare_write; - } else if (!PageUptodate(page)) { - error = stuffed_readpage(ip, page); - } - goto out; - } - -prepare_write: - error = block_prepare_write(page, from, to, gfs2_block_map); -out: - if (error == 0) - return 0; - - page_cache_release(page); - if (pos + len > ip->i_inode.i_size) - vmtruncate(&ip->i_inode, ip->i_inode.i_size); -out_endtrans: - gfs2_trans_end(sdp); -out_trans_fail: - if (alloc_required) { - gfs2_inplace_release(ip); -out_qunlock: - gfs2_quota_unlock(ip); -out_alloc_put: - gfs2_alloc_put(ip); - } -out_unlock: - gfs2_glock_dq(&ip->i_gh); -out_uninit: - gfs2_holder_uninit(&ip->i_gh); - return error; -} - -/** - * adjust_fs_space - Adjusts the free space available due to gfs2_grow - * @inode: the rindex inode - */ -static void adjust_fs_space(struct inode *inode) -{ - struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; - struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; - struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; - u64 fs_total, new_free; - - /* Total up the file system space, according to the latest rindex. */ - fs_total = gfs2_ri_total(sdp); - - spin_lock(&sdp->sd_statfs_spin); - if (fs_total > (m_sc->sc_total + l_sc->sc_total)) - new_free = fs_total - (m_sc->sc_total + l_sc->sc_total); - else - new_free = 0; - spin_unlock(&sdp->sd_statfs_spin); - fs_warn(sdp, "File system extended by %llu blocks.\n", - (unsigned long long)new_free); - gfs2_statfs_change(sdp, new_free, new_free, 0); -} - -/** - * gfs2_stuffed_write_end - Write end for stuffed files - * @inode: The inode - * @dibh: The buffer_head containing the on-disk inode - * @pos: The file position - * @len: The length of the write - * @copied: How much was actually copied by the VFS - * @page: The page - * - * This copies the data from the page into the inode block after - * the inode data structure itself. - * - * Returns: errno - */ -static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, - loff_t pos, unsigned len, unsigned copied, - struct page *page) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); - u64 to = pos + copied; - void *kaddr; - unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode); - struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; - - BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode))); - kaddr = kmap_atomic(page, KM_USER0); - memcpy(buf + pos, kaddr + pos, copied); - memset(kaddr + pos + copied, 0, len - copied); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - - if (!PageUptodate(page)) - SetPageUptodate(page); - unlock_page(page); - page_cache_release(page); - - if (copied) { - if (inode->i_size < to) { - i_size_write(inode, to); - ip->i_disksize = inode->i_size; - } - gfs2_dinode_out(ip, di); - mark_inode_dirty(inode); - } - - if (inode == sdp->sd_rindex) - adjust_fs_space(inode); - - brelse(dibh); - gfs2_trans_end(sdp); - gfs2_glock_dq(&ip->i_gh); - gfs2_holder_uninit(&ip->i_gh); - return copied; -} - -/** - * gfs2_write_end - * @file: The file to write to - * @mapping: The address space to write to - * @pos: The file position - * @len: The length of the data - * @copied: - * @page: The page that has been written - * @fsdata: The fsdata (unused in GFS2) - * - * The main write_end function for GFS2. We have a separate one for - * stuffed files as they are slightly different, otherwise we just - * put our locking around the VFS provided functions. - * - * Returns: errno - */ - -static int gfs2_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = page->mapping->host; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); - struct buffer_head *dibh; - struct gfs2_alloc *al = ip->i_alloc; - unsigned int from = pos & (PAGE_CACHE_SIZE - 1); - unsigned int to = from + len; - int ret; - - BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL); - - ret = gfs2_meta_inode_buffer(ip, &dibh); - if (unlikely(ret)) { - unlock_page(page); - page_cache_release(page); - goto failed; - } - - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - - if (gfs2_is_stuffed(ip)) - return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page); - - if (!gfs2_is_writeback(ip)) - gfs2_page_add_databufs(ip, page, from, to); - - ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - if (ret > 0) { - if (inode->i_size > ip->i_disksize) - ip->i_disksize = inode->i_size; - gfs2_dinode_out(ip, dibh->b_data); - mark_inode_dirty(inode); - } - - if (inode == sdp->sd_rindex) - adjust_fs_space(inode); - - brelse(dibh); - gfs2_trans_end(sdp); -failed: - if (al) { - gfs2_inplace_release(ip); - gfs2_quota_unlock(ip); - gfs2_alloc_put(ip); - } - gfs2_glock_dq(&ip->i_gh); - gfs2_holder_uninit(&ip->i_gh); - return ret; -} - -/** - * gfs2_set_page_dirty - Page dirtying function - * @page: The page to dirty - * - * Returns: 1 if it dirtyed the page, or 0 otherwise - */ - -static int gfs2_set_page_dirty(struct page *page) -{ - SetPageChecked(page); - return __set_page_dirty_buffers(page); -} - -/** - * gfs2_bmap - Block map function - * @mapping: Address space info - * @lblock: The block to map - * - * Returns: The disk address for the block or 0 on hole or error - */ - -static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock) -{ - struct gfs2_inode *ip = GFS2_I(mapping->host); - struct gfs2_holder i_gh; - sector_t dblock = 0; - int error; - - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); - if (error) - return 0; - - if (!gfs2_is_stuffed(ip)) - dblock = generic_block_bmap(mapping, lblock, gfs2_block_map); - - gfs2_glock_dq_uninit(&i_gh); - - return dblock; -} - -static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh) -{ - struct gfs2_bufdata *bd; - - lock_buffer(bh); - gfs2_log_lock(sdp); - clear_buffer_dirty(bh); - bd = bh->b_private; - if (bd) { - if (!list_empty(&bd->bd_le.le_list) && !buffer_pinned(bh)) - list_del_init(&bd->bd_le.le_list); - else - gfs2_remove_from_journal(bh, current->journal_info, 0); - } - bh->b_bdev = NULL; - clear_buffer_mapped(bh); - clear_buffer_req(bh); - clear_buffer_new(bh); - gfs2_log_unlock(sdp); - unlock_buffer(bh); -} - -static void gfs2_invalidatepage(struct page *page, unsigned long offset) -{ - struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); - struct buffer_head *bh, *head; - unsigned long pos = 0; - - BUG_ON(!PageLocked(page)); - if (offset == 0) - ClearPageChecked(page); - if (!page_has_buffers(page)) - goto out; - - bh = head = page_buffers(page); - do { - if (offset <= pos) - gfs2_discard(sdp, bh); - pos += bh->b_size; - bh = bh->b_this_page; - } while (bh != head); -out: - if (offset == 0) - try_to_release_page(page, 0); -} - -/** - * gfs2_ok_for_dio - check that dio is valid on this file - * @ip: The inode - * @rw: READ or WRITE - * @offset: The offset at which we are reading or writing - * - * Returns: 0 (to ignore the i/o request and thus fall back to buffered i/o) - * 1 (to accept the i/o request) - */ -static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset) -{ - /* - * Should we return an error here? I can't see that O_DIRECT for - * a stuffed file makes any sense. For now we'll silently fall - * back to buffered I/O - */ - if (gfs2_is_stuffed(ip)) - return 0; - - if (offset >= i_size_read(&ip->i_inode)) - return 0; - return 1; -} - - - -static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder gh; - int rv; - - /* - * Deferred lock, even if its a write, since we do no allocation - * on this path. All we need change is atime, and this lock mode - * ensures that other nodes have flushed their buffered read caches - * (i.e. their page cache entries for this inode). We do not, - * unfortunately have the option of only flushing a range like - * the VFS does. - */ - gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh); - rv = gfs2_glock_nq(&gh); - if (rv) - return rv; - rv = gfs2_ok_for_dio(ip, rw, offset); - if (rv != 1) - goto out; /* dio not valid, fall back to buffered i/o */ - - rv = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev, - iov, offset, nr_segs, - gfs2_get_block_direct, NULL); -out: - gfs2_glock_dq_m(1, &gh); - gfs2_holder_uninit(&gh); - return rv; -} - -/** - * gfs2_releasepage - free the metadata associated with a page - * @page: the page that's being released - * @gfp_mask: passed from Linux VFS, ignored by us - * - * Call try_to_free_buffers() if the buffers in this page can be - * released. - * - * Returns: 0 - */ - -int gfs2_releasepage(struct page *page, gfp_t gfp_mask) -{ - struct inode *aspace = page->mapping->host; - struct gfs2_sbd *sdp = aspace->i_sb->s_fs_info; - struct buffer_head *bh, *head; - struct gfs2_bufdata *bd; - - if (!page_has_buffers(page)) - return 0; - - gfs2_log_lock(sdp); - head = bh = page_buffers(page); - do { - if (atomic_read(&bh->b_count)) - goto cannot_release; - bd = bh->b_private; - if (bd && bd->bd_ail) - goto cannot_release; - gfs2_assert_warn(sdp, !buffer_pinned(bh)); - gfs2_assert_warn(sdp, !buffer_dirty(bh)); - bh = bh->b_this_page; - } while(bh != head); - gfs2_log_unlock(sdp); - - head = bh = page_buffers(page); - do { - gfs2_log_lock(sdp); - bd = bh->b_private; - if (bd) { - gfs2_assert_warn(sdp, bd->bd_bh == bh); - gfs2_assert_warn(sdp, list_empty(&bd->bd_list_tr)); - if (!list_empty(&bd->bd_le.le_list)) { - if (!buffer_pinned(bh)) - list_del_init(&bd->bd_le.le_list); - else - bd = NULL; - } - if (bd) - bd->bd_bh = NULL; - bh->b_private = NULL; - } - gfs2_log_unlock(sdp); - if (bd) - kmem_cache_free(gfs2_bufdata_cachep, bd); - - bh = bh->b_this_page; - } while (bh != head); - - return try_to_free_buffers(page); -cannot_release: - gfs2_log_unlock(sdp); - return 0; -} - -static const struct address_space_operations gfs2_writeback_aops = { - .writepage = gfs2_writeback_writepage, - .writepages = gfs2_writeback_writepages, - .readpage = gfs2_readpage, - .readpages = gfs2_readpages, - .sync_page = block_sync_page, - .write_begin = gfs2_write_begin, - .write_end = gfs2_write_end, - .bmap = gfs2_bmap, - .invalidatepage = gfs2_invalidatepage, - .releasepage = gfs2_releasepage, - .direct_IO = gfs2_direct_IO, - .migratepage = buffer_migrate_page, - .is_partially_uptodate = block_is_partially_uptodate, -}; - -static const struct address_space_operations gfs2_ordered_aops = { - .writepage = gfs2_ordered_writepage, - .readpage = gfs2_readpage, - .readpages = gfs2_readpages, - .sync_page = block_sync_page, - .write_begin = gfs2_write_begin, - .write_end = gfs2_write_end, - .set_page_dirty = gfs2_set_page_dirty, - .bmap = gfs2_bmap, - .invalidatepage = gfs2_invalidatepage, - .releasepage = gfs2_releasepage, - .direct_IO = gfs2_direct_IO, - .migratepage = buffer_migrate_page, - .is_partially_uptodate = block_is_partially_uptodate, -}; - -static const struct address_space_operations gfs2_jdata_aops = { - .writepage = gfs2_jdata_writepage, - .writepages = gfs2_jdata_writepages, - .readpage = gfs2_readpage, - .readpages = gfs2_readpages, - .sync_page = block_sync_page, - .write_begin = gfs2_write_begin, - .write_end = gfs2_write_end, - .set_page_dirty = gfs2_set_page_dirty, - .bmap = gfs2_bmap, - .invalidatepage = gfs2_invalidatepage, - .releasepage = gfs2_releasepage, - .is_partially_uptodate = block_is_partially_uptodate, -}; - -void gfs2_set_aops(struct inode *inode) -{ - struct gfs2_inode *ip = GFS2_I(inode); - - if (gfs2_is_writeback(ip)) - inode->i_mapping->a_ops = &gfs2_writeback_aops; - else if (gfs2_is_ordered(ip)) - inode->i_mapping->a_ops = &gfs2_ordered_aops; - else if (gfs2_is_jdata(ip)) - inode->i_mapping->a_ops = &gfs2_jdata_aops; - else - BUG(); -} - diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h deleted file mode 100644 index 5da2128..0000000 --- a/fs/gfs2/ops_address.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __OPS_ADDRESS_DOT_H__ -#define __OPS_ADDRESS_DOT_H__ - -#include -#include -#include - -extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask); -extern int gfs2_internal_read(struct gfs2_inode *ip, - struct file_ra_state *ra_state, - char *buf, loff_t *pos, unsigned size); -extern void gfs2_set_aops(struct inode *inode); - -#endif /* __OPS_ADDRESS_DOT_H__ */ diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c deleted file mode 100644 index 022c66c..0000000 --- a/fs/gfs2/ops_dentry.c +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include -#include -#include -#include -#include -#include - -#include "gfs2.h" -#include "incore.h" -#include "dir.h" -#include "glock.h" -#include "super.h" -#include "util.h" -#include "inode.h" - -/** - * gfs2_drevalidate - Check directory lookup consistency - * @dentry: the mapping to check - * @nd: - * - * Check to make sure the lookup necessary to arrive at this inode from its - * parent is still good. - * - * Returns: 1 if the dentry is ok, 0 if it isn't - */ - -static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) -{ - struct dentry *parent = dget_parent(dentry); - struct gfs2_sbd *sdp = GFS2_SB(parent->d_inode); - struct gfs2_inode *dip = GFS2_I(parent->d_inode); - struct inode *inode = dentry->d_inode; - struct gfs2_holder d_gh; - struct gfs2_inode *ip = NULL; - int error; - int had_lock = 0; - - if (inode) { - if (is_bad_inode(inode)) - goto invalid; - ip = GFS2_I(inode); - } - - if (sdp->sd_args.ar_localcaching) - goto valid; - - had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL); - if (!had_lock) { - error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); - if (error) - goto fail; - } - - error = gfs2_dir_check(parent->d_inode, &dentry->d_name, ip); - switch (error) { - case 0: - if (!inode) - goto invalid_gunlock; - break; - case -ENOENT: - if (!inode) - goto valid_gunlock; - goto invalid_gunlock; - default: - goto fail_gunlock; - } - -valid_gunlock: - if (!had_lock) - gfs2_glock_dq_uninit(&d_gh); -valid: - dput(parent); - return 1; - -invalid_gunlock: - if (!had_lock) - gfs2_glock_dq_uninit(&d_gh); -invalid: - if (inode && S_ISDIR(inode->i_mode)) { - if (have_submounts(dentry)) - goto valid; - shrink_dcache_parent(dentry); - } - d_drop(dentry); - dput(parent); - return 0; - -fail_gunlock: - gfs2_glock_dq_uninit(&d_gh); -fail: - dput(parent); - return 0; -} - -static int gfs2_dhash(struct dentry *dentry, struct qstr *str) -{ - str->hash = gfs2_disk_hash(str->name, str->len); - return 0; -} - -const struct dentry_operations gfs2_dops = { - .d_revalidate = gfs2_drevalidate, - .d_hash = gfs2_dhash, -}; - diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c deleted file mode 100644 index 9200ef2..0000000 --- a/fs/gfs2/ops_export.c +++ /dev/null @@ -1,285 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include "gfs2.h" -#include "incore.h" -#include "dir.h" -#include "glock.h" -#include "glops.h" -#include "inode.h" -#include "super.h" -#include "rgrp.h" -#include "util.h" - -#define GFS2_SMALL_FH_SIZE 4 -#define GFS2_LARGE_FH_SIZE 8 -#define GFS2_OLD_FH_SIZE 10 - -static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len, - int connectable) -{ - __be32 *fh = (__force __be32 *)p; - struct inode *inode = dentry->d_inode; - struct super_block *sb = inode->i_sb; - struct gfs2_inode *ip = GFS2_I(inode); - - if (*len < GFS2_SMALL_FH_SIZE || - (connectable && *len < GFS2_LARGE_FH_SIZE)) - return 255; - - fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32); - fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF); - fh[2] = cpu_to_be32(ip->i_no_addr >> 32); - fh[3] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF); - *len = GFS2_SMALL_FH_SIZE; - - if (!connectable || inode == sb->s_root->d_inode) - return *len; - - spin_lock(&dentry->d_lock); - inode = dentry->d_parent->d_inode; - ip = GFS2_I(inode); - igrab(inode); - spin_unlock(&dentry->d_lock); - - fh[4] = cpu_to_be32(ip->i_no_formal_ino >> 32); - fh[5] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF); - fh[6] = cpu_to_be32(ip->i_no_addr >> 32); - fh[7] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF); - *len = GFS2_LARGE_FH_SIZE; - - iput(inode); - - return *len; -} - -struct get_name_filldir { - struct gfs2_inum_host inum; - char *name; -}; - -static int get_name_filldir(void *opaque, const char *name, int length, - loff_t offset, u64 inum, unsigned int type) -{ - struct get_name_filldir *gnfd = opaque; - - if (inum != gnfd->inum.no_addr) - return 0; - - memcpy(gnfd->name, name, length); - gnfd->name[length] = 0; - - return 1; -} - -static int gfs2_get_name(struct dentry *parent, char *name, - struct dentry *child) -{ - struct inode *dir = parent->d_inode; - struct inode *inode = child->d_inode; - struct gfs2_inode *dip, *ip; - struct get_name_filldir gnfd; - struct gfs2_holder gh; - u64 offset = 0; - int error; - - if (!dir) - return -EINVAL; - - if (!S_ISDIR(dir->i_mode) || !inode) - return -EINVAL; - - dip = GFS2_I(dir); - ip = GFS2_I(inode); - - *name = 0; - gnfd.inum.no_addr = ip->i_no_addr; - gnfd.inum.no_formal_ino = ip->i_no_formal_ino; - gnfd.name = name; - - error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh); - if (error) - return error; - - error = gfs2_dir_read(dir, &offset, &gnfd, get_name_filldir); - - gfs2_glock_dq_uninit(&gh); - - if (!error && !*name) - error = -ENOENT; - - return error; -} - -static struct dentry *gfs2_get_parent(struct dentry *child) -{ - struct qstr dotdot; - struct dentry *dentry; - - /* - * XXX(hch): it would be a good idea to keep this around as a - * static variable. - */ - gfs2_str2qstr(&dotdot, ".."); - - dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &dotdot, 1)); - if (!IS_ERR(dentry)) - dentry->d_op = &gfs2_dops; - return dentry; -} - -static struct dentry *gfs2_get_dentry(struct super_block *sb, - struct gfs2_inum_host *inum) -{ - struct gfs2_sbd *sdp = sb->s_fs_info; - struct gfs2_holder i_gh, ri_gh, rgd_gh; - struct gfs2_rgrpd *rgd; - struct inode *inode; - struct dentry *dentry; - int error; - - /* System files? */ - - inode = gfs2_ilookup(sb, inum->no_addr); - if (inode) { - if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) { - iput(inode); - return ERR_PTR(-ESTALE); - } - goto out_inode; - } - - error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops, - LM_ST_SHARED, LM_FLAG_ANY, &i_gh); - if (error) - return ERR_PTR(error); - - error = gfs2_rindex_hold(sdp, &ri_gh); - if (error) - goto fail; - - error = -EINVAL; - rgd = gfs2_blk2rgrpd(sdp, inum->no_addr); - if (!rgd) - goto fail_rindex; - - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh); - if (error) - goto fail_rindex; - - error = -ESTALE; - if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE) - goto fail_rgd; - - gfs2_glock_dq_uninit(&rgd_gh); - gfs2_glock_dq_uninit(&ri_gh); - - inode = gfs2_inode_lookup(sb, DT_UNKNOWN, - inum->no_addr, - 0, 0); - if (IS_ERR(inode)) { - error = PTR_ERR(inode); - goto fail; - } - - error = gfs2_inode_refresh(GFS2_I(inode)); - if (error) { - iput(inode); - goto fail; - } - - /* Pick up the works we bypass in gfs2_inode_lookup */ - if (inode->i_state & I_NEW) - gfs2_set_iop(inode); - - if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) { - iput(inode); - goto fail; - } - - error = -EIO; - if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) { - iput(inode); - goto fail; - } - - gfs2_glock_dq_uninit(&i_gh); - -out_inode: - dentry = d_obtain_alias(inode); - if (!IS_ERR(dentry)) - dentry->d_op = &gfs2_dops; - return dentry; - -fail_rgd: - gfs2_glock_dq_uninit(&rgd_gh); - -fail_rindex: - gfs2_glock_dq_uninit(&ri_gh); - -fail: - gfs2_glock_dq_uninit(&i_gh); - return ERR_PTR(error); -} - -static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) -{ - struct gfs2_inum_host this; - __be32 *fh = (__force __be32 *)fid->raw; - - switch (fh_type) { - case GFS2_SMALL_FH_SIZE: - case GFS2_LARGE_FH_SIZE: - case GFS2_OLD_FH_SIZE: - this.no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32; - this.no_formal_ino |= be32_to_cpu(fh[1]); - this.no_addr = ((u64)be32_to_cpu(fh[2])) << 32; - this.no_addr |= be32_to_cpu(fh[3]); - return gfs2_get_dentry(sb, &this); - default: - return NULL; - } -} - -static struct dentry *gfs2_fh_to_parent(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) -{ - struct gfs2_inum_host parent; - __be32 *fh = (__force __be32 *)fid->raw; - - switch (fh_type) { - case GFS2_LARGE_FH_SIZE: - case GFS2_OLD_FH_SIZE: - parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32; - parent.no_formal_ino |= be32_to_cpu(fh[5]); - parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32; - parent.no_addr |= be32_to_cpu(fh[7]); - return gfs2_get_dentry(sb, &parent); - default: - return NULL; - } -} - -const struct export_operations gfs2_export_ops = { - .encode_fh = gfs2_encode_fh, - .fh_to_dentry = gfs2_fh_to_dentry, - .fh_to_parent = gfs2_fh_to_parent, - .get_name = gfs2_get_name, - .get_parent = gfs2_get_parent, -}; - diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c deleted file mode 100644 index 0ee7bd2..0000000 --- a/fs/gfs2/ops_file.c +++ /dev/null @@ -1,766 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "gfs2.h" -#include "incore.h" -#include "bmap.h" -#include "dir.h" -#include "glock.h" -#include "glops.h" -#include "inode.h" -#include "log.h" -#include "meta_io.h" -#include "quota.h" -#include "rgrp.h" -#include "trans.h" -#include "util.h" -#include "eaops.h" -#include "ops_address.h" - -/** - * gfs2_llseek - seek to a location in a file - * @file: the file - * @offset: the offset - * @origin: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END) - * - * SEEK_END requires the glock for the file because it references the - * file's size. - * - * Returns: The new offset, or errno - */ - -static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin) -{ - struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); - struct gfs2_holder i_gh; - loff_t error; - - if (origin == 2) { - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, - &i_gh); - if (!error) { - error = generic_file_llseek_unlocked(file, offset, origin); - gfs2_glock_dq_uninit(&i_gh); - } - } else - error = generic_file_llseek_unlocked(file, offset, origin); - - return error; -} - -/** - * gfs2_readdir - Read directory entries from a directory - * @file: The directory to read from - * @dirent: Buffer for dirents - * @filldir: Function used to do the copying - * - * Returns: errno - */ - -static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir) -{ - struct inode *dir = file->f_mapping->host; - struct gfs2_inode *dip = GFS2_I(dir); - struct gfs2_holder d_gh; - u64 offset = file->f_pos; - int error; - - gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); - error = gfs2_glock_nq(&d_gh); - if (error) { - gfs2_holder_uninit(&d_gh); - return error; - } - - error = gfs2_dir_read(dir, &offset, dirent, filldir); - - gfs2_glock_dq_uninit(&d_gh); - - file->f_pos = offset; - - return error; -} - -/** - * fsflags_cvt - * @table: A table of 32 u32 flags - * @val: a 32 bit value to convert - * - * This function can be used to convert between fsflags values and - * GFS2's own flags values. - * - * Returns: the converted flags - */ -static u32 fsflags_cvt(const u32 *table, u32 val) -{ - u32 res = 0; - while(val) { - if (val & 1) - res |= *table; - table++; - val >>= 1; - } - return res; -} - -static const u32 fsflags_to_gfs2[32] = { - [3] = GFS2_DIF_SYNC, - [4] = GFS2_DIF_IMMUTABLE, - [5] = GFS2_DIF_APPENDONLY, - [7] = GFS2_DIF_NOATIME, - [12] = GFS2_DIF_EXHASH, - [14] = GFS2_DIF_INHERIT_JDATA, -}; - -static const u32 gfs2_to_fsflags[32] = { - [gfs2fl_Sync] = FS_SYNC_FL, - [gfs2fl_Immutable] = FS_IMMUTABLE_FL, - [gfs2fl_AppendOnly] = FS_APPEND_FL, - [gfs2fl_NoAtime] = FS_NOATIME_FL, - [gfs2fl_ExHash] = FS_INDEX_FL, - [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL, -}; - -static int gfs2_get_flags(struct file *filp, u32 __user *ptr) -{ - struct inode *inode = filp->f_path.dentry->d_inode; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder gh; - int error; - u32 fsflags; - - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); - error = gfs2_glock_nq(&gh); - if (error) - return error; - - fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags); - if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA) - fsflags |= FS_JOURNAL_DATA_FL; - if (put_user(fsflags, ptr)) - error = -EFAULT; - - gfs2_glock_dq(&gh); - gfs2_holder_uninit(&gh); - return error; -} - -void gfs2_set_inode_flags(struct inode *inode) -{ - struct gfs2_inode *ip = GFS2_I(inode); - unsigned int flags = inode->i_flags; - - flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); - if (ip->i_diskflags & GFS2_DIF_IMMUTABLE) - flags |= S_IMMUTABLE; - if (ip->i_diskflags & GFS2_DIF_APPENDONLY) - flags |= S_APPEND; - if (ip->i_diskflags & GFS2_DIF_NOATIME) - flags |= S_NOATIME; - if (ip->i_diskflags & GFS2_DIF_SYNC) - flags |= S_SYNC; - inode->i_flags = flags; -} - -/* Flags that can be set by user space */ -#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \ - GFS2_DIF_IMMUTABLE| \ - GFS2_DIF_APPENDONLY| \ - GFS2_DIF_NOATIME| \ - GFS2_DIF_SYNC| \ - GFS2_DIF_SYSTEM| \ - GFS2_DIF_INHERIT_JDATA) - -/** - * gfs2_set_flags - set flags on an inode - * @inode: The inode - * @flags: The flags to set - * @mask: Indicates which flags are valid - * - */ -static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) -{ - struct inode *inode = filp->f_path.dentry->d_inode; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); - struct buffer_head *bh; - struct gfs2_holder gh; - int error; - u32 new_flags, flags; - - error = mnt_want_write(filp->f_path.mnt); - if (error) - return error; - - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - if (error) - goto out_drop_write; - - flags = ip->i_diskflags; - new_flags = (flags & ~mask) | (reqflags & mask); - if ((new_flags ^ flags) == 0) - goto out; - - error = -EINVAL; - if ((new_flags ^ flags) & ~GFS2_FLAGS_USER_SET) - goto out; - - error = -EPERM; - if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE)) - goto out; - if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY)) - goto out; - if (((new_flags ^ flags) & GFS2_DIF_IMMUTABLE) && - !capable(CAP_LINUX_IMMUTABLE)) - goto out; - if (!IS_IMMUTABLE(inode)) { - error = gfs2_permission(inode, MAY_WRITE); - if (error) - goto out; - } - if ((flags ^ new_flags) & GFS2_DIF_JDATA) { - if (flags & GFS2_DIF_JDATA) - gfs2_log_flush(sdp, ip->i_gl); - error = filemap_fdatawrite(inode->i_mapping); - if (error) - goto out; - error = filemap_fdatawait(inode->i_mapping); - if (error) - goto out; - } - error = gfs2_trans_begin(sdp, RES_DINODE, 0); - if (error) - goto out; - error = gfs2_meta_inode_buffer(ip, &bh); - if (error) - goto out_trans_end; - gfs2_trans_add_bh(ip->i_gl, bh, 1); - ip->i_diskflags = new_flags; - gfs2_dinode_out(ip, bh->b_data); - brelse(bh); - gfs2_set_inode_flags(inode); - gfs2_set_aops(inode); -out_trans_end: - gfs2_trans_end(sdp); -out: - gfs2_glock_dq_uninit(&gh); -out_drop_write: - mnt_drop_write(filp->f_path.mnt); - return error; -} - -static int gfs2_set_flags(struct file *filp, u32 __user *ptr) -{ - struct inode *inode = filp->f_path.dentry->d_inode; - u32 fsflags, gfsflags; - if (get_user(fsflags, ptr)) - return -EFAULT; - gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags); - if (!S_ISDIR(inode->i_mode)) { - if (gfsflags & GFS2_DIF_INHERIT_JDATA) - gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA); - return do_gfs2_set_flags(filp, gfsflags, ~0); - } - return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA); -} - -static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) -{ - switch(cmd) { - case FS_IOC_GETFLAGS: - return gfs2_get_flags(filp, (u32 __user *)arg); - case FS_IOC_SETFLAGS: - return gfs2_set_flags(filp, (u32 __user *)arg); - } - return -ENOTTY; -} - -/** - * gfs2_allocate_page_backing - Use bmap to allocate blocks - * @page: The (locked) page to allocate backing for - * - * We try to allocate all the blocks required for the page in - * one go. This might fail for various reasons, so we keep - * trying until all the blocks to back this page are allocated. - * If some of the blocks are already allocated, thats ok too. - */ - -static int gfs2_allocate_page_backing(struct page *page) -{ - struct inode *inode = page->mapping->host; - struct buffer_head bh; - unsigned long size = PAGE_CACHE_SIZE; - u64 lblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - - do { - bh.b_state = 0; - bh.b_size = size; - gfs2_block_map(inode, lblock, &bh, 1); - if (!buffer_mapped(&bh)) - return -EIO; - size -= bh.b_size; - lblock += (bh.b_size >> inode->i_blkbits); - } while(size > 0); - return 0; -} - -/** - * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable - * @vma: The virtual memory area - * @page: The page which is about to become writable - * - * When the page becomes writable, we need to ensure that we have - * blocks allocated on disk to back that page. - */ - -static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct page *page = vmf->page; - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); - unsigned long last_index; - u64 pos = page->index << PAGE_CACHE_SHIFT; - unsigned int data_blocks, ind_blocks, rblocks; - int alloc_required = 0; - struct gfs2_holder gh; - struct gfs2_alloc *al; - int ret; - - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - ret = gfs2_glock_nq(&gh); - if (ret) - goto out; - - set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); - set_bit(GIF_SW_PAGED, &ip->i_flags); - - ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required); - if (ret || !alloc_required) - goto out_unlock; - ret = -ENOMEM; - al = gfs2_alloc_get(ip); - if (al == NULL) - goto out_unlock; - - ret = gfs2_quota_lock_check(ip); - if (ret) - goto out_alloc_put; - gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); - al->al_requested = data_blocks + ind_blocks; - ret = gfs2_inplace_reserve(ip); - if (ret) - goto out_quota_unlock; - - rblocks = RES_DINODE + ind_blocks; - if (gfs2_is_jdata(ip)) - rblocks += data_blocks ? data_blocks : 1; - if (ind_blocks || data_blocks) - rblocks += RES_STATFS + RES_QUOTA; - ret = gfs2_trans_begin(sdp, rblocks, 0); - if (ret) - goto out_trans_fail; - - lock_page(page); - ret = -EINVAL; - last_index = ip->i_inode.i_size >> PAGE_CACHE_SHIFT; - if (page->index > last_index) - goto out_unlock_page; - ret = 0; - if (!PageUptodate(page) || page->mapping != ip->i_inode.i_mapping) - goto out_unlock_page; - if (gfs2_is_stuffed(ip)) { - ret = gfs2_unstuff_dinode(ip, page); - if (ret) - goto out_unlock_page; - } - ret = gfs2_allocate_page_backing(page); - -out_unlock_page: - unlock_page(page); - gfs2_trans_end(sdp); -out_trans_fail: - gfs2_inplace_release(ip); -out_quota_unlock: - gfs2_quota_unlock(ip); -out_alloc_put: - gfs2_alloc_put(ip); -out_unlock: - gfs2_glock_dq(&gh); -out: - gfs2_holder_uninit(&gh); - if (ret == -ENOMEM) - ret = VM_FAULT_OOM; - else if (ret) - ret = VM_FAULT_SIGBUS; - return ret; -} - -static struct vm_operations_struct gfs2_vm_ops = { - .fault = filemap_fault, - .page_mkwrite = gfs2_page_mkwrite, -}; - -/** - * gfs2_mmap - - * @file: The file to map - * @vma: The VMA which described the mapping - * - * There is no need to get a lock here unless we should be updating - * atime. We ignore any locking errors since the only consequence is - * a missed atime update (which will just be deferred until later). - * - * Returns: 0 - */ - -static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) -{ - struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); - - if (!(file->f_flags & O_NOATIME)) { - struct gfs2_holder i_gh; - int error; - - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); - error = gfs2_glock_nq(&i_gh); - file_accessed(file); - if (error == 0) - gfs2_glock_dq_uninit(&i_gh); - } - vma->vm_ops = &gfs2_vm_ops; - vma->vm_flags |= VM_CAN_NONLINEAR; - - return 0; -} - -/** - * gfs2_open - open a file - * @inode: the inode to open - * @file: the struct file for this opening - * - * Returns: errno - */ - -static int gfs2_open(struct inode *inode, struct file *file) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder i_gh; - struct gfs2_file *fp; - int error; - - fp = kzalloc(sizeof(struct gfs2_file), GFP_KERNEL); - if (!fp) - return -ENOMEM; - - mutex_init(&fp->f_fl_mutex); - - gfs2_assert_warn(GFS2_SB(inode), !file->private_data); - file->private_data = fp; - - if (S_ISREG(ip->i_inode.i_mode)) { - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, - &i_gh); - if (error) - goto fail; - - if (!(file->f_flags & O_LARGEFILE) && - ip->i_disksize > MAX_NON_LFS) { - error = -EOVERFLOW; - goto fail_gunlock; - } - - gfs2_glock_dq_uninit(&i_gh); - } - - return 0; - -fail_gunlock: - gfs2_glock_dq_uninit(&i_gh); -fail: - file->private_data = NULL; - kfree(fp); - return error; -} - -/** - * gfs2_close - called to close a struct file - * @inode: the inode the struct file belongs to - * @file: the struct file being closed - * - * Returns: errno - */ - -static int gfs2_close(struct inode *inode, struct file *file) -{ - struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; - struct gfs2_file *fp; - - fp = file->private_data; - file->private_data = NULL; - - if (gfs2_assert_warn(sdp, fp)) - return -EIO; - - kfree(fp); - - return 0; -} - -/** - * gfs2_fsync - sync the dirty data for a file (across the cluster) - * @file: the file that points to the dentry (we ignore this) - * @dentry: the dentry that points to the inode to sync - * - * The VFS will flush "normal" data for us. We only need to worry - * about metadata here. For journaled data, we just do a log flush - * as we can't avoid it. Otherwise we can just bale out if datasync - * is set. For stuffed inodes we must flush the log in order to - * ensure that all data is on disk. - * - * The call to write_inode_now() is there to write back metadata and - * the inode itself. It does also try and write the data, but thats - * (hopefully) a no-op due to the VFS having already called filemap_fdatawrite() - * for us. - * - * Returns: errno - */ - -static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync) -{ - struct inode *inode = dentry->d_inode; - int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC); - int ret = 0; - - if (gfs2_is_jdata(GFS2_I(inode))) { - gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl); - return 0; - } - - if (sync_state != 0) { - if (!datasync) - ret = write_inode_now(inode, 0); - - if (gfs2_is_stuffed(GFS2_I(inode))) - gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl); - } - - return ret; -} - -#ifdef CONFIG_GFS2_FS_LOCKING_DLM - -/** - * gfs2_setlease - acquire/release a file lease - * @file: the file pointer - * @arg: lease type - * @fl: file lock - * - * We don't currently have a way to enforce a lease across the whole - * cluster; until we do, disable leases (by just returning -EINVAL), - * unless the administrator has requested purely local locking. - * - * Returns: errno - */ - -static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl) -{ - return -EINVAL; -} - -/** - * gfs2_lock - acquire/release a posix lock on a file - * @file: the file pointer - * @cmd: either modify or retrieve lock state, possibly wait - * @fl: type and range of lock - * - * Returns: errno - */ - -static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) -{ - struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); - struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); - struct lm_lockstruct *ls = &sdp->sd_lockstruct; - - if (!(fl->fl_flags & FL_POSIX)) - return -ENOLCK; - if (__mandatory_lock(&ip->i_inode)) - return -ENOLCK; - - if (cmd == F_CANCELLK) { - /* Hack: */ - cmd = F_SETLK; - fl->fl_type = F_UNLCK; - } - if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) - return -EIO; - if (IS_GETLK(cmd)) - return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); - else if (fl->fl_type == F_UNLCK) - return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl); - else - return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl); -} - -static int do_flock(struct file *file, int cmd, struct file_lock *fl) -{ - struct gfs2_file *fp = file->private_data; - struct gfs2_holder *fl_gh = &fp->f_fl_gh; - struct gfs2_inode *ip = GFS2_I(file->f_path.dentry->d_inode); - struct gfs2_glock *gl; - unsigned int state; - int flags; - int error = 0; - - state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; - flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE; - - mutex_lock(&fp->f_fl_mutex); - - gl = fl_gh->gh_gl; - if (gl) { - if (fl_gh->gh_state == state) - goto out; - flock_lock_file_wait(file, - &(struct file_lock){.fl_type = F_UNLCK}); - gfs2_glock_dq_wait(fl_gh); - gfs2_holder_reinit(state, flags, fl_gh); - } else { - error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr, - &gfs2_flock_glops, CREATE, &gl); - if (error) - goto out; - gfs2_holder_init(gl, state, flags, fl_gh); - gfs2_glock_put(gl); - } - error = gfs2_glock_nq(fl_gh); - if (error) { - gfs2_holder_uninit(fl_gh); - if (error == GLR_TRYFAILED) - error = -EAGAIN; - } else { - error = flock_lock_file_wait(file, fl); - gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error); - } - -out: - mutex_unlock(&fp->f_fl_mutex); - return error; -} - -static void do_unflock(struct file *file, struct file_lock *fl) -{ - struct gfs2_file *fp = file->private_data; - struct gfs2_holder *fl_gh = &fp->f_fl_gh; - - mutex_lock(&fp->f_fl_mutex); - flock_lock_file_wait(file, fl); - if (fl_gh->gh_gl) - gfs2_glock_dq_uninit(fl_gh); - mutex_unlock(&fp->f_fl_mutex); -} - -/** - * gfs2_flock - acquire/release a flock lock on a file - * @file: the file pointer - * @cmd: either modify or retrieve lock state, possibly wait - * @fl: type and range of lock - * - * Returns: errno - */ - -static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) -{ - struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); - - if (!(fl->fl_flags & FL_FLOCK)) - return -ENOLCK; - if (__mandatory_lock(&ip->i_inode)) - return -ENOLCK; - - if (fl->fl_type == F_UNLCK) { - do_unflock(file, fl); - return 0; - } else { - return do_flock(file, cmd, fl); - } -} - -const struct file_operations gfs2_file_fops = { - .llseek = gfs2_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, - .unlocked_ioctl = gfs2_ioctl, - .mmap = gfs2_mmap, - .open = gfs2_open, - .release = gfs2_close, - .fsync = gfs2_fsync, - .lock = gfs2_lock, - .flock = gfs2_flock, - .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, - .setlease = gfs2_setlease, -}; - -const struct file_operations gfs2_dir_fops = { - .readdir = gfs2_readdir, - .unlocked_ioctl = gfs2_ioctl, - .open = gfs2_open, - .release = gfs2_close, - .fsync = gfs2_fsync, - .lock = gfs2_lock, - .flock = gfs2_flock, -}; - -#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ - -const struct file_operations gfs2_file_fops_nolock = { - .llseek = gfs2_llseek, - .read = do_sync_read, - .aio_read = generic_file_aio_read, - .write = do_sync_write, - .aio_write = generic_file_aio_write, - .unlocked_ioctl = gfs2_ioctl, - .mmap = gfs2_mmap, - .open = gfs2_open, - .release = gfs2_close, - .fsync = gfs2_fsync, - .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, - .setlease = generic_setlease, -}; - -const struct file_operations gfs2_dir_fops_nolock = { - .readdir = gfs2_readdir, - .unlocked_ioctl = gfs2_ioctl, - .open = gfs2_open, - .release = gfs2_close, - .fsync = gfs2_fsync, -}; - diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 152e6c4..2e9b932 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -60,7 +60,6 @@ #include "super.h" #include "trans.h" #include "inode.h" -#include "ops_address.h" #include "util.h" #define QUOTA_USER 1 diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index ee3d5c1..6122c7e 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -29,7 +29,6 @@ #include "util.h" #include "log.h" #include "inode.h" -#include "ops_address.h" #define BFITNOENT ((u32)~0) #define NO_BLOCK ((u64)~0) -- cgit v1.1 From 9e6e0a128bca0a151d8d3fbd9459b22fc21cfebb Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 22 May 2009 10:36:01 +0100 Subject: GFS2: Merge mount.c and ops_super.c into super.c mount.c only contained a single function, so is not really worth retaining on its own. All of the super related code is now either in super.c or ops_fstype.c Signed-off-by: Steven Whitehouse --- fs/gfs2/Makefile | 4 +- fs/gfs2/mount.c | 195 ------------ fs/gfs2/ops_super.c | 757 ------------------------------------------- fs/gfs2/super.c | 903 +++++++++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 903 insertions(+), 956 deletions(-) delete mode 100644 fs/gfs2/mount.c delete mode 100644 fs/gfs2/ops_super.c (limited to 'fs') diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile index 4f7332c..d53a9be 100644 --- a/fs/gfs2/Makefile +++ b/fs/gfs2/Makefile @@ -1,8 +1,8 @@ obj-$(CONFIG_GFS2_FS) += gfs2.o gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \ glops.o inode.o log.o lops.o main.o meta_io.o \ - mount.o aops.o dentry.o export.o file.o \ - ops_fstype.o ops_inode.o ops_super.o quota.o \ + aops.o dentry.o export.o file.o \ + ops_fstype.o ops_inode.o quota.o \ recovery.o rgrp.o super.o sys.o trans.o util.o gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c deleted file mode 100644 index 947af15..0000000 --- a/fs/gfs2/mount.c +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include -#include -#include -#include -#include -#include - -#include "gfs2.h" -#include "incore.h" -#include "super.h" -#include "sys.h" -#include "util.h" - -enum { - Opt_lockproto, - Opt_locktable, - Opt_hostdata, - Opt_spectator, - Opt_ignore_local_fs, - Opt_localflocks, - Opt_localcaching, - Opt_debug, - Opt_nodebug, - Opt_upgrade, - Opt_acl, - Opt_noacl, - Opt_quota_off, - Opt_quota_account, - Opt_quota_on, - Opt_quota, - Opt_noquota, - Opt_suiddir, - Opt_nosuiddir, - Opt_data_writeback, - Opt_data_ordered, - Opt_meta, - Opt_discard, - Opt_nodiscard, - Opt_commit, - Opt_err, -}; - -static const match_table_t tokens = { - {Opt_lockproto, "lockproto=%s"}, - {Opt_locktable, "locktable=%s"}, - {Opt_hostdata, "hostdata=%s"}, - {Opt_spectator, "spectator"}, - {Opt_ignore_local_fs, "ignore_local_fs"}, - {Opt_localflocks, "localflocks"}, - {Opt_localcaching, "localcaching"}, - {Opt_debug, "debug"}, - {Opt_nodebug, "nodebug"}, - {Opt_upgrade, "upgrade"}, - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_quota_off, "quota=off"}, - {Opt_quota_account, "quota=account"}, - {Opt_quota_on, "quota=on"}, - {Opt_quota, "quota"}, - {Opt_noquota, "noquota"}, - {Opt_suiddir, "suiddir"}, - {Opt_nosuiddir, "nosuiddir"}, - {Opt_data_writeback, "data=writeback"}, - {Opt_data_ordered, "data=ordered"}, - {Opt_meta, "meta"}, - {Opt_discard, "discard"}, - {Opt_nodiscard, "nodiscard"}, - {Opt_commit, "commit=%d"}, - {Opt_err, NULL} -}; - -/** - * gfs2_mount_args - Parse mount options - * @sdp: - * @data: - * - * Return: errno - */ - -int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options) -{ - char *o; - int token; - substring_t tmp[MAX_OPT_ARGS]; - int rv; - - /* Split the options into tokens with the "," character and - process them */ - - while (1) { - o = strsep(&options, ","); - if (o == NULL) - break; - if (*o == '\0') - continue; - - token = match_token(o, tokens, tmp); - switch (token) { - case Opt_lockproto: - match_strlcpy(args->ar_lockproto, &tmp[0], - GFS2_LOCKNAME_LEN); - break; - case Opt_locktable: - match_strlcpy(args->ar_locktable, &tmp[0], - GFS2_LOCKNAME_LEN); - break; - case Opt_hostdata: - match_strlcpy(args->ar_hostdata, &tmp[0], - GFS2_LOCKNAME_LEN); - break; - case Opt_spectator: - args->ar_spectator = 1; - break; - case Opt_ignore_local_fs: - args->ar_ignore_local_fs = 1; - break; - case Opt_localflocks: - args->ar_localflocks = 1; - break; - case Opt_localcaching: - args->ar_localcaching = 1; - break; - case Opt_debug: - args->ar_debug = 1; - break; - case Opt_nodebug: - args->ar_debug = 0; - break; - case Opt_upgrade: - args->ar_upgrade = 1; - break; - case Opt_acl: - args->ar_posix_acl = 1; - break; - case Opt_noacl: - args->ar_posix_acl = 0; - break; - case Opt_quota_off: - case Opt_noquota: - args->ar_quota = GFS2_QUOTA_OFF; - break; - case Opt_quota_account: - args->ar_quota = GFS2_QUOTA_ACCOUNT; - break; - case Opt_quota_on: - case Opt_quota: - args->ar_quota = GFS2_QUOTA_ON; - break; - case Opt_suiddir: - args->ar_suiddir = 1; - break; - case Opt_nosuiddir: - args->ar_suiddir = 0; - break; - case Opt_data_writeback: - args->ar_data = GFS2_DATA_WRITEBACK; - break; - case Opt_data_ordered: - args->ar_data = GFS2_DATA_ORDERED; - break; - case Opt_meta: - args->ar_meta = 1; - break; - case Opt_discard: - args->ar_discard = 1; - break; - case Opt_nodiscard: - args->ar_discard = 0; - break; - case Opt_commit: - rv = match_int(&tmp[0], &args->ar_commit); - if (rv || args->ar_commit <= 0) { - fs_info(sdp, "commit mount option requires a positive numeric argument\n"); - return rv ? rv : -EINVAL; - } - break; - case Opt_err: - default: - fs_info(sdp, "invalid mount option: %s\n", o); - return -EINVAL; - } - } - - return 0; -} - diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c deleted file mode 100644 index 2fd1dcb..0000000 --- a/fs/gfs2/ops_super.c +++ /dev/null @@ -1,757 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "gfs2.h" -#include "incore.h" -#include "glock.h" -#include "inode.h" -#include "log.h" -#include "quota.h" -#include "recovery.h" -#include "rgrp.h" -#include "super.h" -#include "sys.h" -#include "util.h" -#include "trans.h" -#include "dir.h" -#include "eattr.h" -#include "bmap.h" -#include "meta_io.h" - -#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x) - -/** - * gfs2_write_inode - Make sure the inode is stable on the disk - * @inode: The inode - * @sync: synchronous write flag - * - * Returns: errno - */ - -static int gfs2_write_inode(struct inode *inode, int sync) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); - struct gfs2_holder gh; - struct buffer_head *bh; - struct timespec atime; - struct gfs2_dinode *di; - int ret = 0; - - /* Check this is a "normal" inode, etc */ - if (!test_bit(GIF_USER, &ip->i_flags) || - (current->flags & PF_MEMALLOC)) - return 0; - ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - if (ret) - goto do_flush; - ret = gfs2_trans_begin(sdp, RES_DINODE, 0); - if (ret) - goto do_unlock; - ret = gfs2_meta_inode_buffer(ip, &bh); - if (ret == 0) { - di = (struct gfs2_dinode *)bh->b_data; - atime.tv_sec = be64_to_cpu(di->di_atime); - atime.tv_nsec = be32_to_cpu(di->di_atime_nsec); - if (timespec_compare(&inode->i_atime, &atime) > 0) { - gfs2_trans_add_bh(ip->i_gl, bh, 1); - gfs2_dinode_out(ip, bh->b_data); - } - brelse(bh); - } - gfs2_trans_end(sdp); -do_unlock: - gfs2_glock_dq_uninit(&gh); -do_flush: - if (sync != 0) - gfs2_log_flush(GFS2_SB(inode), ip->i_gl); - return ret; -} - -/** - * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one - * @sdp: the filesystem - * - * Returns: errno - */ - -static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) -{ - struct gfs2_holder t_gh; - int error; - - gfs2_quota_sync(sdp); - gfs2_statfs_sync(sdp); - - error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE, - &t_gh); - if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) - return error; - - gfs2_meta_syncfs(sdp); - gfs2_log_shutdown(sdp); - - clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); - - if (t_gh.gh_gl) - gfs2_glock_dq_uninit(&t_gh); - - gfs2_quota_cleanup(sdp); - - return error; -} - -static int gfs2_umount_recovery_wait(void *word) -{ - schedule(); - return 0; -} - -/** - * gfs2_put_super - Unmount the filesystem - * @sb: The VFS superblock - * - */ - -static void gfs2_put_super(struct super_block *sb) -{ - struct gfs2_sbd *sdp = sb->s_fs_info; - int error; - struct gfs2_jdesc *jd; - - /* Unfreeze the filesystem, if we need to */ - - mutex_lock(&sdp->sd_freeze_lock); - if (sdp->sd_freeze_count) - gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); - mutex_unlock(&sdp->sd_freeze_lock); - - /* No more recovery requests */ - set_bit(SDF_NORECOVERY, &sdp->sd_flags); - smp_mb(); - - /* Wait on outstanding recovery */ -restart: - spin_lock(&sdp->sd_jindex_spin); - list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { - if (!test_bit(JDF_RECOVERY, &jd->jd_flags)) - continue; - spin_unlock(&sdp->sd_jindex_spin); - wait_on_bit(&jd->jd_flags, JDF_RECOVERY, - gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE); - goto restart; - } - spin_unlock(&sdp->sd_jindex_spin); - - kthread_stop(sdp->sd_quotad_process); - kthread_stop(sdp->sd_logd_process); - - if (!(sb->s_flags & MS_RDONLY)) { - error = gfs2_make_fs_ro(sdp); - if (error) - gfs2_io_error(sdp); - } - /* At this point, we're through modifying the disk */ - - /* Release stuff */ - - iput(sdp->sd_jindex); - iput(sdp->sd_inum_inode); - iput(sdp->sd_statfs_inode); - iput(sdp->sd_rindex); - iput(sdp->sd_quota_inode); - - gfs2_glock_put(sdp->sd_rename_gl); - gfs2_glock_put(sdp->sd_trans_gl); - - if (!sdp->sd_args.ar_spectator) { - gfs2_glock_dq_uninit(&sdp->sd_journal_gh); - gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); - gfs2_glock_dq_uninit(&sdp->sd_ir_gh); - gfs2_glock_dq_uninit(&sdp->sd_sc_gh); - gfs2_glock_dq_uninit(&sdp->sd_qc_gh); - iput(sdp->sd_ir_inode); - iput(sdp->sd_sc_inode); - iput(sdp->sd_qc_inode); - } - - gfs2_glock_dq_uninit(&sdp->sd_live_gh); - gfs2_clear_rgrpd(sdp); - gfs2_jindex_free(sdp); - /* Take apart glock structures and buffer lists */ - gfs2_gl_hash_clear(sdp); - /* Unmount the locking protocol */ - gfs2_lm_unmount(sdp); - - /* At this point, we're through participating in the lockspace */ - gfs2_sys_fs_del(sdp); -} - -/** - * gfs2_write_super - * @sb: the superblock - * - */ - -static void gfs2_write_super(struct super_block *sb) -{ - sb->s_dirt = 0; -} - -/** - * gfs2_sync_fs - sync the filesystem - * @sb: the superblock - * - * Flushes the log to disk. - */ - -static int gfs2_sync_fs(struct super_block *sb, int wait) -{ - sb->s_dirt = 0; - if (wait && sb->s_fs_info) - gfs2_log_flush(sb->s_fs_info, NULL); - return 0; -} - -/** - * gfs2_freeze - prevent further writes to the filesystem - * @sb: the VFS structure for the filesystem - * - */ - -static int gfs2_freeze(struct super_block *sb) -{ - struct gfs2_sbd *sdp = sb->s_fs_info; - int error; - - if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) - return -EINVAL; - - for (;;) { - error = gfs2_freeze_fs(sdp); - if (!error) - break; - - switch (error) { - case -EBUSY: - fs_err(sdp, "waiting for recovery before freeze\n"); - break; - - default: - fs_err(sdp, "error freezing FS: %d\n", error); - break; - } - - fs_err(sdp, "retrying...\n"); - msleep(1000); - } - return 0; -} - -/** - * gfs2_unfreeze - reallow writes to the filesystem - * @sb: the VFS structure for the filesystem - * - */ - -static int gfs2_unfreeze(struct super_block *sb) -{ - gfs2_unfreeze_fs(sb->s_fs_info); - return 0; -} - -/** - * statfs_fill - fill in the sg for a given RG - * @rgd: the RG - * @sc: the sc structure - * - * Returns: 0 on success, -ESTALE if the LVB is invalid - */ - -static int statfs_slow_fill(struct gfs2_rgrpd *rgd, - struct gfs2_statfs_change_host *sc) -{ - gfs2_rgrp_verify(rgd); - sc->sc_total += rgd->rd_data; - sc->sc_free += rgd->rd_free; - sc->sc_dinodes += rgd->rd_dinodes; - return 0; -} - -/** - * gfs2_statfs_slow - Stat a filesystem using asynchronous locking - * @sdp: the filesystem - * @sc: the sc info that will be returned - * - * Any error (other than a signal) will cause this routine to fall back - * to the synchronous version. - * - * FIXME: This really shouldn't busy wait like this. - * - * Returns: errno - */ - -static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) -{ - struct gfs2_holder ri_gh; - struct gfs2_rgrpd *rgd_next; - struct gfs2_holder *gha, *gh; - unsigned int slots = 64; - unsigned int x; - int done; - int error = 0, err; - - memset(sc, 0, sizeof(struct gfs2_statfs_change_host)); - gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL); - if (!gha) - return -ENOMEM; - - error = gfs2_rindex_hold(sdp, &ri_gh); - if (error) - goto out; - - rgd_next = gfs2_rgrpd_get_first(sdp); - - for (;;) { - done = 1; - - for (x = 0; x < slots; x++) { - gh = gha + x; - - if (gh->gh_gl && gfs2_glock_poll(gh)) { - err = gfs2_glock_wait(gh); - if (err) { - gfs2_holder_uninit(gh); - error = err; - } else { - if (!error) - error = statfs_slow_fill( - gh->gh_gl->gl_object, sc); - gfs2_glock_dq_uninit(gh); - } - } - - if (gh->gh_gl) - done = 0; - else if (rgd_next && !error) { - error = gfs2_glock_nq_init(rgd_next->rd_gl, - LM_ST_SHARED, - GL_ASYNC, - gh); - rgd_next = gfs2_rgrpd_get_next(rgd_next); - done = 0; - } - - if (signal_pending(current)) - error = -ERESTARTSYS; - } - - if (done) - break; - - yield(); - } - - gfs2_glock_dq_uninit(&ri_gh); - -out: - kfree(gha); - return error; -} - -/** - * gfs2_statfs_i - Do a statfs - * @sdp: the filesystem - * @sg: the sg structure - * - * Returns: errno - */ - -static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) -{ - struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; - struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; - - spin_lock(&sdp->sd_statfs_spin); - - *sc = *m_sc; - sc->sc_total += l_sc->sc_total; - sc->sc_free += l_sc->sc_free; - sc->sc_dinodes += l_sc->sc_dinodes; - - spin_unlock(&sdp->sd_statfs_spin); - - if (sc->sc_free < 0) - sc->sc_free = 0; - if (sc->sc_free > sc->sc_total) - sc->sc_free = sc->sc_total; - if (sc->sc_dinodes < 0) - sc->sc_dinodes = 0; - - return 0; -} - -/** - * gfs2_statfs - Gather and return stats about the filesystem - * @sb: The superblock - * @statfsbuf: The buffer - * - * Returns: 0 on success or error code - */ - -static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - struct super_block *sb = dentry->d_inode->i_sb; - struct gfs2_sbd *sdp = sb->s_fs_info; - struct gfs2_statfs_change_host sc; - int error; - - if (gfs2_tune_get(sdp, gt_statfs_slow)) - error = gfs2_statfs_slow(sdp, &sc); - else - error = gfs2_statfs_i(sdp, &sc); - - if (error) - return error; - - buf->f_type = GFS2_MAGIC; - buf->f_bsize = sdp->sd_sb.sb_bsize; - buf->f_blocks = sc.sc_total; - buf->f_bfree = sc.sc_free; - buf->f_bavail = sc.sc_free; - buf->f_files = sc.sc_dinodes + sc.sc_free; - buf->f_ffree = sc.sc_free; - buf->f_namelen = GFS2_FNAMESIZE; - - return 0; -} - -/** - * gfs2_remount_fs - called when the FS is remounted - * @sb: the filesystem - * @flags: the remount flags - * @data: extra data passed in (not used right now) - * - * Returns: errno - */ - -static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) -{ - struct gfs2_sbd *sdp = sb->s_fs_info; - struct gfs2_args args = sdp->sd_args; /* Default to current settings */ - struct gfs2_tune *gt = &sdp->sd_tune; - int error; - - spin_lock(>->gt_spin); - args.ar_commit = gt->gt_log_flush_secs; - spin_unlock(>->gt_spin); - error = gfs2_mount_args(sdp, &args, data); - if (error) - return error; - - /* Not allowed to change locking details */ - if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) || - strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) || - strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata)) - return -EINVAL; - - /* Some flags must not be changed */ - if (args_neq(&args, &sdp->sd_args, spectator) || - args_neq(&args, &sdp->sd_args, ignore_local_fs) || - args_neq(&args, &sdp->sd_args, localflocks) || - args_neq(&args, &sdp->sd_args, localcaching) || - args_neq(&args, &sdp->sd_args, meta)) - return -EINVAL; - - if (sdp->sd_args.ar_spectator) - *flags |= MS_RDONLY; - - if ((sb->s_flags ^ *flags) & MS_RDONLY) { - if (*flags & MS_RDONLY) - error = gfs2_make_fs_ro(sdp); - else - error = gfs2_make_fs_rw(sdp); - if (error) - return error; - } - - sdp->sd_args = args; - if (sdp->sd_args.ar_posix_acl) - sb->s_flags |= MS_POSIXACL; - else - sb->s_flags &= ~MS_POSIXACL; - spin_lock(>->gt_spin); - gt->gt_log_flush_secs = args.ar_commit; - spin_unlock(>->gt_spin); - - return 0; -} - -/** - * gfs2_drop_inode - Drop an inode (test for remote unlink) - * @inode: The inode to drop - * - * If we've received a callback on an iopen lock then its because a - * remote node tried to deallocate the inode but failed due to this node - * still having the inode open. Here we mark the link count zero - * since we know that it must have reached zero if the GLF_DEMOTE flag - * is set on the iopen glock. If we didn't do a disk read since the - * remote node removed the final link then we might otherwise miss - * this event. This check ensures that this node will deallocate the - * inode's blocks, or alternatively pass the baton on to another - * node for later deallocation. - */ - -static void gfs2_drop_inode(struct inode *inode) -{ - struct gfs2_inode *ip = GFS2_I(inode); - - if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) { - struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; - if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags)) - clear_nlink(inode); - } - generic_drop_inode(inode); -} - -/** - * gfs2_clear_inode - Deallocate an inode when VFS is done with it - * @inode: The VFS inode - * - */ - -static void gfs2_clear_inode(struct inode *inode) -{ - struct gfs2_inode *ip = GFS2_I(inode); - - /* This tells us its a "real" inode and not one which only - * serves to contain an address space (see rgrp.c, meta_io.c) - * which therefore doesn't have its own glocks. - */ - if (test_bit(GIF_USER, &ip->i_flags)) { - ip->i_gl->gl_object = NULL; - gfs2_glock_put(ip->i_gl); - ip->i_gl = NULL; - if (ip->i_iopen_gh.gh_gl) { - ip->i_iopen_gh.gh_gl->gl_object = NULL; - gfs2_glock_dq_uninit(&ip->i_iopen_gh); - } - } -} - -static int is_ancestor(const struct dentry *d1, const struct dentry *d2) -{ - do { - if (d1 == d2) - return 1; - d1 = d1->d_parent; - } while (!IS_ROOT(d1)); - return 0; -} - -/** - * gfs2_show_options - Show mount options for /proc/mounts - * @s: seq_file structure - * @mnt: vfsmount - * - * Returns: 0 on success or error code - */ - -static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) -{ - struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info; - struct gfs2_args *args = &sdp->sd_args; - int lfsecs; - - if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir)) - seq_printf(s, ",meta"); - if (args->ar_lockproto[0]) - seq_printf(s, ",lockproto=%s", args->ar_lockproto); - if (args->ar_locktable[0]) - seq_printf(s, ",locktable=%s", args->ar_locktable); - if (args->ar_hostdata[0]) - seq_printf(s, ",hostdata=%s", args->ar_hostdata); - if (args->ar_spectator) - seq_printf(s, ",spectator"); - if (args->ar_ignore_local_fs) - seq_printf(s, ",ignore_local_fs"); - if (args->ar_localflocks) - seq_printf(s, ",localflocks"); - if (args->ar_localcaching) - seq_printf(s, ",localcaching"); - if (args->ar_debug) - seq_printf(s, ",debug"); - if (args->ar_upgrade) - seq_printf(s, ",upgrade"); - if (args->ar_posix_acl) - seq_printf(s, ",acl"); - if (args->ar_quota != GFS2_QUOTA_DEFAULT) { - char *state; - switch (args->ar_quota) { - case GFS2_QUOTA_OFF: - state = "off"; - break; - case GFS2_QUOTA_ACCOUNT: - state = "account"; - break; - case GFS2_QUOTA_ON: - state = "on"; - break; - default: - state = "unknown"; - break; - } - seq_printf(s, ",quota=%s", state); - } - if (args->ar_suiddir) - seq_printf(s, ",suiddir"); - if (args->ar_data != GFS2_DATA_DEFAULT) { - char *state; - switch (args->ar_data) { - case GFS2_DATA_WRITEBACK: - state = "writeback"; - break; - case GFS2_DATA_ORDERED: - state = "ordered"; - break; - default: - state = "unknown"; - break; - } - seq_printf(s, ",data=%s", state); - } - if (args->ar_discard) - seq_printf(s, ",discard"); - lfsecs = sdp->sd_tune.gt_log_flush_secs; - if (lfsecs != 60) - seq_printf(s, ",commit=%d", lfsecs); - return 0; -} - -/* - * We have to (at the moment) hold the inodes main lock to cover - * the gap between unlocking the shared lock on the iopen lock and - * taking the exclusive lock. I'd rather do a shared -> exclusive - * conversion on the iopen lock, but we can change that later. This - * is safe, just less efficient. - */ - -static void gfs2_delete_inode(struct inode *inode) -{ - struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder gh; - int error; - - if (!test_bit(GIF_USER, &ip->i_flags)) - goto out; - - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - if (unlikely(error)) { - gfs2_glock_dq_uninit(&ip->i_iopen_gh); - goto out; - } - - gfs2_glock_dq_wait(&ip->i_iopen_gh); - gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); - error = gfs2_glock_nq(&ip->i_iopen_gh); - if (error) - goto out_truncate; - - if (S_ISDIR(inode->i_mode) && - (ip->i_diskflags & GFS2_DIF_EXHASH)) { - error = gfs2_dir_exhash_dealloc(ip); - if (error) - goto out_unlock; - } - - if (ip->i_eattr) { - error = gfs2_ea_dealloc(ip); - if (error) - goto out_unlock; - } - - if (!gfs2_is_stuffed(ip)) { - error = gfs2_file_dealloc(ip); - if (error) - goto out_unlock; - } - - error = gfs2_dinode_dealloc(ip); - if (error) - goto out_unlock; - -out_truncate: - error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); - if (error) - goto out_unlock; - /* Needs to be done before glock release & also in a transaction */ - truncate_inode_pages(&inode->i_data, 0); - gfs2_trans_end(sdp); - -out_unlock: - if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) - gfs2_glock_dq(&ip->i_iopen_gh); - gfs2_holder_uninit(&ip->i_iopen_gh); - gfs2_glock_dq_uninit(&gh); - if (error && error != GLR_TRYFAILED && error != -EROFS) - fs_warn(sdp, "gfs2_delete_inode: %d\n", error); -out: - truncate_inode_pages(&inode->i_data, 0); - clear_inode(inode); -} - -static struct inode *gfs2_alloc_inode(struct super_block *sb) -{ - struct gfs2_inode *ip; - - ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL); - if (ip) { - ip->i_flags = 0; - ip->i_gl = NULL; - } - return &ip->i_inode; -} - -static void gfs2_destroy_inode(struct inode *inode) -{ - kmem_cache_free(gfs2_inode_cachep, inode); -} - -const struct super_operations gfs2_super_ops = { - .alloc_inode = gfs2_alloc_inode, - .destroy_inode = gfs2_destroy_inode, - .write_inode = gfs2_write_inode, - .delete_inode = gfs2_delete_inode, - .put_super = gfs2_put_super, - .write_super = gfs2_write_super, - .sync_fs = gfs2_sync_fs, - .freeze_fs = gfs2_freeze, - .unfreeze_fs = gfs2_unfreeze, - .statfs = gfs2_statfs, - .remount_fs = gfs2_remount_fs, - .clear_inode = gfs2_clear_inode, - .drop_inode = gfs2_drop_inode, - .show_options = gfs2_show_options, -}; - diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 601913e..40bcc37 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -7,14 +7,20 @@ * of the GNU General Public License version 2. */ +#include #include #include #include #include #include -#include +#include +#include +#include +#include +#include #include -#include +#include +#include #include "gfs2.h" #include "incore.h" @@ -31,6 +37,183 @@ #include "super.h" #include "trans.h" #include "util.h" +#include "sys.h" +#include "eattr.h" + +#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x) + +enum { + Opt_lockproto, + Opt_locktable, + Opt_hostdata, + Opt_spectator, + Opt_ignore_local_fs, + Opt_localflocks, + Opt_localcaching, + Opt_debug, + Opt_nodebug, + Opt_upgrade, + Opt_acl, + Opt_noacl, + Opt_quota_off, + Opt_quota_account, + Opt_quota_on, + Opt_quota, + Opt_noquota, + Opt_suiddir, + Opt_nosuiddir, + Opt_data_writeback, + Opt_data_ordered, + Opt_meta, + Opt_discard, + Opt_nodiscard, + Opt_commit, + Opt_error, +}; + +static const match_table_t tokens = { + {Opt_lockproto, "lockproto=%s"}, + {Opt_locktable, "locktable=%s"}, + {Opt_hostdata, "hostdata=%s"}, + {Opt_spectator, "spectator"}, + {Opt_ignore_local_fs, "ignore_local_fs"}, + {Opt_localflocks, "localflocks"}, + {Opt_localcaching, "localcaching"}, + {Opt_debug, "debug"}, + {Opt_nodebug, "nodebug"}, + {Opt_upgrade, "upgrade"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_quota_off, "quota=off"}, + {Opt_quota_account, "quota=account"}, + {Opt_quota_on, "quota=on"}, + {Opt_quota, "quota"}, + {Opt_noquota, "noquota"}, + {Opt_suiddir, "suiddir"}, + {Opt_nosuiddir, "nosuiddir"}, + {Opt_data_writeback, "data=writeback"}, + {Opt_data_ordered, "data=ordered"}, + {Opt_meta, "meta"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, + {Opt_commit, "commit=%d"}, + {Opt_error, NULL} +}; + +/** + * gfs2_mount_args - Parse mount options + * @sdp: + * @data: + * + * Return: errno + */ + +int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options) +{ + char *o; + int token; + substring_t tmp[MAX_OPT_ARGS]; + int rv; + + /* Split the options into tokens with the "," character and + process them */ + + while (1) { + o = strsep(&options, ","); + if (o == NULL) + break; + if (*o == '\0') + continue; + + token = match_token(o, tokens, tmp); + switch (token) { + case Opt_lockproto: + match_strlcpy(args->ar_lockproto, &tmp[0], + GFS2_LOCKNAME_LEN); + break; + case Opt_locktable: + match_strlcpy(args->ar_locktable, &tmp[0], + GFS2_LOCKNAME_LEN); + break; + case Opt_hostdata: + match_strlcpy(args->ar_hostdata, &tmp[0], + GFS2_LOCKNAME_LEN); + break; + case Opt_spectator: + args->ar_spectator = 1; + break; + case Opt_ignore_local_fs: + args->ar_ignore_local_fs = 1; + break; + case Opt_localflocks: + args->ar_localflocks = 1; + break; + case Opt_localcaching: + args->ar_localcaching = 1; + break; + case Opt_debug: + args->ar_debug = 1; + break; + case Opt_nodebug: + args->ar_debug = 0; + break; + case Opt_upgrade: + args->ar_upgrade = 1; + break; + case Opt_acl: + args->ar_posix_acl = 1; + break; + case Opt_noacl: + args->ar_posix_acl = 0; + break; + case Opt_quota_off: + case Opt_noquota: + args->ar_quota = GFS2_QUOTA_OFF; + break; + case Opt_quota_account: + args->ar_quota = GFS2_QUOTA_ACCOUNT; + break; + case Opt_quota_on: + case Opt_quota: + args->ar_quota = GFS2_QUOTA_ON; + break; + case Opt_suiddir: + args->ar_suiddir = 1; + break; + case Opt_nosuiddir: + args->ar_suiddir = 0; + break; + case Opt_data_writeback: + args->ar_data = GFS2_DATA_WRITEBACK; + break; + case Opt_data_ordered: + args->ar_data = GFS2_DATA_ORDERED; + break; + case Opt_meta: + args->ar_meta = 1; + break; + case Opt_discard: + args->ar_discard = 1; + break; + case Opt_nodiscard: + args->ar_discard = 0; + break; + case Opt_commit: + rv = match_int(&tmp[0], &args->ar_commit); + if (rv || args->ar_commit <= 0) { + fs_info(sdp, "commit mount option requires a positive numeric argument\n"); + return rv ? rv : -EINVAL; + } + break; + case Opt_error: + default: + fs_info(sdp, "invalid mount option: %s\n", o); + return -EINVAL; + } + } + + return 0; +} /** * gfs2_jindex_free - Clear all the journal index information @@ -436,3 +619,719 @@ void gfs2_unfreeze_fs(struct gfs2_sbd *sdp) mutex_unlock(&sdp->sd_freeze_lock); } + +/** + * gfs2_write_inode - Make sure the inode is stable on the disk + * @inode: The inode + * @sync: synchronous write flag + * + * Returns: errno + */ + +static int gfs2_write_inode(struct inode *inode, int sync) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_holder gh; + struct buffer_head *bh; + struct timespec atime; + struct gfs2_dinode *di; + int ret = 0; + + /* Check this is a "normal" inode, etc */ + if (!test_bit(GIF_USER, &ip->i_flags) || + (current->flags & PF_MEMALLOC)) + return 0; + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (ret) + goto do_flush; + ret = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (ret) + goto do_unlock; + ret = gfs2_meta_inode_buffer(ip, &bh); + if (ret == 0) { + di = (struct gfs2_dinode *)bh->b_data; + atime.tv_sec = be64_to_cpu(di->di_atime); + atime.tv_nsec = be32_to_cpu(di->di_atime_nsec); + if (timespec_compare(&inode->i_atime, &atime) > 0) { + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_dinode_out(ip, bh->b_data); + } + brelse(bh); + } + gfs2_trans_end(sdp); +do_unlock: + gfs2_glock_dq_uninit(&gh); +do_flush: + if (sync != 0) + gfs2_log_flush(GFS2_SB(inode), ip->i_gl); + return ret; +} + +/** + * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one + * @sdp: the filesystem + * + * Returns: errno + */ + +static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) +{ + struct gfs2_holder t_gh; + int error; + + gfs2_quota_sync(sdp); + gfs2_statfs_sync(sdp); + + error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE, + &t_gh); + if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + return error; + + gfs2_meta_syncfs(sdp); + gfs2_log_shutdown(sdp); + + clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + + if (t_gh.gh_gl) + gfs2_glock_dq_uninit(&t_gh); + + gfs2_quota_cleanup(sdp); + + return error; +} + +static int gfs2_umount_recovery_wait(void *word) +{ + schedule(); + return 0; +} + +/** + * gfs2_put_super - Unmount the filesystem + * @sb: The VFS superblock + * + */ + +static void gfs2_put_super(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + int error; + struct gfs2_jdesc *jd; + + /* Unfreeze the filesystem, if we need to */ + + mutex_lock(&sdp->sd_freeze_lock); + if (sdp->sd_freeze_count) + gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); + mutex_unlock(&sdp->sd_freeze_lock); + + /* No more recovery requests */ + set_bit(SDF_NORECOVERY, &sdp->sd_flags); + smp_mb(); + + /* Wait on outstanding recovery */ +restart: + spin_lock(&sdp->sd_jindex_spin); + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + if (!test_bit(JDF_RECOVERY, &jd->jd_flags)) + continue; + spin_unlock(&sdp->sd_jindex_spin); + wait_on_bit(&jd->jd_flags, JDF_RECOVERY, + gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE); + goto restart; + } + spin_unlock(&sdp->sd_jindex_spin); + + kthread_stop(sdp->sd_quotad_process); + kthread_stop(sdp->sd_logd_process); + + if (!(sb->s_flags & MS_RDONLY)) { + error = gfs2_make_fs_ro(sdp); + if (error) + gfs2_io_error(sdp); + } + /* At this point, we're through modifying the disk */ + + /* Release stuff */ + + iput(sdp->sd_jindex); + iput(sdp->sd_inum_inode); + iput(sdp->sd_statfs_inode); + iput(sdp->sd_rindex); + iput(sdp->sd_quota_inode); + + gfs2_glock_put(sdp->sd_rename_gl); + gfs2_glock_put(sdp->sd_trans_gl); + + if (!sdp->sd_args.ar_spectator) { + gfs2_glock_dq_uninit(&sdp->sd_journal_gh); + gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); + gfs2_glock_dq_uninit(&sdp->sd_ir_gh); + gfs2_glock_dq_uninit(&sdp->sd_sc_gh); + gfs2_glock_dq_uninit(&sdp->sd_qc_gh); + iput(sdp->sd_ir_inode); + iput(sdp->sd_sc_inode); + iput(sdp->sd_qc_inode); + } + + gfs2_glock_dq_uninit(&sdp->sd_live_gh); + gfs2_clear_rgrpd(sdp); + gfs2_jindex_free(sdp); + /* Take apart glock structures and buffer lists */ + gfs2_gl_hash_clear(sdp); + /* Unmount the locking protocol */ + gfs2_lm_unmount(sdp); + + /* At this point, we're through participating in the lockspace */ + gfs2_sys_fs_del(sdp); +} + +/** + * gfs2_write_super + * @sb: the superblock + * + */ + +static void gfs2_write_super(struct super_block *sb) +{ + sb->s_dirt = 0; +} + +/** + * gfs2_sync_fs - sync the filesystem + * @sb: the superblock + * + * Flushes the log to disk. + */ + +static int gfs2_sync_fs(struct super_block *sb, int wait) +{ + sb->s_dirt = 0; + if (wait && sb->s_fs_info) + gfs2_log_flush(sb->s_fs_info, NULL); + return 0; +} + +/** + * gfs2_freeze - prevent further writes to the filesystem + * @sb: the VFS structure for the filesystem + * + */ + +static int gfs2_freeze(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + int error; + + if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + return -EINVAL; + + for (;;) { + error = gfs2_freeze_fs(sdp); + if (!error) + break; + + switch (error) { + case -EBUSY: + fs_err(sdp, "waiting for recovery before freeze\n"); + break; + + default: + fs_err(sdp, "error freezing FS: %d\n", error); + break; + } + + fs_err(sdp, "retrying...\n"); + msleep(1000); + } + return 0; +} + +/** + * gfs2_unfreeze - reallow writes to the filesystem + * @sb: the VFS structure for the filesystem + * + */ + +static int gfs2_unfreeze(struct super_block *sb) +{ + gfs2_unfreeze_fs(sb->s_fs_info); + return 0; +} + +/** + * statfs_fill - fill in the sg for a given RG + * @rgd: the RG + * @sc: the sc structure + * + * Returns: 0 on success, -ESTALE if the LVB is invalid + */ + +static int statfs_slow_fill(struct gfs2_rgrpd *rgd, + struct gfs2_statfs_change_host *sc) +{ + gfs2_rgrp_verify(rgd); + sc->sc_total += rgd->rd_data; + sc->sc_free += rgd->rd_free; + sc->sc_dinodes += rgd->rd_dinodes; + return 0; +} + +/** + * gfs2_statfs_slow - Stat a filesystem using asynchronous locking + * @sdp: the filesystem + * @sc: the sc info that will be returned + * + * Any error (other than a signal) will cause this routine to fall back + * to the synchronous version. + * + * FIXME: This really shouldn't busy wait like this. + * + * Returns: errno + */ + +static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) +{ + struct gfs2_holder ri_gh; + struct gfs2_rgrpd *rgd_next; + struct gfs2_holder *gha, *gh; + unsigned int slots = 64; + unsigned int x; + int done; + int error = 0, err; + + memset(sc, 0, sizeof(struct gfs2_statfs_change_host)); + gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL); + if (!gha) + return -ENOMEM; + + error = gfs2_rindex_hold(sdp, &ri_gh); + if (error) + goto out; + + rgd_next = gfs2_rgrpd_get_first(sdp); + + for (;;) { + done = 1; + + for (x = 0; x < slots; x++) { + gh = gha + x; + + if (gh->gh_gl && gfs2_glock_poll(gh)) { + err = gfs2_glock_wait(gh); + if (err) { + gfs2_holder_uninit(gh); + error = err; + } else { + if (!error) + error = statfs_slow_fill( + gh->gh_gl->gl_object, sc); + gfs2_glock_dq_uninit(gh); + } + } + + if (gh->gh_gl) + done = 0; + else if (rgd_next && !error) { + error = gfs2_glock_nq_init(rgd_next->rd_gl, + LM_ST_SHARED, + GL_ASYNC, + gh); + rgd_next = gfs2_rgrpd_get_next(rgd_next); + done = 0; + } + + if (signal_pending(current)) + error = -ERESTARTSYS; + } + + if (done) + break; + + yield(); + } + + gfs2_glock_dq_uninit(&ri_gh); + +out: + kfree(gha); + return error; +} + +/** + * gfs2_statfs_i - Do a statfs + * @sdp: the filesystem + * @sg: the sg structure + * + * Returns: errno + */ + +static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) +{ + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + + spin_lock(&sdp->sd_statfs_spin); + + *sc = *m_sc; + sc->sc_total += l_sc->sc_total; + sc->sc_free += l_sc->sc_free; + sc->sc_dinodes += l_sc->sc_dinodes; + + spin_unlock(&sdp->sd_statfs_spin); + + if (sc->sc_free < 0) + sc->sc_free = 0; + if (sc->sc_free > sc->sc_total) + sc->sc_free = sc->sc_total; + if (sc->sc_dinodes < 0) + sc->sc_dinodes = 0; + + return 0; +} + +/** + * gfs2_statfs - Gather and return stats about the filesystem + * @sb: The superblock + * @statfsbuf: The buffer + * + * Returns: 0 on success or error code + */ + +static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_inode->i_sb; + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_statfs_change_host sc; + int error; + + if (gfs2_tune_get(sdp, gt_statfs_slow)) + error = gfs2_statfs_slow(sdp, &sc); + else + error = gfs2_statfs_i(sdp, &sc); + + if (error) + return error; + + buf->f_type = GFS2_MAGIC; + buf->f_bsize = sdp->sd_sb.sb_bsize; + buf->f_blocks = sc.sc_total; + buf->f_bfree = sc.sc_free; + buf->f_bavail = sc.sc_free; + buf->f_files = sc.sc_dinodes + sc.sc_free; + buf->f_ffree = sc.sc_free; + buf->f_namelen = GFS2_FNAMESIZE; + + return 0; +} + +/** + * gfs2_remount_fs - called when the FS is remounted + * @sb: the filesystem + * @flags: the remount flags + * @data: extra data passed in (not used right now) + * + * Returns: errno + */ + +static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_args args = sdp->sd_args; /* Default to current settings */ + struct gfs2_tune *gt = &sdp->sd_tune; + int error; + + spin_lock(>->gt_spin); + args.ar_commit = gt->gt_log_flush_secs; + spin_unlock(>->gt_spin); + error = gfs2_mount_args(sdp, &args, data); + if (error) + return error; + + /* Not allowed to change locking details */ + if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) || + strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) || + strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata)) + return -EINVAL; + + /* Some flags must not be changed */ + if (args_neq(&args, &sdp->sd_args, spectator) || + args_neq(&args, &sdp->sd_args, ignore_local_fs) || + args_neq(&args, &sdp->sd_args, localflocks) || + args_neq(&args, &sdp->sd_args, localcaching) || + args_neq(&args, &sdp->sd_args, meta)) + return -EINVAL; + + if (sdp->sd_args.ar_spectator) + *flags |= MS_RDONLY; + + if ((sb->s_flags ^ *flags) & MS_RDONLY) { + if (*flags & MS_RDONLY) + error = gfs2_make_fs_ro(sdp); + else + error = gfs2_make_fs_rw(sdp); + if (error) + return error; + } + + sdp->sd_args = args; + if (sdp->sd_args.ar_posix_acl) + sb->s_flags |= MS_POSIXACL; + else + sb->s_flags &= ~MS_POSIXACL; + spin_lock(>->gt_spin); + gt->gt_log_flush_secs = args.ar_commit; + spin_unlock(>->gt_spin); + + return 0; +} + +/** + * gfs2_drop_inode - Drop an inode (test for remote unlink) + * @inode: The inode to drop + * + * If we've received a callback on an iopen lock then its because a + * remote node tried to deallocate the inode but failed due to this node + * still having the inode open. Here we mark the link count zero + * since we know that it must have reached zero if the GLF_DEMOTE flag + * is set on the iopen glock. If we didn't do a disk read since the + * remote node removed the final link then we might otherwise miss + * this event. This check ensures that this node will deallocate the + * inode's blocks, or alternatively pass the baton on to another + * node for later deallocation. + */ + +static void gfs2_drop_inode(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + + if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) { + struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; + if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags)) + clear_nlink(inode); + } + generic_drop_inode(inode); +} + +/** + * gfs2_clear_inode - Deallocate an inode when VFS is done with it + * @inode: The VFS inode + * + */ + +static void gfs2_clear_inode(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + + /* This tells us its a "real" inode and not one which only + * serves to contain an address space (see rgrp.c, meta_io.c) + * which therefore doesn't have its own glocks. + */ + if (test_bit(GIF_USER, &ip->i_flags)) { + ip->i_gl->gl_object = NULL; + gfs2_glock_put(ip->i_gl); + ip->i_gl = NULL; + if (ip->i_iopen_gh.gh_gl) { + ip->i_iopen_gh.gh_gl->gl_object = NULL; + gfs2_glock_dq_uninit(&ip->i_iopen_gh); + } + } +} + +static int is_ancestor(const struct dentry *d1, const struct dentry *d2) +{ + do { + if (d1 == d2) + return 1; + d1 = d1->d_parent; + } while (!IS_ROOT(d1)); + return 0; +} + +/** + * gfs2_show_options - Show mount options for /proc/mounts + * @s: seq_file structure + * @mnt: vfsmount + * + * Returns: 0 on success or error code + */ + +static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) +{ + struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info; + struct gfs2_args *args = &sdp->sd_args; + int lfsecs; + + if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir)) + seq_printf(s, ",meta"); + if (args->ar_lockproto[0]) + seq_printf(s, ",lockproto=%s", args->ar_lockproto); + if (args->ar_locktable[0]) + seq_printf(s, ",locktable=%s", args->ar_locktable); + if (args->ar_hostdata[0]) + seq_printf(s, ",hostdata=%s", args->ar_hostdata); + if (args->ar_spectator) + seq_printf(s, ",spectator"); + if (args->ar_ignore_local_fs) + seq_printf(s, ",ignore_local_fs"); + if (args->ar_localflocks) + seq_printf(s, ",localflocks"); + if (args->ar_localcaching) + seq_printf(s, ",localcaching"); + if (args->ar_debug) + seq_printf(s, ",debug"); + if (args->ar_upgrade) + seq_printf(s, ",upgrade"); + if (args->ar_posix_acl) + seq_printf(s, ",acl"); + if (args->ar_quota != GFS2_QUOTA_DEFAULT) { + char *state; + switch (args->ar_quota) { + case GFS2_QUOTA_OFF: + state = "off"; + break; + case GFS2_QUOTA_ACCOUNT: + state = "account"; + break; + case GFS2_QUOTA_ON: + state = "on"; + break; + default: + state = "unknown"; + break; + } + seq_printf(s, ",quota=%s", state); + } + if (args->ar_suiddir) + seq_printf(s, ",suiddir"); + if (args->ar_data != GFS2_DATA_DEFAULT) { + char *state; + switch (args->ar_data) { + case GFS2_DATA_WRITEBACK: + state = "writeback"; + break; + case GFS2_DATA_ORDERED: + state = "ordered"; + break; + default: + state = "unknown"; + break; + } + seq_printf(s, ",data=%s", state); + } + if (args->ar_discard) + seq_printf(s, ",discard"); + lfsecs = sdp->sd_tune.gt_log_flush_secs; + if (lfsecs != 60) + seq_printf(s, ",commit=%d", lfsecs); + return 0; +} + +/* + * We have to (at the moment) hold the inodes main lock to cover + * the gap between unlocking the shared lock on the iopen lock and + * taking the exclusive lock. I'd rather do a shared -> exclusive + * conversion on the iopen lock, but we can change that later. This + * is safe, just less efficient. + */ + +static void gfs2_delete_inode(struct inode *inode) +{ + struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int error; + + if (!test_bit(GIF_USER, &ip->i_flags)) + goto out; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (unlikely(error)) { + gfs2_glock_dq_uninit(&ip->i_iopen_gh); + goto out; + } + + gfs2_glock_dq_wait(&ip->i_iopen_gh); + gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); + error = gfs2_glock_nq(&ip->i_iopen_gh); + if (error) + goto out_truncate; + + if (S_ISDIR(inode->i_mode) && + (ip->i_diskflags & GFS2_DIF_EXHASH)) { + error = gfs2_dir_exhash_dealloc(ip); + if (error) + goto out_unlock; + } + + if (ip->i_eattr) { + error = gfs2_ea_dealloc(ip); + if (error) + goto out_unlock; + } + + if (!gfs2_is_stuffed(ip)) { + error = gfs2_file_dealloc(ip); + if (error) + goto out_unlock; + } + + error = gfs2_dinode_dealloc(ip); + if (error) + goto out_unlock; + +out_truncate: + error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); + if (error) + goto out_unlock; + /* Needs to be done before glock release & also in a transaction */ + truncate_inode_pages(&inode->i_data, 0); + gfs2_trans_end(sdp); + +out_unlock: + if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) + gfs2_glock_dq(&ip->i_iopen_gh); + gfs2_holder_uninit(&ip->i_iopen_gh); + gfs2_glock_dq_uninit(&gh); + if (error && error != GLR_TRYFAILED && error != -EROFS) + fs_warn(sdp, "gfs2_delete_inode: %d\n", error); +out: + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); +} + +static struct inode *gfs2_alloc_inode(struct super_block *sb) +{ + struct gfs2_inode *ip; + + ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL); + if (ip) { + ip->i_flags = 0; + ip->i_gl = NULL; + } + return &ip->i_inode; +} + +static void gfs2_destroy_inode(struct inode *inode) +{ + kmem_cache_free(gfs2_inode_cachep, inode); +} + +const struct super_operations gfs2_super_ops = { + .alloc_inode = gfs2_alloc_inode, + .destroy_inode = gfs2_destroy_inode, + .write_inode = gfs2_write_inode, + .delete_inode = gfs2_delete_inode, + .put_super = gfs2_put_super, + .write_super = gfs2_write_super, + .sync_fs = gfs2_sync_fs, + .freeze_fs = gfs2_freeze, + .unfreeze_fs = gfs2_unfreeze, + .statfs = gfs2_statfs, + .remount_fs = gfs2_remount_fs, + .clear_inode = gfs2_clear_inode, + .drop_inode = gfs2_drop_inode, + .show_options = gfs2_show_options, +}; + -- cgit v1.1 From 2286dbfad1fb622ee2691537e5caaedee4618860 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 22 May 2009 10:45:09 +0100 Subject: GFS2: Move gfs2_rmdiri into ops_inode.c Move gfs2_rmdiri() into ops_inode.c and make it static. Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 52 ---------------------------------------------------- fs/gfs2/inode.h | 2 -- fs/gfs2/ops_inode.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 54 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index c03a1a3..9b17447 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1046,58 +1046,6 @@ fail: return ERR_PTR(error); } -/** - * gfs2_rmdiri - Remove a directory - * @dip: The parent directory of the directory to be removed - * @name: The name of the directory to be removed - * @ip: The GFS2 inode of the directory to be removed - * - * Assumes Glocks on dip and ip are held - * - * Returns: errno - */ - -int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, - struct gfs2_inode *ip) -{ - struct qstr dotname; - int error; - - if (ip->i_entries != 2) { - if (gfs2_consist_inode(ip)) - gfs2_dinode_print(ip); - return -EIO; - } - - error = gfs2_dir_del(dip, name); - if (error) - return error; - - error = gfs2_change_nlink(dip, -1); - if (error) - return error; - - gfs2_str2qstr(&dotname, "."); - error = gfs2_dir_del(ip, &dotname); - if (error) - return error; - - gfs2_str2qstr(&dotname, ".."); - error = gfs2_dir_del(ip, &dotname); - if (error) - return error; - - /* It looks odd, but it really should be done twice */ - error = gfs2_change_nlink(ip, -1); - if (error) - return error; - - error = gfs2_change_nlink(ip, -1); - if (error) - return error; - - return error; -} /* * gfs2_unlink_ok - check to see that a inode is still in a directory diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 2c3ec07..6cd3928 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -96,8 +96,6 @@ extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, extern struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, unsigned int mode, dev_t dev); -extern int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, - struct gfs2_inode *ip); extern int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, const struct gfs2_inode *ip); extern int gfs2_permission(struct inode *inode, int mask); diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 1c70fa5..5dacd64 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -473,6 +473,59 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) } /** + * gfs2_rmdiri - Remove a directory + * @dip: The parent directory of the directory to be removed + * @name: The name of the directory to be removed + * @ip: The GFS2 inode of the directory to be removed + * + * Assumes Glocks on dip and ip are held + * + * Returns: errno + */ + +static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, + struct gfs2_inode *ip) +{ + struct qstr dotname; + int error; + + if (ip->i_entries != 2) { + if (gfs2_consist_inode(ip)) + gfs2_dinode_print(ip); + return -EIO; + } + + error = gfs2_dir_del(dip, name); + if (error) + return error; + + error = gfs2_change_nlink(dip, -1); + if (error) + return error; + + gfs2_str2qstr(&dotname, "."); + error = gfs2_dir_del(ip, &dotname); + if (error) + return error; + + gfs2_str2qstr(&dotname, ".."); + error = gfs2_dir_del(ip, &dotname); + if (error) + return error; + + /* It looks odd, but it really should be done twice */ + error = gfs2_change_nlink(ip, -1); + if (error) + return error; + + error = gfs2_change_nlink(ip, -1); + if (error) + return error; + + return error; +} + +/** * gfs2_rmdir - Remove a directory * @dir: The parent directory of the directory to be removed * @dentry: The dentry of the directory to remove -- cgit v1.1 From 536baf02f650f4547f105386878b4736fbc181e8 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 22 May 2009 10:48:59 +0100 Subject: GFS2: Move gfs2_readlinki into ops_inode.c Move gfs2_readlinki into ops_inode.c and make it static Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 58 +---------------------------------------------------- fs/gfs2/inode.h | 1 - fs/gfs2/ops_inode.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 58 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 9b17447..676e750 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1085,63 +1085,7 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, return 0; } -/** - * gfs2_readlinki - return the contents of a symlink - * @ip: the symlink's inode - * @buf: a pointer to the buffer to be filled - * @len: a pointer to the length of @buf - * - * If @buf is too small, a piece of memory is kmalloc()ed and needs - * to be freed by the caller. - * - * Returns: errno - */ - -int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len) -{ - struct gfs2_holder i_gh; - struct buffer_head *dibh; - unsigned int x; - int error; - - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); - error = gfs2_glock_nq(&i_gh); - if (error) { - gfs2_holder_uninit(&i_gh); - return error; - } - - if (!ip->i_disksize) { - gfs2_consist_inode(ip); - error = -EIO; - goto out; - } - - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto out; - - x = ip->i_disksize + 1; - if (x > *len) { - *buf = kmalloc(x, GFP_NOFS); - if (!*buf) { - error = -ENOMEM; - goto out_brelse; - } - } - - memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x); - *len = x; - -out_brelse: - brelse(dibh); -out: - gfs2_glock_dq_uninit(&i_gh); - return error; -} - -static int -__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) +static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) { struct buffer_head *dibh; int error; diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 6cd3928..fc9a08f 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -99,7 +99,6 @@ extern struct inode *gfs2_createi(struct gfs2_holder *ghs, extern int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, const struct gfs2_inode *ip); extern int gfs2_permission(struct inode *inode, int mask); -extern int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len); extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 5dacd64..f607f09 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -938,6 +938,61 @@ out: } /** + * gfs2_readlinki - return the contents of a symlink + * @ip: the symlink's inode + * @buf: a pointer to the buffer to be filled + * @len: a pointer to the length of @buf + * + * If @buf is too small, a piece of memory is kmalloc()ed and needs + * to be freed by the caller. + * + * Returns: errno + */ + +static int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len) +{ + struct gfs2_holder i_gh; + struct buffer_head *dibh; + unsigned int x; + int error; + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); + error = gfs2_glock_nq(&i_gh); + if (error) { + gfs2_holder_uninit(&i_gh); + return error; + } + + if (!ip->i_disksize) { + gfs2_consist_inode(ip); + error = -EIO; + goto out; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out; + + x = ip->i_disksize + 1; + if (x > *len) { + *buf = kmalloc(x, GFP_NOFS); + if (!*buf) { + error = -ENOMEM; + goto out_brelse; + } + } + + memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x); + *len = x; + +out_brelse: + brelse(dibh); +out: + gfs2_glock_dq_uninit(&i_gh); + return error; +} + +/** * gfs2_readlink - Read the value of a symlink * @dentry: the symlink * @buf: the buffer to read the symlink data into -- cgit v1.1 From 87ec21741138bb42e7f943bb142b1d8567c10925 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 22 May 2009 10:54:50 +0100 Subject: GFS2: Move gfs2_unlink_ok into ops_inode.c Another function which is only called from one ops_inode.c so we move it and make it static. Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 39 --------------------------------------- fs/gfs2/inode.h | 2 -- fs/gfs2/ops_inode.c | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 676e750..2f94bd7 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1046,45 +1046,6 @@ fail: return ERR_PTR(error); } - -/* - * gfs2_unlink_ok - check to see that a inode is still in a directory - * @dip: the directory - * @name: the name of the file - * @ip: the inode - * - * Assumes that the lock on (at least) @dip is held. - * - * Returns: 0 if the parent/child relationship is correct, errno if it isn't - */ - -int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, - const struct gfs2_inode *ip) -{ - int error; - - if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode)) - return -EPERM; - - if ((dip->i_inode.i_mode & S_ISVTX) && - dip->i_inode.i_uid != current_fsuid() && - ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER)) - return -EPERM; - - if (IS_APPEND(&dip->i_inode)) - return -EPERM; - - error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC); - if (error) - return error; - - error = gfs2_dir_check(&dip->i_inode, name, ip); - if (error) - return error; - - return 0; -} - static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) { struct buffer_head *dibh; diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index fc9a08f..c341aaf 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -96,8 +96,6 @@ extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, extern struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, unsigned int mode, dev_t dev); -extern int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, - const struct gfs2_inode *ip); extern int gfs2_permission(struct inode *inode, int mask); extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index f607f09..f8bd20b 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -262,6 +262,44 @@ out_parent: return error; } +/* + * gfs2_unlink_ok - check to see that a inode is still in a directory + * @dip: the directory + * @name: the name of the file + * @ip: the inode + * + * Assumes that the lock on (at least) @dip is held. + * + * Returns: 0 if the parent/child relationship is correct, errno if it isn't + */ + +static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, + const struct gfs2_inode *ip) +{ + int error; + + if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode)) + return -EPERM; + + if ((dip->i_inode.i_mode & S_ISVTX) && + dip->i_inode.i_uid != current_fsuid() && + ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER)) + return -EPERM; + + if (IS_APPEND(&dip->i_inode)) + return -EPERM; + + error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC); + if (error) + return error; + + error = gfs2_dir_check(&dip->i_inode, name, ip); + if (error) + return error; + + return 0; +} + /** * gfs2_unlink - Unlink a file * @dir: The inode of the directory containing the file to unlink -- cgit v1.1 From d5046853634a8d73f28bad3cf68d182c4a99035d Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 22 May 2009 20:36:21 +0900 Subject: nilfs2: fix memory leak in nilfs_ioctl_clean_segments This fixes a new memory leak problem in garbage collection. The problem was brought by the bugfix patch ("nilfs2: fix lock order reversal in nilfs_clean_segments ioctl"). Thanks to Kentaro Suzuki for finding this problem. Reported-by: Kentaro Suzuki Signed-off-by: Ryusuke Konishi --- fs/nilfs2/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 50ff3f2..d6759b9 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -576,7 +576,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); out_free: - while (--n > 0) + while (--n >= 0) vfree(kbufs[n]); kfree(kbufs[4]); return ret; -- cgit v1.1 From e1defc4ff0cf57aca6c5e3ff99fa503f5943c1f1 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 22 May 2009 17:17:49 -0400 Subject: block: Do away with the notion of hardsect_size Until now we have had a 1:1 mapping between storage device physical block size and the logical block sized used when addressing the device. With SATA 4KB drives coming out that will no longer be the case. The sector size will be 4KB but the logical block size will remain 512-bytes. Hence we need to distinguish between the physical block size and the logical ditto. This patch renames hardsect_size to logical_block_size. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- fs/bio.c | 3 ++- fs/block_dev.c | 6 +++--- fs/buffer.c | 6 +++--- fs/direct-io.c | 2 +- fs/ext3/super.c | 4 ++-- fs/ext4/super.c | 2 +- fs/gfs2/ops_fstype.c | 4 ++-- fs/gfs2/rgrp.c | 2 +- fs/nilfs2/the_nilfs.c | 2 +- fs/ntfs/super.c | 6 +++--- fs/ocfs2/cluster/heartbeat.c | 2 +- fs/ocfs2/super.c | 2 +- fs/partitions/ibm.c | 2 +- fs/partitions/msdos.c | 4 ++-- fs/udf/super.c | 2 +- fs/xfs/linux-2.6/xfs_buf.c | 2 +- 16 files changed, 26 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 81dc93e..4445c38 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1490,11 +1490,12 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors) sector_t bio_sector_offset(struct bio *bio, unsigned short index, unsigned int offset) { - unsigned int sector_sz = queue_hardsect_size(bio->bi_bdev->bd_disk->queue); + unsigned int sector_sz; struct bio_vec *bv; sector_t sectors; int i; + sector_sz = queue_logical_block_size(bio->bi_bdev->bd_disk->queue); sectors = 0; if (index >= bio->bi_idx) diff --git a/fs/block_dev.c b/fs/block_dev.c index a85fe31..a29b4dc 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -76,7 +76,7 @@ int set_blocksize(struct block_device *bdev, int size) return -EINVAL; /* Size cannot be smaller than the size supported by the device */ - if (size < bdev_hardsect_size(bdev)) + if (size < bdev_logical_block_size(bdev)) return -EINVAL; /* Don't change the size if it is same as current */ @@ -106,7 +106,7 @@ EXPORT_SYMBOL(sb_set_blocksize); int sb_min_blocksize(struct super_block *sb, int size) { - int minsize = bdev_hardsect_size(sb->s_bdev); + int minsize = bdev_logical_block_size(sb->s_bdev); if (size < minsize) size = minsize; return sb_set_blocksize(sb, size); @@ -1117,7 +1117,7 @@ EXPORT_SYMBOL(check_disk_change); void bd_set_size(struct block_device *bdev, loff_t size) { - unsigned bsize = bdev_hardsect_size(bdev); + unsigned bsize = bdev_logical_block_size(bdev); bdev->bd_inode->i_size = size; while (bsize < PAGE_CACHE_SIZE) { diff --git a/fs/buffer.c b/fs/buffer.c index aed2977..36e2bbc 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1085,12 +1085,12 @@ static struct buffer_head * __getblk_slow(struct block_device *bdev, sector_t block, int size) { /* Size must be multiple of hard sectorsize */ - if (unlikely(size & (bdev_hardsect_size(bdev)-1) || + if (unlikely(size & (bdev_logical_block_size(bdev)-1) || (size < 512 || size > PAGE_SIZE))) { printk(KERN_ERR "getblk(): invalid block size %d requested\n", size); - printk(KERN_ERR "hardsect size: %d\n", - bdev_hardsect_size(bdev)); + printk(KERN_ERR "logical block size: %d\n", + bdev_logical_block_size(bdev)); dump_stack(); return NULL; diff --git a/fs/direct-io.c b/fs/direct-io.c index 05763bb..8b10b87 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1127,7 +1127,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, rw = WRITE_ODIRECT; if (bdev) - bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev)); + bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); if (offset & blocksize_mask) { if (bdev) diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 599dbfe..acbb94f 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1696,7 +1696,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount; } - hblock = bdev_hardsect_size(sb->s_bdev); + hblock = bdev_logical_block_size(sb->s_bdev); if (sb->s_blocksize != blocksize) { /* * Make sure the blocksize for the filesystem is larger @@ -2119,7 +2119,7 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb, } blocksize = sb->s_blocksize; - hblock = bdev_hardsect_size(bdev); + hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { printk(KERN_ERR "EXT3-fs: blocksize too small for journal device.\n"); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2958f4e..a30549f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2962,7 +2962,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, } blocksize = sb->s_blocksize; - hblock = bdev_hardsect_size(bdev); + hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { printk(KERN_ERR "EXT4-fs: blocksize too small for journal device.\n"); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 1ff9473..a3b2ac9 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -526,11 +526,11 @@ static int init_sb(struct gfs2_sbd *sdp, int silent) } /* Set up the buffer cache and SB for real */ - if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) { + if (sdp->sd_sb.sb_bsize < bdev_logical_block_size(sb->s_bdev)) { ret = -EINVAL; fs_err(sdp, "FS block size (%u) is too small for device " "block size (%u)\n", - sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev)); + sdp->sd_sb.sb_bsize, bdev_logical_block_size(sb->s_bdev)); goto out; } if (sdp->sd_sb.sb_bsize > PAGE_SIZE) { diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 5650382..a971d24 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -845,7 +845,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, struct super_block *sb = sdp->sd_vfs; struct block_device *bdev = sb->s_bdev; const unsigned int sects_per_blk = sdp->sd_sb.sb_bsize / - bdev_hardsect_size(sb->s_bdev); + bdev_logical_block_size(sb->s_bdev); u64 blk; sector_t start = 0; sector_t nr_sects = 0; diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 7f65b3b..a91f15b 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -515,7 +515,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data) blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size); if (sb->s_blocksize != blocksize) { - int hw_blocksize = bdev_hardsect_size(sb->s_bdev); + int hw_blocksize = bdev_logical_block_size(sb->s_bdev); if (blocksize < hw_blocksize) { printk(KERN_ERR diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index f76951d..6aa7c47 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -25,7 +25,7 @@ #include #include #include -#include /* For bdev_hardsect_size(). */ +#include /* For bdev_logical_block_size(). */ #include #include #include @@ -2785,13 +2785,13 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) goto err_out_now; /* We support sector sizes up to the PAGE_CACHE_SIZE. */ - if (bdev_hardsect_size(sb->s_bdev) > PAGE_CACHE_SIZE) { + if (bdev_logical_block_size(sb->s_bdev) > PAGE_CACHE_SIZE) { if (!silent) ntfs_error(sb, "Device has unsupported sector size " "(%i). The maximum supported sector " "size on this architecture is %lu " "bytes.", - bdev_hardsect_size(sb->s_bdev), + bdev_logical_block_size(sb->s_bdev), PAGE_CACHE_SIZE); goto err_out_now; } diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 4f85ece..09cc25d 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1371,7 +1371,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, bdevname(reg->hr_bdev, reg->hr_dev_name); - sectsize = bdev_hardsect_size(reg->hr_bdev); + sectsize = bdev_logical_block_size(reg->hr_bdev); if (sectsize != reg->hr_block_bytes) { mlog(ML_ERROR, "blocksize %u incorrect for device, expected %d", diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 79ff8d9..5c6163f 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -713,7 +713,7 @@ static int ocfs2_sb_probe(struct super_block *sb, *bh = NULL; /* may be > 512 */ - *sector_size = bdev_hardsect_size(sb->s_bdev); + *sector_size = bdev_logical_block_size(sb->s_bdev); if (*sector_size > OCFS2_MAX_BLOCKSIZE) { mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n", *sector_size, OCFS2_MAX_BLOCKSIZE); diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c index 4629768..fc71aab 100644 --- a/fs/partitions/ibm.c +++ b/fs/partitions/ibm.c @@ -76,7 +76,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) Sector sect; res = 0; - blocksize = bdev_hardsect_size(bdev); + blocksize = bdev_logical_block_size(bdev); if (blocksize <= 0) goto out_exit; i_size = i_size_read(bdev->bd_inode); diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c index 7965118..0028d2e 100644 --- a/fs/partitions/msdos.c +++ b/fs/partitions/msdos.c @@ -110,7 +110,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev, Sector sect; unsigned char *data; u32 this_sector, this_size; - int sector_size = bdev_hardsect_size(bdev) / 512; + int sector_size = bdev_logical_block_size(bdev) / 512; int loopct = 0; /* number of links followed without finding a data partition */ int i; @@ -415,7 +415,7 @@ static struct { int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) { - int sector_size = bdev_hardsect_size(bdev) / 512; + int sector_size = bdev_logical_block_size(bdev) / 512; Sector sect; unsigned char *data; struct partition *p; diff --git a/fs/udf/super.c b/fs/udf/super.c index 72348cc..0ba4410 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -1915,7 +1915,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) { ret = udf_load_vrs(sb, &uopt, silent, &fileset); } else { - uopt.blocksize = bdev_hardsect_size(sb->s_bdev); + uopt.blocksize = bdev_logical_block_size(sb->s_bdev); ret = udf_load_vrs(sb, &uopt, silent, &fileset); if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) { if (!silent) diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index e28800a..1418b91 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1501,7 +1501,7 @@ xfs_setsize_buftarg_early( struct block_device *bdev) { return xfs_setsize_buftarg_flags(btp, - PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0); + PAGE_CACHE_SIZE, bdev_logical_block_size(bdev), 0); } int -- cgit v1.1 From ae03bf639a5027d27270123f5f6e3ee6a412781d Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 22 May 2009 17:17:50 -0400 Subject: block: Use accessor functions for queue limits Convert all external users of queue limits to using wrapper functions instead of poking the request queue variables directly. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- fs/bio.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 4445c38..ab423a1 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -499,11 +499,11 @@ int bio_get_nr_vecs(struct block_device *bdev) struct request_queue *q = bdev_get_queue(bdev); int nr_pages; - nr_pages = ((q->max_sectors << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (nr_pages > q->max_phys_segments) - nr_pages = q->max_phys_segments; - if (nr_pages > q->max_hw_segments) - nr_pages = q->max_hw_segments; + nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (nr_pages > queue_max_phys_segments(q)) + nr_pages = queue_max_phys_segments(q); + if (nr_pages > queue_max_hw_segments(q)) + nr_pages = queue_max_hw_segments(q); return nr_pages; } @@ -562,8 +562,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page * make this too complex. */ - while (bio->bi_phys_segments >= q->max_phys_segments - || bio->bi_phys_segments >= q->max_hw_segments) { + while (bio->bi_phys_segments >= queue_max_phys_segments(q) + || bio->bi_phys_segments >= queue_max_hw_segments(q)) { if (retried_segments) return 0; @@ -634,7 +634,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { - return __bio_add_page(q, bio, page, len, offset, q->max_hw_sectors); + return __bio_add_page(q, bio, page, len, offset, + queue_max_hw_sectors(q)); } /** @@ -654,7 +655,7 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - return __bio_add_page(q, bio, page, len, offset, q->max_sectors); + return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q)); } struct bio_map_data { -- cgit v1.1 From c72758f33784e5e2a1a4bb9421ef3e6de8f9fcf3 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 22 May 2009 17:17:53 -0400 Subject: block: Export I/O topology for block devices and partitions To support devices with physical block sizes bigger than 512 bytes we need to ensure proper alignment. This patch adds support for exposing I/O topology characteristics as devices are stacked. logical_block_size is the smallest unit the device can address. physical_block_size indicates the smallest I/O the device can write without incurring a read-modify-write penalty. The io_min parameter is the smallest preferred I/O size reported by the device. In many cases this is the same as the physical block size. However, the io_min parameter can be scaled up when stacking (RAID5 chunk size > physical block size). The io_opt characteristic indicates the optimal I/O size reported by the device. This is usually the stripe width for arrays. The alignment_offset parameter indicates the number of bytes the start of the device/partition is offset from the device's natural alignment. Partition tools and MD/DM utilities can use this to pad their offsets so filesystems start on proper boundaries. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- fs/partitions/check.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 99e33ef..0af3608 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -219,6 +219,13 @@ ssize_t part_size_show(struct device *dev, return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); } +ssize_t part_alignment_offset_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset); +} + ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -272,6 +279,7 @@ ssize_t part_fail_store(struct device *dev, static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); +static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = @@ -282,6 +290,7 @@ static struct attribute *part_attrs[] = { &dev_attr_partition.attr, &dev_attr_start.attr, &dev_attr_size.attr, + &dev_attr_alignment_offset.attr, &dev_attr_stat.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, @@ -383,6 +392,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, pdev = part_to_dev(p); p->start_sect = start; + p->alignment_offset = queue_sector_alignment_offset(disk->queue, start); p->nr_sects = len; p->partno = partno; p->policy = get_disk_ro(disk); -- cgit v1.1 From 8db14ca12569fe885694bd3d5ff84c2d973d3cb0 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 23 May 2009 18:57:25 +0000 Subject: [CIFS] Avoid open on possible directories since Samba now rejects them Small change (mostly formatting) to limit lookup based open calls to file create only. After discussion yesteday on samba-technical about the posix lookup regression, and looking at a problem with cifs posix open to one particular Samba version, Jeff and JRA realized that Samba server's behavior changed in this area (posix open behavior on files vs. directories). To make this behavior consistent, JRA just made a fix to Samba server to alter how it handles open of directories (now returning the equivalent of EISDIR instead of success). Since we don't know at lookup time whether the inode is a directory or file (and thus whether posix open will succeed with most current Samba server), this change avoids the posix open code on lookup open (just issues posix open on creates). This gets the semantic benefits we want (atomicity, posix byte range locks, improved write semantics on newly created files) and file create still is fast, and we avoid the problem that Jeff noticed yesterday with "openat" (and some open directory calls) of non-cached directories to one version of Samba server, and will work with future Samba versions (which include the fix jra just pushed into Samba server). I confirmed this approach with jra yesterday and with Shirish today. Posix open is only called (at lookup time) for file create now. For opens (rather than creates), because we do not know if it is a file or directory yet, and current Samba no longer allows us to do posix open on dirs, we could end up wasting an open call on what turns out to be a dir. For file opens, we wait to call posix open till cifs_open. It could be added here (lookup) in the future but the performance tradeoff of the extra network request when EISDIR or EACCES is returned would have to be weighed against the 50% reduction in network traffic in the other paths. Reviewed-by: Shirish Pargaonkar Tested-by: Jeff Layton CC: Jeremy Allison Signed-off-by: Steve French --- fs/cifs/dir.c | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index f49d684..3758965 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -657,31 +657,36 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, } cFYI(1, ("Full path: %s inode = 0x%p", full_path, direntry->d_inode)); + /* Posix open is only called (at lookup time) for file create now. + * For opens (rather than creates), because we do not know if it + * is a file or directory yet, and current Samba no longer allows + * us to do posix open on dirs, we could end up wasting an open call + * on what turns out to be a dir. For file opens, we wait to call posix + * open till cifs_open. It could be added here (lookup) in the future + * but the performance tradeoff of the extra network request when EISDIR + * or EACCES is returned would have to be weighed against the 50% + * reduction in network traffic in the other paths. + */ if (pTcon->unix_ext) { if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY)) && - (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open) { - if (!((nd->intent.open.flags & O_CREAT) && - (nd->intent.open.flags & O_EXCL))) { - rc = cifs_posix_open(full_path, &newInode, + (nd->flags & LOOKUP_OPEN) && !pTcon->broken_posix_open && + (nd->intent.open.flags & O_CREAT)) { + rc = cifs_posix_open(full_path, &newInode, parent_dir_inode->i_sb, nd->intent.open.create_mode, nd->intent.open.flags, &oplock, &fileHandle, xid); - /* - * This code works around a bug in - * samba posix open in samba versions 3.3.1 - * and earlier where create works - * but open fails with invalid parameter. - * If either of these error codes are - * returned, follow the normal lookup. - * Otherwise, the error during posix open - * is handled. - */ - if ((rc != -EINVAL) && (rc != -EOPNOTSUPP)) - posix_open = true; - else - pTcon->broken_posix_open = true; - } + /* + * The check below works around a bug in POSIX + * open in samba versions 3.3.1 and earlier where + * open could incorrectly fail with invalid parameter. + * If either that or op not supported returned, follow + * the normal lookup. + */ + if ((rc == 0) || (rc == -ENOENT)) + posix_open = true; + else if ((rc == -EINVAL) || (rc != -EOPNOTSUPP)) + pTcon->broken_posix_open = true; } if (!posix_open) rc = cifs_get_inode_info_unix(&newInode, full_path, -- cgit v1.1 From 79f52b77b89e8b7aa9fbe62135eea198a2ecbd5b Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Sat, 23 May 2009 20:28:41 -0500 Subject: jfs: Add missing mutex_unlock call to error path Jan Kucera found an missing call to mutex_unlock() with his static code checker. It's an unlikely error path to hit in the real world, but it should be fixed. Signed-off-by: Dave Kleikamp Reported-by: Jan Kucera --- fs/jfs/jfs_imap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 3460572..0fc3040 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -2571,6 +2571,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp) txAbort(tid, 0); txEnd(tid); + mutex_unlock(&JFS_IP(ipimap)->commit_mutex); /* release the inode map lock */ IWRITE_UNLOCK(ipimap); -- cgit v1.1 From 759d427aa5a9d88a81afd11817cdeb40aea85234 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 25 May 2009 11:51:00 -0400 Subject: ext4: remove unused function __ext4_write_dirty_metadata The __ext4_write_dirty_metadata() function was introduced by commit 0390131b, "ext4: Allow ext4 to run without a journal", but nothing ever used the function, either then or since. So let's remove it and save a bit of space. Cc: Frank Mayhar Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index dadd3f9..14c00ff 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4790,25 +4790,6 @@ int ext4_write_inode(struct inode *inode, int wait) return ext4_force_commit(inode->i_sb); } -int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh) -{ - int err = 0; - - mark_buffer_dirty(bh); - if (inode && inode_needs_sync(inode)) { - sync_dirty_buffer(bh); - if (buffer_req(bh) && !buffer_uptodate(bh)) { - ext4_error(inode->i_sb, __func__, - "IO error syncing inode, " - "inode=%lu, block=%llu", - inode->i_ino, - (unsigned long long)bh->b_blocknr); - err = -EIO; - } - } - return err; -} - /* * ext4_setattr() * -- cgit v1.1 From 88b6edd17c62b7d346d21f4087893ce7d4ef828a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 25 May 2009 11:50:39 -0400 Subject: ext4: Clean up calls to ext4_get_group_desc() If the caller isn't planning on modifying the block group descriptors, there's no need to pass in a pointer to a struct buffer_head. Nuking this saves a tiny amount of CPU time and stack space usage. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 3 +-- fs/ext4/super.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 82f7d1d..3743bd8 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -347,7 +347,6 @@ static int find_group_flex(struct super_block *sb, struct inode *parent, { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *desc; - struct buffer_head *bh; struct flex_groups *flex_group = sbi->s_flex_groups; ext4_group_t parent_group = EXT4_I(parent)->i_block_group; ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group); @@ -402,7 +401,7 @@ find_close_to_parent: found_flexbg: for (i = best_flex * flex_size; i < ngroups && i < (best_flex + 1) * flex_size; i++) { - desc = ext4_get_group_desc(sb, i, &bh); + desc = ext4_get_group_desc(sb, i, NULL); if (ext4_free_inodes_count(sb, desc)) { *best_group = i; goto out; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index eca6c05..91b98b5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1660,7 +1660,6 @@ static int ext4_fill_flex_info(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp = NULL; - struct buffer_head *bh; ext4_group_t flex_group_count; ext4_group_t flex_group; int groups_per_flex = 0; @@ -1693,7 +1692,7 @@ static int ext4_fill_flex_info(struct super_block *sb) } for (i = 0; i < sbi->s_groups_count; i++) { - gdp = ext4_get_group_desc(sb, i, &bh); + gdp = ext4_get_group_desc(sb, i, NULL); flex_group = ext4_flex_group(sbi, i); atomic_set(&sbi->s_flex_groups[flex_group].free_inodes, -- cgit v1.1 From e1b28aab5804aa477c33d19855d6747607a885fd Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 26 May 2009 15:41:27 +0100 Subject: GFS2: Remove lockstruct subdir from gfs2 sysfs files The lockstruct sub directory contained two entries, both of which are duplicated elsewhere in the gfs2 sysfs files as well as being available via /proc/mounts. There is no userland program using either of them, so this patch removes them. Signed-off-by: Steven Whitehouse --- fs/gfs2/sys.c | 41 ++++++++--------------------------------- 1 file changed, 8 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 9f6d48b..94bd59e 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -276,25 +276,6 @@ static struct kobj_type gfs2_ktype = { .sysfs_ops = &gfs2_attr_ops, }; -/* - * display struct lm_lockstruct fields - */ - -#define LOCKSTRUCT_ATTR(name, fmt) \ -static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ -{ \ - return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_lockstruct.ls_##name); \ -} \ -static struct gfs2_attr lockstruct_attr_##name = __ATTR_RO(name) - -LOCKSTRUCT_ATTR(jid, "%u\n"); -LOCKSTRUCT_ATTR(first, "%u\n"); - -static struct attribute *lockstruct_attrs[] = { - &lockstruct_attr_jid.attr, - &lockstruct_attr_first.attr, - NULL, -}; /* * lock_module. Originally from lock_dlm @@ -397,6 +378,11 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf) return sprintf(buf, "%d\n", ls->ls_recover_jid_status); } +static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf) +{ + return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid); +} + #define GDLM_ATTR(_name,_mode,_show,_store) \ static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) @@ -404,6 +390,7 @@ GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); GDLM_ATTR(block, 0644, block_show, block_store); GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); GDLM_ATTR(id, 0444, lkid_show, NULL); +GDLM_ATTR(jid, 0444, jid_show, NULL); GDLM_ATTR(first, 0444, lkfirst_show, NULL); GDLM_ATTR(first_done, 0444, first_done_show, NULL); GDLM_ATTR(recover, 0200, NULL, recover_store); @@ -415,7 +402,7 @@ static struct attribute *lock_module_attrs[] = { &gdlm_attr_block.attr, &gdlm_attr_withdraw.attr, &gdlm_attr_id.attr, - &lockstruct_attr_jid.attr, + &gdlm_attr_jid.attr, &gdlm_attr_first.attr, &gdlm_attr_first_done.attr, &gdlm_attr_recover.attr, @@ -558,11 +545,6 @@ static struct attribute *tune_attrs[] = { NULL, }; -static struct attribute_group lockstruct_group = { - .name = "lockstruct", - .attrs = lockstruct_attrs, -}; - static struct attribute_group args_group = { .name = "args", .attrs = args_attrs, @@ -588,13 +570,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp) if (error) goto fail; - error = sysfs_create_group(&sdp->sd_kobj, &lockstruct_group); - if (error) - goto fail_reg; - error = sysfs_create_group(&sdp->sd_kobj, &args_group); if (error) - goto fail_lockstruct; + goto fail_reg; error = sysfs_create_group(&sdp->sd_kobj, &tune_group); if (error) @@ -611,8 +589,6 @@ fail_tune: sysfs_remove_group(&sdp->sd_kobj, &tune_group); fail_args: sysfs_remove_group(&sdp->sd_kobj, &args_group); -fail_lockstruct: - sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group); fail_reg: kobject_put(&sdp->sd_kobj); fail: @@ -624,7 +600,6 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp) { sysfs_remove_group(&sdp->sd_kobj, &tune_group); sysfs_remove_group(&sdp->sd_kobj, &args_group); - sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group); sysfs_remove_group(&sdp->sd_kobj, &lock_module_group); kobject_put(&sdp->sd_kobj); } -- cgit v1.1 From f6eb53498ee8f725832f3a0fffca90566bb118a6 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 26 May 2009 15:50:25 +0100 Subject: GFS2: Remove args subdir from gfs2 sysfs files Since we can cat /proc/mounts there is no need to have this subdirectory in the gfs2 sysfs files. In fact this does not reflect the full range of possible mount argumenmts, where as /proc/mounts does. There was only one userland user of this set of sysfs files and it will function perfectly well without these files being present (in fact that subcommand of gfs2_tool is obsolete anyway). The tune/* subdirectory is also considered mostly obsolete, but there are a few uses of this until mount arguments can be added for the last few functions for which there are no equivalents currently. However the tune/* directory is still in my sights and new code should avoid using it. Only the gfs2_quota and gfs2_tool programs are know to use tune/* at the moment. Signed-off-by: Steven Whitehouse --- fs/gfs2/sys.c | 52 +--------------------------------------------------- 1 file changed, 1 insertion(+), 51 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 94bd59e..23419dc 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -411,44 +411,6 @@ static struct attribute *lock_module_attrs[] = { NULL, }; -#define ARGS_ATTR(name, fmt) \ -static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ -{ \ - return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_args.ar_##name); \ -} \ -static struct gfs2_attr args_attr_##name = __ATTR_RO(name) - -ARGS_ATTR(lockproto, "%s\n"); -ARGS_ATTR(locktable, "%s\n"); -ARGS_ATTR(hostdata, "%s\n"); -ARGS_ATTR(spectator, "%d\n"); -ARGS_ATTR(ignore_local_fs, "%d\n"); -ARGS_ATTR(localcaching, "%d\n"); -ARGS_ATTR(localflocks, "%d\n"); -ARGS_ATTR(debug, "%d\n"); -ARGS_ATTR(upgrade, "%d\n"); -ARGS_ATTR(posix_acl, "%d\n"); -ARGS_ATTR(quota, "%u\n"); -ARGS_ATTR(suiddir, "%d\n"); -ARGS_ATTR(data, "%d\n"); - -static struct attribute *args_attrs[] = { - &args_attr_lockproto.attr, - &args_attr_locktable.attr, - &args_attr_hostdata.attr, - &args_attr_spectator.attr, - &args_attr_ignore_local_fs.attr, - &args_attr_localcaching.attr, - &args_attr_localflocks.attr, - &args_attr_debug.attr, - &args_attr_upgrade.attr, - &args_attr_posix_acl.attr, - &args_attr_quota.attr, - &args_attr_suiddir.attr, - &args_attr_data.attr, - NULL, -}; - /* * get and set struct gfs2_tune fields */ @@ -545,11 +507,6 @@ static struct attribute *tune_attrs[] = { NULL, }; -static struct attribute_group args_group = { - .name = "args", - .attrs = args_attrs, -}; - static struct attribute_group tune_group = { .name = "tune", .attrs = tune_attrs, @@ -570,13 +527,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp) if (error) goto fail; - error = sysfs_create_group(&sdp->sd_kobj, &args_group); - if (error) - goto fail_reg; - error = sysfs_create_group(&sdp->sd_kobj, &tune_group); if (error) - goto fail_args; + goto fail_reg; error = sysfs_create_group(&sdp->sd_kobj, &lock_module_group); if (error) @@ -587,8 +540,6 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp) fail_tune: sysfs_remove_group(&sdp->sd_kobj, &tune_group); -fail_args: - sysfs_remove_group(&sdp->sd_kobj, &args_group); fail_reg: kobject_put(&sdp->sd_kobj); fail: @@ -599,7 +550,6 @@ fail: void gfs2_sys_fs_del(struct gfs2_sbd *sdp) { sysfs_remove_group(&sdp->sd_kobj, &tune_group); - sysfs_remove_group(&sdp->sd_kobj, &args_group); sysfs_remove_group(&sdp->sd_kobj, &lock_module_group); kobject_put(&sdp->sd_kobj); } -- cgit v1.1 From d0367a508af9cf97beb202935bb9ad8883d30cd1 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Tue, 26 May 2009 14:51:00 -0400 Subject: nfs: fix build error in nfsroot with initconst fix build error with latest kbuild adjustments to initconst. The commit a447c0932445f92ce6f4c1bd020f62c5097a7842 ("vfs: Use const for kernel parser table") changed: static match_table_t __initdata tokens = { to static match_table_t __initconst tokens = { But the missing const causes popwerpc to fail with latest updates to __initconst like this: fs/nfs/nfsroot.c:400: error: __setup_str_nfs_root_setup causes a section type conflict fs/nfs/nfsroot.c:400: error: __setup_str_nfs_root_setup causes a section type conflict The bug is only present with kbuild-next. Following patch has been build tested. Signed-off-by: Sam Ravnborg Cc: Steven Whitehouse Cc: Stephen Rothwell Acked-by: Jan Beulich Signed-off-by: Trond Myklebust --- fs/nfs/nfsroot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index d9ef602..e3ed590 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -129,7 +129,7 @@ enum { Opt_err }; -static match_table_t __initconst tokens = { +static const match_table_t tokens __initconst = { {Opt_port, "port=%u"}, {Opt_rsize, "rsize=%u"}, {Opt_wsize, "wsize=%u"}, -- cgit v1.1 From 95baa25c7321eb8613246acbf61b97911cc748d3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 26 May 2009 14:51:00 -0400 Subject: NFSv4: Fix the case where NFSv4 renewal fails If the asynchronous lease renewal fails (usually due to a soft timeout), then we _must_ schedule state recovery in order to ensure that we don't lose the lease unnecessarily or, if the lease is already lost, that we recover the locking state promptly... Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index a4d2426..4674f80 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2594,12 +2594,9 @@ static void nfs4_renew_done(struct rpc_task *task, void *data) unsigned long timestamp = (unsigned long)data; if (task->tk_status < 0) { - switch (task->tk_status) { - case -NFS4ERR_STALE_CLIENTID: - case -NFS4ERR_EXPIRED: - case -NFS4ERR_CB_PATH_DOWN: - nfs4_schedule_state_recovery(clp); - } + /* Unless we're shutting down, schedule state recovery! */ + if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0) + nfs4_schedule_state_recovery(clp); return; } spin_lock(&clp->cl_lock); -- cgit v1.1 From 46a7574caf5bc533c24b315800ed323c187614f5 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 24 May 2009 18:45:17 -0400 Subject: cifs: fix artificial limit on reading symlinks There's no reason to limit the size of a symlink that we can read to 4000 bytes. That may be nowhere near PATH_MAX if the server is sending UCS2 strings. CIFS should be able to read in a symlink up to the size of the buffer. The size of the header has already been accounted for when creating the slabcache, so CIFSMaxBufSize should be the correct size to pass in. Fixes samba bug #6384. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifssmb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index d062602..aece2a8 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -2427,8 +2427,7 @@ querySymLinkRetry: params = 2 /* level */ + 4 /* rsrvd */ + name_len /* incl null */ ; pSMB->TotalDataCount = 0; pSMB->MaxParameterCount = cpu_to_le16(2); - /* BB find exact max data count below from sess structure BB */ - pSMB->MaxDataCount = cpu_to_le16(4000); + pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize); pSMB->MaxSetupCount = 0; pSMB->Reserved = 0; pSMB->Flags = 0; -- cgit v1.1 From f55ed1a83d099f275c9560ad7d4c4700d1e54bdd Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 26 May 2009 16:28:11 -0400 Subject: cifs: tighten up default file_mode/dir_mode The current default file mode is 02767 and dir mode is 0777. This is extremely "loose". Given that CIFS is a single-user protocol, these permissions allow anyone to use the mount -- in effect, giving anyone on the machine access to the credentials used to mount the share. Change this by making the default permissions restrict write access to the default owner of the mount. Give read and execute permissions to everyone else. These are the same permissions that VFAT mounts get by default so there is some precedent here. Note that this patch also removes the mandatory locking flags from the default file_mode. After having looked at how these flags are used by the kernel, I don't think that keeping them as the default offers any real benefit. That flag combination makes it so that the kernel enforces mandatory locking. Since the server is going to do that for us anyway, I don't think we want the client to enforce this by default on applications that just want advisory locks. Anyone that does want this behavior can always enable it by setting the file_mode appropriately. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 4aa81a5..f32c903 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -827,9 +827,9 @@ cifs_parse_mount_options(char *options, const char *devname, vol->target_rfc1001_name[0] = 0; vol->linux_uid = current_uid(); /* use current_euid() instead? */ vol->linux_gid = current_gid(); - vol->dir_mode = S_IRWXUGO; - /* 2767 perms indicate mandatory locking support */ - vol->file_mode = (S_IRWXUGO | S_ISGID) & (~S_IXGRP); + + /* default to only allowing write access to owner of the mount */ + vol->dir_mode = vol->file_mode = S_IRUGO | S_IXUGO | S_IWUSR; /* vol->retry default is 0 (i.e. "soft" limited retry not hard retry) */ vol->rw = true; -- cgit v1.1 From 348ca1029e8bae6e0c49097ad25439b17c5326f4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 27 May 2009 15:46:50 +0100 Subject: FS-Cache: Fixup renamed filenames in comments in internal.h Fix up renamed filenames in comments in fs/fscache/internal.h. Originally, the files were all called fsc-xxx.c, but they got renamed to just xxx.c. Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/fscache/internal.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index e0cbd16..1c34130 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -28,7 +28,7 @@ #define FSCACHE_MAX_THREADS 32 /* - * fsc-cache.c + * cache.c */ extern struct list_head fscache_cache_list; extern struct rw_semaphore fscache_addremove_sem; @@ -37,7 +37,7 @@ extern struct fscache_cache *fscache_select_cache_for_object( struct fscache_cookie *); /* - * fsc-cookie.c + * cookie.c */ extern struct kmem_cache *fscache_cookie_jar; @@ -45,13 +45,13 @@ extern void fscache_cookie_init_once(void *); extern void __fscache_cookie_put(struct fscache_cookie *); /* - * fsc-fsdef.c + * fsdef.c */ extern struct fscache_cookie fscache_fsdef_index; extern struct fscache_cookie_def fscache_fsdef_netfs_def; /* - * fsc-histogram.c + * histogram.c */ #ifdef CONFIG_FSCACHE_HISTOGRAM extern atomic_t fscache_obj_instantiate_histogram[HZ]; @@ -75,7 +75,7 @@ extern const struct file_operations fscache_histogram_fops; #endif /* - * fsc-main.c + * main.c */ extern unsigned fscache_defer_lookup; extern unsigned fscache_defer_create; @@ -86,14 +86,14 @@ extern int fscache_wait_bit(void *); extern int fscache_wait_bit_interruptible(void *); /* - * fsc-object.c + * object.c */ extern void fscache_withdrawing_object(struct fscache_cache *, struct fscache_object *); extern void fscache_enqueue_object(struct fscache_object *); /* - * fsc-operation.c + * operation.c */ extern int fscache_submit_exclusive_op(struct fscache_object *, struct fscache_operation *); @@ -104,7 +104,7 @@ extern void fscache_start_operations(struct fscache_object *); extern void fscache_operation_gc(struct work_struct *); /* - * fsc-proc.c + * proc.c */ #ifdef CONFIG_PROC_FS extern int __init fscache_proc_init(void); @@ -115,7 +115,7 @@ extern void fscache_proc_cleanup(void); #endif /* - * fsc-stats.c + * stats.c */ #ifdef CONFIG_FSCACHE_STATS extern atomic_t fscache_n_ops_processed[FSCACHE_MAX_THREADS]; -- cgit v1.1 From 911e690e70540f009125bacd16c017eb1a7b1916 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 27 May 2009 15:46:55 +0100 Subject: CacheFiles: Fixup renamed filenames in comments in internal.h Fix up renamed filenames in comments in fs/cachefiles/internal.h. Originally, the files were all called cf-xxx.c, but they got renamed to just xxx.c. Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/cachefiles/internal.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 19218e1..f7c255f 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -122,13 +122,13 @@ static inline void cachefiles_state_changed(struct cachefiles_cache *cache) } /* - * cf-bind.c + * bind.c */ extern int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args); extern void cachefiles_daemon_unbind(struct cachefiles_cache *cache); /* - * cf-daemon.c + * daemon.c */ extern const struct file_operations cachefiles_daemon_fops; @@ -136,17 +136,17 @@ extern int cachefiles_has_space(struct cachefiles_cache *cache, unsigned fnr, unsigned bnr); /* - * cf-interface.c + * interface.c */ extern const struct fscache_cache_ops cachefiles_cache_ops; /* - * cf-key.c + * key.c */ extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type); /* - * cf-namei.c + * namei.c */ extern int cachefiles_delete_object(struct cachefiles_cache *cache, struct cachefiles_object *object); @@ -165,7 +165,7 @@ extern int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir, char *filename); /* - * cf-proc.c + * proc.c */ #ifdef CONFIG_CACHEFILES_HISTOGRAM extern atomic_t cachefiles_lookup_histogram[HZ]; @@ -190,7 +190,7 @@ void cachefiles_hist(atomic_t histogram[], unsigned long start_jif) #endif /* - * cf-rdwr.c + * rdwr.c */ extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *, struct page *, gfp_t); @@ -205,7 +205,7 @@ extern int cachefiles_write_page(struct fscache_storage *, struct page *); extern void cachefiles_uncache_page(struct fscache_object *, struct page *); /* - * cf-security.c + * security.c */ extern int cachefiles_get_security_ID(struct cachefiles_cache *cache); extern int cachefiles_determine_cache_security(struct cachefiles_cache *cache, @@ -225,7 +225,7 @@ static inline void cachefiles_end_secure(struct cachefiles_cache *cache, } /* - * cf-xattr.c + * xattr.c */ extern int cachefiles_check_object_type(struct cachefiles_object *object); extern int cachefiles_set_object_xattr(struct cachefiles_object *object, -- cgit v1.1 From a0d24b295aed7a9daf4ca36bd4784e4d40f82303 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 19 May 2009 12:03:15 +0800 Subject: nfsd: fix hung up of nfs client while sync write data to nfs server Commit 'Short write in nfsd becomes a full write to the client' (31dec2538e45e9fff2007ea1f4c6bae9f78db724) broken the sync write. With the following commands to reproduce: $ mount -t nfs -o sync 192.168.0.21:/nfsroot /mnt $ cd /mnt $ echo aaaa > temp.txt Then nfs client is hung up. In SYNC mode the server alaways return the write count 0 to the client. This is because the value of host_err in nfsd_vfs_write() will be overwrite in SYNC mode by 'host_err=nfsd_sync(file);', and then we return host_err(which is now 0) as write count. This patch fixed the problem. Signed-off-by: Wei Yongjun Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 6c68ffd..b660435 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1015,6 +1015,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); set_fs(oldfs); if (host_err >= 0) { + *cnt = host_err; nfsdstats.io_write += host_err; fsnotify_modify(file->f_path.dentry); } @@ -1060,10 +1061,9 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, } dprintk("nfsd: write complete host_err=%d\n", host_err); - if (host_err >= 0) { + if (host_err >= 0) err = 0; - *cnt = host_err; - } else + else err = nfserrno(host_err); out: return err; -- cgit v1.1 From 14dba5331b90c20588ae6504fea8049c7283028d Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Wed, 27 May 2009 09:31:52 -0400 Subject: integrity: nfsd imbalance bug fix An nfsd exported file is opened/closed by the kernel causing the integrity imbalance message. Before a file is opened, there normally is permission checking, which is done in inode_permission(). However, as integrity checking requires a dentry and mount point, which is not available in inode_permission(), the integrity (permission) checking must be called separately. In order to detect any missing integrity checking calls, we keep track of file open/closes. ima_path_check() increments these counts and does the integrity (permission) checking. As a result, the number of calls to ima_path_check()/ima_file_free() should be balanced. An extra call to fput(), indicates the file could have been accessed without first calling ima_path_check(). In nfsv3 permission checking is done once, followed by multiple reads, which do an open/close for each read. The integrity (permission) checking call should be in nfsd_permission() after the inode_permission() call, but as there is no correlation between the number of permission checking and open calls, the integrity checking call should not increment the counters, but defer it to when the file is actually opened. This patch adds: - integrity (permission) checking for nfsd exported files in nfsd_permission(). - a call to increment counts for files opened by nfsd. This patch has been updated to return the nfs error types. Signed-off-by: Mimi Zohar Signed-off-by: James Morris --- fs/nfsd/vfs.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 6c68ffd..81ff0f4 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -55,6 +55,7 @@ #include #endif /* CONFIG_NFSD_V4 */ #include +#include #include @@ -735,6 +736,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, flags, cred); if (IS_ERR(*filp)) host_err = PTR_ERR(*filp); + else + ima_counts_get(*filp); out_nfserr: err = nfserrno(host_err); out: @@ -2024,6 +2027,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, struct dentry *dentry, int acc) { struct inode *inode = dentry->d_inode; + struct path path; int err; if (acc == NFSD_MAY_NOP) @@ -2096,7 +2100,17 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, if (err == -EACCES && S_ISREG(inode->i_mode) && acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE)) err = inode_permission(inode, MAY_EXEC); + if (err) + goto nfsd_out; + /* Do integrity (permission) checking now, but defer incrementing + * IMA counts to the actual file open. + */ + path.mnt = exp->ex_path.mnt; + path.dentry = dentry; + err = ima_path_check(&path, acc & (MAY_READ | MAY_WRITE | MAY_EXEC), + IMA_COUNT_LEAVE); +nfsd_out: return err? nfserrno(err) : 0; } -- cgit v1.1 From 07119a4df8c8c77d888f2f46964ea9512ea84ff8 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 27 May 2009 09:37:33 -0400 Subject: cifs: have cifs_NTtimeToUnix take a little-endian arg ...and just have the function call le64_to_cpu. Signed-off-by: Jeff Layton Reviewed-by: Christoph Hellwig Signed-off-by: Steve French --- fs/cifs/cifsproto.h | 2 +- fs/cifs/file.c | 2 +- fs/cifs/inode.c | 15 ++++++--------- fs/cifs/netmisc.c | 12 ++++++------ fs/cifs/readdir.c | 12 ++++++------ 5 files changed, 20 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index fae0839..8831f64 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -90,7 +90,7 @@ extern struct oplock_q_entry *AllocOplockQEntry(struct inode *, u16, struct cifsTconInfo *); extern void DeleteOplockQEntry(struct oplock_q_entry *); extern void DeleteTconOplockQEntries(struct cifsTconInfo *); -extern struct timespec cifs_NTtimeToUnix(u64 utc_nanoseconds_since_1601); +extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); extern u64 cifs_UnixTimeToNT(struct timespec); extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time); extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 302ea15..0686684 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -241,7 +241,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file, /* BB need same check in cifs_create too? */ /* if not oplocked, invalidate inode pages if mtime or file size changed */ - temp = cifs_NTtimeToUnix(le64_to_cpu(buf->LastWriteTime)); + temp = cifs_NTtimeToUnix(buf->LastWriteTime); if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) && (file->f_path.dentry->d_inode->i_size == (loff_t)le64_to_cpu(buf->EndOfFile))) { diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 9c869a6..42d6e0f 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -85,10 +85,10 @@ static void cifs_unix_info_to_inode(struct inode *inode, __u64 num_of_bytes = le64_to_cpu(info->NumOfBytes); __u64 end_of_file = le64_to_cpu(info->EndOfFile); - inode->i_atime = cifs_NTtimeToUnix(le64_to_cpu(info->LastAccessTime)); + inode->i_atime = cifs_NTtimeToUnix(info->LastAccessTime); inode->i_mtime = - cifs_NTtimeToUnix(le64_to_cpu(info->LastModificationTime)); - inode->i_ctime = cifs_NTtimeToUnix(le64_to_cpu(info->LastStatusChange)); + cifs_NTtimeToUnix(info->LastModificationTime); + inode->i_ctime = cifs_NTtimeToUnix(info->LastStatusChange); inode->i_mode = le64_to_cpu(info->Permissions); /* @@ -554,14 +554,11 @@ int cifs_get_inode_info(struct inode **pinode, /* Linux can not store file creation time so ignore it */ if (pfindData->LastAccessTime) - inode->i_atime = cifs_NTtimeToUnix - (le64_to_cpu(pfindData->LastAccessTime)); + inode->i_atime = cifs_NTtimeToUnix(pfindData->LastAccessTime); else /* do not need to use current_fs_time - time not stored */ inode->i_atime = CURRENT_TIME; - inode->i_mtime = - cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime)); - inode->i_ctime = - cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime)); + inode->i_mtime = cifs_NTtimeToUnix(pfindData->LastWriteTime); + inode->i_ctime = cifs_NTtimeToUnix(pfindData->ChangeTime); cFYI(DBG2, ("Attributes came in as 0x%x", attr)); if (adjustTZ && (pTcon->ses) && (pTcon->ses->server)) { inode->i_ctime.tv_sec += pTcon->ses->server->timeAdj; diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index e2fe998..d3ba75e 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -853,12 +853,12 @@ smbCalcSize_LE(struct smb_hdr *ptr) #define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000) - /* - * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units) - * into Unix UTC (based 1970-01-01, in seconds). - */ +/* + * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units) + * into Unix UTC (based 1970-01-01, in seconds). + */ struct timespec -cifs_NTtimeToUnix(u64 ntutc) +cifs_NTtimeToUnix(__le64 ntutc) { struct timespec ts; /* BB what about the timezone? BB */ @@ -866,7 +866,7 @@ cifs_NTtimeToUnix(u64 ntutc) /* Subtract the NTFS time offset, then convert to 1s intervals. */ u64 t; - t = ntutc - NTFS_TIME_OFFSET; + t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET; ts.tv_nsec = do_div(t, 10000000) * 100; ts.tv_sec = t; return ts; diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 964e097..79c46c2 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -150,11 +150,11 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type, allocation_size = le64_to_cpu(pfindData->AllocationSize); end_of_file = le64_to_cpu(pfindData->EndOfFile); tmp_inode->i_atime = - cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime)); + cifs_NTtimeToUnix(pfindData->LastAccessTime); tmp_inode->i_mtime = - cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime)); + cifs_NTtimeToUnix(pfindData->LastWriteTime); tmp_inode->i_ctime = - cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime)); + cifs_NTtimeToUnix(pfindData->ChangeTime); } else { /* legacy, OS2 and DOS style */ /* struct timespec ts;*/ FIND_FILE_STANDARD_INFO *pfindData = @@ -331,11 +331,11 @@ static void unix_fill_in_inode(struct inode *tmp_inode, local_size = tmp_inode->i_size; tmp_inode->i_atime = - cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime)); + cifs_NTtimeToUnix(pfindData->LastAccessTime); tmp_inode->i_mtime = - cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastModificationTime)); + cifs_NTtimeToUnix(pfindData->LastModificationTime); tmp_inode->i_ctime = - cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastStatusChange)); + cifs_NTtimeToUnix(pfindData->LastStatusChange); tmp_inode->i_mode = le64_to_cpu(pfindData->Permissions); /* since we set the inode type below we need to mask off type -- cgit v1.1 From c4a2c08db7d976c2e23a97da5d69ec7c9701034d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 27 May 2009 09:37:33 -0400 Subject: cifs: make cnvrtDosUnixTm take a little-endian args and an offset The callers primarily end up converting the args from le anyway. Also, most of the callers end up needing to add an offset to the result. The exception to these rules is cnvrtDosCifsTm, but there are no callers of that function, so we might as well remove it. Signed-off-by: Jeff Layton Reviewed-by: Christoph Hellwig Signed-off-by: Steve French --- fs/cifs/cifsproto.h | 4 ++-- fs/cifs/cifssmb.c | 4 ++-- fs/cifs/netmisc.c | 12 ++++-------- fs/cifs/readdir.c | 32 ++++++++++---------------------- 4 files changed, 18 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 8831f64..d542cf1 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -92,8 +92,8 @@ extern void DeleteOplockQEntry(struct oplock_q_entry *); extern void DeleteTconOplockQEntries(struct cifsTconInfo *); extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); extern u64 cifs_UnixTimeToNT(struct timespec); -extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time); -extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time); +extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, + int offset); extern int cifs_posix_open(char *full_path, struct inode **pinode, struct super_block *sb, int mode, int oflags, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index aece2a8..b84c61d 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -524,8 +524,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses) int val, seconds, remain, result; struct timespec ts, utc; utc = CURRENT_TIME; - ts = cnvrtDosUnixTm(le16_to_cpu(rsp->SrvTime.Date), - le16_to_cpu(rsp->SrvTime.Time)); + ts = cnvrtDosUnixTm(rsp->SrvTime.Date, + rsp->SrvTime.Time, 0); cFYI(1, ("SrvTime %d sec since 1970 (utc: %d) diff: %d", (int)ts.tv_sec, (int)utc.tv_sec, (int)(utc.tv_sec - ts.tv_sec))); diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index d3ba75e..32d6baa 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -883,16 +883,12 @@ cifs_UnixTimeToNT(struct timespec t) static int total_days_of_prev_months[] = {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334}; - -__le64 cnvrtDosCifsTm(__u16 date, __u16 time) -{ - return cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm(date, time))); -} - -struct timespec cnvrtDosUnixTm(__u16 date, __u16 time) +struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset) { struct timespec ts; int sec, min, days, month, year; + u16 date = le16_to_cpu(le_date); + u16 time = le16_to_cpu(le_time); SMB_TIME *st = (SMB_TIME *)&time; SMB_DATE *sd = (SMB_DATE *)&date; @@ -933,7 +929,7 @@ struct timespec cnvrtDosUnixTm(__u16 date, __u16 time) days -= ((year & 0x03) == 0) && (month < 2 ? 1 : 0); sec += 24 * 60 * 60 * days; - ts.tv_sec = sec; + ts.tv_sec = sec + offset; /* cFYI(1,("sec after cnvrt dos to unix time %d",sec)); */ diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 79c46c2..86d0055 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -115,17 +115,6 @@ construct_dentry(struct qstr *qstring, struct file *file, return rc; } -static void AdjustForTZ(struct cifsTconInfo *tcon, struct inode *inode) -{ - if ((tcon) && (tcon->ses) && (tcon->ses->server)) { - inode->i_ctime.tv_sec += tcon->ses->server->timeAdj; - inode->i_mtime.tv_sec += tcon->ses->server->timeAdj; - inode->i_atime.tv_sec += tcon->ses->server->timeAdj; - } - return; -} - - static void fill_in_inode(struct inode *tmp_inode, int new_buf_type, char *buf, unsigned int *pobject_type, int isNewInode) { @@ -156,20 +145,19 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type, tmp_inode->i_ctime = cifs_NTtimeToUnix(pfindData->ChangeTime); } else { /* legacy, OS2 and DOS style */ -/* struct timespec ts;*/ + int offset = cifs_sb->tcon->ses->server->timeAdj; FIND_FILE_STANDARD_INFO *pfindData = (FIND_FILE_STANDARD_INFO *)buf; - tmp_inode->i_mtime = cnvrtDosUnixTm( - le16_to_cpu(pfindData->LastWriteDate), - le16_to_cpu(pfindData->LastWriteTime)); - tmp_inode->i_atime = cnvrtDosUnixTm( - le16_to_cpu(pfindData->LastAccessDate), - le16_to_cpu(pfindData->LastAccessTime)); - tmp_inode->i_ctime = cnvrtDosUnixTm( - le16_to_cpu(pfindData->LastWriteDate), - le16_to_cpu(pfindData->LastWriteTime)); - AdjustForTZ(cifs_sb->tcon, tmp_inode); + tmp_inode->i_mtime = cnvrtDosUnixTm(pfindData->LastWriteDate, + pfindData->LastWriteTime, + offset); + tmp_inode->i_atime = cnvrtDosUnixTm(pfindData->LastAccessDate, + pfindData->LastAccessTime, + offset); + tmp_inode->i_ctime = cnvrtDosUnixTm(pfindData->LastWriteDate, + pfindData->LastWriteTime, + offset); attr = le16_to_cpu(pfindData->Attributes); allocation_size = le32_to_cpu(pfindData->AllocationSize); end_of_file = le32_to_cpu(pfindData->DataSize); -- cgit v1.1 From bd433d4cf4d8593a5f1764776b91f1794fce5a77 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 27 May 2009 09:37:34 -0400 Subject: cifs: rename cifs_iget to cifs_root_iget The current cifs_iget isn't suitable for anything but the root inode. Rename it with a more appropriate name. Signed-off-by: Jeff Layton Reviewed-by: Christoph Hellwig Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 2 +- fs/cifs/cifsfs.h | 2 +- fs/cifs/inode.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 5e6d358..0a10a59 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -146,7 +146,7 @@ cifs_read_super(struct super_block *sb, void *data, #endif sb->s_blocksize = CIFS_MAX_MSGSIZE; sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */ - inode = cifs_iget(sb, ROOT_I); + inode = cifs_root_iget(sb, ROOT_I); if (IS_ERR(inode)) { rc = PTR_ERR(inode); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 051b71c..3b6a85c 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -36,7 +36,7 @@ extern void cifs_read_inode(struct inode *); /* Functions related to inodes */ extern const struct inode_operations cifs_dir_inode_ops; -extern struct inode *cifs_iget(struct super_block *, unsigned long); +extern struct inode *cifs_root_iget(struct super_block *, unsigned long); extern int cifs_create(struct inode *, struct dentry *, int, struct nameidata *); extern struct dentry *cifs_lookup(struct inode *, struct dentry *, diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 42d6e0f..84b7bea 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -696,7 +696,7 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb) } /* gets root inode */ -struct inode *cifs_iget(struct super_block *sb, unsigned long ino) +struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino) { int xid; struct cifs_sb_info *cifs_sb; -- cgit v1.1 From a0c9217f64ee3cd1e534966da8c5f05768e1ab09 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 27 May 2009 15:40:47 -0400 Subject: cifs: make serverino the default when mounting Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index f32c903..8ae563f 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -835,6 +835,8 @@ cifs_parse_mount_options(char *options, const char *devname, vol->rw = true; /* default is always to request posix paths. */ vol->posix_paths = 1; + /* default to using server inode numbers where available */ + vol->server_ino = 1; if (!options) return 1; -- cgit v1.1 From c5077ec42303e07c2c685b0f6cb8eee0f2c7751c Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 28 May 2009 15:09:04 +0000 Subject: [CIFS] Update readme to indicate change to default mount (serverino) Signed-off-by: Steve French --- fs/cifs/CHANGES | 8 ++++++++ fs/cifs/README | 7 ++++++- fs/cifs/cifsfs.h | 2 +- 3 files changed, 15 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index f20c406..227c681 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -1,3 +1,9 @@ +Version 1.59 +------------ +Client uses server inode numbers (which are persistent) rather than +client generated ones by default (mount option "serverino" turned +on by default if server supports it). + Version 1.58 ------------ Guard against buffer overruns in various UCS-2 to UTF-8 string conversions @@ -10,6 +16,8 @@ we converted from). Fix endianness of the vcnum field used during session setup to distinguish multiple mounts to same server from different userids. Raw NTLMSSP fixed (it requires /proc/fs/cifs/experimental flag to be set to 2, and mount must enable krb5 to turn on extended security). +Performance of file create to Samba improved (posix create on lookup +removes 1 of 2 network requests sent on file create) Version 1.57 ------------ diff --git a/fs/cifs/README b/fs/cifs/README index db208ddb..6d1608f 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -388,8 +388,13 @@ A partial list of the supported mount options follows: or the CIFS Unix Extensions equivalent and for those this mount option will have no effect. Exporting cifs mounts under nfsd requires this mount option on the cifs mount. + This is now the default if server supports the + required network operation. noserverino Client generates inode numbers (rather than using the actual one - from the server) by default. + from the server). These inode numbers will vary after + unmount or reboot which can confuse some applications, + but not all server filesystems support unique inode + numbers. setuids If the CIFS Unix extensions are negotiated with the server the client will attempt to set the effective uid and gid of the local process on newly created files, directories, and diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 3b6a85c..9570a0e 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -100,5 +100,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* EXPERIMENTAL */ -#define CIFS_VERSION "1.58" +#define CIFS_VERSION "1.59" #endif /* _CIFSFS_H */ -- cgit v1.1 From 1bf4072da67c14d6b02cfeef02212aa5a6211df2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 27 May 2009 09:37:33 -0400 Subject: cifs: reorganize get_cifs_acl Thus spake Christoph: "But this whole set_cifs_acl function is a real mess anyway and needs some splitting up." With this change too, it's possible to call acl_to_uid_mode() with a NULL inode pointer. That (or something close to it) will eventually be necessary when cifs_get_inode_info is reorganized. Signed-off-by: Christoph Hellwig Reviewed-by: Shirish Pargaonkar Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 100 +++++++++++++++++++++++++++------------------------- fs/cifs/cifsproto.h | 4 +-- fs/cifs/inode.c | 2 +- 3 files changed, 55 insertions(+), 51 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 57ecdc8..7f8e6c4 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -552,67 +552,66 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, return rc; } - -/* Retrieve an ACL from the server */ -static struct cifs_ntsd *get_cifs_acl(u32 *pacllen, struct inode *inode, - const char *path, const __u16 *pfid) +static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, + __u16 fid, u32 *pacllen) { - struct cifsFileInfo *open_file = NULL; - bool unlock_file = false; - int xid; - int rc = -EIO; - __u16 fid; - struct super_block *sb; - struct cifs_sb_info *cifs_sb; struct cifs_ntsd *pntsd = NULL; + int xid, rc; - cFYI(1, ("get mode from ACL for %s", path)); + xid = GetXid(); + rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen); + FreeXid(xid); - if (inode == NULL) - return NULL; - xid = GetXid(); - if (pfid == NULL) - open_file = find_readable_file(CIFS_I(inode)); - else - fid = *pfid; + cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen)); + return pntsd; +} - sb = inode->i_sb; - if (sb == NULL) { - FreeXid(xid); - return NULL; - } - cifs_sb = CIFS_SB(sb); +static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, + const char *path, u32 *pacllen) +{ + struct cifs_ntsd *pntsd = NULL; + int oplock = 0; + int xid, rc; + __u16 fid; - if (open_file) { - unlock_file = true; - fid = open_file->netfid; - } else if (pfid == NULL) { - int oplock = 0; - /* open file */ - rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, - READ_CONTROL, 0, &fid, &oplock, NULL, - cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc != 0) { - cERROR(1, ("Unable to open file to get ACL")); - FreeXid(xid); - return NULL; - } + xid = GetXid(); + + rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, READ_CONTROL, 0, + &fid, &oplock, NULL, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc) { + cERROR(1, ("Unable to open file to get ACL")); + goto out; } rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen); cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen)); - if (unlock_file == true) /* find_readable_file increments ref count */ - atomic_dec(&open_file->wrtPending); - else if (pfid == NULL) /* if opened above we have to close the handle */ - CIFSSMBClose(xid, cifs_sb->tcon, fid); - /* else handle was passed in by caller */ + CIFSSMBClose(xid, cifs_sb->tcon, fid); + out: FreeXid(xid); return pntsd; } +/* Retrieve an ACL from the server */ +static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, + struct inode *inode, const char *path, + u32 *pacllen) +{ + struct cifs_ntsd *pntsd = NULL; + struct cifsFileInfo *open_file = NULL; + + if (inode) + open_file = find_readable_file(CIFS_I(inode)); + if (!open_file) + return get_cifs_acl_by_path(cifs_sb, path, pacllen); + + pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen); + atomic_dec(&open_file->wrtPending); + return pntsd; +} + /* Set an ACL on the server */ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, struct inode *inode, const char *path) @@ -668,14 +667,19 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, } /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */ -void acl_to_uid_mode(struct inode *inode, const char *path, const __u16 *pfid) +void acl_to_uid_mode(struct cifs_sb_info *cifs_sb, struct inode *inode, + const char *path, const __u16 *pfid) { struct cifs_ntsd *pntsd = NULL; u32 acllen = 0; int rc = 0; cFYI(DBG2, ("converting ACL to mode for %s", path)); - pntsd = get_cifs_acl(&acllen, inode, path, pfid); + + if (pfid) + pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen); + else + pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen); /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */ if (pntsd) @@ -698,7 +702,7 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode) cFYI(DBG2, ("set ACL from mode for %s", path)); /* Get the security descriptor */ - pntsd = get_cifs_acl(&secdesclen, inode, path, NULL); + pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen); /* Add three ACEs for owner, group, everyone getting rid of other ACEs as chmod disables ACEs and set the security descriptor */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index d542cf1..f945232 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -108,8 +108,8 @@ extern int cifs_get_inode_info(struct inode **pinode, extern int cifs_get_inode_info_unix(struct inode **pinode, const unsigned char *search_path, struct super_block *sb, int xid); -extern void acl_to_uid_mode(struct inode *inode, const char *path, - const __u16 *pfid); +extern void acl_to_uid_mode(struct cifs_sb_info *cifs_sb, struct inode *inode, + const char *path, const __u16 *pfid); extern int mode_to_acl(struct inode *inode, const char *path, __u64); extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *, diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 84b7bea..fad882b 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -626,7 +626,7 @@ int cifs_get_inode_info(struct inode **pinode, /* fill in 0777 bits from ACL */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { cFYI(1, ("Getting mode bits from ACL")); - acl_to_uid_mode(inode, full_path, pfid); + acl_to_uid_mode(cifs_sb, inode, full_path, pfid); } #endif if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { -- cgit v1.1 From b96d31a62f714566fa6420851b3bb3615c796322 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 27 May 2009 09:37:33 -0400 Subject: cifs: clean up set_cifs_acl interfaces Signed-off-by: Christoph Hellwig Reviewed-by: Shirish Pargaonkar Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsacl.c | 78 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 41 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 7f8e6c4..1403b5d 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -612,57 +612,61 @@ static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, return pntsd; } -/* Set an ACL on the server */ -static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, - struct inode *inode, const char *path) +static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid, + struct cifs_ntsd *pnntsd, u32 acllen) { - struct cifsFileInfo *open_file; - bool unlock_file = false; - int xid; - int rc = -EIO; - __u16 fid; - struct super_block *sb; - struct cifs_sb_info *cifs_sb; + int xid, rc; - cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode)); + xid = GetXid(); + rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen); + FreeXid(xid); - if (!inode) - return rc; + cFYI(DBG2, ("SetCIFSACL rc = %d", rc)); + return rc; +} - sb = inode->i_sb; - if (sb == NULL) - return rc; +static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path, + struct cifs_ntsd *pnntsd, u32 acllen) +{ + int oplock = 0; + int xid, rc; + __u16 fid; - cifs_sb = CIFS_SB(sb); xid = GetXid(); - open_file = find_readable_file(CIFS_I(inode)); - if (open_file) { - unlock_file = true; - fid = open_file->netfid; - } else { - int oplock = 0; - /* open file */ - rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, - WRITE_DAC, 0, &fid, &oplock, NULL, - cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc != 0) { - cERROR(1, ("Unable to open file to set ACL")); - FreeXid(xid); - return rc; - } + rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, WRITE_DAC, 0, + &fid, &oplock, NULL, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc) { + cERROR(1, ("Unable to open file to set ACL")); + goto out; } rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen); cFYI(DBG2, ("SetCIFSACL rc = %d", rc)); - if (unlock_file) - atomic_dec(&open_file->wrtPending); - else - CIFSSMBClose(xid, cifs_sb->tcon, fid); + CIFSSMBClose(xid, cifs_sb->tcon, fid); + out: FreeXid(xid); + return rc; +} +/* Set an ACL on the server */ +static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, + struct inode *inode, const char *path) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifsFileInfo *open_file; + int rc; + + cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode)); + + open_file = find_readable_file(CIFS_I(inode)); + if (!open_file) + return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen); + + rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen); + atomic_dec(&open_file->wrtPending); return rc; } -- cgit v1.1 From 086a377edc969aea6c761176a7e4ff68f264d6fe Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 7 May 2009 12:36:53 -0700 Subject: sysfs: file.c: use create_singlethread_workqueue() We don't need a kernel thread per CPU for this application. Acked-by: Alex Chiang Cc: Lai Jiangshan Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index b1606e0..561a9c0 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -723,7 +723,7 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), mutex_unlock(&sysfs_workq_mutex); if (sysfs_workqueue == NULL) { - sysfs_workqueue = create_workqueue("sysfsd"); + sysfs_workqueue = create_singlethread_workqueue("sysfsd"); if (sysfs_workqueue == NULL) { module_put(owner); return -ENOMEM; -- cgit v1.1 From 81e2962801bbb4e740c501ca687d5cb857929c04 Mon Sep 17 00:00:00 2001 From: Joakim Tjernlund Date: Thu, 28 May 2009 17:43:59 +0200 Subject: jffs2: Fix corruption when flash erase/write failure Erase errors such as: "Newly-erased block contained word 0xa4ef223e at offset 0x0296a014" and failure to write the clean marker, moves the offending erase block to erasing list before calling jffs2_erase_failed(). This is bad as jffs2_erase_failed() will also move the block to the bad_list, but is now moving the wrong block, causing FS corruption. Signed-off-by: Joakim Tjernlund Signed-off-by: David Woodhouse --- fs/jffs2/erase.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'fs') diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index c32b4a1..a024474 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -480,13 +480,6 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb return; filebad: - mutex_lock(&c->erase_free_sem); - spin_lock(&c->erase_completion_lock); - /* Stick it on a list (any list) so erase_failed can take it - right off again. Silly, but shouldn't happen often. */ - list_move(&jeb->list, &c->erasing_list); - spin_unlock(&c->erase_completion_lock); - mutex_unlock(&c->erase_free_sem); jffs2_erase_failed(c, jeb, bad_offset); return; -- cgit v1.1 From bd6daba909d8484bd2ccf6017db4028d7a420927 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 28 May 2009 14:34:21 -0700 Subject: procfs: make errno values consistent when open pident vs exit(2) race occurs proc_pident_instantiate() has following call flow. proc_pident_lookup() proc_pident_instantiate() proc_pid_make_inode() And, proc_pident_lookup() has following error handling. const struct pid_entry *p, *last; error = ERR_PTR(-ENOENT); if (!task) goto out_no_task; Then, proc_pident_instantiate should return ENOENT too when racing against exit(2) occur. EINAL has two bad reason. - it implies caller is wrong. bad the race isn't caller's mistake. - man 2 open don't explain EINVAL. user often don't handle it. Note: Other proc_pid_make_inode() caller already use ENOENT properly. Acked-by: Eric W. Biederman Cc: Alexey Dobriyan Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index fb45615..3326bbf 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1956,7 +1956,7 @@ static struct dentry *proc_pident_instantiate(struct inode *dir, const struct pid_entry *p = ptr; struct inode *inode; struct proc_inode *ei; - struct dentry *error = ERR_PTR(-EINVAL); + struct dentry *error = ERR_PTR(-ENOENT); inode = proc_pid_make_inode(dir->i_sb, task); if (!inode) -- cgit v1.1 From c3dc5bec05a2ae03a72ef82e321d77fb549d951c Mon Sep 17 00:00:00 2001 From: Oskar Schirmer Date: Thu, 28 May 2009 14:34:31 -0700 Subject: flat: fix data sections alignment The flat loader uses an architecture's flat_stack_align() to align the stack but assumes word-alignment is enough for the data sections. However, on the Xtensa S6000 we have registers up to 128bit width which can be used from userspace and therefor need userspace stack and data-section alignment of at least this size. This patch drops flat_stack_align() and uses the same alignment that is required for slab caches, ARCH_SLAB_MINALIGN, or wordsize if it's not defined by the architecture. It also fixes m32r which was obviously kaput, aligning an uninitialized stack entry instead of the stack pointer. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Oskar Schirmer Cc: David Howells Cc: Russell King Cc: Bryan Wu Cc: Geert Uytterhoeven Acked-by: Paul Mundt Cc: Greg Ungerer Signed-off-by: Johannes Weiner Acked-by: Mike Frysinger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_flat.c | 46 +++++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 5cebf0b..697f6b5 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -41,6 +41,7 @@ #include #include #include +#include /****************************************************************************/ @@ -54,6 +55,18 @@ #define DBG_FLT(a...) #endif +/* + * User data (stack, data section and bss) needs to be aligned + * for the same reasons as SLAB memory is, and to the same amount. + * Avoid duplicating architecture specific code by using the same + * macro as with SLAB allocation: + */ +#ifdef ARCH_SLAB_MINALIGN +#define FLAT_DATA_ALIGN (ARCH_SLAB_MINALIGN) +#else +#define FLAT_DATA_ALIGN (sizeof(void *)) +#endif + #define RELOC_FAILED 0xff00ff01 /* Relocation incorrect somewhere */ #define UNLOADED_LIB 0x7ff000ff /* Placeholder for unused library */ @@ -114,20 +127,18 @@ static unsigned long create_flat_tables( int envc = bprm->envc; char uninitialized_var(dummy); - sp = (unsigned long *) ((-(unsigned long)sizeof(char *))&(unsigned long) p); + sp = (unsigned long *)p; + sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0); + sp = (unsigned long *) ((unsigned long)sp & -FLAT_DATA_ALIGN); + argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0); + envp = argv + (argc + 1); - sp -= envc+1; - envp = sp; - sp -= argc+1; - argv = sp; - - flat_stack_align(sp); if (flat_argvp_envp_on_stack()) { - --sp; put_user((unsigned long) envp, sp); - --sp; put_user((unsigned long) argv, sp); + put_user((unsigned long) envp, sp + 2); + put_user((unsigned long) argv, sp + 1); } - put_user(argc,--sp); + put_user(argc, sp); current->mm->arg_start = (unsigned long) p; while (argc-->0) { put_user((unsigned long) p, argv++); @@ -558,7 +569,9 @@ static int load_flat_file(struct linux_binprm * bprm, ret = realdatastart; goto err; } - datapos = realdatastart + MAX_SHARED_LIBS * sizeof(unsigned long); + datapos = ALIGN(realdatastart + + MAX_SHARED_LIBS * sizeof(unsigned long), + FLAT_DATA_ALIGN); DBG_FLT("BINFMT_FLAT: Allocated data+bss+stack (%d bytes): %x\n", (int)(data_len + bss_len + stack_len), (int)datapos); @@ -604,9 +617,12 @@ static int load_flat_file(struct linux_binprm * bprm, } realdatastart = textpos + ntohl(hdr->data_start); - datapos = realdatastart + MAX_SHARED_LIBS * sizeof(unsigned long); - reloc = (unsigned long *) (textpos + ntohl(hdr->reloc_start) + - MAX_SHARED_LIBS * sizeof(unsigned long)); + datapos = ALIGN(realdatastart + + MAX_SHARED_LIBS * sizeof(unsigned long), + FLAT_DATA_ALIGN); + + reloc = (unsigned long *) + (datapos + (ntohl(hdr->reloc_start) - text_len)); memp = textpos; memp_size = len; #ifdef CONFIG_BINFMT_ZFLAT @@ -854,7 +870,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs) stack_len = TOP_OF_ARGS - bprm->p; /* the strings */ stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */ stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */ - + stack_len += FLAT_DATA_ALIGN - 1; /* reserve for upcoming alignment */ res = load_flat_file(bprm, &libinfo, 0, &stack_len); if (res > (unsigned long)-4096) -- cgit v1.1 From 62013ab5d5df297a01ae5863b5c26d758ec0af7f Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sat, 30 May 2009 21:50:58 +0900 Subject: nilfs2: fix bh leak in nilfs_cpfile_delete_checkpoints function The nilfs_cpfile_delete_checkpoints() wrongly skips brelse() for the header block of checkpoint file in case of errors. This fixes the leak bug. Signed-off-by: Ryusuke Konishi --- fs/nilfs2/cpfile.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index e90b60d..300f1cd 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -311,7 +311,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh); if (ret < 0) { if (ret != -ENOENT) - goto out_sem; + goto out_header; /* skip hole */ ret = 0; continue; @@ -344,7 +344,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, continue; printk(KERN_ERR "%s: cannot delete block\n", __func__); - goto out_sem; + goto out_header; } } @@ -361,6 +361,8 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, nilfs_mdt_mark_dirty(cpfile); kunmap_atomic(kaddr, KM_USER0); } + + out_header: brelse(header_bh); out_sem: -- cgit v1.1 From 1f23920dbf1377fa9e4aef4f3d20c34a06a71a35 Mon Sep 17 00:00:00 2001 From: Felix Blyakher Date: Thu, 7 May 2009 19:49:45 -0500 Subject: xfs: fix double unlock in xfs_swap_extents() Regreesion from commit ef8f7fc, which rearranged the code in xfs_swap_extents() leading to double unlock of xfs inode ilock. That resulted in xfs_fsr deadlocking itself on platforms, which don't handle double unlock of rw_semaphore nicely. It caused the count go negative, which represents the write holder, without really having one. ia64 is one of the platforms where deadlock was easily reproduced and the fix was tested. Signed-off-by: Eric Sandeen Reviewed-by: Eric Sandeen Signed-off-by: Felix Blyakher --- fs/xfs/xfs_dfrag.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index e6d839b..7465f9e 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -347,13 +347,15 @@ xfs_swap_extents( error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT); -out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); out: kmem_free(tempifp); return error; +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + goto out; + out_trans_cancel: xfs_trans_cancel(tp, 0); goto out_unlock; -- cgit v1.1 From e6da7c9fed111ba1243297ee6eda8e24ae11c384 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sat, 23 May 2009 14:30:12 -0500 Subject: xfs: fix overflow in xfs_growfs_data_private In the case where growing a filesystem would leave the last AG too small, the fixup code has an overflow in the calculation of the new size with one fewer ag, because "nagcount" is a 32 bit number. If the new filesystem has > 2^32 blocks in it this causes a problem resulting in an EINVAL return from growfs: # xfs_io -f -c "truncate 19998630180864" fsfile # mkfs.xfs -f -bsize=4096 -dagsize=76288719b,size=3905982455b fsfile # mount -o loop fsfile /mnt # xfs_growfs /mnt meta-data=/dev/loop0 isize=256 agcount=52, agsize=76288719 blks = sectsz=512 attr=2 data = bsize=4096 blocks=3905982455, imaxpct=5 = sunit=0 swidth=0 blks naming =version 2 bsize=4096 ascii-ci=0 log =internal bsize=4096 blocks=32768, version=2 = sectsz=512 sunit=0 blks, lazy-count=0 realtime =none extsz=4096 blocks=0, rtextents=0 xfs_growfs: XFS_IOC_FSGROWFSDATA xfsctl failed: Invalid argument Reported-by: richard.ems@cape-horn-eng.com Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_fsops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 8379e3b..cbd451b 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -160,7 +160,7 @@ xfs_growfs_data_private( nagcount = new + (nb_mod != 0); if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) { nagcount--; - nb = nagcount * mp->m_sb.sb_agblocks; + nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks; if (nb < mp->m_sb.sb_dblocks) return XFS_ERROR(EINVAL); } -- cgit v1.1 From 1b17d766463d51904cb242f194a780737e5f73ef Mon Sep 17 00:00:00 2001 From: Felix Blyakher Date: Mon, 1 Jun 2009 13:13:24 -0500 Subject: xfs: prevent deadlock in xfs_qm_shake() It's possible to recurse into filesystem from the memory allocation, which deadlocks in xfs_qm_shake(). Add check for __GFP_FS, and bail out if it is not set. Signed-off-by: Felix Blyakher Signed-off-by: Hedi Berriche Reviewed-by: Christoph Hellwig Reviewed-by: Andi Kleen Signed-off-by: Felix Blyakher --- fs/xfs/linux-2.6/kmem.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h index af6843c..179cbd6 100644 --- a/fs/xfs/linux-2.6/kmem.h +++ b/fs/xfs/linux-2.6/kmem.h @@ -103,7 +103,7 @@ extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast); static inline int kmem_shake_allow(gfp_t gfp_mask) { - return (gfp_mask & __GFP_WAIT) != 0; + return ((gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)); } #endif /* __XFS_SUPPORT_KMEM_H__ */ -- cgit v1.1 From a12af1ebe675e85831fde3c4d0908fc3b0908b7a Mon Sep 17 00:00:00 2001 From: Abhijith Das Date: Mon, 1 Jun 2009 12:30:03 -0500 Subject: GFS2: smbd proccess hangs with flock() call. GFS2 currently does not support mandatory flocks. An flock() call with LOCK_MAND triggers unexpected behavior because gfs2 is not checking for this lock type. This patch corrects that. Signed-off-by: Abhi Das Signed-off-by: Steven Whitehouse --- fs/gfs2/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 73b6f55..841ddc9 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -698,8 +698,8 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; - if (__mandatory_lock(&ip->i_inode)) - return -ENOLCK; + if (fl->fl_type & LOCK_MAND) + return -EOPNOTSUPP; if (fl->fl_type == F_UNLCK) { do_unflock(file, fl); -- cgit v1.1 From 50b64e3b77d569c217a48e078cd565dbd6462ad0 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 2 Jun 2009 06:55:20 -0400 Subject: cifs: fix IPv6 address length check For IPv6 the userspace mount helper sends an address in the "ip=" option. This check fails if the length is > 35 characters. I have no idea where the magic 35 character limit came from, but it's clearly not enough for IPv6. Fix it by making it use the INET6_ADDRSTRLEN #define. While we're at it, use the same #define for the address length in SPNEGO upcalls. Reported-by: Charles R. Anderson Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifs_spnego.c | 6 ++---- fs/cifs/connect.c | 4 +++- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 67bf93a..4a4581c 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "cifsglob.h" #include "cifs_spnego.h" #include "cifs_debug.h" @@ -73,9 +74,6 @@ struct key_type cifs_spnego_key_type = { * strlen(";sec=ntlmsspi") */ #define MAX_MECH_STR_LEN 13 -/* max possible addr len eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/128 */ -#define MAX_IPV6_ADDR_LEN 43 - /* strlen of "host=" */ #define HOST_KEY_LEN 5 @@ -102,7 +100,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo) host=hostname sec=mechanism uid=0xFF user=username */ desc_len = MAX_VER_STR_LEN + HOST_KEY_LEN + strlen(hostname) + - IP_KEY_LEN + MAX_IPV6_ADDR_LEN + + IP_KEY_LEN + INET6_ADDRSTRLEN + MAX_MECH_STR_LEN + UID_KEY_LEN + (sizeof(uid_t) * 2) + USER_KEY_LEN + strlen(sesInfo->userName) + 1; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 8ae563f..74b5a87 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include "cifspdu.h" #include "cifsglob.h" @@ -960,7 +961,8 @@ cifs_parse_mount_options(char *options, const char *devname, } else if (strnicmp(data, "ip", 2) == 0) { if (!value || !*value) { vol->UNCip = NULL; - } else if (strnlen(value, 35) < 35) { + } else if (strnlen(value, INET6_ADDRSTRLEN) < + INET6_ADDRSTRLEN) { vol->UNCip = value; } else { printk(KERN_WARNING "CIFS: ip address " -- cgit v1.1 From e09f9446b94ac64b27d37e98c1110f29d712cdad Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 3 Jun 2009 10:07:44 +0100 Subject: GFS2: Remove unused variable Signed-off-by: Steven Whitehouse --- fs/gfs2/file.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 841ddc9..73318a3 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -694,8 +694,6 @@ static void do_unflock(struct file *file, struct file_lock *fl) static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) { - struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); - if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; if (fl->fl_type & LOCK_MAND) -- cgit v1.1 From bfcd3555af478dbf04c87adc9bb1a739d0a6ccff Mon Sep 17 00:00:00 2001 From: Alberto Bertogli Date: Tue, 9 Jun 2009 00:06:20 -0400 Subject: jbd2: Fix minor typos in comments in fs/jbd2/journal.c Signed-off-by: Alberto Bertogli Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 5814410..62be7d2 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1781,7 +1781,7 @@ int jbd2_journal_wipe(journal_t *journal, int write) * Journal abort has very specific semantics, which we describe * for journal abort. * - * Two internal function, which provide abort to te jbd layer + * Two internal functions, which provide abort to the jbd layer * itself are here. */ @@ -1879,7 +1879,7 @@ void jbd2_journal_abort(journal_t *journal, int errno) * int jbd2_journal_errno () - returns the journal's error state. * @journal: journal to examine. * - * This is the errno numbet set with jbd2_journal_abort(), the last + * This is the errno number set with jbd2_journal_abort(), the last * time the journal was mounted - if the journal was stopped * without calling abort this will be 0. * @@ -1903,7 +1903,7 @@ int jbd2_journal_errno(journal_t *journal) * int jbd2_journal_clear_err () - clears the journal's error state * @journal: journal to act on. * - * An error must be cleared or Acked to take a FS out of readonly + * An error must be cleared or acked to take a FS out of readonly * mode. */ int jbd2_journal_clear_err(journal_t *journal) @@ -1923,7 +1923,7 @@ int jbd2_journal_clear_err(journal_t *journal) * void jbd2_journal_ack_err() - Ack journal err. * @journal: journal to act on. * - * An error must be cleared or Acked to take a FS out of readonly + * An error must be cleared or acked to take a FS out of readonly * mode. */ void jbd2_journal_ack_err(journal_t *journal) -- cgit v1.1 From 0b8e58a140cae2ba1c4a21ccae7c6c3c939c51f9 Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Wed, 3 Jun 2009 17:59:28 -0400 Subject: ext4: super.c whitespace cleanup Cleanup of whitespace and formatting. Initially driven by confusing indents for the ext4_{block,inode}_bitmap() et. al. helper routines, but figured I'd cleanup some other 80-column wrapping and other indenting problems at the same time. Signed-off-by: Andreas Dilger Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 117 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 61 insertions(+), 56 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 91b98b5..0a97b1a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -79,7 +79,7 @@ ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, { return le32_to_cpu(bg->bg_block_bitmap_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? - (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0); + (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0); } ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, @@ -87,7 +87,7 @@ ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, { return le32_to_cpu(bg->bg_inode_bitmap_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? - (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0); + (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0); } ext4_fsblk_t ext4_inode_table(struct super_block *sb, @@ -95,7 +95,7 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb, { return le32_to_cpu(bg->bg_inode_table_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? - (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); + (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); } __u32 ext4_free_blks_count(struct super_block *sb, @@ -103,7 +103,7 @@ __u32 ext4_free_blks_count(struct super_block *sb, { return le16_to_cpu(bg->bg_free_blocks_count_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? - (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0); + (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0); } __u32 ext4_free_inodes_count(struct super_block *sb, @@ -111,7 +111,7 @@ __u32 ext4_free_inodes_count(struct super_block *sb, { return le16_to_cpu(bg->bg_free_inodes_count_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? - (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0); + (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0); } __u32 ext4_used_dirs_count(struct super_block *sb, @@ -119,7 +119,7 @@ __u32 ext4_used_dirs_count(struct super_block *sb, { return le16_to_cpu(bg->bg_used_dirs_count_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? - (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0); + (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0); } __u32 ext4_itable_unused_count(struct super_block *sb, @@ -127,7 +127,7 @@ __u32 ext4_itable_unused_count(struct super_block *sb, { return le16_to_cpu(bg->bg_itable_unused_lo) | (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? - (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0); + (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0); } void ext4_block_bitmap_set(struct super_block *sb, @@ -207,8 +207,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) journal = EXT4_SB(sb)->s_journal; if (journal) { if (is_journal_aborted(journal)) { - ext4_abort(sb, __func__, - "Detected aborted journal"); + ext4_abort(sb, __func__, "Detected aborted journal"); return ERR_PTR(-EROFS); } return jbd2_journal_start(journal, nblocks); @@ -436,7 +435,7 @@ void ext4_warning(struct super_block *sb, const char *function, } void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp, - const char *function, const char *fmt, ...) + const char *function, const char *fmt, ...) __releases(bitlock) __acquires(bitlock) { @@ -472,7 +471,6 @@ __acquires(bitlock) return; } - void ext4_update_dynamic_rev(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; @@ -638,7 +636,6 @@ static void ext4_put_super(struct super_block *sb) lock_kernel(); kfree(sbi->s_blockgroup_lock); kfree(sbi); - return; } static struct kmem_cache *ext4_inode_cachep; @@ -653,6 +650,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS); if (!ei) return NULL; + #ifdef CONFIG_EXT4_FS_POSIX_ACL ei->i_acl = EXT4_ACL_NOT_CACHED; ei->i_default_acl = EXT4_ACL_NOT_CACHED; @@ -673,6 +671,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_allocated_meta_blocks = 0; ei->i_delalloc_reserved_flag = 0; spin_lock_init(&(ei->i_block_reservation_lock)); + return &ei->vfs_inode; } @@ -879,12 +878,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",noauto_da_alloc"); ext4_show_quota_options(seq, sb); + return 0; } - static struct inode *ext4_nfs_get_inode(struct super_block *sb, - u64 ino, u32 generation) + u64 ino, u32 generation) { struct inode *inode; @@ -913,14 +912,14 @@ static struct inode *ext4_nfs_get_inode(struct super_block *sb, } static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) + int fh_len, int fh_type) { return generic_fh_to_dentry(sb, fid, fh_len, fh_type, ext4_nfs_get_inode); } static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) + int fh_len, int fh_type) { return generic_fh_to_parent(sb, fid, fh_len, fh_type, ext4_nfs_get_inode); @@ -932,7 +931,8 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, * which would prevent try_to_free_buffers() from freeing them, we must use * jbd2 layer's try_to_free_buffers() function to release them. */ -static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_t wait) +static int bdev_try_to_free_page(struct super_block *sb, struct page *page, + gfp_t wait) { journal_t *journal = EXT4_SB(sb)->s_journal; @@ -1133,8 +1133,9 @@ static ext4_fsblk_t get_sb_block(void **data) if (!options || strncmp(options, "sb=", 3) != 0) return 1; /* Default location */ + options += 3; - /*todo: use simple_strtoll with >32bit ext4 */ + /* TODO: use simple_strtoll with >32bit ext4 */ sb_block = simple_strtoul(options, &options, 0); if (*options && *options != ',') { printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n", @@ -1144,6 +1145,7 @@ static ext4_fsblk_t get_sb_block(void **data) if (*options == ',') options++; *data = (void *) options; + return sb_block; } @@ -1626,7 +1628,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, printk(KERN_WARNING "EXT4-fs warning: checktime reached, " "running e2fsck is recommended\n"); - if (!sbi->s_journal) + if (!sbi->s_journal) es->s_state &= cpu_to_le16(~EXT4_VALID_FS); if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); @@ -1810,7 +1812,7 @@ static int ext4_check_descriptors(struct super_block *sb) } ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb)); - sbi->s_es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb)); + sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb)); return 1; } @@ -1926,6 +1928,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, #endif sb->s_flags = s_flags; /* Restore MS_RDONLY status */ } + /* * Maximal extent format file size. * Resulting logical blkno at s_maxbytes must fit in our on-disk @@ -1976,19 +1979,19 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) loff_t res = EXT4_NDIR_BLOCKS; int meta_blocks; loff_t upper_limit; - /* This is calculated to be the largest file size for a - * dense, bitmapped file such that the total number of - * sectors in the file, including data and all indirect blocks, - * does not exceed 2^48 -1 - * __u32 i_blocks_lo and _u16 i_blocks_high representing the - * total number of 512 bytes blocks of the file + /* This is calculated to be the largest file size for a dense, block + * mapped file such that the file's total number of 512-byte sectors, + * including data and all indirect blocks, does not exceed (2^48 - 1). + * + * __u32 i_blocks_lo and _u16 i_blocks_high represent the total + * number of 512-byte sectors of the file. */ if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { /* - * !has_huge_files or CONFIG_LBD is not enabled - * implies the inode i_block represent total blocks in - * 512 bytes 32 == size of vfs inode i_blocks * 8 + * !has_huge_files or CONFIG_LBD not enabled implies that + * the inode i_block field represents total file blocks in + * 2^32 512-byte sectors == size of vfs inode i_blocks * 8 */ upper_limit = (1LL << 32) - 1; @@ -2030,7 +2033,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) } static ext4_fsblk_t descriptor_loc(struct super_block *sb, - ext4_fsblk_t logical_sb_block, int nr) + ext4_fsblk_t logical_sb_block, int nr) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t bg, first_meta_bg; @@ -2044,6 +2047,7 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb, bg = sbi->s_desc_per_block * nr; if (ext4_bg_has_super(sb, bg)) has_super = 1; + return (has_super + ext4_group_first_block_no(sb, bg)); } @@ -2148,7 +2152,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a, } static ssize_t sbi_ui_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) + struct ext4_sb_info *sbi, char *buf) { unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); @@ -2253,7 +2257,6 @@ static struct kobj_type ext4_ktype = { static int ext4_fill_super(struct super_block *sb, void *data, int silent) __releases(kernel_lock) __acquires(kernel_lock) - { struct buffer_head *bh; struct ext4_super_block *es = NULL; @@ -2379,7 +2382,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) */ set_opt(sbi->s_mount_opt, DELALLOC); - if (!parse_options((char *) data, sb, &journal_devnum, &journal_ioprio, NULL, 0)) goto failed_mount; @@ -2442,7 +2444,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (sb->s_blocksize != blocksize) { - /* Validate the filesystem blocksize */ if (!sb_set_blocksize(sb, blocksize)) { printk(KERN_ERR "EXT4-fs: bad block size %d.\n", @@ -2489,6 +2490,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2); } + sbi->s_desc_size = le16_to_cpu(es->s_desc_size); if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) { if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || @@ -2501,10 +2503,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } } else sbi->s_desc_size = EXT4_MIN_DESC_SIZE; + sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0) goto cantfind_ext4; + sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb); if (sbi->s_inodes_per_block == 0) goto cantfind_ext4; @@ -2515,6 +2519,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_mount_state = le16_to_cpu(es->s_state); sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); + for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; @@ -2566,12 +2571,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } - /* - * It makes no sense for the first data block to be beyond the end - * of the filesystem. - */ - if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { - printk(KERN_WARNING "EXT4-fs: bad geometry: first data" + /* + * It makes no sense for the first data block to be beyond the end + * of the filesystem. + */ + if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { + printk(KERN_WARNING "EXT4-fs: bad geometry: first data" "block %u is beyond end of filesystem (%llu)\n", le32_to_cpu(es->s_first_data_block), ext4_blocks_count(es)); @@ -3082,6 +3087,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, EXT4_SB(sb)->journal_bdev = bdev; ext4_init_journal_params(sb, journal); return journal; + out_journal: jbd2_journal_destroy(journal); out_bdev: @@ -3116,7 +3122,6 @@ static int ext4_load_journal(struct super_block *sb, * crash? For recovery, we need to check in advance whether we * can get read-write access to the device. */ - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { if (sb->s_flags & MS_RDONLY) { printk(KERN_INFO "EXT4-fs: INFO: recovery " @@ -3234,7 +3239,6 @@ static int ext4_commit_super(struct super_block *sb, int sync) return error; } - /* * Have we just finished recovery? If so, and if we are mounting (or * remounting) the filesystem readonly, then we will end up with a @@ -3485,8 +3489,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) /* * Make sure the group descriptor checksums - * are sane. If they aren't, refuse to - * remount r/w. + * are sane. If they aren't, refuse to remount r/w. */ for (g = 0; g < sbi->s_groups_count; g++) { struct ext4_group_desc *gdp = @@ -3545,6 +3548,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) kfree(old_opts.s_qf_names[i]); #endif return 0; + restore_opts: sb->s_flags = old_sb_flags; sbi->s_mount_opt = old_opts.s_mount_opt; @@ -3628,11 +3632,12 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) le64_to_cpup((void *)es->s_uuid + sizeof(u64)); buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; + return 0; } -/* Helper function for writing quotas on sync - we need to start transaction before quota file - * is locked for write. Otherwise the are possible deadlocks: +/* Helper function for writing quotas on sync - we need to start transaction + * before quota file is locked for write. Otherwise the are possible deadlocks: * Process 1 Process 2 * ext4_create() quota_sync() * jbd2_journal_start() write_dquot() @@ -3656,7 +3661,7 @@ static int ext4_write_dquot(struct dquot *dquot) inode = dquot_to_inode(dquot); handle = ext4_journal_start(inode, - EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); + EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_commit(dquot); @@ -3672,7 +3677,7 @@ static int ext4_acquire_dquot(struct dquot *dquot) handle_t *handle; handle = ext4_journal_start(dquot_to_inode(dquot), - EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb)); + EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_acquire(dquot); @@ -3688,7 +3693,7 @@ static int ext4_release_dquot(struct dquot *dquot) handle_t *handle; handle = ext4_journal_start(dquot_to_inode(dquot), - EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); + EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) { /* Release dquot anyway to avoid endless cycle in dqput() */ dquot_release(dquot); @@ -3736,7 +3741,7 @@ static int ext4_write_info(struct super_block *sb, int type) static int ext4_quota_on_mount(struct super_block *sb, int type) { return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type], - EXT4_SB(sb)->s_jquota_fmt, type); + EXT4_SB(sb)->s_jquota_fmt, type); } /* @@ -3907,10 +3912,10 @@ out: #endif -static int ext4_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static int ext4_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt); + return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); } static struct file_system_type ext4_fs_type = { @@ -3922,14 +3927,14 @@ static struct file_system_type ext4_fs_type = { }; #ifdef CONFIG_EXT4DEV_COMPAT -static int ext4dev_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static int ext4dev_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data,struct vfsmount *mnt) { printk(KERN_WARNING "EXT4-fs: Update your userspace programs " "to mount using ext4\n"); printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility " "will go away by 2.6.31\n"); - return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt); + return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); } static struct file_system_type ext4dev_fs_type = { -- cgit v1.1 From 2cc3c559fb2fe8cecca82a517bc56e88b0c1effd Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Thu, 4 Jun 2009 09:23:50 -0400 Subject: Btrfs: set device->total_disk_bytes when adding new device It was not being properly initialized, and so the size saved to disk was not correct. Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5f01dad..a6d35b0 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1440,6 +1440,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) device->io_align = root->sectorsize; device->sector_size = root->sectorsize; device->total_bytes = i_size_read(bdev->bd_inode); + device->disk_total_bytes = device->total_bytes; device->dev_root = root->fs_info->dev_root; device->bdev = bdev; device->in_fs_metadata = 1; -- cgit v1.1 From 44fb5511638938a2c37c895abc14df648ffc07e9 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 4 Jun 2009 15:34:51 -0400 Subject: Btrfs: Fix oops and use after free during space balancing The btrfs allocator uses list_for_each to walk the available block groups when searching for free blocks. It starts off with a hint to help find the best block group for a given allocation. The hint is resolved into a block group, but we don't properly check to make sure the block group we find isn't in the middle of being freed due to filesystem shrinking or balancing. If it is being freed, the list pointers in it are bogus and can't be trusted. But, the code happily goes along and uses them in the list_for_each loop, leading to all kinds of fun. The fix used here is to check to make sure the block group we find really is on the list before we use it. list_del_init is used when removing it from the list, so we can do a proper check. The allocation clustering code has a similar bug where it will trust the block group in the current free space cluster. If our allocation flags have changed (going from single spindle dup to raid1 for example) because the drives in the FS have changed, we're not allowed to use the old block group any more. The fix used here is to check the current cluster against the current allocation flags. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3e2c7c7..35af933 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2622,7 +2622,18 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, search_start); if (block_group && block_group_bits(block_group, data)) { down_read(&space_info->groups_sem); - goto have_block_group; + if (list_empty(&block_group->list) || + block_group->ro) { + /* + * someone is removing this block group, + * we can't jump into the have_block_group + * target because our list pointers are not + * valid + */ + btrfs_put_block_group(block_group); + up_read(&space_info->groups_sem); + } else + goto have_block_group; } else if (block_group) { btrfs_put_block_group(block_group); } @@ -2656,6 +2667,13 @@ have_block_group: * people trying to start a new cluster */ spin_lock(&last_ptr->refill_lock); + if (last_ptr->block_group && + (last_ptr->block_group->ro || + !block_group_bits(last_ptr->block_group, data))) { + offset = 0; + goto refill_cluster; + } + offset = btrfs_alloc_from_cluster(block_group, last_ptr, num_bytes, search_start); if (offset) { @@ -2681,10 +2699,17 @@ have_block_group: last_ptr_loop = 1; search_start = block_group->key.objectid; + /* + * we know this block group is properly + * in the list because + * btrfs_remove_block_group, drops the + * cluster before it removes the block + * group from the list + */ goto have_block_group; } spin_unlock(&last_ptr->lock); - +refill_cluster: /* * this cluster didn't work out, free it and * start over @@ -5968,6 +5993,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, { struct btrfs_path *path; struct btrfs_block_group_cache *block_group; + struct btrfs_free_cluster *cluster; struct btrfs_key key; int ret; @@ -5979,6 +6005,21 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, memcpy(&key, &block_group->key, sizeof(key)); + /* make sure this block group isn't part of an allocation cluster */ + cluster = &root->fs_info->data_alloc_cluster; + spin_lock(&cluster->refill_lock); + btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&cluster->refill_lock); + + /* + * make sure this block group isn't part of a metadata + * allocation cluster + */ + cluster = &root->fs_info->meta_alloc_cluster; + spin_lock(&cluster->refill_lock); + btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&cluster->refill_lock); + path = btrfs_alloc_path(); BUG_ON(!path); @@ -5988,7 +6029,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&root->fs_info->block_group_cache_lock); btrfs_remove_free_space_cache(block_group); down_write(&block_group->space_info->groups_sem); - list_del(&block_group->list); + /* + * we must use list_del_init so people can check to see if they + * are still on the list after taking the semaphore + */ + list_del_init(&block_group->list); up_write(&block_group->space_info->groups_sem); spin_lock(&block_group->space_info->lock); -- cgit v1.1 From 172124e220f1854acc99ee394671781b8b5e2120 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 4 Jun 2009 22:34:44 +0200 Subject: Revert "block: implement blkdev_readpages" This reverts commit db2dbb12dc47a50c7a4c5678f526014063e486f6. It apparently causes problems with partition table read-ahead on archs with large page sizes. Until that problem is diagnosed further, just drop the readpages support on block devices. Signed-off-by: Jens Axboe --- fs/block_dev.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index a29b4dc..2dfc6cd 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -331,12 +331,6 @@ static int blkdev_readpage(struct file * file, struct page * page) return block_read_full_page(page, blkdev_get_block); } -static int blkdev_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) -{ - return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block); -} - static int blkdev_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1405,7 +1399,6 @@ static int blkdev_releasepage(struct page *page, gfp_t wait) static const struct address_space_operations def_blk_aops = { .readpage = blkdev_readpage, - .readpages = blkdev_readpages, .writepage = blkdev_writepage, .sync_page = block_sync_page, .write_begin = blkdev_write_begin, -- cgit v1.1 From 03f5d8bcf094a5e3b501bd2ae1553656efa8d1be Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 9 Jun 2009 00:17:05 -0400 Subject: ext4: Get rid of EXTEND_DISKSIZE flag of ext4_get_blocks_handle() Get rid of EXTEND_DISKSIZE flag of ext4_get_blocks_handle(). This seems to be a relict from some old days and setting disksize in this function does not make much sense. Currently it was set only by ext4_getblk(). Since the parameter has some effect only if create == 1, it is easy to check by grepping through the sources that the three callers which end up calling ext4_getblk() with create == 1 (ext4_append, ext4_quota_write, ext4_mkdir) do the right thing and set disksize themselves. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 6 ++---- fs/ext4/extents.c | 9 --------- fs/ext4/inode.c | 23 +++-------------------- 3 files changed, 5 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4311cc8..59657ff 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -323,15 +323,13 @@ struct ext4_new_group_data { #define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002 #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ EXT4_GET_BLOCKS_CREATE) - /* Update the ext4_inode_info i_disksize field */ -#define EXT4_GET_BLOCKS_EXTEND_DISKSIZE 0x0004 /* Caller is from the delayed allocation writeout path, so set the magic i_delalloc_reserve_flag after taking the inode allocation semaphore for */ -#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0008 +#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 /* Call ext4_da_update_reserve_space() after successfully allocating the blocks */ -#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0010 +#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008 /* diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d4e99e9..9c35a7b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2779,7 +2779,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, int err = 0, depth, ret, cache_type; unsigned int allocated = 0; struct ext4_allocation_request ar; - loff_t disksize; __clear_bit(BH_New, &bh_result->b_state); ext_debug("blocks %u/%u requested for inode %u\n", @@ -2969,14 +2968,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, newblock = ext_pblock(&newex); allocated = ext4_ext_get_actual_len(&newex); outnew: - if (flags & EXT4_GET_BLOCKS_EXTEND_DISKSIZE) { - disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits; - if (disksize > i_size_read(inode)) - disksize = i_size_read(inode); - if (disksize > EXT4_I(inode)->i_disksize) - EXT4_I(inode)->i_disksize = disksize; - } - set_buffer_new(bh_result); /* Cache only when it is _not_ an uninitialized extent */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 14c00ff..17ed0d2 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -933,11 +933,8 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, int indirect_blks; int blocks_to_boundary = 0; int depth; - struct ext4_inode_info *ei = EXT4_I(inode); int count = 0; ext4_fsblk_t first_block = 0; - loff_t disksize; - J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); @@ -1003,19 +1000,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, if (!err) err = ext4_splice_branch(handle, inode, iblock, partial, indirect_blks, count); - /* - * i_disksize growing is protected by i_data_sem. Don't forget to - * protect it if you're about to implement concurrent - * ext4_get_block() -bzzz - */ - if (!err && (flags & EXT4_GET_BLOCKS_EXTEND_DISKSIZE)) { - disksize = ((loff_t) iblock + count) << inode->i_blkbits; - if (disksize > i_size_read(inode)) - disksize = i_size_read(inode); - if (disksize > ei->i_disksize) - ei->i_disksize = disksize; - } - if (err) + else goto cleanup; set_buffer_new(bh_result); @@ -1321,7 +1306,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, { struct buffer_head dummy; int fatal = 0, err; - int flags = EXT4_GET_BLOCKS_EXTEND_DISKSIZE; + int flags = 0; J_ASSERT(handle != NULL || create == 0); @@ -2153,9 +2138,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) } /* - * Update on-disk size along with block allocation we don't - * use EXT4_GET_BLOCKS_EXTEND_DISKSIZE as size may change - * within already allocated block -bzzz + * Update on-disk size along with block allocation. */ disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits; if (disksize > i_size_read(mpd->inode)) -- cgit v1.1 From b31e15527a9bb71b6a11a425d17ce139a62f5af5 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 4 Jun 2009 17:36:36 -0400 Subject: ext4: Change all super.c messages to print the device This patch changes ext4 super.c to include the device name with all warning/error messages, by using a new utility function ext4_msg. It's a rather large patch, but very mechanic. I left debug printks alone. This is a straightforward port of a patch which Andi Kleen did for ext3. Cc: Andi Kleen Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 2 + fs/ext4/super.c | 462 ++++++++++++++++++++++++++++---------------------------- 2 files changed, 235 insertions(+), 229 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 59657ff..cc7d5ed 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1401,6 +1401,8 @@ extern void ext4_abort(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); extern void ext4_warning(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); +extern void ext4_msg(struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); extern void ext4_grp_locked_error(struct super_block *, ext4_group_t, const char *, const char *, ...) __attribute__ ((format (printf, 4, 5))); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0a97b1a..c191d0f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -306,7 +306,7 @@ static void ext4_handle_error(struct super_block *sb) jbd2_journal_abort(journal, -EIO); } if (test_opt(sb, ERRORS_RO)) { - printk(KERN_CRIT "Remounting filesystem read-only\n"); + ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); sb->s_flags |= MS_RDONLY; } ext4_commit_super(sb, 1); @@ -399,8 +399,6 @@ void ext4_abort(struct super_block *sb, const char *function, { va_list args; - printk(KERN_CRIT "ext4_abort called.\n"); - va_start(args, fmt); printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); vprintk(fmt, args); @@ -413,7 +411,7 @@ void ext4_abort(struct super_block *sb, const char *function, if (sb->s_flags & MS_RDONLY) return; - printk(KERN_CRIT "Remounting filesystem read-only\n"); + ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; sb->s_flags |= MS_RDONLY; EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT; @@ -421,6 +419,18 @@ void ext4_abort(struct super_block *sb, const char *function, jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); } +void ext4_msg (struct super_block * sb, const char *prefix, + const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + printk("%sEXT4-fs (%s): ", prefix, sb->s_id); + vprintk(fmt, args); + printk("\n"); + va_end(args); +} + void ext4_warning(struct super_block *sb, const char *function, const char *fmt, ...) { @@ -499,7 +509,7 @@ void ext4_update_dynamic_rev(struct super_block *sb) /* * Open the external journal device */ -static struct block_device *ext4_blkdev_get(dev_t dev) +static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) { struct block_device *bdev; char b[BDEVNAME_SIZE]; @@ -510,7 +520,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev) return bdev; fail: - printk(KERN_ERR "EXT4-fs: failed to open journal device %s: %ld\n", + ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld", __bdevname(dev, b), PTR_ERR(bdev)); return NULL; } @@ -546,8 +556,8 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi) { struct list_head *l; - printk(KERN_ERR "sb orphan head is %d\n", - le32_to_cpu(sbi->s_es->s_last_orphan)); + ext4_msg(sb, KERN_ERR, "sb orphan head is %d", + le32_to_cpu(sbi->s_es->s_last_orphan)); printk(KERN_ERR "sb_info orphan list:\n"); list_for_each(l, &sbi->s_orphan) { @@ -678,8 +688,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) static void ext4_destroy_inode(struct inode *inode) { if (!list_empty(&(EXT4_I(inode)->i_orphan))) { - printk("EXT4 Inode %p: orphan list check failed!\n", - EXT4_I(inode)); + ext4_msg(inode->i_sb, KERN_ERR, + "Inode %lu (%p): orphan list check failed!", + inode->i_ino, EXT4_I(inode)); print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, EXT4_I(inode), sizeof(struct ext4_inode_info), true); @@ -1239,8 +1250,7 @@ static int parse_options(char *options, struct super_block *sb, #else case Opt_user_xattr: case Opt_nouser_xattr: - printk(KERN_ERR "EXT4 (no)user_xattr options " - "not supported\n"); + ext4_msg(sb, KERN_ERR, "(no)user_xattr options not supported"); break; #endif #ifdef CONFIG_EXT4_FS_POSIX_ACL @@ -1253,8 +1263,7 @@ static int parse_options(char *options, struct super_block *sb, #else case Opt_acl: case Opt_noacl: - printk(KERN_ERR "EXT4 (no)acl options " - "not supported\n"); + ext4_msg(sb, KERN_ERR, "(no)acl options not supported"); break; #endif case Opt_journal_update: @@ -1264,16 +1273,16 @@ static int parse_options(char *options, struct super_block *sb, user to specify an existing inode to be the journal file. */ if (is_remount) { - printk(KERN_ERR "EXT4-fs: cannot specify " - "journal on remount\n"); + ext4_msg(sb, KERN_ERR, + "Cannot specify journal on remount"); return 0; } set_opt(sbi->s_mount_opt, UPDATE_JOURNAL); break; case Opt_journal_dev: if (is_remount) { - printk(KERN_ERR "EXT4-fs: cannot specify " - "journal on remount\n"); + ext4_msg(sb, KERN_ERR, + "Cannot specify journal on remount"); return 0; } if (match_int(&args[0], &option)) @@ -1327,9 +1336,8 @@ static int parse_options(char *options, struct super_block *sb, if (is_remount) { if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS) != data_opt) { - printk(KERN_ERR - "EXT4-fs: cannot change data " - "mode on remount\n"); + ext4_msg(sb, KERN_ERR, + "Cannot change data mode on remount"); return 0; } } else { @@ -1359,31 +1367,31 @@ static int parse_options(char *options, struct super_block *sb, set_qf_name: if (sb_any_quota_loaded(sb) && !sbi->s_qf_names[qtype]) { - printk(KERN_ERR - "EXT4-fs: Cannot change journaled " - "quota options when quota turned on.\n"); + ext4_msg(sb, KERN_ERR, + "Cannot change journaled " + "quota options when quota turned on"); return 0; } qname = match_strdup(&args[0]); if (!qname) { - printk(KERN_ERR - "EXT4-fs: not enough memory for " - "storing quotafile name.\n"); + ext4_msg(sb, KERN_ERR, + "Not enough memory for " + "storing quotafile name"); return 0; } if (sbi->s_qf_names[qtype] && strcmp(sbi->s_qf_names[qtype], qname)) { - printk(KERN_ERR - "EXT4-fs: %s quota file already " - "specified.\n", QTYPE2NAME(qtype)); + ext4_msg(sb, KERN_ERR, + "%s quota file already " + "specified", QTYPE2NAME(qtype)); kfree(qname); return 0; } sbi->s_qf_names[qtype] = qname; if (strchr(sbi->s_qf_names[qtype], '/')) { - printk(KERN_ERR - "EXT4-fs: quotafile must be on " - "filesystem root.\n"); + ext4_msg(sb, KERN_ERR, + "quotafile must be on " + "filesystem root"); kfree(sbi->s_qf_names[qtype]); sbi->s_qf_names[qtype] = NULL; return 0; @@ -1398,9 +1406,9 @@ set_qf_name: clear_qf_name: if (sb_any_quota_loaded(sb) && sbi->s_qf_names[qtype]) { - printk(KERN_ERR "EXT4-fs: Cannot change " + ext4_msg(sb, KERN_ERR, "Cannot change " "journaled quota options when " - "quota turned on.\n"); + "quota turned on"); return 0; } /* @@ -1417,9 +1425,9 @@ clear_qf_name: set_qf_format: if (sb_any_quota_loaded(sb) && sbi->s_jquota_fmt != qfmt) { - printk(KERN_ERR "EXT4-fs: Cannot change " + ext4_msg(sb, KERN_ERR, "Cannot change " "journaled quota options when " - "quota turned on.\n"); + "quota turned on"); return 0; } sbi->s_jquota_fmt = qfmt; @@ -1435,8 +1443,8 @@ set_qf_format: break; case Opt_noquota: if (sb_any_quota_loaded(sb)) { - printk(KERN_ERR "EXT4-fs: Cannot change quota " - "options when quota turned on.\n"); + ext4_msg(sb, KERN_ERR, "Cannot change quota " + "options when quota turned on"); return 0; } clear_opt(sbi->s_mount_opt, QUOTA); @@ -1447,8 +1455,8 @@ set_qf_format: case Opt_quota: case Opt_usrquota: case Opt_grpquota: - printk(KERN_ERR - "EXT4-fs: quota options not supported.\n"); + ext4_msg(sb, KERN_ERR, + "quota options not supported"); break; case Opt_usrjquota: case Opt_grpjquota: @@ -1456,9 +1464,8 @@ set_qf_format: case Opt_offgrpjquota: case Opt_jqfmt_vfsold: case Opt_jqfmt_vfsv0: - printk(KERN_ERR - "EXT4-fs: journaled quota options not " - "supported.\n"); + ext4_msg(sb, KERN_ERR, + "journaled quota options not supported"); break; case Opt_noquota: break; @@ -1483,8 +1490,9 @@ set_qf_format: break; case Opt_resize: if (!is_remount) { - printk("EXT4-fs: resize option only available " - "for remount\n"); + ext4_msg(sb, KERN_ERR, + "resize option only available " + "for remount"); return 0; } if (match_int(&args[0], &option) != 0) @@ -1526,8 +1534,9 @@ set_qf_format: if (option < 0 || option > (1 << 30)) return 0; if (!is_power_of_2(option)) { - printk(KERN_ERR "EXT4-fs: inode_readahead_blks" - " must be a power of 2\n"); + ext4_msg(sb, KERN_ERR, + "EXT4-fs: inode_readahead_blks" + " must be a power of 2"); return 0; } sbi->s_inode_readahead_blks = option; @@ -1554,9 +1563,9 @@ set_qf_format: set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); break; default: - printk(KERN_ERR - "EXT4-fs: Unrecognized mount option \"%s\" " - "or missing value\n", p); + ext4_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" " + "or missing value", p); return 0; } } @@ -1574,21 +1583,21 @@ set_qf_format: (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) || (sbi->s_qf_names[GRPQUOTA] && (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) { - printk(KERN_ERR "EXT4-fs: old and new quota " - "format mixing.\n"); + ext4_msg(sb, KERN_ERR, "old and new quota " + "format mixing"); return 0; } if (!sbi->s_jquota_fmt) { - printk(KERN_ERR "EXT4-fs: journaled quota format " - "not specified.\n"); + ext4_msg(sb, KERN_ERR, "journaled quota format " + "not specified"); return 0; } } else { if (sbi->s_jquota_fmt) { - printk(KERN_ERR "EXT4-fs: journaled quota format " + ext4_msg(sb, KERN_ERR, "journaled quota format " "specified with no journaling " - "enabled.\n"); + "enabled"); return 0; } } @@ -1603,31 +1612,31 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, int res = 0; if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { - printk(KERN_ERR "EXT4-fs warning: revision level too high, " - "forcing read-only mode\n"); + ext4_msg(sb, KERN_ERR, "revision level too high, " + "forcing read-only mode"); res = MS_RDONLY; } if (read_only) return res; if (!(sbi->s_mount_state & EXT4_VALID_FS)) - printk(KERN_WARNING "EXT4-fs warning: mounting unchecked fs, " - "running e2fsck is recommended\n"); + ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, " + "running e2fsck is recommended"); else if ((sbi->s_mount_state & EXT4_ERROR_FS)) - printk(KERN_WARNING - "EXT4-fs warning: mounting fs with errors, " - "running e2fsck is recommended\n"); + ext4_msg(sb, KERN_WARNING, + "warning: mounting fs with errors, " + "running e2fsck is recommended"); else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && le16_to_cpu(es->s_mnt_count) >= (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) - printk(KERN_WARNING - "EXT4-fs warning: maximal mount count reached, " - "running e2fsck is recommended\n"); + ext4_msg(sb, KERN_WARNING, + "warning: maximal mount count reached, " + "running e2fsck is recommended"); else if (le32_to_cpu(es->s_checkinterval) && (le32_to_cpu(es->s_lastcheck) + le32_to_cpu(es->s_checkinterval) <= get_seconds())) - printk(KERN_WARNING - "EXT4-fs warning: checktime reached, " - "running e2fsck is recommended\n"); + ext4_msg(sb, KERN_WARNING, + "warning: checktime reached, " + "running e2fsck is recommended"); if (!sbi->s_journal) es->s_state &= cpu_to_le16(~EXT4_VALID_FS); if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) @@ -1649,11 +1658,11 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, sbi->s_mount_opt); if (EXT4_SB(sb)->s_journal) { - printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n", - sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" : + ext4_msg(sb, KERN_INFO, "%s journal on %s", + EXT4_SB(sb)->s_journal->j_inode ? "internal" : "external", EXT4_SB(sb)->s_journal->j_devname); } else { - printk(KERN_INFO "EXT4 FS on %s, no journal\n", sb->s_id); + ext4_msg(sb, KERN_INFO, "no journal"); } return res; } @@ -1688,8 +1697,8 @@ static int ext4_fill_flex_info(struct super_block *sb) memset(sbi->s_flex_groups, 0, size); } if (sbi->s_flex_groups == NULL) { - printk(KERN_ERR "EXT4-fs: not enough memory for " - "%u flex groups\n", flex_group_count); + ext4_msg(sb, KERN_ERR, "not enough memory for " + "%u flex groups", flex_group_count); goto failed; } @@ -1775,32 +1784,32 @@ static int ext4_check_descriptors(struct super_block *sb) block_bitmap = ext4_block_bitmap(sb, gdp); if (block_bitmap < first_block || block_bitmap > last_block) { - printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Block bitmap for group %u not in group " - "(block %llu)!\n", i, block_bitmap); + "(block %llu)!", i, block_bitmap); return 0; } inode_bitmap = ext4_inode_bitmap(sb, gdp); if (inode_bitmap < first_block || inode_bitmap > last_block) { - printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode bitmap for group %u not in group " - "(block %llu)!\n", i, inode_bitmap); + "(block %llu)!", i, inode_bitmap); return 0; } inode_table = ext4_inode_table(sb, gdp); if (inode_table < first_block || inode_table + sbi->s_itb_per_group - 1 > last_block) { - printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " "Inode table for group %u not in group " - "(block %llu)!\n", i, inode_table); + "(block %llu)!", i, inode_table); return 0; } ext4_lock_group(sb, i); if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { - printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " - "Checksum for group %u failed (%u!=%u)\n", - i, le16_to_cpu(ext4_group_desc_csum(sbi, i, - gdp)), le16_to_cpu(gdp->bg_checksum)); + ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: " + "Checksum for group %u failed (%u!=%u)", + i, le16_to_cpu(ext4_group_desc_csum(sbi, i, + gdp)), le16_to_cpu(gdp->bg_checksum)); if (!(sb->s_flags & MS_RDONLY)) { ext4_unlock_group(sb, i); return 0; @@ -1847,8 +1856,8 @@ static void ext4_orphan_cleanup(struct super_block *sb, } if (bdev_read_only(sb->s_bdev)) { - printk(KERN_ERR "EXT4-fs: write access " - "unavailable, skipping orphan cleanup.\n"); + ext4_msg(sb, KERN_ERR, "write access " + "unavailable, skipping orphan cleanup"); return; } @@ -1862,8 +1871,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, } if (s_flags & MS_RDONLY) { - printk(KERN_INFO "EXT4-fs: %s: orphan cleanup on readonly fs\n", - sb->s_id); + ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); sb->s_flags &= ~MS_RDONLY; } #ifdef CONFIG_QUOTA @@ -1874,9 +1882,9 @@ static void ext4_orphan_cleanup(struct super_block *sb, if (EXT4_SB(sb)->s_qf_names[i]) { int ret = ext4_quota_on_mount(sb, i); if (ret < 0) - printk(KERN_ERR - "EXT4-fs: Cannot turn on journaled " - "quota: error %d\n", ret); + ext4_msg(sb, KERN_ERR, + "Cannot turn on journaled " + "quota: error %d", ret); } } #endif @@ -1893,16 +1901,16 @@ static void ext4_orphan_cleanup(struct super_block *sb, list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); vfs_dq_init(inode); if (inode->i_nlink) { - printk(KERN_DEBUG - "%s: truncating inode %lu to %lld bytes\n", + ext4_msg(sb, KERN_DEBUG, + "%s: truncating inode %lu to %lld bytes", __func__, inode->i_ino, inode->i_size); jbd_debug(2, "truncating inode %lu to %lld bytes\n", inode->i_ino, inode->i_size); ext4_truncate(inode); nr_truncates++; } else { - printk(KERN_DEBUG - "%s: deleting unreferenced inode %lu\n", + ext4_msg(sb, KERN_DEBUG, + "%s: deleting unreferenced inode %lu", __func__, inode->i_ino); jbd_debug(2, "deleting unreferenced inode %lu\n", inode->i_ino); @@ -1914,11 +1922,11 @@ static void ext4_orphan_cleanup(struct super_block *sb, #define PLURAL(x) (x), ((x) == 1) ? "" : "s" if (nr_orphans) - printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n", - sb->s_id, PLURAL(nr_orphans)); + ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted", + PLURAL(nr_orphans)); if (nr_truncates) - printk(KERN_INFO "EXT4-fs: %s: %d truncate%s cleaned up\n", - sb->s_id, PLURAL(nr_truncates)); + ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", + PLURAL(nr_truncates)); #ifdef CONFIG_QUOTA /* Turn quotas off */ for (i = 0; i < MAXQUOTAS; i++) { @@ -2307,7 +2315,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); if (!blocksize) { - printk(KERN_ERR "EXT4-fs: unable to set blocksize\n"); + ext4_msg(sb, KERN_ERR, "unable to set blocksize"); goto out_fail; } @@ -2323,7 +2331,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (!(bh = sb_bread(sb, logical_sb_block))) { - printk(KERN_ERR "EXT4-fs: unable to read superblock\n"); + ext4_msg(sb, KERN_ERR, "unable to read superblock"); goto out_fail; } /* @@ -2393,9 +2401,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) || EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) || EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U))) - printk(KERN_WARNING - "EXT4-fs warning: feature flags set on rev 0 fs, " - "running e2fsck is recommended\n"); + ext4_msg(sb, KERN_WARNING, + "feature flags set on rev 0 fs, " + "running e2fsck is recommended"); /* * Check feature flags regardless of the revision level, since we @@ -2404,16 +2412,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) */ features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP); if (features) { - printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of " - "unsupported optional features (%x).\n", sb->s_id, + ext4_msg(sb, KERN_ERR, + "Couldn't mount because of " + "unsupported optional features (%x)", (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & ~EXT4_FEATURE_INCOMPAT_SUPP)); goto failed_mount; } features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP); if (!(sb->s_flags & MS_RDONLY) && features) { - printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of " - "unsupported optional features (%x).\n", sb->s_id, + ext4_msg(sb, KERN_ERR, + "Couldn't mount RDWR because of " + "unsupported optional features (%x)", (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & ~EXT4_FEATURE_RO_COMPAT_SUPP)); goto failed_mount; @@ -2427,9 +2437,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) */ if (sizeof(root->i_blocks) < sizeof(u64) && !(sb->s_flags & MS_RDONLY)) { - printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge " + ext4_msg(sb, KERN_ERR, "Filesystem with huge " "files cannot be mounted read-write " - "without CONFIG_LBD.\n", sb->s_id); + "without CONFIG_LBD"); goto failed_mount; } } @@ -2437,16 +2447,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (blocksize < EXT4_MIN_BLOCK_SIZE || blocksize > EXT4_MAX_BLOCK_SIZE) { - printk(KERN_ERR - "EXT4-fs: Unsupported filesystem blocksize %d on %s.\n", - blocksize, sb->s_id); + ext4_msg(sb, KERN_ERR, + "Unsupported filesystem blocksize %d", blocksize); goto failed_mount; } if (sb->s_blocksize != blocksize) { /* Validate the filesystem blocksize */ if (!sb_set_blocksize(sb, blocksize)) { - printk(KERN_ERR "EXT4-fs: bad block size %d.\n", + ext4_msg(sb, KERN_ERR, "bad block size %d", blocksize); goto failed_mount; } @@ -2456,15 +2465,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) offset = do_div(logical_sb_block, blocksize); bh = sb_bread(sb, logical_sb_block); if (!bh) { - printk(KERN_ERR - "EXT4-fs: Can't read superblock on 2nd try.\n"); + ext4_msg(sb, KERN_ERR, + "Can't read superblock on 2nd try"); goto failed_mount; } es = (struct ext4_super_block *)(((char *)bh->b_data) + offset); sbi->s_es = es; if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { - printk(KERN_ERR - "EXT4-fs: Magic mismatch, very weird !\n"); + ext4_msg(sb, KERN_ERR, + "Magic mismatch, very weird!"); goto failed_mount; } } @@ -2482,8 +2491,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || (!is_power_of_2(sbi->s_inode_size)) || (sbi->s_inode_size > blocksize)) { - printk(KERN_ERR - "EXT4-fs: unsupported inode size: %d\n", + ext4_msg(sb, KERN_ERR, + "unsupported inode size: %d", sbi->s_inode_size); goto failed_mount; } @@ -2496,8 +2505,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || sbi->s_desc_size > EXT4_MAX_DESC_SIZE || !is_power_of_2(sbi->s_desc_size)) { - printk(KERN_ERR - "EXT4-fs: unsupported descriptor size %lu\n", + ext4_msg(sb, KERN_ERR, + "unsupported descriptor size %lu", sbi->s_desc_size); goto failed_mount; } @@ -2537,25 +2546,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } if (sbi->s_blocks_per_group > blocksize * 8) { - printk(KERN_ERR - "EXT4-fs: #blocks per group too big: %lu\n", + ext4_msg(sb, KERN_ERR, + "#blocks per group too big: %lu", sbi->s_blocks_per_group); goto failed_mount; } if (sbi->s_inodes_per_group > blocksize * 8) { - printk(KERN_ERR - "EXT4-fs: #inodes per group too big: %lu\n", + ext4_msg(sb, KERN_ERR, + "#inodes per group too big: %lu", sbi->s_inodes_per_group); goto failed_mount; } if (ext4_blocks_count(es) > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { - printk(KERN_ERR "EXT4-fs: filesystem on %s:" - " too large to mount safely\n", sb->s_id); + ext4_msg(sb, KERN_ERR, "filesystem" + " too large to mount safely"); if (sizeof(sector_t) < 8) - printk(KERN_WARNING "EXT4-fs: CONFIG_LBD not " - "enabled\n"); + ext4_msg(sb, KERN_WARNING, "CONFIG_LBD not enabled"); goto failed_mount; } @@ -2565,8 +2573,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* check blocks count against device size */ blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; if (blocks_count && ext4_blocks_count(es) > blocks_count) { - printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu " - "exceeds size of device (%llu blocks)\n", + ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu " + "exceeds size of device (%llu blocks)", ext4_blocks_count(es), blocks_count); goto failed_mount; } @@ -2576,10 +2584,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * of the filesystem. */ if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { - printk(KERN_WARNING "EXT4-fs: bad geometry: first data" - "block %u is beyond end of filesystem (%llu)\n", - le32_to_cpu(es->s_first_data_block), - ext4_blocks_count(es)); + ext4_msg(sb, KERN_WARNING, "bad geometry: first data" + "block %u is beyond end of filesystem (%llu)", + le32_to_cpu(es->s_first_data_block), + ext4_blocks_count(es)); goto failed_mount; } blocks_count = (ext4_blocks_count(es) - @@ -2587,9 +2595,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) EXT4_BLOCKS_PER_GROUP(sb) - 1); do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { - printk(KERN_WARNING "EXT4-fs: groups count too large: %u " + ext4_msg(sb, KERN_WARNING, "groups count too large: %u " "(block count %llu, first data block %u, " - "blocks per group %lu)\n", sbi->s_groups_count, + "blocks per group %lu)", sbi->s_groups_count, ext4_blocks_count(es), le32_to_cpu(es->s_first_data_block), EXT4_BLOCKS_PER_GROUP(sb)); @@ -2601,7 +2609,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), GFP_KERNEL); if (sbi->s_group_desc == NULL) { - printk(KERN_ERR "EXT4-fs: not enough memory\n"); + ext4_msg(sb, KERN_ERR, "not enough memory"); goto failed_mount; } @@ -2616,21 +2624,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) block = descriptor_loc(sb, logical_sb_block, i); sbi->s_group_desc[i] = sb_bread(sb, block); if (!sbi->s_group_desc[i]) { - printk(KERN_ERR "EXT4-fs: " - "can't read group descriptor %d\n", i); + ext4_msg(sb, KERN_ERR, + "can't read group descriptor %d", i); db_count = i; goto failed_mount2; } } if (!ext4_check_descriptors(sb)) { - printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n"); + ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); goto failed_mount2; } if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) if (!ext4_fill_flex_info(sb)) { - printk(KERN_ERR - "EXT4-fs: unable to initialize " - "flex_bg meta info!\n"); + ext4_msg(sb, KERN_ERR, + "unable to initialize " + "flex_bg meta info!"); goto failed_mount2; } @@ -2652,7 +2660,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); } if (err) { - printk(KERN_ERR "EXT4-fs: insufficient memory\n"); + ext4_msg(sb, KERN_ERR, "insufficient memory"); goto failed_mount3; } @@ -2692,13 +2700,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount3; if (!(sb->s_flags & MS_RDONLY) && EXT4_SB(sb)->s_journal->j_failed_commit) { - printk(KERN_CRIT "EXT4-fs error (device %s): " + ext4_msg(sb, KERN_CRIT, "error: " "ext4_fill_super: Journal transaction " - "%u is corrupt\n", sb->s_id, + "%u is corrupt", EXT4_SB(sb)->s_journal->j_failed_commit); if (test_opt(sb, ERRORS_RO)) { - printk(KERN_CRIT - "Mounting filesystem read-only\n"); + ext4_msg(sb, KERN_CRIT, + "Mounting filesystem read-only"); sb->s_flags |= MS_RDONLY; EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); @@ -2712,8 +2720,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { - printk(KERN_ERR "EXT4-fs: required journal recovery " - "suppressed and not mounted read-only\n"); + ext4_msg(sb, KERN_ERR, "required journal recovery " + "suppressed and not mounted read-only"); goto failed_mount4; } else { clear_opt(sbi->s_mount_opt, DATA_FLAGS); @@ -2726,7 +2734,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (ext4_blocks_count(es) > 0xffffffffULL && !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_64BIT)) { - printk(KERN_ERR "EXT4-fs: Failed to set 64-bit journal feature\n"); + ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature"); goto failed_mount4; } @@ -2764,8 +2772,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) case EXT4_MOUNT_WRITEBACK_DATA: if (!jbd2_journal_check_available_features (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) { - printk(KERN_ERR "EXT4-fs: Journal does not support " - "requested data journaling mode\n"); + ext4_msg(sb, KERN_ERR, "Journal does not support " + "requested data journaling mode"); goto failed_mount4; } default: @@ -2777,8 +2785,8 @@ no_journal: if (test_opt(sb, NOBH)) { if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { - printk(KERN_WARNING "EXT4-fs: Ignoring nobh option - " - "its supported only with writeback mode\n"); + ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - " + "its supported only with writeback mode"); clear_opt(sbi->s_mount_opt, NOBH); } } @@ -2789,18 +2797,18 @@ no_journal: root = ext4_iget(sb, EXT4_ROOT_INO); if (IS_ERR(root)) { - printk(KERN_ERR "EXT4-fs: get root inode failed\n"); + ext4_msg(sb, KERN_ERR, "get root inode failed"); ret = PTR_ERR(root); goto failed_mount4; } if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { iput(root); - printk(KERN_ERR "EXT4-fs: corrupt root inode, run e2fsck\n"); + ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); goto failed_mount4; } sb->s_root = d_alloc_root(root); if (!sb->s_root) { - printk(KERN_ERR "EXT4-fs: get root dentry failed\n"); + ext4_msg(sb, KERN_ERR, "get root dentry failed"); iput(root); ret = -ENOMEM; goto failed_mount4; @@ -2829,29 +2837,29 @@ no_journal: sbi->s_inode_size) { sbi->s_want_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; - printk(KERN_INFO "EXT4-fs: required extra inode space not" - "available.\n"); + ext4_msg(sb, KERN_INFO, "required extra inode space not" + "available"); } if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { - printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - " - "requested data journaling mode\n"); + ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - " + "requested data journaling mode"); clear_opt(sbi->s_mount_opt, DELALLOC); } else if (test_opt(sb, DELALLOC)) - printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); + ext4_msg(sb, KERN_INFO, "delayed allocation enabled"); err = ext4_setup_system_zone(sb); if (err) { - printk(KERN_ERR "EXT4-fs: failed to initialize system " - "zone (%d)\n", err); + ext4_msg(sb, KERN_ERR, "failed to initialize system " + "zone (%d)\n", err); goto failed_mount4; } ext4_ext_init(sb); err = ext4_mb_init(sb, needs_recovery); if (err) { - printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n", - err); + ext4_msg(sb, KERN_ERR, "failed to initalize mballoc (%d)", + err); goto failed_mount4; } @@ -2869,7 +2877,7 @@ no_journal: ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; if (needs_recovery) { - printk(KERN_INFO "EXT4-fs: recovery complete.\n"); + ext4_msg(sb, KERN_INFO, "recovery complete"); ext4_mark_recovery_complete(sb, es); } if (EXT4_SB(sb)->s_journal) { @@ -2882,20 +2890,18 @@ no_journal: } else descr = "out journal"; - printk(KERN_INFO "EXT4-fs: mounted filesystem %s with%s\n", - sb->s_id, descr); + ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr); lock_kernel(); return 0; cantfind_ext4: if (!silent) - printk(KERN_ERR "VFS: Can't find ext4 filesystem on dev %s.\n", - sb->s_id); + ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); goto failed_mount; failed_mount4: - printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id); + ext4_msg(sb, KERN_ERR, "mount failed"); ext4_release_system_zone(sb); if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); @@ -2973,27 +2979,27 @@ static journal_t *ext4_get_journal(struct super_block *sb, journal_inode = ext4_iget(sb, journal_inum); if (IS_ERR(journal_inode)) { - printk(KERN_ERR "EXT4-fs: no journal found.\n"); + ext4_msg(sb, KERN_ERR, "no journal found"); return NULL; } if (!journal_inode->i_nlink) { make_bad_inode(journal_inode); iput(journal_inode); - printk(KERN_ERR "EXT4-fs: journal inode is deleted.\n"); + ext4_msg(sb, KERN_ERR, "journal inode is deleted"); return NULL; } jbd_debug(2, "Journal inode found at %p: %lld bytes\n", journal_inode, journal_inode->i_size); if (!S_ISREG(journal_inode->i_mode)) { - printk(KERN_ERR "EXT4-fs: invalid journal inode.\n"); + ext4_msg(sb, KERN_ERR, "invalid journal inode"); iput(journal_inode); return NULL; } journal = jbd2_journal_init_inode(journal_inode); if (!journal) { - printk(KERN_ERR "EXT4-fs: Could not load journal inode\n"); + ext4_msg(sb, KERN_ERR, "Could not load journal inode"); iput(journal_inode); return NULL; } @@ -3017,13 +3023,13 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); - bdev = ext4_blkdev_get(j_dev); + bdev = ext4_blkdev_get(j_dev, sb); if (bdev == NULL) return NULL; if (bd_claim(bdev, sb)) { - printk(KERN_ERR - "EXT4-fs: failed to claim external journal device.\n"); + ext4_msg(sb, KERN_ERR, + "failed to claim external journal device"); blkdev_put(bdev, FMODE_READ|FMODE_WRITE); return NULL; } @@ -3031,8 +3037,8 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, blocksize = sb->s_blocksize; hblock = bdev_hardsect_size(bdev); if (blocksize < hblock) { - printk(KERN_ERR - "EXT4-fs: blocksize too small for journal device.\n"); + ext4_msg(sb, KERN_ERR, + "blocksize too small for journal device"); goto out_bdev; } @@ -3040,8 +3046,8 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, offset = EXT4_MIN_BLOCK_SIZE % blocksize; set_blocksize(bdev, blocksize); if (!(bh = __bread(bdev, sb_block, blocksize))) { - printk(KERN_ERR "EXT4-fs: couldn't read superblock of " - "external journal\n"); + ext4_msg(sb, KERN_ERR, "couldn't read superblock of " + "external journal"); goto out_bdev; } @@ -3049,14 +3055,14 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) || !(le32_to_cpu(es->s_feature_incompat) & EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { - printk(KERN_ERR "EXT4-fs: external journal has " - "bad superblock\n"); + ext4_msg(sb, KERN_ERR, "external journal has " + "bad superblock"); brelse(bh); goto out_bdev; } if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { - printk(KERN_ERR "EXT4-fs: journal UUID does not match\n"); + ext4_msg(sb, KERN_ERR, "journal UUID does not match"); brelse(bh); goto out_bdev; } @@ -3068,19 +3074,19 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, journal = jbd2_journal_init_dev(bdev, sb->s_bdev, start, len, blocksize); if (!journal) { - printk(KERN_ERR "EXT4-fs: failed to create device journal\n"); + ext4_msg(sb, KERN_ERR, "failed to create device journal"); goto out_bdev; } journal->j_private = sb; ll_rw_block(READ, 1, &journal->j_sb_buffer); wait_on_buffer(journal->j_sb_buffer); if (!buffer_uptodate(journal->j_sb_buffer)) { - printk(KERN_ERR "EXT4-fs: I/O error on journal device\n"); + ext4_msg(sb, KERN_ERR, "I/O error on journal device"); goto out_journal; } if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { - printk(KERN_ERR "EXT4-fs: External journal has more than one " - "user (unsupported) - %d\n", + ext4_msg(sb, KERN_ERR, "External journal has more than one " + "user (unsupported) - %d", be32_to_cpu(journal->j_superblock->s_nr_users)); goto out_journal; } @@ -3109,8 +3115,8 @@ static int ext4_load_journal(struct super_block *sb, if (journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { - printk(KERN_INFO "EXT4-fs: external journal device major/minor " - "numbers have changed\n"); + ext4_msg(sb, KERN_INFO, "external journal device major/minor " + "numbers have changed"); journal_dev = new_decode_dev(journal_devnum); } else journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev)); @@ -3124,21 +3130,21 @@ static int ext4_load_journal(struct super_block *sb, */ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { if (sb->s_flags & MS_RDONLY) { - printk(KERN_INFO "EXT4-fs: INFO: recovery " - "required on readonly filesystem.\n"); + ext4_msg(sb, KERN_INFO, "INFO: recovery " + "required on readonly filesystem"); if (really_read_only) { - printk(KERN_ERR "EXT4-fs: write access " - "unavailable, cannot proceed.\n"); + ext4_msg(sb, KERN_ERR, "write access " + "unavailable, cannot proceed"); return -EROFS; } - printk(KERN_INFO "EXT4-fs: write access will " - "be enabled during recovery.\n"); + ext4_msg(sb, KERN_INFO, "write access will " + "be enabled during recovery"); } } if (journal_inum && journal_dev) { - printk(KERN_ERR "EXT4-fs: filesystem has both journal " - "and inode journals!\n"); + ext4_msg(sb, KERN_ERR, "filesystem has both journal " + "and inode journals!"); return -EINVAL; } @@ -3151,14 +3157,14 @@ static int ext4_load_journal(struct super_block *sb, } if (journal->j_flags & JBD2_BARRIER) - printk(KERN_INFO "EXT4-fs: barriers enabled\n"); + ext4_msg(sb, KERN_INFO, "barriers enabled"); else - printk(KERN_INFO "EXT4-fs: barriers disabled\n"); + ext4_msg(sb, KERN_INFO, "barriers disabled"); if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { err = jbd2_journal_update_format(journal); if (err) { - printk(KERN_ERR "EXT4-fs: error updating journal.\n"); + ext4_msg(sb, KERN_ERR, "error updating journal"); jbd2_journal_destroy(journal); return err; } @@ -3170,7 +3176,7 @@ static int ext4_load_journal(struct super_block *sb, err = jbd2_journal_load(journal); if (err) { - printk(KERN_ERR "EXT4-fs: error loading journal.\n"); + ext4_msg(sb, KERN_ERR, "error loading journal"); jbd2_journal_destroy(journal); return err; } @@ -3206,8 +3212,8 @@ static int ext4_commit_super(struct super_block *sb, int sync) * be remapped. Nothing we can do but to retry the * write and hope for the best. */ - printk(KERN_ERR "EXT4-fs: previous I/O error to " - "superblock detected for %s.\n", sb->s_id); + ext4_msg(sb, KERN_ERR, "previous I/O error to " + "superblock detected"); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } @@ -3230,8 +3236,8 @@ static int ext4_commit_super(struct super_block *sb, int sync) error = buffer_write_io_error(sbh); if (error) { - printk(KERN_ERR "EXT4-fs: I/O error while writing " - "superblock for %s.\n", sb->s_id); + ext4_msg(sb, KERN_ERR, "I/O error while writing " + "superblock"); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } @@ -3478,9 +3484,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) int ret; if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP))) { - printk(KERN_WARNING "EXT4-fs: %s: couldn't " + ext4_msg(sb, KERN_WARNING, "couldn't " "remount RDWR because of unsupported " - "optional features (%x).\n", sb->s_id, + "optional features (%x)", (le32_to_cpu(sbi->s_es->s_feature_ro_compat) & ~EXT4_FEATURE_RO_COMPAT_SUPP)); err = -EROFS; @@ -3496,9 +3502,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) ext4_get_group_desc(sb, g, NULL); if (!ext4_group_desc_csum_verify(sbi, g, gdp)) { - printk(KERN_ERR - "EXT4-fs: ext4_remount: " - "Checksum for group %u failed (%u!=%u)\n", + ext4_msg(sb, KERN_ERR, + "ext4_remount: Checksum for group %u failed (%u!=%u)", g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)), le16_to_cpu(gdp->bg_checksum)); err = -EINVAL; @@ -3512,11 +3517,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) * require a full umount/remount for now. */ if (es->s_last_orphan) { - printk(KERN_WARNING "EXT4-fs: %s: couldn't " + ext4_msg(sb, KERN_WARNING, "Couldn't " "remount RDWR because of unprocessed " "orphan inode list. Please " - "umount/remount instead.\n", - sb->s_id); + "umount/remount instead"); err = -EINVAL; goto restore_opts; } @@ -3772,9 +3776,9 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, if (EXT4_SB(sb)->s_qf_names[type]) { /* Quotafile not in fs root? */ if (path.dentry->d_parent != sb->s_root) - printk(KERN_WARNING - "EXT4-fs: Quota file not on filesystem root. " - "Journaled quota will not work.\n"); + ext4_msg(sb, KERN_WARNING, + "Quota file not on filesystem root. " + "Journaled quota will not work"); } /* @@ -3857,8 +3861,8 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, handle_t *handle = journal_current_handle(); if (EXT4_SB(sb)->s_journal && !handle) { - printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)" - " cancelled because transaction is not started.\n", + ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" + " cancelled because transaction is not started", (unsigned long long)off, (unsigned long long)len); return -EIO; } @@ -3930,10 +3934,10 @@ static struct file_system_type ext4_fs_type = { static int ext4dev_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data,struct vfsmount *mnt) { - printk(KERN_WARNING "EXT4-fs: Update your userspace programs " - "to mount using ext4\n"); - printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility " - "will go away by 2.6.31\n"); + printk(KERN_WARNING "EXT4-fs (%s): Update your userspace programs " + "to mount using ext4\n", dev_name); + printk(KERN_WARNING "EXT4-fs (%s): ext4dev backwards compatibility " + "will go away by 2.6.31\n", dev_name); return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); } -- cgit v1.1 From 1938a150c25bf7c2c47182e753a1038945b70b0e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 5 Jun 2009 01:00:26 -0400 Subject: ext4: Avoid leaking blocks after a block allocation failure We should add inode to the orphan list in the same transaction as block allocation. This ensures that if we crash after a failed block allocation and before we do a vmtruncate we don't leak block (ie block marked as used in bitmap but not claimed by the inode). Signed-off-by: Aneesh Kumar K.V CC: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 17ed0d2..8d21588 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1459,7 +1459,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; - int ret, needed_blocks = ext4_writepage_trans_blocks(inode); + int ret, needed_blocks; handle_t *handle; int retries = 0; struct page *page; @@ -1470,6 +1470,11 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, "dev %s ino %lu pos %llu len %u flags %u", inode->i_sb->s_id, inode->i_ino, (unsigned long long) pos, len, flags); + /* + * Reserve one block more for addition to orphan list in case + * we allocate blocks but write fails for some reason + */ + needed_blocks = ext4_writepage_trans_blocks(inode) + 1; index = pos >> PAGE_CACHE_SHIFT; from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -1503,15 +1508,30 @@ retry: if (ret) { unlock_page(page); - ext4_journal_stop(handle); page_cache_release(page); /* * block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold i_mutex. + * + * Add inode to orphan list in case we crash before + * truncate finishes */ if (pos + len > inode->i_size) + ext4_orphan_add(handle, inode); + + ext4_journal_stop(handle); + if (pos + len > inode->i_size) { vmtruncate(inode, inode->i_size); + /* + * If vmtruncate failed early the inode might + * still be on the orphan list; we need to + * make sure the inode is removed from the + * orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) -- cgit v1.1 From f8514083cd61daef12fba5ef883ad9352c450428 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 5 Jun 2009 00:56:49 -0400 Subject: ext4: truncate the file properly if we fail to copy data from userspace In generic_perform_write if we fail to copy the user data we don't update the inode->i_size. We should truncate the file in the above case so that we don't have blocks allocated outside inode->i_size. Add the inode to orphan list in the same transaction as block allocation This ensures that if we crash in between the recovery would do the truncate. Signed-off-by: Aneesh Kumar K.V CC: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 128 ++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 102 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8d21588..2c10d34 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1549,6 +1549,52 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh) return ext4_handle_dirty_metadata(handle, NULL, bh); } +static int ext4_generic_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + int i_size_changed = 0; + struct inode *inode = mapping->host; + handle_t *handle = ext4_journal_current_handle(); + + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold i_mutex. + * + * But it's important to update i_size while still holding page lock: + * page writeout could otherwise come in and zero beyond i_size. + */ + if (pos + copied > inode->i_size) { + i_size_write(inode, pos + copied); + i_size_changed = 1; + } + + if (pos + copied > EXT4_I(inode)->i_disksize) { + /* We need to mark inode dirty even if + * new_i_size is less that inode->i_size + * bu greater than i_disksize.(hint delalloc) + */ + ext4_update_i_disksize(inode, (pos + copied)); + i_size_changed = 1; + } + unlock_page(page); + page_cache_release(page); + + /* + * Don't mark the inode dirty under page lock. First, it unnecessarily + * makes the holding time of page lock longer. Second, it forces lock + * ordering of page lock and transaction start for journaling + * filesystems. + */ + if (i_size_changed) + ext4_mark_inode_dirty(handle, inode); + + return copied; +} + /* * We need to pick up the new inode size which generic_commit_write gave us * `file' can be NULL - eg, when called from page_symlink(). @@ -1572,21 +1618,15 @@ static int ext4_ordered_write_end(struct file *file, ret = ext4_jbd2_file_inode(handle, inode); if (ret == 0) { - loff_t new_i_size; - - new_i_size = pos + copied; - if (new_i_size > EXT4_I(inode)->i_disksize) { - ext4_update_i_disksize(inode, new_i_size); - /* We need to mark inode dirty even if - * new_i_size is less that inode->i_size - * bu greater than i_disksize.(hint delalloc) - */ - ext4_mark_inode_dirty(handle, inode); - } - - ret2 = generic_write_end(file, mapping, pos, len, copied, + ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, page, fsdata); copied = ret2; + if (pos + len > inode->i_size) + /* if we have allocated more blocks and copied + * less. We will have blocks allocated outside + * inode->i_size. So truncate them + */ + ext4_orphan_add(handle, inode); if (ret2 < 0) ret = ret2; } @@ -1594,6 +1634,18 @@ static int ext4_ordered_write_end(struct file *file, if (!ret) ret = ret2; + if (pos + len > inode->i_size) { + vmtruncate(inode, inode->i_size); + /* + * If vmtruncate failed early the inode might still be + * on the orphan list; we need to make sure the inode + * is removed from the orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + + return ret ? ret : copied; } @@ -1605,25 +1657,21 @@ static int ext4_writeback_write_end(struct file *file, handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; int ret = 0, ret2; - loff_t new_i_size; trace_mark(ext4_writeback_write_end, "dev %s ino %lu pos %llu len %u copied %u", inode->i_sb->s_id, inode->i_ino, (unsigned long long) pos, len, copied); - new_i_size = pos + copied; - if (new_i_size > EXT4_I(inode)->i_disksize) { - ext4_update_i_disksize(inode, new_i_size); - /* We need to mark inode dirty even if - * new_i_size is less that inode->i_size - * bu greater than i_disksize.(hint delalloc) - */ - ext4_mark_inode_dirty(handle, inode); - } - - ret2 = generic_write_end(file, mapping, pos, len, copied, + ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, page, fsdata); copied = ret2; + if (pos + len > inode->i_size) + /* if we have allocated more blocks and copied + * less. We will have blocks allocated outside + * inode->i_size. So truncate them + */ + ext4_orphan_add(handle, inode); + if (ret2 < 0) ret = ret2; @@ -1631,6 +1679,17 @@ static int ext4_writeback_write_end(struct file *file, if (!ret) ret = ret2; + if (pos + len > inode->i_size) { + vmtruncate(inode, inode->i_size); + /* + * If vmtruncate failed early the inode might still be + * on the orphan list; we need to make sure the inode + * is removed from the orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + return ret ? ret : copied; } @@ -1675,10 +1734,27 @@ static int ext4_journalled_write_end(struct file *file, } unlock_page(page); + page_cache_release(page); + if (pos + len > inode->i_size) + /* if we have allocated more blocks and copied + * less. We will have blocks allocated outside + * inode->i_size. So truncate them + */ + ext4_orphan_add(handle, inode); + ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; - page_cache_release(page); + if (pos + len > inode->i_size) { + vmtruncate(inode, inode->i_size); + /* + * If vmtruncate failed early the inode might still be + * on the orphan list; we need to make sure the inode + * is removed from the orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } return ret ? ret : copied; } -- cgit v1.1 From f6d03139d745198b434f65a28aabed524f415a4c Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 5 Jun 2009 07:18:57 +0100 Subject: GFS2: Fix locking issue mounting gfs2meta fs This patch uses sget() to get a reference to the existing gfs2 sb when mouting the gfs2meta filesystem (in fact thats just another mount of the gfs2 filesystem with a different root and this interface is for backward compatibility). Signed-off-by: Steven Whitehouse Reported-by: Benjamin Marzinski Tested-by: Benjamin Marzinski Cc: Christoph Hellwig --- fs/gfs2/ops_fstype.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 2cd1164..9da161c 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1273,9 +1273,20 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags, return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt); } +static int test_meta_super(struct super_block *s, void *ptr) +{ + struct block_device *bdev = ptr; + return (bdev == s->s_bdev); +} + +static int set_meta_super(struct super_block *s, void *ptr) +{ + return -EINVAL; +} + static struct super_block *get_gfs2_sb(const char *dev_name) { - struct super_block *sb; + struct super_block *s; struct path path; int error; @@ -1283,30 +1294,27 @@ static struct super_block *get_gfs2_sb(const char *dev_name) if (error) { printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n", dev_name, error); - return NULL; + return ERR_PTR(-ENOENT); } - sb = path.dentry->d_inode->i_sb; - if (sb && (sb->s_type == &gfs2_fs_type)) - atomic_inc(&sb->s_active); - else - sb = NULL; + s = sget(&gfs2_fs_type, test_meta_super, set_meta_super, + path.dentry->d_inode->i_sb->s_bdev); path_put(&path); - return sb; + return s; } static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - struct super_block *sb = NULL; + struct super_block *s; struct gfs2_sbd *sdp; - sb = get_gfs2_sb(dev_name); - if (!sb) { + s = get_gfs2_sb(dev_name); + if (IS_ERR(s)) { printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n"); - return -ENOENT; + return PTR_ERR(s); } - sdp = sb->s_fs_info; - mnt->mnt_sb = sb; + sdp = s->s_fs_info; + mnt->mnt_sb = s; mnt->mnt_root = dget(sdp->sd_master_dir); return 0; } -- cgit v1.1 From 460bcf57b128ce1c0dd553d905fedc097f9955c6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 12 May 2009 07:37:56 -0400 Subject: Fix nobh_truncate_page() to not pass stack garbage to get_block() The nobh_truncate_page() function is used by ext2, exofs, and jfs. Of these three, only ext2 and jfs's get_block() function pays attention to bh->b_size --- which is normally always the filesystem blocksize except when the get_block() function is called by either mpage_readpage(), mpage_readpages(), or the direct I/O routines in fs/direct_io.c. Unfortunately, nobh_truncate_page() does not initialize map_bh before calling the filesystem-supplied get_block() function. So ext2 and jfs will try to calculate the number of blocks to map by taking stack garbage and shifting it left by inode->i_blkbits. This should be *mostly* harmless (except the filesystem will do some unnneeded work) unless the stack garbage is less than filesystem's blocksize, in which case maxblocks will be zero, and the attempt to find out whether or not the filesystem has a hole at a given logical block will fail, and the page cache entry might not get zero'ed out. Also if the stack garbage in in map_bh->state happens to have the BH_Mapped bit set, there could be an attempt to call readpage() on a non-existent page, which could cause nobh_truncate_page() to return an error when it should not. Fix this by initializing map_bh->state and map_bh->size. Fortunately, it's probably fairly unlikely that ext2 and jfs users mount with nobh these days. Signed-off-by: "Theodore Ts'o" Cc: Dave Kleikamp Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Al Viro --- fs/buffer.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index aed2977..4910612 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2736,6 +2736,8 @@ has_buffers: pos += blocksize; } + map_bh.b_size = blocksize; + map_bh.b_state = 0; err = get_block(inode, iblock, &map_bh, 0); if (err) goto unlock; -- cgit v1.1 From 72a43d63cb51057393edfbcfc4596066205ad15d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 13 May 2009 19:13:40 +0100 Subject: ext3/4 with synchronous writes gets wedged by Postfix OK, that's probably the easiest way to do that, as much as I don't like it... Since iget() et.al. will not accept I_FREEING (will wait to go away and restart), and since we'd better have serialization between new/free on fs data structures anyway, we can afford simply skipping I_FREEING et.al. in insert_inode_locked(). We do that from new_inode, so it won't race with free_inode in any interesting ways and it won't race with iget (of any origin; nfsd or in case of fs corruption a lookup) since both still will wait for I_LOCK. Reviewed-by: "Theodore Ts'o" Acked-by: Jan Kara Tested-by: David Watson Signed-off-by: Al Viro --- fs/inode.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index 0571983..a4876e5 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1053,13 +1053,22 @@ int insert_inode_locked(struct inode *inode) struct super_block *sb = inode->i_sb; ino_t ino = inode->i_ino; struct hlist_head *head = inode_hashtable + hash(sb, ino); - struct inode *old; inode->i_state |= I_LOCK|I_NEW; while (1) { + struct hlist_node *node; + struct inode *old = NULL; spin_lock(&inode_lock); - old = find_inode_fast(sb, head, ino); - if (likely(!old)) { + hlist_for_each_entry(old, node, head, i_hash) { + if (old->i_ino != ino) + continue; + if (old->i_sb != sb) + continue; + if (old->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) + continue; + break; + } + if (likely(!node)) { hlist_add_head(&inode->i_hash, head); spin_unlock(&inode_lock); return 0; @@ -1081,14 +1090,24 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, { struct super_block *sb = inode->i_sb; struct hlist_head *head = inode_hashtable + hash(sb, hashval); - struct inode *old; inode->i_state |= I_LOCK|I_NEW; while (1) { + struct hlist_node *node; + struct inode *old = NULL; + spin_lock(&inode_lock); - old = find_inode(sb, head, test, data); - if (likely(!old)) { + hlist_for_each_entry(old, node, head, i_hash) { + if (old->i_sb != sb) + continue; + if (!test(old, data)) + continue; + if (old->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) + continue; + break; + } + if (likely(!node)) { hlist_add_head(&inode->i_hash, head); spin_unlock(&inode_lock); return 0; -- cgit v1.1 From 4ae1507f6d266d0cc3dd36e474d83aad70fec9e4 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Sun, 24 May 2009 18:45:15 -0400 Subject: cifs: make overriding of ownership conditional on new mount options We have a bit of a problem with the uid= option. The basic issue is that it means too many things and has too many side-effects. It's possible to allow an unprivileged user to mount a filesystem if the user owns the mountpoint, /bin/mount is setuid root, and the mount is set up in /etc/fstab with the "user" option. When doing this though, /bin/mount automatically adds the "uid=" and "gid=" options to the share. This is fortunate since the correct uid= option is needed in order to tell the upcall what user's credcache to use when generating the SPNEGO blob. On a mount without unix extensions this is fine -- you generally will want the files to be owned by the "owner" of the mount. The problem comes in on a mount with unix extensions. With those enabled, the uid/gid options cause the ownership of files to be overriden even though the server is sending along the ownership info. This means that it's not possible to have a mount by an unprivileged user that shows the server's file ownership info. The result is also inode permissions that have no reflection at all on the server. You simply cannot separate ownership from the mode in this fashion. This behavior also makes MultiuserMount option less usable. Once you pass in the uid= option for a mount, then you can't use unix ownership info and allow someone to share the mount. While I'm not thrilled with it, the only solution I can see is to stop making uid=/gid= force the overriding of ownership on mounts, and to add new mount options that turn this behavior on. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 74b5a87..10151f8 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1096,17 +1096,17 @@ cifs_parse_mount_options(char *options, const char *devname, return 1; } } else if (strnicmp(data, "uid", 3) == 0) { - if (value && *value) { + if (value && *value) vol->linux_uid = simple_strtoul(value, &value, 0); + } else if (strnicmp(data, "forceuid", 8) == 0) { vol->override_uid = 1; - } } else if (strnicmp(data, "gid", 3) == 0) { - if (value && *value) { + if (value && *value) vol->linux_gid = simple_strtoul(value, &value, 0); + } else if (strnicmp(data, "forcegid", 8) == 0) { vol->override_gid = 1; - } } else if (strnicmp(data, "file_mode", 4) == 0) { if (value && *value) { vol->file_mode = -- cgit v1.1 From f0472d0ec89bef2ea4432828c3daa1b26ef569aa Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 6 Jun 2009 21:09:39 +0000 Subject: [CIFS] Add mention of new mount parm (forceuid) to cifs readme Also update fs/cifs/CHANGES Signed-off-by: Steve French --- fs/cifs/CHANGES | 5 ++++- fs/cifs/README | 9 ++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index 227c681..b486898 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -2,7 +2,10 @@ Version 1.59 ------------ Client uses server inode numbers (which are persistent) rather than client generated ones by default (mount option "serverino" turned -on by default if server supports it). +on by default if server supports it). Add forceuid and forcegid +mount options (so that when negotiating unix extensions specifying +which uid mounted does not immediately force the server's reported +uids to be overridden). Version 1.58 ------------ diff --git a/fs/cifs/README b/fs/cifs/README index 6d1608f..ad92921 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -262,7 +262,8 @@ A partial list of the supported mount options follows: mount. domain Set the SMB/CIFS workgroup name prepended to the username during CIFS session establishment - uid Set the default uid for inodes. For mounts to servers + forceuid Set the default uid for inodes based on the uid + passed in. For mounts to servers which do support the CIFS Unix extensions, such as a properly configured Samba server, the server provides the uid, gid and mode so this parameter should not be @@ -292,6 +293,12 @@ A partial list of the supported mount options follows: the client. Note that the mount.cifs helper must be at version 1.10 or higher to support specifying the uid (or gid) in non-numeric form. + forcegid (similar to above but for the groupid instead of uid) + uid Set the default uid for inodes, and indicate to the + cifs kernel driver which local user mounted . If the server + supports the unix extensions the default uid is + not used to fill in the owner fields of inodes (files) + unless the "forceuid" parameter is specified. gid Set the default gid for inodes (similar to above). file_mode If CIFS Unix extensions are not supported by the server this overrides the default mode for file inodes. -- cgit v1.1 From f07502dae230a2c3b65381fd1b06e8a18b2c7525 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 6 Jun 2009 21:18:09 +0100 Subject: integrity: fix IMA inode leak CONFIG_IMA=y inode activity leaks iint_cache and radix_tree_node objects until the system runs out of memory. Nowhere is calling ima_inode_free() a.k.a. ima_iint_delete(). Fix that by calling it from destroy_inode(). Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- fs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index a4876e5..bca0c61 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -219,6 +219,7 @@ static struct inode *alloc_inode(struct super_block *sb) void destroy_inode(struct inode *inode) { BUG_ON(inode_has_buffers(inode)); + ima_inode_free(inode); security_inode_free(inode); if (inode->i_sb->s_op->destroy_inode) inode->i_sb->s_op->destroy_inode(inode); -- cgit v1.1 From 9aee2286071c23c535fe9928eec1a26e0bcf256d Mon Sep 17 00:00:00 2001 From: Toshiyuki Okajima Date: Mon, 8 Jun 2009 12:41:35 -0400 Subject: ext4: fix dx_map_entry to support 256k directory blocks The dx_map_entry structure doesn't support over 64KB block size by current usage of its member("offs"). Because "offs" treats an offset of copies of the ext4_dir_entry_2 structure as is. This member size is 16 bits. But real offset for over 64KB(256KB) block size needs 18 bits. However, real offset keeps 4 byte boundary, so lower 2 bits is not used. Therefore, we do the following to fix this limitation: For "store": we divide the real offset by 4 and then store this result to "offs" member. For "use": we multiply "offs" member by 4 and then use this result as real offset. Signed-off-by: Toshiyuki Okajima Signed-off-by: "Theodore Ts'o" --- fs/ext4/namei.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index f2bc160..07eb664 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -749,7 +749,7 @@ static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, ext4fs_dirhash(de->name, de->name_len, &h); map_tail--; map_tail->hash = h.hash; - map_tail->offs = (u16) ((char *) de - base); + map_tail->offs = ((char *) de - base)>>2; map_tail->size = le16_to_cpu(de->rec_len); count++; cond_resched(); @@ -1147,7 +1147,8 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count, unsigned rec_len = 0; while (count--) { - struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + map->offs); + struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) + (from + (map->offs<<2)); rec_len = EXT4_DIR_REC_LEN(de->name_len); memcpy (to, de, rec_len); ((struct ext4_dir_entry_2 *) to)->rec_len = -- cgit v1.1 From 0eab928221bac8895a0b494a16a8810002bd8645 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 9 Jun 2009 09:54:40 -0400 Subject: ext4: Don't treat a truncation of a zero-length file as replace-via-truncate If a non-existent file is opened via O_WRONLY|O_CREAT|O_TRUNC, there's no need to treat this as a true file truncation, so we shouldn't activate the replace-via-truncate hueristic. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2c10d34..875db94 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4127,7 +4127,8 @@ void ext4_truncate(struct inode *inode) if (!ext4_can_truncate(inode)) return; - if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) + if (ei->i_disksize && inode->i_size == 0 && + !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { -- cgit v1.1 From 55782138e47d9baf2f7d3a7af9e7cf42adf72c56 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 9 Jun 2009 13:43:05 +0800 Subject: tracing/events: convert block trace points to TRACE_EVENT() TRACE_EVENT is a more generic way to define tracepoints. Doing so adds these new capabilities to this tracepoint: - zero-copy and per-cpu splice() tracing - binary tracing without printf overhead - structured logging records exposed under /debug/tracing/events - trace events embedded in function tracer output and other plugins - user-defined, per tracepoint filter expressions ... Cons: - no dev_t info for the output of plug, unplug_timer and unplug_io events. no dev_t info for getrq and sleeprq events if bio == NULL. no dev_t info for rq_abort,...,rq_requeue events if rq->rq_disk == NULL. This is mainly because we can't get the deivce from a request queue. But this may change in the future. - A packet command is converted to a string in TP_assign, not TP_print. While blktrace do the convertion just before output. Since pc requests should be rather rare, this is not a big issue. - In blktrace, an event can have 2 different print formats, but a TRACE_EVENT has a unique format, which means we have some unused data in a trace entry. The overhead is minimized by using __dynamic_array() instead of __array(). I've benchmarked the ioctl blktrace vs the splice based TRACE_EVENT tracing: dd dd + ioctl blktrace dd + TRACE_EVENT (splice) 1 7.36s, 42.7 MB/s 7.50s, 42.0 MB/s 7.41s, 42.5 MB/s 2 7.43s, 42.3 MB/s 7.48s, 42.1 MB/s 7.43s, 42.4 MB/s 3 7.38s, 42.6 MB/s 7.45s, 42.2 MB/s 7.41s, 42.5 MB/s So the overhead of tracing is very small, and no regression when using those trace events vs blktrace. And the binary output of TRACE_EVENT is much smaller than blktrace: # ls -l -h -rw-r--r-- 1 root root 8.8M 06-09 13:24 sda.blktrace.0 -rw-r--r-- 1 root root 195K 06-09 13:24 sda.blktrace.1 -rw-r--r-- 1 root root 2.7M 06-09 13:25 trace_splice.out Following are some comparisons between TRACE_EVENT and blktrace: plug: kjournald-480 [000] 303.084981: block_plug: [kjournald] kjournald-480 [000] 303.084981: 8,0 P N [kjournald] unplug_io: kblockd/0-118 [000] 300.052973: block_unplug_io: [kblockd/0] 1 kblockd/0-118 [000] 300.052974: 8,0 U N [kblockd/0] 1 remap: kjournald-480 [000] 303.085042: block_remap: 8,0 W 102736992 + 8 <- (8,8) 33384 kjournald-480 [000] 303.085043: 8,0 A W 102736992 + 8 <- (8,8) 33384 bio_backmerge: kjournald-480 [000] 303.085086: block_bio_backmerge: 8,0 W 102737032 + 8 [kjournald] kjournald-480 [000] 303.085086: 8,0 M W 102737032 + 8 [kjournald] getrq: kjournald-480 [000] 303.084974: block_getrq: 8,0 W 102736984 + 8 [kjournald] kjournald-480 [000] 303.084975: 8,0 G W 102736984 + 8 [kjournald] bash-2066 [001] 1072.953770: 8,0 G N [bash] bash-2066 [001] 1072.953773: block_getrq: 0,0 N 0 + 0 [bash] rq_complete: konsole-2065 [001] 300.053184: block_rq_complete: 8,0 W () 103669040 + 16 [0] konsole-2065 [001] 300.053191: 8,0 C W 103669040 + 16 [0] ksoftirqd/1-7 [001] 1072.953811: 8,0 C N (5a 00 08 00 00 00 00 00 24 00) [0] ksoftirqd/1-7 [001] 1072.953813: block_rq_complete: 0,0 N (5a 00 08 00 00 00 00 00 24 00) 0 + 0 [0] rq_insert: kjournald-480 [000] 303.084985: block_rq_insert: 8,0 W 0 () 102736984 + 8 [kjournald] kjournald-480 [000] 303.084986: 8,0 I W 102736984 + 8 [kjournald] Changelog from v2 -> v3: - use the newly introduced __dynamic_array(). Changelog from v1 -> v2: - use __string() instead of __array() to minimize the memory required to store hex dump of rq->cmd(). - support large pc requests. - add missing blk_fill_rwbs_rq() in block_rq_requeue TRACE_EVENT. - some cleanups. Signed-off-by: Li Zefan LKML-Reference: <4A2DF669.5070905@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- fs/bio.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 9871164..740699c 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -26,10 +26,9 @@ #include #include #include -#include #include /* for struct sg_iovec */ -DEFINE_TRACE(block_split); +#include /* * Test patch to inline a certain number of bi_io_vec's inside the bio -- cgit v1.1 From 463aea1a1c49f1a7d4b50656dfd6c8bb33358b1b Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 9 Jun 2009 16:26:24 -0700 Subject: autofs4: remove hashed check in validate_wait() The recent ->lookup() deadlock correction required the directory inode mutex to be dropped while waiting for expire completion. We were concerned about side effects from this change and one has been identified. I saw several error messages. They cause autofs to become quite confused and don't really point to the actual problem. Things like: handle_packet_missing_direct:1376: can't find map entry for (43,1827932) which is usually totally fatal (although in this case it wouldn't be except that I treat is as such because it normally is). do_mount_direct: direct trigger not valid or already mounted /test/nested/g3c/s1/ss1 which is recoverable, however if this problem is at play it can cause autofs to become quite confused as to the dependencies in the mount tree because mount triggers end up mounted multiple times. It's hard to accurately check for this over mounting case and automount shouldn't need to if the kernel module is doing its job. There was one other message, similar in consequence of this last one but I can't locate a log example just now. When checking if a mount has already completed prior to adding a new mount request to the wait queue we check if the dentry is hashed and, if so, if it is a mount point. But, if a mount successfully completed while we slept on the wait queue mutex the dentry must exist for the mount to have completed so the test is not really needed. Mounts can also be done on top of a global root dentry, so for the above case, where a mount request completes and the wait queue entry has already been removed, the hashed test returning false can cause an incorrect callback to the daemon. Also, d_mountpoint() is not sufficient to check if a mount has completed for the multi-mount case when we don't have a real mount at the base of the tree. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/waitq.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index eeb2468..2341375 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -297,20 +297,14 @@ static int validate_request(struct autofs_wait_queue **wait, */ if (notify == NFY_MOUNT) { /* - * If the dentry isn't hashed just go ahead and try the - * mount again with a new wait (not much else we can do). - */ - if (!d_unhashed(dentry)) { - /* - * But if the dentry is hashed, that means that we - * got here through the revalidate path. Thus, we - * need to check if the dentry has been mounted - * while we waited on the wq_mutex. If it has, - * simply return success. - */ - if (d_mountpoint(dentry)) - return 0; - } + * If the dentry was successfully mounted while we slept + * on the wait queue mutex we can return success. If it + * isn't mounted (doesn't have submounts for the case of + * a multi-mount with no mount at it's base) we can + * continue on and create a new request. + */ + if (have_submounts(dentry)) + return 0; } return 1; -- cgit v1.1 From a61d90d75d0f9e86432c45b496b4b0fbf0fd03dc Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 9 Jun 2009 16:26:26 -0700 Subject: jbd: fix race in buffer processing in commit code In commit code, we scan buffers attached to a transaction. During this scan, we sometimes have to drop j_list_lock and then we recheck whether the journal buffer head didn't get freed by journal_try_to_free_buffers(). But checking for buffer_jbd(bh) isn't enough because a new journal head could get attached to our buffer head. So add a check whether the journal head remained the same and whether it's still at the same transaction and list. This is a nasty bug and can cause problems like memory corruption (use after free) or trigger various assertions in JBD code (observed). Signed-off-by: Jan Kara Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jbd/commit.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 06560c5..618e21c 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -241,7 +241,7 @@ write_out_data: spin_lock(&journal->j_list_lock); } /* Someone already cleaned up the buffer? */ - if (!buffer_jbd(bh) + if (!buffer_jbd(bh) || bh2jh(bh) != jh || jh->b_transaction != commit_transaction || jh->b_jlist != BJ_SyncData) { jbd_unlock_bh_state(bh); @@ -478,7 +478,9 @@ void journal_commit_transaction(journal_t *journal) spin_lock(&journal->j_list_lock); continue; } - if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { + if (buffer_jbd(bh) && bh2jh(bh) == jh && + jh->b_transaction == commit_transaction && + jh->b_jlist == BJ_Locked) { __journal_unfile_buffer(jh); jbd_unlock_bh_state(bh); journal_remove_journal_head(bh); -- cgit v1.1 From 40bc9a27e00d6c8c7e4dc2865c02d7402a950472 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 10 Jun 2009 09:09:40 +0100 Subject: GFS2: Fix cache coherency between truncate and O_DIRECT read If a page was partially zeroed as the result of a truncate, then it was not being correctly marked dirty. This resulted in the deleted data reappearing if the file was read back via direct I/O. Reported-by: Eric Sandeen Signed-off-by: Steven Whitehouse --- fs/gfs2/bmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 1153a07..3297635 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1012,7 +1012,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping) gfs2_trans_add_bh(ip->i_gl, bh, 0); zero_user(page, offset, length); - + mark_buffer_dirty(bh); unlock: unlock_page(page); page_cache_release(page); -- cgit v1.1 From 003dec8913d6bebb4ecc989ec04a235cf38f5ea9 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 10 Jun 2009 10:31:45 +0100 Subject: GFS2: Merge gfs2_get_sb into gfs2_get_sb_meta These don't need to be separate functions. Reported-by: Christoph Hellwig Signed-off-by: Steven Whitehouse --- fs/gfs2/ops_fstype.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 9da161c..f234aba 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1284,9 +1284,11 @@ static int set_meta_super(struct super_block *s, void *ptr) return -EINVAL; } -static struct super_block *get_gfs2_sb(const char *dev_name) +static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, struct vfsmount *mnt) { struct super_block *s; + struct gfs2_sbd *sdp; struct path path; int error; @@ -1294,21 +1296,11 @@ static struct super_block *get_gfs2_sb(const char *dev_name) if (error) { printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n", dev_name, error); - return ERR_PTR(-ENOENT); + return error; } s = sget(&gfs2_fs_type, test_meta_super, set_meta_super, path.dentry->d_inode->i_sb->s_bdev); path_put(&path); - return s; -} - -static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, struct vfsmount *mnt) -{ - struct super_block *s; - struct gfs2_sbd *sdp; - - s = get_gfs2_sb(dev_name); if (IS_ERR(s)) { printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n"); return PTR_ERR(s); -- cgit v1.1 From 5c939df56c3ea018b58e5aa76181284c2053d699 Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Wed, 27 May 2009 09:16:03 -0400 Subject: btrfs: Fix set/clear_extent_bit for 'end == (u64)-1' There are some 'start = state->end + 1;' like code in set_extent_bit and clear_extent_bit. They overflow when end == (u64)-1. Signed-off-by: Yan Zheng Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index fe9eb99..6826018 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -476,6 +476,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *state; struct extent_state *prealloc = NULL; struct rb_node *node; + u64 last_end; int err; int set = 0; @@ -498,6 +499,7 @@ again: if (state->start > end) goto out; WARN_ON(state->end < start); + last_end = state->end; /* * | ---- desired range ---- | @@ -524,9 +526,11 @@ again: if (err) goto out; if (state->end <= end) { - start = state->end + 1; set |= clear_state_bit(tree, state, bits, wake, delete); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; } else { start = state->start; } @@ -552,8 +556,10 @@ again: goto out; } - start = state->end + 1; set |= clear_state_bit(tree, state, bits, wake, delete); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; goto search_again; out: @@ -707,8 +713,10 @@ again: goto out; } set_state_bits(tree, state, bits); - start = state->end + 1; merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; goto search_again; } @@ -742,8 +750,10 @@ again: goto out; if (state->end <= end) { set_state_bits(tree, state, bits); - start = state->end + 1; merge_state(tree, state); + if (last_end == (u64)-1) + goto out; + start = last_end + 1; } else { start = state->start; } -- cgit v1.1 From 5d4f98a28c7d334091c1b7744f48a1acdd2a4ae0 Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Wed, 10 Jun 2009 10:45:14 -0400 Subject: Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE) This commit introduces a new kind of back reference for btrfs metadata. Once a filesystem has been mounted with this commit, IT WILL NO LONGER BE MOUNTABLE BY OLDER KERNELS. When a tree block in subvolume tree is cow'd, the reference counts of all extents it points to are increased by one. At transaction commit time, the old root of the subvolume is recorded in a "dead root" data structure, and the btree it points to is later walked, dropping reference counts and freeing any blocks where the reference count goes to 0. The increments done during cow and decrements done after commit cancel out, and the walk is a very expensive way to go about freeing the blocks that are no longer referenced by the new btree root. This commit reduces the transaction overhead by avoiding the need for dead root records. When a non-shared tree block is cow'd, we free the old block at once, and the new block inherits old block's references. When a tree block with reference count > 1 is cow'd, we increase the reference counts of all extents the new block points to by one, and decrease the old block's reference count by one. This dead tree avoidance code removes the need to modify the reference counts of lower level extents when a non-shared tree block is cow'd. But we still need to update back ref for all pointers in the block. This is because the location of the block is recorded in the back ref item. We can solve this by introducing a new type of back ref. The new back ref provides information about pointer's key, level and in which tree the pointer lives. This information allow us to find the pointer by searching the tree. The shortcoming of the new back ref is that it only works for pointers in tree blocks referenced by their owner trees. This is mostly a problem for snapshots, where resolving one of these fuzzy back references would be O(number_of_snapshots) and quite slow. The solution used here is to use the fuzzy back references in the common case where a given tree block is only referenced by one root, and use the full back references when multiple roots have a reference on a given block. This commit adds per subvolume red-black tree to keep trace of cached inodes. The red-black tree helps the balancing code to find cached inodes whose inode numbers within a given range. This commit improves the balancing code by introducing several data structures to keep the state of balancing. The most important one is the back ref cache. It caches how the upper level tree blocks are referenced. This greatly reduce the overhead of checking back ref. The improved balancing code scales significantly better with a large number of snapshots. This is a very large commit and was written in a number of pieces. But, they depend heavily on the disk format change and were squashed together to make sure git bisect didn't end up in a bad state wrt space balancing or the format change. Signed-off-by: Yan Zheng Signed-off-by: Chris Mason --- fs/btrfs/Makefile | 4 +- fs/btrfs/btrfs_inode.h | 3 + fs/btrfs/ctree.c | 685 +++++---- fs/btrfs/ctree.h | 308 ++-- fs/btrfs/delayed-ref.c | 509 +++++-- fs/btrfs/delayed-ref.h | 85 +- fs/btrfs/disk-io.c | 95 +- fs/btrfs/export.c | 4 +- fs/btrfs/extent-tree.c | 2674 ++++++++++++++++++++++------------ fs/btrfs/file.c | 76 +- fs/btrfs/inode.c | 132 +- fs/btrfs/ioctl.c | 21 +- fs/btrfs/print-tree.c | 155 +- fs/btrfs/relocation.c | 3711 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/root-tree.c | 17 +- fs/btrfs/super.c | 26 +- fs/btrfs/transaction.c | 410 ++---- fs/btrfs/transaction.h | 12 +- fs/btrfs/tree-log.c | 102 +- fs/btrfs/volumes.c | 2 - 20 files changed, 6958 insertions(+), 2073 deletions(-) create mode 100644 fs/btrfs/relocation.c (limited to 'fs') diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 9421284..a35eb36 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ transaction.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ - ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ - compression.o delayed-ref.o + export.o tree-log.o acl.o free-space-cache.o zlib.o \ + compression.o delayed-ref.o relocation.o diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b30986f..ecf5f7d 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -72,6 +72,9 @@ struct btrfs_inode { */ struct list_head ordered_operations; + /* node for the red-black tree that links inodes in subvolume root */ + struct rb_node rb_node; + /* the space_info for where this inode's data allocations are done */ struct btrfs_space_info *space_info; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index fedf8b9..2b96027 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -197,14 +197,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, u32 nritems; int ret = 0; int level; - struct btrfs_root *new_root; - - new_root = kmalloc(sizeof(*new_root), GFP_NOFS); - if (!new_root) - return -ENOMEM; - - memcpy(new_root, root, sizeof(*new_root)); - new_root->root_key.objectid = new_root_objectid; + struct btrfs_disk_key disk_key; WARN_ON(root->ref_cows && trans->transid != root->fs_info->running_transaction->transid); @@ -212,28 +205,37 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, level = btrfs_header_level(buf); nritems = btrfs_header_nritems(buf); + if (level == 0) + btrfs_item_key(buf, &disk_key, 0); + else + btrfs_node_key(buf, &disk_key, 0); - cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0, - new_root_objectid, trans->transid, - level, buf->start, 0); - if (IS_ERR(cow)) { - kfree(new_root); + cow = btrfs_alloc_free_block(trans, root, buf->len, 0, + new_root_objectid, &disk_key, level, + buf->start, 0); + if (IS_ERR(cow)) return PTR_ERR(cow); - } copy_extent_buffer(cow, buf, 0, 0, cow->len); btrfs_set_header_bytenr(cow, cow->start); btrfs_set_header_generation(cow, trans->transid); - btrfs_set_header_owner(cow, new_root_objectid); - btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN); + btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); + btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | + BTRFS_HEADER_FLAG_RELOC); + if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) + btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); + else + btrfs_set_header_owner(cow, new_root_objectid); write_extent_buffer(cow, root->fs_info->fsid, (unsigned long)btrfs_header_fsid(cow), BTRFS_FSID_SIZE); WARN_ON(btrfs_header_generation(buf) > trans->transid); - ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL); - kfree(new_root); + if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) + ret = btrfs_inc_ref(trans, root, cow, 1); + else + ret = btrfs_inc_ref(trans, root, cow, 0); if (ret) return ret; @@ -244,6 +246,125 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, } /* + * check if the tree block can be shared by multiple trees + */ +int btrfs_block_can_be_shared(struct btrfs_root *root, + struct extent_buffer *buf) +{ + /* + * Tree blocks not in refernece counted trees and tree roots + * are never shared. If a block was allocated after the last + * snapshot and the block was not allocated by tree relocation, + * we know the block is not shared. + */ + if (root->ref_cows && + buf != root->node && buf != root->commit_root && + (btrfs_header_generation(buf) <= + btrfs_root_last_snapshot(&root->root_item) || + btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) + return 1; +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (root->ref_cows && + btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) + return 1; +#endif + return 0; +} + +static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer *cow) +{ + u64 refs; + u64 owner; + u64 flags; + u64 new_flags = 0; + int ret; + + /* + * Backrefs update rules: + * + * Always use full backrefs for extent pointers in tree block + * allocated by tree relocation. + * + * If a shared tree block is no longer referenced by its owner + * tree (btrfs_header_owner(buf) == root->root_key.objectid), + * use full backrefs for extent pointers in tree block. + * + * If a tree block is been relocating + * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID), + * use full backrefs for extent pointers in tree block. + * The reason for this is some operations (such as drop tree) + * are only allowed for blocks use full backrefs. + */ + + if (btrfs_block_can_be_shared(root, buf)) { + ret = btrfs_lookup_extent_info(trans, root, buf->start, + buf->len, &refs, &flags); + BUG_ON(ret); + BUG_ON(refs == 0); + } else { + refs = 1; + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) + flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; + else + flags = 0; + } + + owner = btrfs_header_owner(buf); + BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID && + !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); + + if (refs > 1) { + if ((owner == root->root_key.objectid || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && + !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { + ret = btrfs_inc_ref(trans, root, buf, 1); + BUG_ON(ret); + + if (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID) { + ret = btrfs_dec_ref(trans, root, buf, 0); + BUG_ON(ret); + ret = btrfs_inc_ref(trans, root, cow, 1); + BUG_ON(ret); + } + new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + } else { + + if (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID) + ret = btrfs_inc_ref(trans, root, cow, 1); + else + ret = btrfs_inc_ref(trans, root, cow, 0); + BUG_ON(ret); + } + if (new_flags != 0) { + ret = btrfs_set_disk_extent_flags(trans, root, + buf->start, + buf->len, + new_flags, 0); + BUG_ON(ret); + } + } else { + if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { + if (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID) + ret = btrfs_inc_ref(trans, root, cow, 1); + else + ret = btrfs_inc_ref(trans, root, cow, 0); + BUG_ON(ret); + ret = btrfs_dec_ref(trans, root, buf, 1); + BUG_ON(ret); + } + clean_tree_block(trans, root, buf); + } + return 0; +} + +/* * does the dirty work in cow of a single block. The parent block (if * supplied) is updated to point to the new cow copy. The new buffer is marked * dirty and returned locked. If you modify the block it needs to be marked @@ -262,34 +383,39 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct extent_buffer **cow_ret, u64 search_start, u64 empty_size) { - u64 parent_start; + struct btrfs_disk_key disk_key; struct extent_buffer *cow; - u32 nritems; - int ret = 0; int level; int unlock_orig = 0; + u64 parent_start; if (*cow_ret == buf) unlock_orig = 1; btrfs_assert_tree_locked(buf); - if (parent) - parent_start = parent->start; - else - parent_start = 0; - WARN_ON(root->ref_cows && trans->transid != root->fs_info->running_transaction->transid); WARN_ON(root->ref_cows && trans->transid != root->last_trans); level = btrfs_header_level(buf); - nritems = btrfs_header_nritems(buf); - cow = btrfs_alloc_free_block(trans, root, buf->len, - parent_start, root->root_key.objectid, - trans->transid, level, - search_start, empty_size); + if (level == 0) + btrfs_item_key(buf, &disk_key, 0); + else + btrfs_node_key(buf, &disk_key, 0); + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (parent) + parent_start = parent->start; + else + parent_start = 0; + } else + parent_start = 0; + + cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, + root->root_key.objectid, &disk_key, + level, search_start, empty_size); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -298,83 +424,53 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, copy_extent_buffer(cow, buf, 0, 0, cow->len); btrfs_set_header_bytenr(cow, cow->start); btrfs_set_header_generation(cow, trans->transid); - btrfs_set_header_owner(cow, root->root_key.objectid); - btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN); + btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); + btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | + BTRFS_HEADER_FLAG_RELOC); + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); + else + btrfs_set_header_owner(cow, root->root_key.objectid); write_extent_buffer(cow, root->fs_info->fsid, (unsigned long)btrfs_header_fsid(cow), BTRFS_FSID_SIZE); - WARN_ON(btrfs_header_generation(buf) > trans->transid); - if (btrfs_header_generation(buf) != trans->transid) { - u32 nr_extents; - ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents); - if (ret) - return ret; - - ret = btrfs_cache_ref(trans, root, buf, nr_extents); - WARN_ON(ret); - } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) { - /* - * There are only two places that can drop reference to - * tree blocks owned by living reloc trees, one is here, - * the other place is btrfs_drop_subtree. In both places, - * we check reference count while tree block is locked. - * Furthermore, if reference count is one, it won't get - * increased by someone else. - */ - u32 refs; - ret = btrfs_lookup_extent_ref(trans, root, buf->start, - buf->len, &refs); - BUG_ON(ret); - if (refs == 1) { - ret = btrfs_update_ref(trans, root, buf, cow, - 0, nritems); - clean_tree_block(trans, root, buf); - } else { - ret = btrfs_inc_ref(trans, root, buf, cow, NULL); - } - BUG_ON(ret); - } else { - ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems); - if (ret) - return ret; - clean_tree_block(trans, root, buf); - } - - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start); - WARN_ON(ret); - } + update_ref_for_cow(trans, root, buf, cow); if (buf == root->node) { WARN_ON(parent && parent != buf); + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || + btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) + parent_start = buf->start; + else + parent_start = 0; spin_lock(&root->node_lock); root->node = cow; extent_buffer_get(cow); spin_unlock(&root->node_lock); - if (buf != root->commit_root) { - btrfs_free_extent(trans, root, buf->start, - buf->len, buf->start, - root->root_key.objectid, - btrfs_header_generation(buf), - level, 1); - } + btrfs_free_extent(trans, root, buf->start, buf->len, + parent_start, root->root_key.objectid, + level, 0); free_extent_buffer(buf); add_root_to_dirty_list(root); } else { + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + parent_start = parent->start; + else + parent_start = 0; + + WARN_ON(trans->transid != btrfs_header_generation(parent)); btrfs_set_node_blockptr(parent, parent_slot, cow->start); - WARN_ON(trans->transid == 0); btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); - WARN_ON(btrfs_header_generation(parent) != trans->transid); btrfs_free_extent(trans, root, buf->start, buf->len, - parent_start, btrfs_header_owner(parent), - btrfs_header_generation(parent), level, 1); + parent_start, root->root_key.objectid, + level, 0); } if (unlock_orig) btrfs_tree_unlock(buf); @@ -384,6 +480,18 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, return 0; } +static inline int should_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf) +{ + if (btrfs_header_generation(buf) == trans->transid && + !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && + !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && + btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) + return 0; + return 1; +} + /* * cows a single block, see __btrfs_cow_block for the real work. * This version of it has extra checks so that a block isn't cow'd more than @@ -411,9 +519,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, WARN_ON(1); } - if (btrfs_header_generation(buf) == trans->transid && - btrfs_header_owner(buf) == root->root_key.objectid && - !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { + if (!should_cow_block(trans, root, buf)) { *cow_ret = buf; return 0; } @@ -469,7 +575,7 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) /* * same as comp_keys only with two btrfs_key's */ -static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) +int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) { if (k1->objectid > k2->objectid) return 1; @@ -845,6 +951,12 @@ static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, return -1; } +int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, + int level, int *slot) +{ + return bin_search(eb, key, level, slot); +} + /* given a node and slot number, this reads the blocks it points to. The * extent buffer is returned with a reference taken (but unlocked). * NULL is returned on error. @@ -921,13 +1033,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, root->node = child; spin_unlock(&root->node_lock); - ret = btrfs_update_extent_ref(trans, root, child->start, - child->len, - mid->start, child->start, - root->root_key.objectid, - trans->transid, level - 1); - BUG_ON(ret); - add_root_to_dirty_list(root); btrfs_tree_unlock(child); @@ -938,9 +1043,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* once for the path */ free_extent_buffer(mid); ret = btrfs_free_extent(trans, root, mid->start, mid->len, - mid->start, root->root_key.objectid, - btrfs_header_generation(mid), - level, 1); + 0, root->root_key.objectid, level, 1); /* once for the root ptr */ free_extent_buffer(mid); return ret; @@ -998,7 +1101,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, ret = wret; if (btrfs_header_nritems(right) == 0) { u64 bytenr = right->start; - u64 generation = btrfs_header_generation(parent); u32 blocksize = right->len; clean_tree_block(trans, root, right); @@ -1010,9 +1112,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret) ret = wret; wret = btrfs_free_extent(trans, root, bytenr, - blocksize, parent->start, - btrfs_header_owner(parent), - generation, level, 1); + blocksize, 0, + root->root_key.objectid, + level, 0); if (wret) ret = wret; } else { @@ -1047,7 +1149,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } if (btrfs_header_nritems(mid) == 0) { /* we've managed to empty the middle node, drop it */ - u64 root_gen = btrfs_header_generation(parent); u64 bytenr = mid->start; u32 blocksize = mid->len; @@ -1059,9 +1160,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret) ret = wret; wret = btrfs_free_extent(trans, root, bytenr, blocksize, - parent->start, - btrfs_header_owner(parent), - root_gen, level, 1); + 0, root->root_key.objectid, + level, 0); if (wret) ret = wret; } else { @@ -1437,7 +1537,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) { int i; - if (path->keep_locks || path->lowest_level) + if (path->keep_locks) return; for (i = level; i < BTRFS_MAX_LEVEL; i++) { @@ -1614,10 +1714,17 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root lowest_unlock = 2; again: - if (p->skip_locking) - b = btrfs_root_node(root); - else - b = btrfs_lock_root_node(root); + if (p->search_commit_root) { + b = root->commit_root; + extent_buffer_get(b); + if (!p->skip_locking) + btrfs_tree_lock(b); + } else { + if (p->skip_locking) + b = btrfs_root_node(root); + else + b = btrfs_lock_root_node(root); + } while (b) { level = btrfs_header_level(b); @@ -1638,11 +1745,9 @@ again: * then we don't want to set the path blocking, * so we test it here */ - if (btrfs_header_generation(b) == trans->transid && - btrfs_header_owner(b) == root->root_key.objectid && - !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { + if (!should_cow_block(trans, root, b)) goto cow_done; - } + btrfs_set_path_blocking(p); wret = btrfs_cow_block(trans, root, b, @@ -1764,138 +1869,6 @@ done: return ret; } -int btrfs_merge_path(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_key *node_keys, - u64 *nodes, int lowest_level) -{ - struct extent_buffer *eb; - struct extent_buffer *parent; - struct btrfs_key key; - u64 bytenr; - u64 generation; - u32 blocksize; - int level; - int slot; - int key_match; - int ret; - - eb = btrfs_lock_root_node(root); - ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb); - BUG_ON(ret); - - btrfs_set_lock_blocking(eb); - - parent = eb; - while (1) { - level = btrfs_header_level(parent); - if (level == 0 || level <= lowest_level) - break; - - ret = bin_search(parent, &node_keys[lowest_level], level, - &slot); - if (ret && slot > 0) - slot--; - - bytenr = btrfs_node_blockptr(parent, slot); - if (nodes[level - 1] == bytenr) - break; - - blocksize = btrfs_level_size(root, level - 1); - generation = btrfs_node_ptr_generation(parent, slot); - btrfs_node_key_to_cpu(eb, &key, slot); - key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key)); - - if (generation == trans->transid) { - eb = read_tree_block(root, bytenr, blocksize, - generation); - btrfs_tree_lock(eb); - btrfs_set_lock_blocking(eb); - } - - /* - * if node keys match and node pointer hasn't been modified - * in the running transaction, we can merge the path. for - * blocks owened by reloc trees, the node pointer check is - * skipped, this is because these blocks are fully controlled - * by the space balance code, no one else can modify them. - */ - if (!nodes[level - 1] || !key_match || - (generation == trans->transid && - btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) { - if (level == 1 || level == lowest_level + 1) { - if (generation == trans->transid) { - btrfs_tree_unlock(eb); - free_extent_buffer(eb); - } - break; - } - - if (generation != trans->transid) { - eb = read_tree_block(root, bytenr, blocksize, - generation); - btrfs_tree_lock(eb); - btrfs_set_lock_blocking(eb); - } - - ret = btrfs_cow_block(trans, root, eb, parent, slot, - &eb); - BUG_ON(ret); - - if (root->root_key.objectid == - BTRFS_TREE_RELOC_OBJECTID) { - if (!nodes[level - 1]) { - nodes[level - 1] = eb->start; - memcpy(&node_keys[level - 1], &key, - sizeof(node_keys[0])); - } else { - WARN_ON(1); - } - } - - btrfs_tree_unlock(parent); - free_extent_buffer(parent); - parent = eb; - continue; - } - - btrfs_set_node_blockptr(parent, slot, nodes[level - 1]); - btrfs_set_node_ptr_generation(parent, slot, trans->transid); - btrfs_mark_buffer_dirty(parent); - - ret = btrfs_inc_extent_ref(trans, root, - nodes[level - 1], - blocksize, parent->start, - btrfs_header_owner(parent), - btrfs_header_generation(parent), - level - 1); - BUG_ON(ret); - - /* - * If the block was created in the running transaction, - * it's possible this is the last reference to it, so we - * should drop the subtree. - */ - if (generation == trans->transid) { - ret = btrfs_drop_subtree(trans, root, eb, parent); - BUG_ON(ret); - btrfs_tree_unlock(eb); - free_extent_buffer(eb); - } else { - ret = btrfs_free_extent(trans, root, bytenr, - blocksize, parent->start, - btrfs_header_owner(parent), - btrfs_header_generation(parent), - level - 1, 1); - BUG_ON(ret); - } - break; - } - btrfs_tree_unlock(parent); - free_extent_buffer(parent); - return 0; -} - /* * adjust the pointers going up the tree, starting at level * making sure the right key of each node is points to 'key'. @@ -2021,9 +1994,6 @@ static int push_node_left(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(src); btrfs_mark_buffer_dirty(dst); - ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items); - BUG_ON(ret); - return ret; } @@ -2083,9 +2053,6 @@ static int balance_node_right(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(src); btrfs_mark_buffer_dirty(dst); - ret = btrfs_update_ref(trans, root, src, dst, 0, push_items); - BUG_ON(ret); - return ret; } @@ -2105,7 +2072,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, struct extent_buffer *c; struct extent_buffer *old; struct btrfs_disk_key lower_key; - int ret; BUG_ON(path->nodes[level]); BUG_ON(path->nodes[level-1] != root->node); @@ -2117,16 +2083,17 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_node_key(lower, &lower_key, 0); c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, - root->root_key.objectid, trans->transid, + root->root_key.objectid, &lower_key, level, root->node->start, 0); if (IS_ERR(c)) return PTR_ERR(c); - memset_extent_buffer(c, 0, 0, root->nodesize); + memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_nritems(c, 1); btrfs_set_header_level(c, level); btrfs_set_header_bytenr(c, c->start); btrfs_set_header_generation(c, trans->transid); + btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(c, root->root_key.objectid); write_extent_buffer(c, root->fs_info->fsid, @@ -2151,12 +2118,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, root->node = c; spin_unlock(&root->node_lock); - ret = btrfs_update_extent_ref(trans, root, lower->start, - lower->len, lower->start, c->start, - root->root_key.objectid, - trans->transid, level - 1); - BUG_ON(ret); - /* the super has an extra ref to root->node */ free_extent_buffer(old); @@ -2244,20 +2205,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans, } c_nritems = btrfs_header_nritems(c); + mid = (c_nritems + 1) / 2; + btrfs_node_key(c, &disk_key, mid); - split = btrfs_alloc_free_block(trans, root, root->nodesize, - path->nodes[level + 1]->start, + split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, root->root_key.objectid, - trans->transid, level, c->start, 0); + &disk_key, level, c->start, 0); if (IS_ERR(split)) return PTR_ERR(split); - btrfs_set_header_flags(split, btrfs_header_flags(c)); + memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_level(split, btrfs_header_level(c)); btrfs_set_header_bytenr(split, split->start); btrfs_set_header_generation(split, trans->transid); + btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(split, root->root_key.objectid); - btrfs_set_header_flags(split, 0); write_extent_buffer(split, root->fs_info->fsid, (unsigned long)btrfs_header_fsid(split), BTRFS_FSID_SIZE); @@ -2265,7 +2227,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans, (unsigned long)btrfs_header_chunk_tree_uuid(split), BTRFS_UUID_SIZE); - mid = (c_nritems + 1) / 2; copy_extent_buffer(split, c, btrfs_node_key_ptr_offset(0), @@ -2278,16 +2239,12 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); btrfs_mark_buffer_dirty(split); - btrfs_node_key(split, &disk_key, 0); wret = insert_ptr(trans, root, path, &disk_key, split->start, path->slots[level + 1] + 1, level + 1); if (wret) ret = wret; - ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid); - BUG_ON(ret); - if (path->slots[level] >= mid) { path->slots[level] -= mid; btrfs_tree_unlock(c); @@ -2360,7 +2317,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, u32 right_nritems; u32 data_end; u32 this_item_size; - int ret; if (empty) nr = 0; @@ -2473,9 +2429,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(left); btrfs_mark_buffer_dirty(right); - ret = btrfs_update_ref(trans, root, left, right, 0, push_items); - BUG_ON(ret); - btrfs_item_key(right, &disk_key, 0); btrfs_set_node_key(upper, &disk_key, slot + 1); btrfs_mark_buffer_dirty(upper); @@ -2720,10 +2673,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, if (right_nritems) btrfs_mark_buffer_dirty(right); - ret = btrfs_update_ref(trans, root, right, left, - old_left_nritems, push_items); - BUG_ON(ret); - btrfs_item_key(right, &disk_key, 0); wret = fixup_low_keys(trans, root, path, &disk_key, 1); if (wret) @@ -2880,9 +2829,6 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(l); BUG_ON(path->slots[0] != slot); - ret = btrfs_update_ref(trans, root, l, right, 0, nritems); - BUG_ON(ret); - if (mid <= slot) { btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); @@ -2911,6 +2857,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_path *path, int data_size, int extend) { + struct btrfs_disk_key disk_key; struct extent_buffer *l; u32 nritems; int mid; @@ -2918,7 +2865,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, struct extent_buffer *right; int ret = 0; int wret; - int double_split; + int split; int num_doubles = 0; /* first try to make some room by pushing left and right */ @@ -2945,16 +2892,53 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, return ret; } again: - double_split = 0; + split = 1; l = path->nodes[0]; slot = path->slots[0]; nritems = btrfs_header_nritems(l); mid = (nritems + 1) / 2; - right = btrfs_alloc_free_block(trans, root, root->leafsize, - path->nodes[1]->start, + if (mid <= slot) { + if (nritems == 1 || + leaf_space_used(l, mid, nritems - mid) + data_size > + BTRFS_LEAF_DATA_SIZE(root)) { + if (slot >= nritems) { + split = 0; + } else { + mid = slot; + if (mid != nritems && + leaf_space_used(l, mid, nritems - mid) + + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + split = 2; + } + } + } + } else { + if (leaf_space_used(l, 0, mid) + data_size > + BTRFS_LEAF_DATA_SIZE(root)) { + if (!extend && data_size && slot == 0) { + split = 0; + } else if ((extend || !data_size) && slot == 0) { + mid = 1; + } else { + mid = slot; + if (mid != nritems && + leaf_space_used(l, mid, nritems - mid) + + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + split = 2 ; + } + } + } + } + + if (split == 0) + btrfs_cpu_key_to_disk(&disk_key, ins_key); + else + btrfs_item_key(l, &disk_key, mid); + + right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, root->root_key.objectid, - trans->transid, 0, l->start, 0); + &disk_key, 0, l->start, 0); if (IS_ERR(right)) { BUG_ON(1); return PTR_ERR(right); @@ -2963,6 +2947,7 @@ again: memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_bytenr(right, right->start); btrfs_set_header_generation(right, trans->transid); + btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(right, root->root_key.objectid); btrfs_set_header_level(right, 0); write_extent_buffer(right, root->fs_info->fsid, @@ -2973,79 +2958,47 @@ again: (unsigned long)btrfs_header_chunk_tree_uuid(right), BTRFS_UUID_SIZE); - if (mid <= slot) { - if (nritems == 1 || - leaf_space_used(l, mid, nritems - mid) + data_size > - BTRFS_LEAF_DATA_SIZE(root)) { - if (slot >= nritems) { - struct btrfs_disk_key disk_key; - - btrfs_cpu_key_to_disk(&disk_key, ins_key); - btrfs_set_header_nritems(right, 0); - wret = insert_ptr(trans, root, path, - &disk_key, right->start, - path->slots[1] + 1, 1); - if (wret) - ret = wret; + if (split == 0) { + if (mid <= slot) { + btrfs_set_header_nritems(right, 0); + wret = insert_ptr(trans, root, path, + &disk_key, right->start, + path->slots[1] + 1, 1); + if (wret) + ret = wret; - btrfs_tree_unlock(path->nodes[0]); - free_extent_buffer(path->nodes[0]); - path->nodes[0] = right; - path->slots[0] = 0; - path->slots[1] += 1; - btrfs_mark_buffer_dirty(right); - return ret; - } - mid = slot; - if (mid != nritems && - leaf_space_used(l, mid, nritems - mid) + - data_size > BTRFS_LEAF_DATA_SIZE(root)) { - double_split = 1; - } - } - } else { - if (leaf_space_used(l, 0, mid) + data_size > - BTRFS_LEAF_DATA_SIZE(root)) { - if (!extend && data_size && slot == 0) { - struct btrfs_disk_key disk_key; - - btrfs_cpu_key_to_disk(&disk_key, ins_key); - btrfs_set_header_nritems(right, 0); - wret = insert_ptr(trans, root, path, - &disk_key, - right->start, - path->slots[1], 1); + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] = 0; + path->slots[1] += 1; + } else { + btrfs_set_header_nritems(right, 0); + wret = insert_ptr(trans, root, path, + &disk_key, + right->start, + path->slots[1], 1); + if (wret) + ret = wret; + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] = 0; + if (path->slots[1] == 0) { + wret = fixup_low_keys(trans, root, + path, &disk_key, 1); if (wret) ret = wret; - btrfs_tree_unlock(path->nodes[0]); - free_extent_buffer(path->nodes[0]); - path->nodes[0] = right; - path->slots[0] = 0; - if (path->slots[1] == 0) { - wret = fixup_low_keys(trans, root, - path, &disk_key, 1); - if (wret) - ret = wret; - } - btrfs_mark_buffer_dirty(right); - return ret; - } else if ((extend || !data_size) && slot == 0) { - mid = 1; - } else { - mid = slot; - if (mid != nritems && - leaf_space_used(l, mid, nritems - mid) + - data_size > BTRFS_LEAF_DATA_SIZE(root)) { - double_split = 1; - } } } + btrfs_mark_buffer_dirty(right); + return ret; } ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems); BUG_ON(ret); - if (double_split) { + if (split == 2) { BUG_ON(num_doubles != 0); num_doubles++; goto again; @@ -3447,7 +3400,7 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans, /* figure out how many keys we can insert in here */ total_data = data_size[0]; for (i = 1; i < nr; i++) { - if (comp_cpu_keys(&found_key, cpu_key + i) <= 0) + if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0) break; total_data += data_size[i]; } @@ -3745,9 +3698,7 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, /* * a helper function to delete the leaf pointed to by path->slots[1] and - * path->nodes[1]. bytenr is the node block pointer, but since the callers - * already know it, it is faster to have them pass it down than to - * read it out of the node again. + * path->nodes[1]. * * This deletes the pointer in path->nodes[1] and frees the leaf * block extent. zero is returned if it all worked out, < 0 otherwise. @@ -3755,15 +3706,14 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, * The path must have already been setup for deleting the leaf, including * all the proper balancing. path->nodes[1] must be locked. */ -noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 bytenr) +static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *leaf) { int ret; - u64 root_gen = btrfs_header_generation(path->nodes[1]); - u64 parent_start = path->nodes[1]->start; - u64 parent_owner = btrfs_header_owner(path->nodes[1]); + WARN_ON(btrfs_header_generation(leaf) != trans->transid); ret = del_ptr(trans, root, path, 1, path->slots[1]); if (ret) return ret; @@ -3774,10 +3724,8 @@ noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, */ btrfs_unlock_up_safe(path, 0); - ret = btrfs_free_extent(trans, root, bytenr, - btrfs_level_size(root, 0), - parent_start, parent_owner, - root_gen, 0, 1); + ret = btrfs_free_extent(trans, root, leaf->start, leaf->len, + 0, root->root_key.objectid, 0, 0); return ret; } /* @@ -3845,7 +3793,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (leaf == root->node) { btrfs_set_header_level(leaf, 0); } else { - ret = btrfs_del_leaf(trans, root, path, leaf->start); + ret = btrfs_del_leaf(trans, root, path, leaf); BUG_ON(ret); } } else { @@ -3884,8 +3832,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (btrfs_header_nritems(leaf) == 0) { path->slots[1] = slot; - ret = btrfs_del_leaf(trans, root, path, - leaf->start); + ret = btrfs_del_leaf(trans, root, path, leaf); BUG_ON(ret); free_extent_buffer(leaf); } else { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4414a5d..ce3ab4e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -45,6 +45,8 @@ struct btrfs_ordered_sum; #define BTRFS_MAX_LEVEL 8 +#define BTRFS_COMPAT_EXTENT_TREE_V0 + /* * files bigger than this get some pre-flushing when they are added * to the ordered operations list. That way we limit the total @@ -267,7 +269,18 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) } #define BTRFS_FSID_SIZE 16 -#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0) +#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) +#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) +#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) +#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) + +#define BTRFS_BACKREF_REV_MAX 256 +#define BTRFS_BACKREF_REV_SHIFT 56 +#define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \ + BTRFS_BACKREF_REV_SHIFT) + +#define BTRFS_OLD_BACKREF_REV 0 +#define BTRFS_MIXED_BACKREF_REV 1 /* * every tree block (leaf or node) starts with this header. @@ -296,7 +309,6 @@ struct btrfs_header { sizeof(struct btrfs_item) - \ sizeof(struct btrfs_file_extent_item)) -#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) /* * this is a very generous portion of the super block, giving us @@ -355,9 +367,12 @@ struct btrfs_super_block { * Compat flags that we support. If any incompat flags are set other than the * ones specified below then we will fail to mount */ -#define BTRFS_FEATURE_COMPAT_SUPP 0x0 -#define BTRFS_FEATURE_COMPAT_RO_SUPP 0x0 -#define BTRFS_FEATURE_INCOMPAT_SUPP 0x0 +#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) + +#define BTRFS_FEATURE_COMPAT_SUPP 0ULL +#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF /* * A leaf is full of items. offset and size tell us where to find @@ -421,23 +436,65 @@ struct btrfs_path { unsigned int keep_locks:1; unsigned int skip_locking:1; unsigned int leave_spinning:1; + unsigned int search_commit_root:1; }; /* * items in the extent btree are used to record the objectid of the * owner of the block and the number of references */ + struct btrfs_extent_item { + __le64 refs; + __le64 generation; + __le64 flags; +} __attribute__ ((__packed__)); + +struct btrfs_extent_item_v0 { __le32 refs; } __attribute__ ((__packed__)); -struct btrfs_extent_ref { +#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \ + sizeof(struct btrfs_item)) + +#define BTRFS_EXTENT_FLAG_DATA (1ULL << 0) +#define BTRFS_EXTENT_FLAG_TREE_BLOCK (1ULL << 1) + +/* following flags only apply to tree blocks */ + +/* use full backrefs for extent pointers in the block */ +#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8) + +struct btrfs_tree_block_info { + struct btrfs_disk_key key; + u8 level; +} __attribute__ ((__packed__)); + +struct btrfs_extent_data_ref { + __le64 root; + __le64 objectid; + __le64 offset; + __le32 count; +} __attribute__ ((__packed__)); + +struct btrfs_shared_data_ref { + __le32 count; +} __attribute__ ((__packed__)); + +struct btrfs_extent_inline_ref { + u8 type; + u64 offset; +} __attribute__ ((__packed__)); + +/* old style backrefs item */ +struct btrfs_extent_ref_v0 { __le64 root; __le64 generation; __le64 objectid; - __le32 num_refs; + __le32 count; } __attribute__ ((__packed__)); + /* dev extents record free space on individual devices. The owner * field points back to the chunk allocation mapping tree that allocated * the extent. The chunk tree uuid field is a way to double check the owner @@ -695,12 +752,7 @@ struct btrfs_block_group_cache { struct list_head cluster_list; }; -struct btrfs_leaf_ref_tree { - struct rb_root root; - struct list_head list; - spinlock_t lock; -}; - +struct reloc_control; struct btrfs_device; struct btrfs_fs_devices; struct btrfs_fs_info { @@ -831,18 +883,11 @@ struct btrfs_fs_info { struct task_struct *cleaner_kthread; int thread_pool_size; - /* tree relocation relocated fields */ - struct list_head dead_reloc_roots; - struct btrfs_leaf_ref_tree reloc_ref_tree; - struct btrfs_leaf_ref_tree shared_ref_tree; - struct kobject super_kobj; struct completion kobj_unregister; int do_barriers; int closing; int log_root_recovering; - atomic_t throttles; - atomic_t throttle_gen; u64 total_pinned; @@ -861,6 +906,8 @@ struct btrfs_fs_info { */ struct list_head space_info; + struct reloc_control *reloc_ctl; + spinlock_t delalloc_lock; spinlock_t new_trans_lock; u64 delalloc_bytes; @@ -891,7 +938,6 @@ struct btrfs_fs_info { * in ram representation of the tree. extent_root is used for all allocations * and for the extent tree extent_root root. */ -struct btrfs_dirty_root; struct btrfs_root { struct extent_buffer *node; @@ -899,9 +945,6 @@ struct btrfs_root { spinlock_t node_lock; struct extent_buffer *commit_root; - struct btrfs_leaf_ref_tree *ref_tree; - struct btrfs_leaf_ref_tree ref_tree_struct; - struct btrfs_dirty_root *dirty_root; struct btrfs_root *log_root; struct btrfs_root *reloc_root; @@ -952,10 +995,15 @@ struct btrfs_root { /* the dirty list is only used by non-reference counted roots */ struct list_head dirty_list; + struct list_head root_list; + spinlock_t list_lock; - struct list_head dead_list; struct list_head orphan_list; + spinlock_t inode_lock; + /* red-black tree that keeps track of in-memory inodes */ + struct rb_root inode_tree; + /* * right now this just gets used so that a root has its own devid * for stat. It may be used for more later @@ -1017,7 +1065,16 @@ struct btrfs_root { * are used, and how many references there are to each block */ #define BTRFS_EXTENT_ITEM_KEY 168 -#define BTRFS_EXTENT_REF_KEY 180 + +#define BTRFS_TREE_BLOCK_REF_KEY 176 + +#define BTRFS_EXTENT_DATA_REF_KEY 178 + +#define BTRFS_EXTENT_REF_V0_KEY 180 + +#define BTRFS_SHARED_BLOCK_REF_KEY 182 + +#define BTRFS_SHARED_DATA_REF_KEY 184 /* * block groups give us hints into the extent allocation trees. Which @@ -1317,24 +1374,67 @@ static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev) return (u8 *)((unsigned long)dev + ptr); } -/* struct btrfs_extent_ref */ -BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64); -BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64); -BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64); -BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32); +BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64); +BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, + generation, 64); +BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64); -BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64); -BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref, - generation, 64); -BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref, - objectid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref, - num_refs, 32); +BTRFS_SETGET_FUNCS(extent_refs_v0, struct btrfs_extent_item_v0, refs, 32); + + +BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8); + +static inline void btrfs_tree_block_key(struct extent_buffer *eb, + struct btrfs_tree_block_info *item, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, item, struct btrfs_tree_block_info, key, key); +} + +static inline void btrfs_set_tree_block_key(struct extent_buffer *eb, + struct btrfs_tree_block_info *item, + struct btrfs_disk_key *key) +{ + write_eb_member(eb, item, struct btrfs_tree_block_info, key, key); +} -/* struct btrfs_extent_item */ -BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32); -BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item, - refs, 32); +BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref, + root, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref, + objectid, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref, + offset, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, + count, 32); + +BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, + count, 32); + +BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref, + type, 8); +BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref, + offset, 64); + +static inline u32 btrfs_extent_inline_ref_size(int type) +{ + if (type == BTRFS_TREE_BLOCK_REF_KEY || + type == BTRFS_SHARED_BLOCK_REF_KEY) + return sizeof(struct btrfs_extent_inline_ref); + if (type == BTRFS_SHARED_DATA_REF_KEY) + return sizeof(struct btrfs_shared_data_ref) + + sizeof(struct btrfs_extent_inline_ref); + if (type == BTRFS_EXTENT_DATA_REF_KEY) + return sizeof(struct btrfs_extent_data_ref) + + offsetof(struct btrfs_extent_inline_ref, offset); + BUG(); + return 0; +} + +BTRFS_SETGET_FUNCS(ref_root_v0, struct btrfs_extent_ref_v0, root, 64); +BTRFS_SETGET_FUNCS(ref_generation_v0, struct btrfs_extent_ref_v0, + generation, 64); +BTRFS_SETGET_FUNCS(ref_objectid_v0, struct btrfs_extent_ref_v0, objectid, 64); +BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32); /* struct btrfs_node */ BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); @@ -1558,6 +1658,21 @@ static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) return (flags & flag) == flag; } +static inline int btrfs_header_backref_rev(struct extent_buffer *eb) +{ + u64 flags = btrfs_header_flags(eb); + return flags >> BTRFS_BACKREF_REV_SHIFT; +} + +static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, + int rev) +{ + u64 flags = btrfs_header_flags(eb); + flags &= ~BTRFS_BACKREF_REV_MASK; + flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT; + btrfs_set_header_flags(eb, flags); +} + static inline u8 *btrfs_header_fsid(struct extent_buffer *eb) { unsigned long ptr = offsetof(struct btrfs_header, fsid); @@ -1790,39 +1905,32 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *leaf); int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 objectid, u64 bytenr); + struct btrfs_root *root, + u64 objectid, u64 offset, u64 bytenr); int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy); struct btrfs_block_group_cache *btrfs_lookup_block_group( struct btrfs_fs_info *info, u64 bytenr); +void btrfs_put_block_group(struct btrfs_block_group_cache *cache); u64 btrfs_find_block_group(struct btrfs_root *root, u64 search_start, u64 search_hint, int owner); struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u32 blocksize, u64 parent, - u64 root_objectid, - u64 ref_generation, - int level, - u64 hint, - u64 empty_size); + struct btrfs_root *root, u32 blocksize, + u64 parent, u64 root_objectid, + struct btrfs_disk_key *key, int level, + u64 hint, u64 empty_size); struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u32 blocksize, int level); -int btrfs_alloc_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 num_bytes, u64 parent, u64 min_bytes, - u64 root_objectid, u64 ref_generation, - u64 owner, u64 empty_size, u64 hint_byte, - u64 search_end, struct btrfs_key *ins, u64 data); -int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner, struct btrfs_key *ins); -int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner, struct btrfs_key *ins); +int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 root_objectid, u64 owner, + u64 offset, struct btrfs_key *ins); +int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 root_objectid, u64 owner, u64 offset, + struct btrfs_key *ins); int btrfs_reserve_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 num_bytes, u64 min_alloc_size, @@ -1830,18 +1938,18 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, u64 search_end, struct btrfs_key *ins, u64 data); int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *orig_buf, struct extent_buffer *buf, - u32 *nr_extents); -int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, u32 nr_extents); -int btrfs_update_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *orig_buf, - struct extent_buffer *buf, int start_slot, int nr); + struct extent_buffer *buf, int full_backref); +int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref); +int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 flags, + int is_data); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid, int pin); + u64 root_objectid, u64 owner, u64 offset); + int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -1849,13 +1957,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid); -int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, u64 num_bytes, - u64 orig_parent, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid); + u64 root_objectid, u64 owner, u64 offset); + int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr); @@ -1867,16 +1970,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 size); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start); -int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); -int btrfs_free_reloc_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_drop_dead_reloc_roots(struct btrfs_root *root); -int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf, u64 orig_start); -int btrfs_add_dead_reloc_root(struct btrfs_root *root); -int btrfs_cleanup_reloc_trees(struct btrfs_root *root); -int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); +int btrfs_prepare_block_group_relocation(struct btrfs_root *root, + struct btrfs_block_group_cache *group); + u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); @@ -1891,13 +1987,12 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, u64 bytes); /* ctree.c */ +int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, + int level, int *slot); +int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2); int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, int type); -int btrfs_merge_path(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_key *node_keys, - u64 *nodes, int lowest_level); int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *new_key); @@ -1918,6 +2013,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer **cow_ret, u64 new_root_objectid); +int btrfs_block_can_be_shared(struct btrfs_root *root, + struct extent_buffer *buf); int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u32 data_size); int btrfs_truncate_item(struct btrfs_trans_handle *trans, @@ -1944,9 +2041,6 @@ void btrfs_unlock_up_safe(struct btrfs_path *p, int level); int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int slot, int nr); -int btrfs_del_leaf(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 bytenr); static inline int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path) @@ -2005,8 +2099,9 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct btrfs_root_item *item, struct btrfs_key *key); int btrfs_search_root(struct btrfs_root *root, u64 search_start, u64 *found_objectid); -int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid, - struct btrfs_root *latest_root); +int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); +int btrfs_set_root_node(struct btrfs_root_item *item, + struct extent_buffer *node); /* dir-item.c */ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, @@ -2139,7 +2234,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_delete_inode(struct inode *inode); void btrfs_put_inode(struct inode *inode); -void btrfs_read_locked_inode(struct inode *inode); int btrfs_write_inode(struct inode *inode, int wait); void btrfs_dirty_inode(struct inode *inode); struct inode *btrfs_alloc_inode(struct super_block *sb); @@ -2147,12 +2241,8 @@ void btrfs_destroy_inode(struct inode *inode); int btrfs_init_cachep(void); void btrfs_destroy_cachep(void); long btrfs_ioctl_trans_end(struct file *file); -struct inode *btrfs_ilookup(struct super_block *s, u64 objectid, - struct btrfs_root *root, int wait); -struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, - struct btrfs_root *root); struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root, int *is_new); + struct btrfs_root *root); int btrfs_commit_write(struct file *file, struct page *page, unsigned from, unsigned to); struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, @@ -2209,4 +2299,12 @@ int btrfs_check_acl(struct inode *inode, int mask); int btrfs_init_acl(struct inode *inode, struct inode *dir); int btrfs_acl_chmod(struct inode *inode); +/* relocation.c */ +int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); +int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_recover_relocation(struct btrfs_root *root); +int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); #endif diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index d6c01c0..84e6781 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -29,27 +29,87 @@ * add extents in the middle of btrfs_search_slot, and it allows * us to buffer up frequently modified backrefs in an rb tree instead * of hammering updates on the extent allocation tree. - * - * Right now this code is only used for reference counted trees, but - * the long term goal is to get rid of the similar code for delayed - * extent tree modifications. */ /* - * entries in the rb tree are ordered by the byte number of the extent - * and by the byte number of the parent block. + * compare two delayed tree backrefs with same bytenr and type + */ +static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2, + struct btrfs_delayed_tree_ref *ref1) +{ + if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) { + if (ref1->root < ref2->root) + return -1; + if (ref1->root > ref2->root) + return 1; + } else { + if (ref1->parent < ref2->parent) + return -1; + if (ref1->parent > ref2->parent) + return 1; + } + return 0; +} + +/* + * compare two delayed data backrefs with same bytenr and type */ -static int comp_entry(struct btrfs_delayed_ref_node *ref, - u64 bytenr, u64 parent) +static int comp_data_refs(struct btrfs_delayed_data_ref *ref2, + struct btrfs_delayed_data_ref *ref1) { - if (bytenr < ref->bytenr) + if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) { + if (ref1->root < ref2->root) + return -1; + if (ref1->root > ref2->root) + return 1; + if (ref1->objectid < ref2->objectid) + return -1; + if (ref1->objectid > ref2->objectid) + return 1; + if (ref1->offset < ref2->offset) + return -1; + if (ref1->offset > ref2->offset) + return 1; + } else { + if (ref1->parent < ref2->parent) + return -1; + if (ref1->parent > ref2->parent) + return 1; + } + return 0; +} + +/* + * entries in the rb tree are ordered by the byte number of the extent, + * type of the delayed backrefs and content of delayed backrefs. + */ +static int comp_entry(struct btrfs_delayed_ref_node *ref2, + struct btrfs_delayed_ref_node *ref1) +{ + if (ref1->bytenr < ref2->bytenr) return -1; - if (bytenr > ref->bytenr) + if (ref1->bytenr > ref2->bytenr) return 1; - if (parent < ref->parent) + if (ref1->is_head && ref2->is_head) + return 0; + if (ref2->is_head) return -1; - if (parent > ref->parent) + if (ref1->is_head) return 1; + if (ref1->type < ref2->type) + return -1; + if (ref1->type > ref2->type) + return 1; + if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || + ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { + return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), + btrfs_delayed_node_to_tree_ref(ref1)); + } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY || + ref1->type == BTRFS_SHARED_DATA_REF_KEY) { + return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2), + btrfs_delayed_node_to_data_ref(ref1)); + } + BUG(); return 0; } @@ -59,20 +119,21 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref, * inserted. */ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, - u64 bytenr, u64 parent, struct rb_node *node) { struct rb_node **p = &root->rb_node; struct rb_node *parent_node = NULL; struct btrfs_delayed_ref_node *entry; + struct btrfs_delayed_ref_node *ins; int cmp; + ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); while (*p) { parent_node = *p; entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, rb_node); - cmp = comp_entry(entry, bytenr, parent); + cmp = comp_entry(entry, ins); if (cmp < 0) p = &(*p)->rb_left; else if (cmp > 0) @@ -81,18 +142,17 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, return entry; } - entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); rb_link_node(node, parent_node, p); rb_insert_color(node, root); return NULL; } /* - * find an entry based on (bytenr,parent). This returns the delayed - * ref if it was able to find one, or NULL if nothing was in that spot + * find an head entry based on bytenr. This returns the delayed ref + * head if it was able to find one, or NULL if nothing was in that spot */ -static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root, - u64 bytenr, u64 parent, +static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, + u64 bytenr, struct btrfs_delayed_ref_node **last) { struct rb_node *n = root->rb_node; @@ -105,7 +165,15 @@ static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root, if (last) *last = entry; - cmp = comp_entry(entry, bytenr, parent); + if (bytenr < entry->bytenr) + cmp = -1; + else if (bytenr > entry->bytenr) + cmp = 1; + else if (!btrfs_delayed_ref_is_head(entry)) + cmp = 1; + else + cmp = 0; + if (cmp < 0) n = n->rb_left; else if (cmp > 0) @@ -154,7 +222,7 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, node = rb_first(&delayed_refs->root); } else { ref = NULL; - tree_search(&delayed_refs->root, start, (u64)-1, &ref); + find_ref_head(&delayed_refs->root, start, &ref); if (ref) { struct btrfs_delayed_ref_node *tmp; @@ -234,7 +302,7 @@ int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr) delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); + ref = find_ref_head(&delayed_refs->root, bytenr, NULL); if (ref) { prev_node = rb_prev(&ref->rb_node); if (!prev_node) @@ -250,25 +318,28 @@ out: } /* - * helper function to lookup reference count + * helper function to lookup reference count and flags of extent. * * the head node for delayed ref is used to store the sum of all the - * reference count modifications queued up in the rbtree. This way you - * can check to see what the reference count would be if all of the - * delayed refs are processed. + * reference count modifications queued up in the rbtree. the head + * node may also store the extent flags to set. This way you can check + * to see what the reference count and extent flags would be if all of + * the delayed refs are not processed. */ -int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u32 *refs) +int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *refs, u64 *flags) { struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_path *path; - struct extent_buffer *leaf; struct btrfs_extent_item *ei; + struct extent_buffer *leaf; struct btrfs_key key; - u32 num_refs; + u32 item_size; + u64 num_refs; + u64 extent_flags; int ret; path = btrfs_alloc_path(); @@ -287,37 +358,60 @@ again: if (ret == 0) { leaf = path->nodes[0]; - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item); - num_refs = btrfs_extent_refs(leaf, ei); + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if (item_size >= sizeof(*ei)) { + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + num_refs = btrfs_extent_refs(leaf, ei); + extent_flags = btrfs_extent_flags(leaf, ei); + } else { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + struct btrfs_extent_item_v0 *ei0; + BUG_ON(item_size != sizeof(*ei0)); + ei0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item_v0); + num_refs = btrfs_extent_refs_v0(leaf, ei0); + /* FIXME: this isn't correct for data */ + extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; +#else + BUG(); +#endif + } + BUG_ON(num_refs == 0); } else { num_refs = 0; + extent_flags = 0; ret = 0; } spin_lock(&delayed_refs->lock); - ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); + ref = find_ref_head(&delayed_refs->root, bytenr, NULL); if (ref) { head = btrfs_delayed_node_to_head(ref); - if (mutex_trylock(&head->mutex)) { - num_refs += ref->ref_mod; - mutex_unlock(&head->mutex); - *refs = num_refs; - goto out; - } + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&ref->refs); + spin_unlock(&delayed_refs->lock); - atomic_inc(&ref->refs); - spin_unlock(&delayed_refs->lock); + btrfs_release_path(root->fs_info->extent_root, path); - btrfs_release_path(root->fs_info->extent_root, path); + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(ref); + goto again; + } + if (head->extent_op && head->extent_op->update_flags) + extent_flags |= head->extent_op->flags_to_set; + else + BUG_ON(num_refs == 0); - mutex_lock(&head->mutex); + num_refs += ref->ref_mod; mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(ref); - goto again; - } else { - *refs = num_refs; } + WARN_ON(num_refs == 0); + if (refs) + *refs = num_refs; + if (flags) + *flags = extent_flags; out: spin_unlock(&delayed_refs->lock); btrfs_free_path(path); @@ -338,16 +432,7 @@ update_existing_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *existing, struct btrfs_delayed_ref_node *update) { - struct btrfs_delayed_ref *existing_ref; - struct btrfs_delayed_ref *ref; - - existing_ref = btrfs_delayed_node_to_ref(existing); - ref = btrfs_delayed_node_to_ref(update); - - if (ref->pin) - existing_ref->pin = 1; - - if (ref->action != existing_ref->action) { + if (update->action != existing->action) { /* * this is effectively undoing either an add or a * drop. We decrement the ref_mod, and if it goes @@ -363,20 +448,13 @@ update_existing_ref(struct btrfs_trans_handle *trans, delayed_refs->num_entries--; if (trans->delayed_ref_updates) trans->delayed_ref_updates--; + } else { + WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || + existing->type == BTRFS_SHARED_BLOCK_REF_KEY); } } else { - if (existing_ref->action == BTRFS_ADD_DELAYED_REF) { - /* if we're adding refs, make sure all the - * details match up. The extent could - * have been totally freed and reallocated - * by a different owner before the delayed - * ref entries were removed. - */ - existing_ref->owner_objectid = ref->owner_objectid; - existing_ref->generation = ref->generation; - existing_ref->root = ref->root; - existing->num_bytes = update->num_bytes; - } + WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || + existing->type == BTRFS_SHARED_BLOCK_REF_KEY); /* * the action on the existing ref matches * the action on the ref we're trying to add. @@ -401,6 +479,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, existing_ref = btrfs_delayed_node_to_head(existing); ref = btrfs_delayed_node_to_head(update); + BUG_ON(existing_ref->is_data != ref->is_data); if (ref->must_insert_reserved) { /* if the extent was freed and then @@ -420,6 +499,24 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, } + if (ref->extent_op) { + if (!existing_ref->extent_op) { + existing_ref->extent_op = ref->extent_op; + } else { + if (ref->extent_op->update_key) { + memcpy(&existing_ref->extent_op->key, + &ref->extent_op->key, + sizeof(ref->extent_op->key)); + existing_ref->extent_op->update_key = 1; + } + if (ref->extent_op->update_flags) { + existing_ref->extent_op->flags_to_set |= + ref->extent_op->flags_to_set; + existing_ref->extent_op->update_flags = 1; + } + kfree(ref->extent_op); + } + } /* * update the reference mod on the head to reflect this new operation */ @@ -427,19 +524,16 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, } /* - * helper function to actually insert a delayed ref into the rbtree. + * helper function to actually insert a head node into the rbtree. * this does all the dirty work in terms of maintaining the correct - * overall modification count in the head node and properly dealing - * with updating existing nodes as new modifications are queued. + * overall modification count. */ -static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, - u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, - u64 ref_generation, u64 owner_objectid, int action, - int pin) +static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *ref, + u64 bytenr, u64 num_bytes, + int action, int is_data) { struct btrfs_delayed_ref_node *existing; - struct btrfs_delayed_ref *full_ref; struct btrfs_delayed_ref_head *head_ref = NULL; struct btrfs_delayed_ref_root *delayed_refs; int count_mod = 1; @@ -449,12 +543,10 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, * the head node stores the sum of all the mods, so dropping a ref * should drop the sum in the head node by one. */ - if (parent == (u64)-1) { - if (action == BTRFS_DROP_DELAYED_REF) - count_mod = -1; - else if (action == BTRFS_UPDATE_DELAYED_HEAD) - count_mod = 0; - } + if (action == BTRFS_UPDATE_DELAYED_HEAD) + count_mod = 0; + else if (action == BTRFS_DROP_DELAYED_REF) + count_mod = -1; /* * BTRFS_ADD_DELAYED_EXTENT means that we need to update @@ -467,57 +559,148 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, * Once we record must_insert_reserved, switch the action to * BTRFS_ADD_DELAYED_REF because other special casing is not required. */ - if (action == BTRFS_ADD_DELAYED_EXTENT) { + if (action == BTRFS_ADD_DELAYED_EXTENT) must_insert_reserved = 1; - action = BTRFS_ADD_DELAYED_REF; - } else { + else must_insert_reserved = 0; - } - delayed_refs = &trans->transaction->delayed_refs; /* first set the basic ref node struct up */ atomic_set(&ref->refs, 1); ref->bytenr = bytenr; - ref->parent = parent; + ref->num_bytes = num_bytes; ref->ref_mod = count_mod; + ref->type = 0; + ref->action = 0; + ref->is_head = 1; ref->in_tree = 1; + + head_ref = btrfs_delayed_node_to_head(ref); + head_ref->must_insert_reserved = must_insert_reserved; + head_ref->is_data = is_data; + + INIT_LIST_HEAD(&head_ref->cluster); + mutex_init(&head_ref->mutex); + + existing = tree_insert(&delayed_refs->root, &ref->rb_node); + + if (existing) { + update_existing_head_ref(existing, ref); + /* + * we've updated the existing ref, free the newly + * allocated ref + */ + kfree(ref); + } else { + delayed_refs->num_heads++; + delayed_refs->num_heads_ready++; + delayed_refs->num_entries++; + trans->delayed_ref_updates++; + } + return 0; +} + +/* + * helper to insert a delayed tree ref into the rbtree. + */ +static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *ref, + u64 bytenr, u64 num_bytes, u64 parent, + u64 ref_root, int level, int action) +{ + struct btrfs_delayed_ref_node *existing; + struct btrfs_delayed_tree_ref *full_ref; + struct btrfs_delayed_ref_root *delayed_refs; + + if (action == BTRFS_ADD_DELAYED_EXTENT) + action = BTRFS_ADD_DELAYED_REF; + + delayed_refs = &trans->transaction->delayed_refs; + + /* first set the basic ref node struct up */ + atomic_set(&ref->refs, 1); + ref->bytenr = bytenr; ref->num_bytes = num_bytes; + ref->ref_mod = 1; + ref->action = action; + ref->is_head = 0; + ref->in_tree = 1; - if (btrfs_delayed_ref_is_head(ref)) { - head_ref = btrfs_delayed_node_to_head(ref); - head_ref->must_insert_reserved = must_insert_reserved; - INIT_LIST_HEAD(&head_ref->cluster); - mutex_init(&head_ref->mutex); + full_ref = btrfs_delayed_node_to_tree_ref(ref); + if (parent) { + full_ref->parent = parent; + ref->type = BTRFS_SHARED_BLOCK_REF_KEY; } else { - full_ref = btrfs_delayed_node_to_ref(ref); full_ref->root = ref_root; - full_ref->generation = ref_generation; - full_ref->owner_objectid = owner_objectid; - full_ref->pin = pin; - full_ref->action = action; + ref->type = BTRFS_TREE_BLOCK_REF_KEY; } + full_ref->level = level; - existing = tree_insert(&delayed_refs->root, bytenr, - parent, &ref->rb_node); + existing = tree_insert(&delayed_refs->root, &ref->rb_node); if (existing) { - if (btrfs_delayed_ref_is_head(ref)) - update_existing_head_ref(existing, ref); - else - update_existing_ref(trans, delayed_refs, existing, ref); + update_existing_ref(trans, delayed_refs, existing, ref); + /* + * we've updated the existing ref, free the newly + * allocated ref + */ + kfree(ref); + } else { + delayed_refs->num_entries++; + trans->delayed_ref_updates++; + } + return 0; +} + +/* + * helper to insert a delayed data ref into the rbtree. + */ +static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_node *ref, + u64 bytenr, u64 num_bytes, u64 parent, + u64 ref_root, u64 owner, u64 offset, + int action) +{ + struct btrfs_delayed_ref_node *existing; + struct btrfs_delayed_data_ref *full_ref; + struct btrfs_delayed_ref_root *delayed_refs; + + if (action == BTRFS_ADD_DELAYED_EXTENT) + action = BTRFS_ADD_DELAYED_REF; + + delayed_refs = &trans->transaction->delayed_refs; + + /* first set the basic ref node struct up */ + atomic_set(&ref->refs, 1); + ref->bytenr = bytenr; + ref->num_bytes = num_bytes; + ref->ref_mod = 1; + ref->action = action; + ref->is_head = 0; + ref->in_tree = 1; + + full_ref = btrfs_delayed_node_to_data_ref(ref); + if (parent) { + full_ref->parent = parent; + ref->type = BTRFS_SHARED_DATA_REF_KEY; + } else { + full_ref->root = ref_root; + ref->type = BTRFS_EXTENT_DATA_REF_KEY; + } + full_ref->objectid = owner; + full_ref->offset = offset; + existing = tree_insert(&delayed_refs->root, &ref->rb_node); + + if (existing) { + update_existing_ref(trans, delayed_refs, existing, ref); /* * we've updated the existing ref, free the newly * allocated ref */ kfree(ref); } else { - if (btrfs_delayed_ref_is_head(ref)) { - delayed_refs->num_heads++; - delayed_refs->num_heads_ready++; - } delayed_refs->num_entries++; trans->delayed_ref_updates++; } @@ -525,37 +708,78 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, } /* - * add a delayed ref to the tree. This does all of the accounting required + * add a delayed tree ref. This does all of the accounting required * to make sure the delayed ref is eventually processed before this * transaction commits. */ -int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, - u64 ref_generation, u64 owner_objectid, int action, - int pin) +int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, u64 parent, + u64 ref_root, int level, int action, + struct btrfs_delayed_extent_op *extent_op) { - struct btrfs_delayed_ref *ref; + struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; int ret; + BUG_ON(extent_op && extent_op->is_data); ref = kmalloc(sizeof(*ref), GFP_NOFS); if (!ref) return -ENOMEM; + head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + if (!head_ref) { + kfree(ref); + return -ENOMEM; + } + + head_ref->extent_op = extent_op; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + /* - * the parent = 0 case comes from cases where we don't actually - * know the parent yet. It will get updated later via a add/drop - * pair. + * insert both the head node and the new ref without dropping + * the spin lock */ - if (parent == 0) - parent = bytenr; + ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, + action, 0); + BUG_ON(ret); + + ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes, + parent, ref_root, level, action); + BUG_ON(ret); + spin_unlock(&delayed_refs->lock); + return 0; +} + +/* + * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. + */ +int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, + u64 parent, u64 ref_root, + u64 owner, u64 offset, int action, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_delayed_data_ref *ref; + struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + BUG_ON(extent_op && !extent_op->is_data); + ref = kmalloc(sizeof(*ref), GFP_NOFS); + if (!ref) + return -ENOMEM; head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); if (!head_ref) { kfree(ref); return -ENOMEM; } + + head_ref->extent_op = extent_op; + delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); @@ -563,14 +787,39 @@ int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, * insert both the head node and the new ref without dropping * the spin lock */ - ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes, - (u64)-1, 0, 0, 0, action, pin); + ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, + action, 1); BUG_ON(ret); - ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes, - parent, ref_root, ref_generation, - owner_objectid, action, pin); + ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes, + parent, ref_root, owner, offset, action); + BUG_ON(ret); + spin_unlock(&delayed_refs->lock); + return 0; +} + +int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_delayed_ref_head *head_ref; + struct btrfs_delayed_ref_root *delayed_refs; + int ret; + + head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); + if (!head_ref) + return -ENOMEM; + + head_ref->extent_op = extent_op; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + + ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, + num_bytes, BTRFS_UPDATE_DELAYED_HEAD, + extent_op->is_data); BUG_ON(ret); + spin_unlock(&delayed_refs->lock); return 0; } @@ -587,7 +836,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) struct btrfs_delayed_ref_root *delayed_refs; delayed_refs = &trans->transaction->delayed_refs; - ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL); + ref = find_ref_head(&delayed_refs->root, bytenr, NULL); if (ref) return btrfs_delayed_node_to_head(ref); return NULL; @@ -603,6 +852,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) * * It is the same as doing a ref add and delete in two separate calls. */ +#if 0 int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 orig_parent, u64 parent, u64 orig_ref_root, u64 ref_root, @@ -666,3 +916,4 @@ int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); return 0; } +#endif diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 3bec2ff..f6fc67d 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -30,9 +30,6 @@ struct btrfs_delayed_ref_node { /* the starting bytenr of the extent */ u64 bytenr; - /* the parent our backref will point to */ - u64 parent; - /* the size of the extent */ u64 num_bytes; @@ -50,10 +47,21 @@ struct btrfs_delayed_ref_node { */ int ref_mod; + unsigned int action:8; + unsigned int type:8; /* is this node still in the rbtree? */ + unsigned int is_head:1; unsigned int in_tree:1; }; +struct btrfs_delayed_extent_op { + struct btrfs_disk_key key; + u64 flags_to_set; + unsigned int update_key:1; + unsigned int update_flags:1; + unsigned int is_data:1; +}; + /* * the head refs are used to hold a lock on a given extent, which allows us * to make sure that only one process is running the delayed refs @@ -71,6 +79,7 @@ struct btrfs_delayed_ref_head { struct list_head cluster; + struct btrfs_delayed_extent_op *extent_op; /* * when a new extent is allocated, it is just reserved in memory * The actual extent isn't inserted into the extent allocation tree @@ -84,27 +93,26 @@ struct btrfs_delayed_ref_head { * the free has happened. */ unsigned int must_insert_reserved:1; + unsigned int is_data:1; }; -struct btrfs_delayed_ref { +struct btrfs_delayed_tree_ref { struct btrfs_delayed_ref_node node; + union { + u64 root; + u64 parent; + }; + int level; +}; - /* the root objectid our ref will point to */ - u64 root; - - /* the generation for the backref */ - u64 generation; - - /* owner_objectid of the backref */ - u64 owner_objectid; - - /* operation done by this entry in the rbtree */ - u8 action; - - /* if pin == 1, when the extent is freed it will be pinned until - * transaction commit - */ - unsigned int pin:1; +struct btrfs_delayed_data_ref { + struct btrfs_delayed_ref_node node; + union { + u64 root; + u64 parent; + }; + u64 objectid; + u64 offset; }; struct btrfs_delayed_ref_root { @@ -143,17 +151,25 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) } } -int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root, - u64 ref_generation, u64 owner_objectid, int action, - int pin); +int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, u64 parent, + u64 ref_root, int level, int action, + struct btrfs_delayed_extent_op *extent_op); +int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, + u64 parent, u64 ref_root, + u64 owner, u64 offset, int action, + struct btrfs_delayed_extent_op *extent_op); +int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, + struct btrfs_delayed_extent_op *extent_op); struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr); -int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u32 *refs); +int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *refs, u64 *flags); int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 orig_parent, u64 parent, u64 orig_ref_root, u64 ref_root, @@ -169,18 +185,24 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, */ static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node) { - return node->parent == (u64)-1; + return node->is_head; } /* * helper functions to cast a node into its container */ -static inline struct btrfs_delayed_ref * -btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node) +static inline struct btrfs_delayed_tree_ref * +btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node) { WARN_ON(btrfs_delayed_ref_is_head(node)); - return container_of(node, struct btrfs_delayed_ref, node); + return container_of(node, struct btrfs_delayed_tree_ref, node); +} +static inline struct btrfs_delayed_data_ref * +btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node) +{ + WARN_ON(btrfs_delayed_ref_is_head(node)); + return container_of(node, struct btrfs_delayed_data_ref, node); } static inline struct btrfs_delayed_ref_head * @@ -188,6 +210,5 @@ btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node) { WARN_ON(!btrfs_delayed_ref_is_head(node)); return container_of(node, struct btrfs_delayed_ref_head, node); - } #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4b0ea0b..7f5c6e3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -36,7 +36,6 @@ #include "print-tree.h" #include "async-thread.h" #include "locking.h" -#include "ref-cache.h" #include "tree-log.h" #include "free-space-cache.h" @@ -884,7 +883,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, { root->node = NULL; root->commit_root = NULL; - root->ref_tree = NULL; root->sectorsize = sectorsize; root->nodesize = nodesize; root->leafsize = leafsize; @@ -899,12 +897,14 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->last_inode_alloc = 0; root->name = NULL; root->in_sysfs = 0; + root->inode_tree.rb_node = NULL; INIT_LIST_HEAD(&root->dirty_list); INIT_LIST_HEAD(&root->orphan_list); - INIT_LIST_HEAD(&root->dead_list); + INIT_LIST_HEAD(&root->root_list); spin_lock_init(&root->node_lock); spin_lock_init(&root->list_lock); + spin_lock_init(&root->inode_lock); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); init_waitqueue_head(&root->log_writer_wait); @@ -918,9 +918,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, extent_io_tree_init(&root->dirty_log_pages, fs_info->btree_inode->i_mapping, GFP_NOFS); - btrfs_leaf_ref_tree_init(&root->ref_tree_struct); - root->ref_tree = &root->ref_tree_struct; - memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); @@ -959,6 +956,7 @@ static int find_and_setup_root(struct btrfs_root *tree_root, blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); + root->commit_root = btrfs_root_node(root); BUG_ON(!root->node); return 0; } @@ -1025,20 +1023,19 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, */ root->ref_cows = 0; - leaf = btrfs_alloc_free_block(trans, root, root->leafsize, - 0, BTRFS_TREE_LOG_OBJECTID, - trans->transid, 0, 0, 0); + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, + BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0); if (IS_ERR(leaf)) { kfree(root); return ERR_CAST(leaf); } + memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_bytenr(leaf, leaf->start); + btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID); root->node = leaf; - btrfs_set_header_nritems(root->node, 0); - btrfs_set_header_level(root->node, 0); - btrfs_set_header_bytenr(root->node, root->node->start); - btrfs_set_header_generation(root->node, trans->transid); - btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID); write_extent_buffer(root->node, root->fs_info->fsid, (unsigned long)btrfs_header_fsid(root->node), @@ -1081,8 +1078,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, inode_item->nbytes = cpu_to_le64(root->leafsize); inode_item->mode = cpu_to_le32(S_IFDIR | 0755); - btrfs_set_root_bytenr(&log_root->root_item, log_root->node->start); - btrfs_set_root_generation(&log_root->root_item, trans->transid); + btrfs_set_root_node(&log_root->root_item, log_root->node); WARN_ON(root->log_root); root->log_root = log_root; @@ -1144,6 +1140,7 @@ out: blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); + root->commit_root = btrfs_root_node(root); BUG_ON(!root->node); insert: if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { @@ -1210,7 +1207,7 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, } if (!(fs_info->sb->s_flags & MS_RDONLY)) { ret = btrfs_find_dead_roots(fs_info->tree_root, - root->root_key.objectid, root); + root->root_key.objectid); BUG_ON(ret); btrfs_orphan_cleanup(root); } @@ -1569,8 +1566,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->async_submit_draining, 0); atomic_set(&fs_info->nr_async_bios, 0); - atomic_set(&fs_info->throttles, 0); - atomic_set(&fs_info->throttle_gen, 0); fs_info->sb = sb; fs_info->max_extent = (u64)-1; fs_info->max_inline = 8192 * 1024; @@ -1598,6 +1593,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->btree_inode->i_mapping->a_ops = &btree_aops; fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi; + RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, fs_info->btree_inode->i_mapping, GFP_NOFS); @@ -1613,10 +1609,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->btree_inode->i_mapping, GFP_NOFS); fs_info->do_barriers = 1; - INIT_LIST_HEAD(&fs_info->dead_reloc_roots); - btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree); - btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree); - BTRFS_I(fs_info->btree_inode)->root = tree_root; memset(&BTRFS_I(fs_info->btree_inode)->location, 0, sizeof(struct btrfs_key)); @@ -1674,6 +1666,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, goto fail_iput; } + features = btrfs_super_incompat_flags(disk_super); + if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) { + features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; + btrfs_set_super_incompat_flags(disk_super, features); + } + features = btrfs_super_compat_ro_flags(disk_super) & ~BTRFS_FEATURE_COMPAT_RO_SUPP; if (!(sb->s_flags & MS_RDONLY) && features) { @@ -1771,7 +1769,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (ret) { printk(KERN_WARNING "btrfs: failed to read the system " "array on %s\n", sb->s_id); - goto fail_sys_array; + goto fail_sb_buffer; } blocksize = btrfs_level_size(tree_root, @@ -1785,6 +1783,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_super_chunk_root(disk_super), blocksize, generation); BUG_ON(!chunk_root->node); + btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); + chunk_root->commit_root = btrfs_root_node(chunk_root); read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), @@ -1810,7 +1810,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, blocksize, generation); if (!tree_root->node) goto fail_chunk_root; - + btrfs_set_root_node(&tree_root->root_item, tree_root->node); + tree_root->commit_root = btrfs_root_node(tree_root); ret = find_and_setup_root(tree_root, fs_info, BTRFS_EXTENT_TREE_OBJECTID, extent_root); @@ -1820,14 +1821,14 @@ struct btrfs_root *open_ctree(struct super_block *sb, ret = find_and_setup_root(tree_root, fs_info, BTRFS_DEV_TREE_OBJECTID, dev_root); - dev_root->track_dirty = 1; if (ret) goto fail_extent_root; + dev_root->track_dirty = 1; ret = find_and_setup_root(tree_root, fs_info, BTRFS_CSUM_TREE_OBJECTID, csum_root); if (ret) - goto fail_extent_root; + goto fail_dev_root; csum_root->track_dirty = 1; @@ -1881,7 +1882,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, } if (!(sb->s_flags & MS_RDONLY)) { - ret = btrfs_cleanup_reloc_trees(tree_root); + ret = btrfs_recover_relocation(tree_root); BUG_ON(ret); } @@ -1908,14 +1909,19 @@ fail_cleaner: fail_csum_root: free_extent_buffer(csum_root->node); + free_extent_buffer(csum_root->commit_root); +fail_dev_root: + free_extent_buffer(dev_root->node); + free_extent_buffer(dev_root->commit_root); fail_extent_root: free_extent_buffer(extent_root->node); + free_extent_buffer(extent_root->commit_root); fail_tree_root: free_extent_buffer(tree_root->node); + free_extent_buffer(tree_root->commit_root); fail_chunk_root: free_extent_buffer(chunk_root->node); -fail_sys_array: - free_extent_buffer(dev_root->node); + free_extent_buffer(chunk_root->commit_root); fail_sb_buffer: btrfs_stop_workers(&fs_info->fixup_workers); btrfs_stop_workers(&fs_info->delalloc_workers); @@ -2173,6 +2179,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans, int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) { + WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); radix_tree_delete(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid); if (root->anon_super.s_dev) { @@ -2219,10 +2226,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) ARRAY_SIZE(gang)); if (!ret) break; + + root_objectid = gang[ret - 1]->root_key.objectid + 1; for (i = 0; i < ret; i++) { root_objectid = gang[i]->root_key.objectid; ret = btrfs_find_dead_roots(fs_info->tree_root, - root_objectid, gang[i]); + root_objectid); BUG_ON(ret); btrfs_orphan_cleanup(gang[i]); } @@ -2278,20 +2287,16 @@ int close_ctree(struct btrfs_root *root) (unsigned long long)fs_info->total_ref_cache_size); } - if (fs_info->extent_root->node) - free_extent_buffer(fs_info->extent_root->node); - - if (fs_info->tree_root->node) - free_extent_buffer(fs_info->tree_root->node); - - if (root->fs_info->chunk_root->node) - free_extent_buffer(root->fs_info->chunk_root->node); - - if (root->fs_info->dev_root->node) - free_extent_buffer(root->fs_info->dev_root->node); - - if (root->fs_info->csum_root->node) - free_extent_buffer(root->fs_info->csum_root->node); + free_extent_buffer(fs_info->extent_root->node); + free_extent_buffer(fs_info->extent_root->commit_root); + free_extent_buffer(fs_info->tree_root->node); + free_extent_buffer(fs_info->tree_root->commit_root); + free_extent_buffer(root->fs_info->chunk_root->node); + free_extent_buffer(root->fs_info->chunk_root->commit_root); + free_extent_buffer(root->fs_info->dev_root->node); + free_extent_buffer(root->fs_info->dev_root->commit_root); + free_extent_buffer(root->fs_info->csum_root->node); + free_extent_buffer(root->fs_info->csum_root->commit_root); btrfs_free_block_groups(root->fs_info); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 85315d2..9596b40 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -78,7 +78,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; - inode = btrfs_iget(sb, &key, root, NULL); + inode = btrfs_iget(sb, &key, root); if (IS_ERR(inode)) return (void *)inode; @@ -192,7 +192,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child) btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); key.offset = 0; - return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL)); + return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root)); } const struct export_operations btrfs_export_ops = { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 35af933..a42419c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -30,43 +30,33 @@ #include "transaction.h" #include "volumes.h" #include "locking.h" -#include "ref-cache.h" #include "free-space-cache.h" -#define PENDING_EXTENT_INSERT 0 -#define PENDING_EXTENT_DELETE 1 -#define PENDING_BACKREF_UPDATE 2 - -struct pending_extent_op { - int type; - u64 bytenr; - u64 num_bytes; - u64 parent; - u64 orig_parent; - u64 generation; - u64 orig_generation; - int level; - struct list_head list; - int del; -}; - -static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner, struct btrfs_key *ins, - int ref_mod); static int update_reserved_extents(struct btrfs_root *root, u64 bytenr, u64 num, int reserve); static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc, int mark_free); -static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid, int pin, - int ref_to_drop); +static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner_objectid, + u64 owner_offset, int refs_to_drop, + struct btrfs_delayed_extent_op *extra_op); +static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, + struct extent_buffer *leaf, + struct btrfs_extent_item *ei); +static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + u64 flags, u64 owner, u64 offset, + struct btrfs_key *ins, int ref_mod); +static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + u64 flags, struct btrfs_disk_key *key, + int level, struct btrfs_key *ins); static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, @@ -453,348 +443,1078 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) * maintenance. This is actually the same as #2, but with a slightly * different use case. * + * There are two kinds of back refs. The implicit back refs is optimized + * for pointers in non-shared tree blocks. For a given pointer in a block, + * back refs of this kind provide information about the block's owner tree + * and the pointer's key. These information allow us to find the block by + * b-tree searching. The full back refs is for pointers in tree blocks not + * referenced by their owner trees. The location of tree block is recorded + * in the back refs. Actually the full back refs is generic, and can be + * used in all cases the implicit back refs is used. The major shortcoming + * of the full back refs is its overhead. Every time a tree block gets + * COWed, we have to update back refs entry for all pointers in it. + * + * For a newly allocated tree block, we use implicit back refs for + * pointers in it. This means most tree related operations only involve + * implicit back refs. For a tree block created in old transaction, the + * only way to drop a reference to it is COW it. So we can detect the + * event that tree block loses its owner tree's reference and do the + * back refs conversion. + * + * When a tree block is COW'd through a tree, there are four cases: + * + * The reference count of the block is one and the tree is the block's + * owner tree. Nothing to do in this case. + * + * The reference count of the block is one and the tree is not the + * block's owner tree. In this case, full back refs is used for pointers + * in the block. Remove these full back refs, add implicit back refs for + * every pointers in the new block. + * + * The reference count of the block is greater than one and the tree is + * the block's owner tree. In this case, implicit back refs is used for + * pointers in the block. Add full back refs for every pointers in the + * block, increase lower level extents' reference counts. The original + * implicit back refs are entailed to the new block. + * + * The reference count of the block is greater than one and the tree is + * not the block's owner tree. Add implicit back refs for every pointer in + * the new block, increase lower level extents' reference count. + * + * Back Reference Key composing: + * + * The key objectid corresponds to the first byte in the extent, + * The key type is used to differentiate between types of back refs. + * There are different meanings of the key offset for different types + * of back refs. + * * File extents can be referenced by: * * - multiple snapshots, subvolumes, or different generations in one subvol * - different files inside a single subvolume * - different offsets inside a file (bookend extents in file.c) * - * The extent ref structure has fields for: + * The extent ref structure for the implicit back refs has fields for: * * - Objectid of the subvolume root - * - Generation number of the tree holding the reference * - objectid of the file holding the reference - * - number of references holding by parent node (alway 1 for tree blocks) - * - * Btree leaf may hold multiple references to a file extent. In most cases, - * these references are from same file and the corresponding offsets inside - * the file are close together. - * - * When a file extent is allocated the fields are filled in: - * (root_key.objectid, trans->transid, inode objectid, 1) + * - original offset in the file + * - how many bookend extents * - * When a leaf is cow'd new references are added for every file extent found - * in the leaf. It looks similar to the create case, but trans->transid will - * be different when the block is cow'd. + * The key offset for the implicit back refs is hash of the first + * three fields. * - * (root_key.objectid, trans->transid, inode objectid, - * number of references in the leaf) + * The extent ref structure for the full back refs has field for: * - * When a file extent is removed either during snapshot deletion or - * file truncation, we find the corresponding back reference and check - * the following fields: + * - number of pointers in the tree leaf * - * (btrfs_header_owner(leaf), btrfs_header_generation(leaf), - * inode objectid) + * The key offset for the implicit back refs is the first byte of + * the tree leaf * - * Btree extents can be referenced by: - * - * - Different subvolumes - * - Different generations of the same subvolume - * - * When a tree block is created, back references are inserted: + * When a file extent is allocated, The implicit back refs is used. + * the fields are filled in: * - * (root->root_key.objectid, trans->transid, level, 1) + * (root_key.objectid, inode objectid, offset in file, 1) * - * When a tree block is cow'd, new back references are added for all the - * blocks it points to. If the tree block isn't in reference counted root, - * the old back references are removed. These new back references are of - * the form (trans->transid will have increased since creation): + * When a file extent is removed file truncation, we find the + * corresponding implicit back refs and check the following fields: * - * (root->root_key.objectid, trans->transid, level, 1) + * (btrfs_header_owner(leaf), inode objectid, offset in file) * - * When a backref is in deleting, the following fields are checked: + * Btree extents can be referenced by: * - * if backref was for a tree root: - * (btrfs_header_owner(itself), btrfs_header_generation(itself), level) - * else - * (btrfs_header_owner(parent), btrfs_header_generation(parent), level) + * - Different subvolumes * - * Back Reference Key composing: + * Both the implicit back refs and the full back refs for tree blocks + * only consist of key. The key offset for the implicit back refs is + * objectid of block's owner tree. The key offset for the full back refs + * is the first byte of parent block. * - * The key objectid corresponds to the first byte in the extent, the key - * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first - * byte of parent extent. If a extent is tree root, the key offset is set - * to the key objectid. + * When implicit back refs is used, information about the lowest key and + * level of the tree block are required. These information are stored in + * tree block info structure. */ -static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 ref_root, u64 ref_generation, - u64 owner_objectid, int del) +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 +static int convert_extent_item_v0(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 owner, u32 extra_size) { + struct btrfs_extent_item *item; + struct btrfs_extent_item_v0 *ei0; + struct btrfs_extent_ref_v0 *ref0; + struct btrfs_tree_block_info *bi; + struct extent_buffer *leaf; struct btrfs_key key; - struct btrfs_extent_ref *ref; + struct btrfs_key found_key; + u32 new_size = sizeof(*item); + u64 refs; + int ret; + + leaf = path->nodes[0]; + BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0)); + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ei0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item_v0); + refs = btrfs_extent_refs_v0(leaf, ei0); + + if (owner == (u64)-1) { + while (1) { + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + BUG_ON(ret > 0); + leaf = path->nodes[0]; + } + btrfs_item_key_to_cpu(leaf, &found_key, + path->slots[0]); + BUG_ON(key.objectid != found_key.objectid); + if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) { + path->slots[0]++; + continue; + } + ref0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref_v0); + owner = btrfs_ref_objectid_v0(leaf, ref0); + break; + } + } + btrfs_release_path(root, path); + + if (owner < BTRFS_FIRST_FREE_OBJECTID) + new_size += sizeof(*bi); + + new_size -= sizeof(*ei0); + ret = btrfs_search_slot(trans, root, &key, path, + new_size + extra_size, 1); + if (ret < 0) + return ret; + BUG_ON(ret); + + ret = btrfs_extend_item(trans, root, path, new_size); + BUG_ON(ret); + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + btrfs_set_extent_refs(leaf, item, refs); + /* FIXME: get real generation */ + btrfs_set_extent_generation(leaf, item, 0); + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + btrfs_set_extent_flags(leaf, item, + BTRFS_EXTENT_FLAG_TREE_BLOCK | + BTRFS_BLOCK_FLAG_FULL_BACKREF); + bi = (struct btrfs_tree_block_info *)(item + 1); + /* FIXME: get first key of the block */ + memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi)); + btrfs_set_tree_block_level(leaf, bi, (int)owner); + } else { + btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); + } + btrfs_mark_buffer_dirty(leaf); + return 0; +} +#endif + +static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) +{ + u32 high_crc = ~(u32)0; + u32 low_crc = ~(u32)0; + __le64 lenum; + + lenum = cpu_to_le64(root_objectid); + high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); + lenum = cpu_to_le64(owner); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); + lenum = cpu_to_le64(offset); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); + + return ((u64)high_crc << 31) ^ (u64)low_crc; +} + +static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, + struct btrfs_extent_data_ref *ref) +{ + return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref), + btrfs_extent_data_ref_objectid(leaf, ref), + btrfs_extent_data_ref_offset(leaf, ref)); +} + +static int match_extent_data_ref(struct extent_buffer *leaf, + struct btrfs_extent_data_ref *ref, + u64 root_objectid, u64 owner, u64 offset) +{ + if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || + btrfs_extent_data_ref_objectid(leaf, ref) != owner || + btrfs_extent_data_ref_offset(leaf, ref) != offset) + return 0; + return 1; +} + +static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid, + u64 owner, u64 offset) +{ + struct btrfs_key key; + struct btrfs_extent_data_ref *ref; struct extent_buffer *leaf; - u64 ref_objectid; + u32 nritems; int ret; + int recow; + int err = -ENOENT; key.objectid = bytenr; - key.type = BTRFS_EXTENT_REF_KEY; - key.offset = parent; + if (parent) { + key.type = BTRFS_SHARED_DATA_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_EXTENT_DATA_REF_KEY; + key.offset = hash_extent_data_ref(root_objectid, + owner, offset); + } +again: + recow = 0; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) { + err = ret; + goto fail; + } - ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1); - if (ret < 0) - goto out; - if (ret > 0) { - ret = -ENOENT; - goto out; + if (parent) { + if (!ret) + return 0; +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + key.type = BTRFS_EXTENT_REF_V0_KEY; + btrfs_release_path(root, path); + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) { + err = ret; + goto fail; + } + if (!ret) + return 0; +#endif + goto fail; } leaf = path->nodes[0]; - ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); - ref_objectid = btrfs_ref_objectid(leaf, ref); - if (btrfs_ref_root(leaf, ref) != ref_root || - btrfs_ref_generation(leaf, ref) != ref_generation || - (ref_objectid != owner_objectid && - ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) { - ret = -EIO; - WARN_ON(1); - goto out; + nritems = btrfs_header_nritems(leaf); + while (1) { + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + err = ret; + if (ret) + goto fail; + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + recow = 1; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != bytenr || + key.type != BTRFS_EXTENT_DATA_REF_KEY) + goto fail; + + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + + if (match_extent_data_ref(leaf, ref, root_objectid, + owner, offset)) { + if (recow) { + btrfs_release_path(root, path); + goto again; + } + err = 0; + break; + } + path->slots[0]++; } - ret = 0; -out: - return ret; +fail: + return err; } -static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 parent, - u64 ref_root, u64 ref_generation, - u64 owner_objectid, - int refs_to_add) +static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid, u64 owner, + u64 offset, int refs_to_add) { struct btrfs_key key; struct extent_buffer *leaf; - struct btrfs_extent_ref *ref; + u32 size; u32 num_refs; int ret; key.objectid = bytenr; - key.type = BTRFS_EXTENT_REF_KEY; - key.offset = parent; + if (parent) { + key.type = BTRFS_SHARED_DATA_REF_KEY; + key.offset = parent; + size = sizeof(struct btrfs_shared_data_ref); + } else { + key.type = BTRFS_EXTENT_DATA_REF_KEY; + key.offset = hash_extent_data_ref(root_objectid, + owner, offset); + size = sizeof(struct btrfs_extent_data_ref); + } - ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref)); - if (ret == 0) { - leaf = path->nodes[0]; - ref = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_ref); - btrfs_set_ref_root(leaf, ref, ref_root); - btrfs_set_ref_generation(leaf, ref, ref_generation); - btrfs_set_ref_objectid(leaf, ref, owner_objectid); - btrfs_set_ref_num_refs(leaf, ref, refs_to_add); - } else if (ret == -EEXIST) { - u64 existing_owner; - - BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID); - leaf = path->nodes[0]; + ret = btrfs_insert_empty_item(trans, root, path, &key, size); + if (ret && ret != -EEXIST) + goto fail; + + leaf = path->nodes[0]; + if (parent) { + struct btrfs_shared_data_ref *ref; ref = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_ref); - if (btrfs_ref_root(leaf, ref) != ref_root || - btrfs_ref_generation(leaf, ref) != ref_generation) { - ret = -EIO; - WARN_ON(1); - goto out; + struct btrfs_shared_data_ref); + if (ret == 0) { + btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add); + } else { + num_refs = btrfs_shared_data_ref_count(leaf, ref); + num_refs += refs_to_add; + btrfs_set_shared_data_ref_count(leaf, ref, num_refs); } + } else { + struct btrfs_extent_data_ref *ref; + while (ret == -EEXIST) { + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + if (match_extent_data_ref(leaf, ref, root_objectid, + owner, offset)) + break; + btrfs_release_path(root, path); + key.offset++; + ret = btrfs_insert_empty_item(trans, root, path, &key, + size); + if (ret && ret != -EEXIST) + goto fail; - num_refs = btrfs_ref_num_refs(leaf, ref); - BUG_ON(num_refs == 0); - btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add); - - existing_owner = btrfs_ref_objectid(leaf, ref); - if (existing_owner != owner_objectid && - existing_owner != BTRFS_MULTIPLE_OBJECTIDS) { - btrfs_set_ref_objectid(leaf, ref, - BTRFS_MULTIPLE_OBJECTIDS); + leaf = path->nodes[0]; + } + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + if (ret == 0) { + btrfs_set_extent_data_ref_root(leaf, ref, + root_objectid); + btrfs_set_extent_data_ref_objectid(leaf, ref, owner); + btrfs_set_extent_data_ref_offset(leaf, ref, offset); + btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add); + } else { + num_refs = btrfs_extent_data_ref_count(leaf, ref); + num_refs += refs_to_add; + btrfs_set_extent_data_ref_count(leaf, ref, num_refs); } - ret = 0; - } else { - goto out; } - btrfs_unlock_up_safe(path, 1); - btrfs_mark_buffer_dirty(path->nodes[0]); -out: + btrfs_mark_buffer_dirty(leaf); + ret = 0; +fail: btrfs_release_path(root, path); return ret; } -static noinline int remove_extent_backref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - int refs_to_drop) +static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int refs_to_drop) { + struct btrfs_key key; + struct btrfs_extent_data_ref *ref1 = NULL; + struct btrfs_shared_data_ref *ref2 = NULL; struct extent_buffer *leaf; - struct btrfs_extent_ref *ref; - u32 num_refs; + u32 num_refs = 0; int ret = 0; leaf = path->nodes[0]; - ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); - num_refs = btrfs_ref_num_refs(leaf, ref); + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + ref1 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + num_refs = btrfs_extent_data_ref_count(leaf, ref1); + } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { + ref2 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_shared_data_ref); + num_refs = btrfs_shared_data_ref_count(leaf, ref2); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { + struct btrfs_extent_ref_v0 *ref0; + ref0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref_v0); + num_refs = btrfs_ref_count_v0(leaf, ref0); +#endif + } else { + BUG(); + } + BUG_ON(num_refs < refs_to_drop); num_refs -= refs_to_drop; + if (num_refs == 0) { ret = btrfs_del_item(trans, root, path); } else { - btrfs_set_ref_num_refs(leaf, ref, num_refs); + if (key.type == BTRFS_EXTENT_DATA_REF_KEY) + btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); + else if (key.type == BTRFS_SHARED_DATA_REF_KEY) + btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + else { + struct btrfs_extent_ref_v0 *ref0; + ref0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref_v0); + btrfs_set_ref_count_v0(leaf, ref0, num_refs); + } +#endif btrfs_mark_buffer_dirty(leaf); } - btrfs_release_path(root, path); return ret; } -#ifdef BIO_RW_DISCARD -static void btrfs_issue_discard(struct block_device *bdev, - u64 start, u64 len) +static noinline u32 extent_data_ref_count(struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref) { - blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); -} + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_extent_data_ref *ref1; + struct btrfs_shared_data_ref *ref2; + u32 num_refs = 0; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (iref) { + if (btrfs_extent_inline_ref_type(leaf, iref) == + BTRFS_EXTENT_DATA_REF_KEY) { + ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); + num_refs = btrfs_extent_data_ref_count(leaf, ref1); + } else { + ref2 = (struct btrfs_shared_data_ref *)(iref + 1); + num_refs = btrfs_shared_data_ref_count(leaf, ref2); + } + } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + ref1 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_data_ref); + num_refs = btrfs_extent_data_ref_count(leaf, ref1); + } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { + ref2 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_shared_data_ref); + num_refs = btrfs_shared_data_ref_count(leaf, ref2); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) { + struct btrfs_extent_ref_v0 *ref0; + ref0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref_v0); + num_refs = btrfs_ref_count_v0(leaf, ref0); #endif + } else { + WARN_ON(1); + } + return num_refs; +} -static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, - u64 num_bytes) +static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid) { -#ifdef BIO_RW_DISCARD + struct btrfs_key key; int ret; - u64 map_length = num_bytes; - struct btrfs_multi_bio *multi = NULL; - - /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, - bytenr, &map_length, &multi, 0); - if (!ret) { - struct btrfs_bio_stripe *stripe = multi->stripes; - int i; - - if (map_length > num_bytes) - map_length = num_bytes; - for (i = 0; i < multi->num_stripes; i++, stripe++) { - btrfs_issue_discard(stripe->dev->bdev, - stripe->physical, - map_length); - } - kfree(multi); + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; } - return ret; -#else - return 0; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -ENOENT; +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (ret == -ENOENT && parent) { + btrfs_release_path(root, path); + key.type = BTRFS_EXTENT_REF_V0_KEY; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -ENOENT; + } #endif + return ret; } -static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, - u64 orig_parent, u64 parent, - u64 orig_root, u64 ref_root, - u64 orig_generation, u64 ref_generation, - u64 owner_objectid) +static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 root_objectid) { + struct btrfs_key key; int ret; - int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID; - ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes, - orig_parent, parent, orig_root, - ref_root, orig_generation, - ref_generation, owner_objectid, pin); - BUG_ON(ret); + key.objectid = bytenr; + if (parent) { + key.type = BTRFS_SHARED_BLOCK_REF_KEY; + key.offset = parent; + } else { + key.type = BTRFS_TREE_BLOCK_REF_KEY; + key.offset = root_objectid; + } + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + btrfs_release_path(root, path); return ret; } -int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 orig_parent, u64 parent, - u64 ref_root, u64 ref_generation, - u64 owner_objectid) +static inline int extent_ref_type(u64 parent, u64 owner) { - int ret; - if (ref_root == BTRFS_TREE_LOG_OBJECTID && - owner_objectid < BTRFS_FIRST_FREE_OBJECTID) - return 0; - - ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes, - orig_parent, parent, ref_root, - ref_root, ref_generation, - ref_generation, owner_objectid); - return ret; + int type; + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + if (parent > 0) + type = BTRFS_SHARED_BLOCK_REF_KEY; + else + type = BTRFS_TREE_BLOCK_REF_KEY; + } else { + if (parent > 0) + type = BTRFS_SHARED_DATA_REF_KEY; + else + type = BTRFS_EXTENT_DATA_REF_KEY; + } + return type; } -static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, - u64 orig_parent, u64 parent, - u64 orig_root, u64 ref_root, - u64 orig_generation, u64 ref_generation, - u64 owner_objectid) -{ - int ret; - ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root, - ref_generation, owner_objectid, - BTRFS_ADD_DELAYED_REF, 0); - BUG_ON(ret); - return ret; -} +static int find_next_key(struct btrfs_path *path, struct btrfs_key *key) -static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 parent, u64 ref_root, - u64 ref_generation, u64 owner_objectid, - int refs_to_add) { - struct btrfs_path *path; - int ret; - struct btrfs_key key; - struct extent_buffer *l; - struct btrfs_extent_item *item; - u32 refs; + int level; + BUG_ON(!path->keep_locks); + for (level = 0; level < BTRFS_MAX_LEVEL; level++) { + if (!path->nodes[level]) + break; + btrfs_assert_tree_locked(path->nodes[level]); + if (path->slots[level] + 1 >= + btrfs_header_nritems(path->nodes[level])) + continue; + if (level == 0) + btrfs_item_key_to_cpu(path->nodes[level], key, + path->slots[level] + 1); + else + btrfs_node_key_to_cpu(path->nodes[level], key, + path->slots[level] + 1); + return 0; + } + return 1; +} - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; +/* + * look for inline back ref. if back ref is found, *ref_ret is set + * to the address of inline back ref, and 0 is returned. + * + * if back ref isn't found, *ref_ret is set to the address where it + * should be inserted, and -ENOENT is returned. + * + * if insert is true and there are too many inline back refs, the path + * points to the extent item, and -EAGAIN is returned. + * + * NOTE: inline back refs are ordered in the same way that back ref + * items in the tree are ordered. + */ +static noinline_for_stack +int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref **ref_ret, + u64 bytenr, u64 num_bytes, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int insert) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + u64 flags; + u64 item_size; + unsigned long ptr; + unsigned long end; + int extra_size; + int type; + int want; + int ret; + int err = 0; - path->reada = 1; - path->leave_spinning = 1; key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; - /* first find the extent item and update its reference count */ - ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, - path, 0, 1); + want = extent_ref_type(parent, owner); + if (insert) { + extra_size = btrfs_extent_inline_ref_size(want); + if (owner >= BTRFS_FIRST_FREE_OBJECTID) + path->keep_locks = 1; + } else + extra_size = -1; + ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); if (ret < 0) { - btrfs_set_path_blocking(path); + err = ret; + goto out; + } + BUG_ON(ret); + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + if (!insert) { + err = -ENOENT; + goto out; + } + ret = convert_extent_item_v0(trans, root, path, owner, + extra_size); + if (ret < 0) { + err = ret; + goto out; + } + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + } +#endif + BUG_ON(item_size < sizeof(*ei)); + + if (owner < BTRFS_FIRST_FREE_OBJECTID && insert && + item_size + extra_size >= BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { + err = -EAGAIN; + goto out; + } + + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + flags = btrfs_extent_flags(leaf, ei); + + ptr = (unsigned long)(ei + 1); + end = (unsigned long)ei + item_size; + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + ptr += sizeof(struct btrfs_tree_block_info); + BUG_ON(ptr > end); + } else { + BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA)); + } + + err = -ENOENT; + while (1) { + if (ptr >= end) { + WARN_ON(ptr > end); + break; + } + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_extent_inline_ref_type(leaf, iref); + if (want < type) + break; + if (want > type) { + ptr += btrfs_extent_inline_ref_size(type); + continue; + } + + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + struct btrfs_extent_data_ref *dref; + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + if (match_extent_data_ref(leaf, dref, root_objectid, + owner, offset)) { + err = 0; + break; + } + if (hash_extent_data_ref_item(leaf, dref) < + hash_extent_data_ref(root_objectid, owner, offset)) + break; + } else { + u64 ref_offset; + ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); + if (parent > 0) { + if (parent == ref_offset) { + err = 0; + break; + } + if (ref_offset < parent) + break; + } else { + if (root_objectid == ref_offset) { + err = 0; + break; + } + if (ref_offset < root_objectid) + break; + } + } + ptr += btrfs_extent_inline_ref_size(type); + } + if (err == -ENOENT && insert) { + if (item_size + extra_size >= + BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { + err = -EAGAIN; + goto out; + } + /* + * To add new inline back ref, we have to make sure + * there is no corresponding back ref item. + * For simplicity, we just do not add new inline back + * ref if there is any kind of item for this block + */ + if (owner >= BTRFS_FIRST_FREE_OBJECTID && + find_next_key(path, &key) == 0 && key.objectid == bytenr) { + err = -EAGAIN; + goto out; + } + } + *ref_ret = (struct btrfs_extent_inline_ref *)ptr; +out: + if (insert && owner >= BTRFS_FIRST_FREE_OBJECTID) { + path->keep_locks = 0; + btrfs_unlock_up_safe(path, 1); + } + return err; +} + +/* + * helper to add new inline back ref + */ +static noinline_for_stack +int setup_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) +{ + struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + unsigned long ptr; + unsigned long end; + unsigned long item_offset; + u64 refs; + int size; + int type; + int ret; + + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + item_offset = (unsigned long)iref - (unsigned long)ei; + + type = extent_ref_type(parent, owner); + size = btrfs_extent_inline_ref_size(type); + + ret = btrfs_extend_item(trans, root, path, size); + BUG_ON(ret); + + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, ei); + refs += refs_to_add; + btrfs_set_extent_refs(leaf, ei, refs); + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, ei); + + ptr = (unsigned long)ei + item_offset; + end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]); + if (ptr < end - size) + memmove_extent_buffer(leaf, ptr + size, ptr, + end - size - ptr); + + iref = (struct btrfs_extent_inline_ref *)ptr; + btrfs_set_extent_inline_ref_type(leaf, iref, type); + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + struct btrfs_extent_data_ref *dref; + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + btrfs_set_extent_data_ref_root(leaf, dref, root_objectid); + btrfs_set_extent_data_ref_objectid(leaf, dref, owner); + btrfs_set_extent_data_ref_offset(leaf, dref, offset); + btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add); + } else if (type == BTRFS_SHARED_DATA_REF_KEY) { + struct btrfs_shared_data_ref *sref; + sref = (struct btrfs_shared_data_ref *)(iref + 1); + btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) { + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else { + btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); + } + btrfs_mark_buffer_dirty(leaf); + return 0; +} + +static int lookup_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref **ref_ret, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, u64 offset) +{ + int ret; + + ret = lookup_inline_extent_backref(trans, root, path, ref_ret, + bytenr, num_bytes, parent, + root_objectid, owner, offset, 0); + if (ret != -ENOENT) return ret; + + btrfs_release_path(root, path); + *ref_ret = NULL; + + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + ret = lookup_tree_block_ref(trans, root, path, bytenr, parent, + root_objectid); + } else { + ret = lookup_extent_data_ref(trans, root, path, bytenr, parent, + root_objectid, owner, offset); } + return ret; +} - if (ret > 0) { - WARN_ON(1); - btrfs_free_path(path); - return -EIO; +/* + * helper to update/remove inline back ref + */ +static noinline_for_stack +int update_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + int refs_to_mod, + struct btrfs_delayed_extent_op *extent_op) +{ + struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + struct btrfs_extent_data_ref *dref = NULL; + struct btrfs_shared_data_ref *sref = NULL; + unsigned long ptr; + unsigned long end; + u32 item_size; + int size; + int type; + int ret; + u64 refs; + + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, ei); + WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); + refs += refs_to_mod; + btrfs_set_extent_refs(leaf, ei, refs); + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, ei); + + type = btrfs_extent_inline_ref_type(leaf, iref); + + if (type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + refs = btrfs_extent_data_ref_count(leaf, dref); + } else if (type == BTRFS_SHARED_DATA_REF_KEY) { + sref = (struct btrfs_shared_data_ref *)(iref + 1); + refs = btrfs_shared_data_ref_count(leaf, sref); + } else { + refs = 1; + BUG_ON(refs_to_mod != -1); } - l = path->nodes[0]; - btrfs_item_key_to_cpu(l, &key, path->slots[0]); - if (key.objectid != bytenr) { - btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]); - printk(KERN_ERR "btrfs wanted %llu found %llu\n", - (unsigned long long)bytenr, - (unsigned long long)key.objectid); - BUG(); + BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); + refs += refs_to_mod; + + if (refs > 0) { + if (type == BTRFS_EXTENT_DATA_REF_KEY) + btrfs_set_extent_data_ref_count(leaf, dref, refs); + else + btrfs_set_shared_data_ref_count(leaf, sref, refs); + } else { + size = btrfs_extent_inline_ref_size(type); + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = (unsigned long)iref; + end = (unsigned long)ei + item_size; + if (ptr + size < end) + memmove_extent_buffer(leaf, ptr, ptr + size, + end - ptr - size); + item_size -= size; + ret = btrfs_truncate_item(trans, root, path, item_size, 1); + BUG_ON(ret); + } + btrfs_mark_buffer_dirty(leaf); + return 0; +} + +static noinline_for_stack +int insert_inline_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, + u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_extent_inline_ref *iref; + int ret; + + ret = lookup_inline_extent_backref(trans, root, path, &iref, + bytenr, num_bytes, parent, + root_objectid, owner, offset, 1); + if (ret == 0) { + BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); + ret = update_inline_extent_backref(trans, root, path, iref, + refs_to_add, extent_op); + } else if (ret == -ENOENT) { + ret = setup_inline_extent_backref(trans, root, path, iref, + parent, root_objectid, + owner, offset, refs_to_add, + extent_op); } - BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY); + return ret; +} - item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); +static int insert_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add) +{ + int ret; + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + BUG_ON(refs_to_add != 1); + ret = insert_tree_block_ref(trans, root, path, bytenr, + parent, root_objectid); + } else { + ret = insert_extent_data_ref(trans, root, path, bytenr, + parent, root_objectid, + owner, offset, refs_to_add); + } + return ret; +} - refs = btrfs_extent_refs(l, item); - btrfs_set_extent_refs(l, item, refs + refs_to_add); - btrfs_unlock_up_safe(path, 1); +static int remove_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_extent_inline_ref *iref, + int refs_to_drop, int is_data) +{ + int ret; - btrfs_mark_buffer_dirty(path->nodes[0]); + BUG_ON(!is_data && refs_to_drop != 1); + if (iref) { + ret = update_inline_extent_backref(trans, root, path, iref, + -refs_to_drop, NULL); + } else if (is_data) { + ret = remove_extent_data_ref(trans, root, path, refs_to_drop); + } else { + ret = btrfs_del_item(trans, root, path); + } + return ret; +} + +#ifdef BIO_RW_DISCARD +static void btrfs_issue_discard(struct block_device *bdev, + u64 start, u64 len) +{ + blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); +} +#endif + +static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes) +{ +#ifdef BIO_RW_DISCARD + int ret; + u64 map_length = num_bytes; + struct btrfs_multi_bio *multi = NULL; + + /* Tell the block device(s) that the sectors can be discarded */ + ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, + bytenr, &map_length, &multi, 0); + if (!ret) { + struct btrfs_bio_stripe *stripe = multi->stripes; + int i; + + if (map_length > num_bytes) + map_length = num_bytes; + + for (i = 0; i < multi->num_stripes; i++, stripe++) { + btrfs_issue_discard(stripe->dev->bdev, + stripe->physical, + map_length); + } + kfree(multi); + } + + return ret; +#else + return 0; +#endif +} + +int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner, u64 offset) +{ + int ret; + BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && + root_objectid == BTRFS_TREE_LOG_OBJECTID); + + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, + parent, root_objectid, (int)owner, + BTRFS_ADD_DELAYED_REF, NULL); + } else { + ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, + parent, root_objectid, owner, offset, + BTRFS_ADD_DELAYED_REF, NULL); + } + return ret; +} + +static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, + u64 parent, u64 root_objectid, + u64 owner, u64 offset, int refs_to_add, + struct btrfs_delayed_extent_op *extent_op) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_extent_item *item; + u64 refs; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = 1; + path->leave_spinning = 1; + /* this will setup the path even if it fails to insert the back ref */ + ret = insert_inline_extent_backref(trans, root->fs_info->extent_root, + path, bytenr, num_bytes, parent, + root_objectid, owner, offset, + refs_to_add, extent_op); + if (ret == 0) + goto out; + + if (ret != -EAGAIN) { + err = ret; + goto out; + } + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, item); + btrfs_set_extent_refs(leaf, item, refs + refs_to_add); + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, item); + + btrfs_mark_buffer_dirty(leaf); btrfs_release_path(root->fs_info->extent_root, path); path->reada = 1; @@ -802,56 +1522,197 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans, /* now insert the actual backref */ ret = insert_extent_backref(trans, root->fs_info->extent_root, - path, bytenr, parent, - ref_root, ref_generation, - owner_objectid, refs_to_add); + path, bytenr, parent, root_objectid, + owner, offset, refs_to_add); BUG_ON(ret); +out: btrfs_free_path(path); - return 0; + return err; } -int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 ref_root, u64 ref_generation, - u64 owner_objectid) +static int run_delayed_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op, + int insert_reserved) +{ + int ret = 0; + struct btrfs_delayed_data_ref *ref; + struct btrfs_key ins; + u64 parent = 0; + u64 ref_root = 0; + u64 flags = 0; + + ins.objectid = node->bytenr; + ins.offset = node->num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + + ref = btrfs_delayed_node_to_data_ref(node); + if (node->type == BTRFS_SHARED_DATA_REF_KEY) + parent = ref->parent; + else + ref_root = ref->root; + + if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { + if (extent_op) { + BUG_ON(extent_op->update_key); + flags |= extent_op->flags_to_set; + } + ret = alloc_reserved_file_extent(trans, root, + parent, ref_root, flags, + ref->objectid, ref->offset, + &ins, node->ref_mod); + update_reserved_extents(root, ins.objectid, ins.offset, 0); + } else if (node->action == BTRFS_ADD_DELAYED_REF) { + ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, + node->num_bytes, parent, + ref_root, ref->objectid, + ref->offset, node->ref_mod, + extent_op); + } else if (node->action == BTRFS_DROP_DELAYED_REF) { + ret = __btrfs_free_extent(trans, root, node->bytenr, + node->num_bytes, parent, + ref_root, ref->objectid, + ref->offset, node->ref_mod, + extent_op); + } else { + BUG(); + } + return ret; +} + +static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, + struct extent_buffer *leaf, + struct btrfs_extent_item *ei) +{ + u64 flags = btrfs_extent_flags(leaf, ei); + if (extent_op->update_flags) { + flags |= extent_op->flags_to_set; + btrfs_set_extent_flags(leaf, ei, flags); + } + + if (extent_op->update_key) { + struct btrfs_tree_block_info *bi; + BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); + bi = (struct btrfs_tree_block_info *)(ei + 1); + btrfs_set_tree_block_key(leaf, bi, &extent_op->key); + } +} + +static int run_delayed_extent_op(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op) { + struct btrfs_key key; + struct btrfs_path *path; + struct btrfs_extent_item *ei; + struct extent_buffer *leaf; + u32 item_size; int ret; - if (ref_root == BTRFS_TREE_LOG_OBJECTID && - owner_objectid < BTRFS_FIRST_FREE_OBJECTID) - return 0; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = node->bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = node->num_bytes; + + path->reada = 1; + path->leave_spinning = 1; + ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, + path, 0, 1); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) { + err = -EIO; + goto out; + } + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + ret = convert_extent_item_v0(trans, root->fs_info->extent_root, + path, (u64)-1, 0); + if (ret < 0) { + err = ret; + goto out; + } + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + } +#endif + BUG_ON(item_size < sizeof(*ei)); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + __run_delayed_extent_op(extent_op, leaf, ei); - ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent, - 0, ref_root, 0, ref_generation, - owner_objectid); - return ret; + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + return err; } -static int drop_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_delayed_ref_node *node) +static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op, + int insert_reserved) { int ret = 0; - struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node); + struct btrfs_delayed_tree_ref *ref; + struct btrfs_key ins; + u64 parent = 0; + u64 ref_root = 0; - BUG_ON(node->ref_mod == 0); - ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes, - node->parent, ref->root, ref->generation, - ref->owner_objectid, ref->pin, node->ref_mod); + ins.objectid = node->bytenr; + ins.offset = node->num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + ref = btrfs_delayed_node_to_tree_ref(node); + if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) + parent = ref->parent; + else + ref_root = ref->root; + + BUG_ON(node->ref_mod != 1); + if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { + BUG_ON(!extent_op || !extent_op->update_flags || + !extent_op->update_key); + ret = alloc_reserved_tree_block(trans, root, + parent, ref_root, + extent_op->flags_to_set, + &extent_op->key, + ref->level, &ins); + update_reserved_extents(root, ins.objectid, ins.offset, 0); + } else if (node->action == BTRFS_ADD_DELAYED_REF) { + ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, + node->num_bytes, parent, ref_root, + ref->level, 0, 1, extent_op); + } else if (node->action == BTRFS_DROP_DELAYED_REF) { + ret = __btrfs_free_extent(trans, root, node->bytenr, + node->num_bytes, parent, ref_root, + ref->level, 0, 1, extent_op); + } else { + BUG(); + } return ret; } + /* helper function to actually process a single delayed ref entry */ -static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_delayed_ref_node *node, - int insert_reserved) +static int run_one_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_delayed_ref_node *node, + struct btrfs_delayed_extent_op *extent_op, + int insert_reserved) { int ret; - struct btrfs_delayed_ref *ref; - - if (node->parent == (u64)-1) { + if (btrfs_delayed_ref_is_head(node)) { struct btrfs_delayed_ref_head *head; /* * we've hit the end of the chain and we were supposed @@ -859,44 +1720,35 @@ static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans, * deleted before we ever needed to insert it, so all * we have to do is clean up the accounting */ + BUG_ON(extent_op); + head = btrfs_delayed_node_to_head(node); if (insert_reserved) { + if (head->is_data) { + ret = btrfs_del_csums(trans, root, + node->bytenr, + node->num_bytes); + BUG_ON(ret); + } + btrfs_update_pinned_extents(root, node->bytenr, + node->num_bytes, 1); update_reserved_extents(root, node->bytenr, node->num_bytes, 0); } - head = btrfs_delayed_node_to_head(node); mutex_unlock(&head->mutex); return 0; } - ref = btrfs_delayed_node_to_ref(node); - if (ref->action == BTRFS_ADD_DELAYED_REF) { - if (insert_reserved) { - struct btrfs_key ins; - - ins.objectid = node->bytenr; - ins.offset = node->num_bytes; - ins.type = BTRFS_EXTENT_ITEM_KEY; - - /* record the full extent allocation */ - ret = __btrfs_alloc_reserved_extent(trans, root, - node->parent, ref->root, - ref->generation, ref->owner_objectid, - &ins, node->ref_mod); - update_reserved_extents(root, node->bytenr, - node->num_bytes, 0); - } else { - /* just add one backref */ - ret = add_extent_ref(trans, root, node->bytenr, - node->num_bytes, - node->parent, ref->root, ref->generation, - ref->owner_objectid, node->ref_mod); - } - BUG_ON(ret); - } else if (ref->action == BTRFS_DROP_DELAYED_REF) { - WARN_ON(insert_reserved); - ret = drop_delayed_ref(trans, root, node); - } - return 0; + if (node->type == BTRFS_TREE_BLOCK_REF_KEY || + node->type == BTRFS_SHARED_BLOCK_REF_KEY) + ret = run_delayed_tree_ref(trans, root, node, extent_op, + insert_reserved); + else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || + node->type == BTRFS_SHARED_DATA_REF_KEY) + ret = run_delayed_data_ref(trans, root, node, extent_op, + insert_reserved); + else + BUG(); + return ret; } static noinline struct btrfs_delayed_ref_node * @@ -919,7 +1771,7 @@ again: rb_node); if (ref->bytenr != head->node.bytenr) break; - if (btrfs_delayed_node_to_ref(ref)->action == action) + if (ref->action == action) return ref; node = rb_prev(node); } @@ -937,6 +1789,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_head *locked_ref = NULL; + struct btrfs_delayed_extent_op *extent_op; int ret; int count = 0; int must_insert_reserved = 0; @@ -975,6 +1828,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, must_insert_reserved = locked_ref->must_insert_reserved; locked_ref->must_insert_reserved = 0; + extent_op = locked_ref->extent_op; + locked_ref->extent_op = NULL; + /* * locked_ref is the head node, so we have to go one * node back for any delayed ref updates @@ -986,6 +1842,25 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, * so that any accounting fixes can happen */ ref = &locked_ref->node; + + if (extent_op && must_insert_reserved) { + kfree(extent_op); + extent_op = NULL; + } + + if (extent_op) { + spin_unlock(&delayed_refs->lock); + + ret = run_delayed_extent_op(trans, root, + ref, extent_op); + BUG_ON(ret); + kfree(extent_op); + + cond_resched(); + spin_lock(&delayed_refs->lock); + continue; + } + list_del_init(&locked_ref->cluster); locked_ref = NULL; } @@ -993,14 +1868,17 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ref->in_tree = 0; rb_erase(&ref->rb_node, &delayed_refs->root); delayed_refs->num_entries--; + spin_unlock(&delayed_refs->lock); - ret = run_one_delayed_ref(trans, root, ref, + ret = run_one_delayed_ref(trans, root, ref, extent_op, must_insert_reserved); BUG_ON(ret); - btrfs_put_delayed_ref(ref); + btrfs_put_delayed_ref(ref); + kfree(extent_op); count++; + cond_resched(); spin_lock(&delayed_refs->lock); } @@ -1095,25 +1973,112 @@ out: return 0; } -int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 objectid, u64 bytenr) +int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 flags, + int is_data) +{ + struct btrfs_delayed_extent_op *extent_op; + int ret; + + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + if (!extent_op) + return -ENOMEM; + + extent_op->flags_to_set = flags; + extent_op->update_flags = 1; + extent_op->update_key = 0; + extent_op->is_data = is_data ? 1 : 0; + + ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); + if (ret) + kfree(extent_op); + return ret; +} + +static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid, u64 offset, u64 bytenr) +{ + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_data_ref *data_ref; + struct btrfs_delayed_ref_root *delayed_refs; + struct rb_node *node; + int ret = 0; + + ret = -ENOENT; + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + if (!head) + goto out; + + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(root->fs_info->extent_root, path); + + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + return -EAGAIN; + } + + node = rb_prev(&head->node.rb_node); + if (!node) + goto out_unlock; + + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + + if (ref->bytenr != bytenr) + goto out_unlock; + + ret = 1; + if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) + goto out_unlock; + + data_ref = btrfs_delayed_node_to_data_ref(ref); + + node = rb_prev(node); + if (node) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + if (ref->bytenr == bytenr) + goto out_unlock; + } + + if (data_ref->root != root->root_key.objectid || + data_ref->objectid != objectid || data_ref->offset != offset) + goto out_unlock; + + ret = 0; +out_unlock: + mutex_unlock(&head->mutex); +out: + spin_unlock(&delayed_refs->lock); + return ret; +} + +static noinline int check_committed_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid, u64 offset, u64 bytenr) { struct btrfs_root *extent_root = root->fs_info->extent_root; - struct btrfs_path *path; struct extent_buffer *leaf; - struct btrfs_extent_ref *ref_item; + struct btrfs_extent_data_ref *ref; + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_item *ei; struct btrfs_key key; - struct btrfs_key found_key; - u64 ref_root; - u64 last_snapshot; - u32 nritems; + u32 item_size; int ret; key.objectid = bytenr; key.offset = (u64)-1; key.type = BTRFS_EXTENT_ITEM_KEY; - path = btrfs_alloc_path(); ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) goto out; @@ -1125,55 +2090,83 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, path->slots[0]--; leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (found_key.objectid != bytenr || - found_key.type != BTRFS_EXTENT_ITEM_KEY) + if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY) goto out; - last_snapshot = btrfs_root_last_snapshot(&root->root_item); - while (1) { - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(extent_root, path); - if (ret < 0) - goto out; - if (ret == 0) - continue; - break; - } - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.objectid != bytenr) - break; + ret = 1; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); + goto out; + } +#endif + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - if (found_key.type != BTRFS_EXTENT_REF_KEY) { - path->slots[0]++; - continue; - } + if (item_size != sizeof(*ei) + + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) + goto out; - ref_item = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_ref); - ref_root = btrfs_ref_root(leaf, ref_item); - if ((ref_root != root->root_key.objectid && - ref_root != BTRFS_TREE_LOG_OBJECTID) || - objectid != btrfs_ref_objectid(leaf, ref_item)) { - ret = 1; - goto out; - } - if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) { - ret = 1; + if (btrfs_extent_generation(leaf, ei) <= + btrfs_root_last_snapshot(&root->root_item)) + goto out; + + iref = (struct btrfs_extent_inline_ref *)(ei + 1); + if (btrfs_extent_inline_ref_type(leaf, iref) != + BTRFS_EXTENT_DATA_REF_KEY) + goto out; + + ref = (struct btrfs_extent_data_ref *)(&iref->offset); + if (btrfs_extent_refs(leaf, ei) != + btrfs_extent_data_ref_count(leaf, ref) || + btrfs_extent_data_ref_root(leaf, ref) != + root->root_key.objectid || + btrfs_extent_data_ref_objectid(leaf, ref) != objectid || + btrfs_extent_data_ref_offset(leaf, ref) != offset) + goto out; + + ret = 0; +out: + return ret; +} + +int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 offset, u64 bytenr) +{ + struct btrfs_path *path; + int ret; + int ret2; + + path = btrfs_alloc_path(); + if (!path) + return -ENOENT; + + do { + ret = check_committed_ref(trans, root, path, objectid, + offset, bytenr); + if (ret && ret != -ENOENT) goto out; - } - path->slots[0]++; + ret2 = check_delayed_ref(trans, root, path, objectid, + offset, bytenr); + } while (ret2 == -EAGAIN); + + if (ret2 && ret2 != -ENOENT) { + ret = ret2; + goto out; } - ret = 0; + + if (ret != -ENOENT || ret2 != -ENOENT) + ret = 0; out: btrfs_free_path(path); return ret; } +#if 0 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, u32 nr_extents) { @@ -1291,191 +2284,49 @@ static int refsort_cmp(const void *a_void, const void *b_void) return 1; return 0; } +#endif - -noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans, +static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *orig_buf, - struct extent_buffer *buf, u32 *nr_extents) + struct extent_buffer *buf, + int full_backref, int inc) { u64 bytenr; + u64 num_bytes; + u64 parent; u64 ref_root; - u64 orig_root; - u64 ref_generation; - u64 orig_generation; - struct refsort *sorted; u32 nritems; - u32 nr_file_extents = 0; struct btrfs_key key; struct btrfs_file_extent_item *fi; int i; int level; int ret = 0; - int faili = 0; - int refi = 0; - int slot; int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, - u64, u64, u64, u64, u64, u64, u64, u64, u64); + u64, u64, u64, u64, u64, u64); ref_root = btrfs_header_owner(buf); - ref_generation = btrfs_header_generation(buf); - orig_root = btrfs_header_owner(orig_buf); - orig_generation = btrfs_header_generation(orig_buf); - nritems = btrfs_header_nritems(buf); level = btrfs_header_level(buf); - sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS); - BUG_ON(!sorted); - - if (root->ref_cows) { - process_func = __btrfs_inc_extent_ref; - } else { - if (level == 0 && - root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) - goto out; - if (level != 0 && - root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) - goto out; - process_func = __btrfs_update_extent_ref; - } - - /* - * we make two passes through the items. In the first pass we - * only record the byte number and slot. Then we sort based on - * byte number and do the actual work based on the sorted results - */ - for (i = 0; i < nritems; i++) { - cond_resched(); - if (level == 0) { - btrfs_item_key_to_cpu(buf, &key, i); - if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) - continue; - fi = btrfs_item_ptr(buf, i, - struct btrfs_file_extent_item); - if (btrfs_file_extent_type(buf, fi) == - BTRFS_FILE_EXTENT_INLINE) - continue; - bytenr = btrfs_file_extent_disk_bytenr(buf, fi); - if (bytenr == 0) - continue; - - nr_file_extents++; - sorted[refi].bytenr = bytenr; - sorted[refi].slot = i; - refi++; - } else { - bytenr = btrfs_node_blockptr(buf, i); - sorted[refi].bytenr = bytenr; - sorted[refi].slot = i; - refi++; - } - } - /* - * if refi == 0, we didn't actually put anything into the sorted - * array and we're done - */ - if (refi == 0) - goto out; - - sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); - - for (i = 0; i < refi; i++) { - cond_resched(); - slot = sorted[i].slot; - bytenr = sorted[i].bytenr; - - if (level == 0) { - btrfs_item_key_to_cpu(buf, &key, slot); - fi = btrfs_item_ptr(buf, slot, - struct btrfs_file_extent_item); - - bytenr = btrfs_file_extent_disk_bytenr(buf, fi); - if (bytenr == 0) - continue; - - ret = process_func(trans, root, bytenr, - btrfs_file_extent_disk_num_bytes(buf, fi), - orig_buf->start, buf->start, - orig_root, ref_root, - orig_generation, ref_generation, - key.objectid); - - if (ret) { - faili = slot; - WARN_ON(1); - goto fail; - } - } else { - ret = process_func(trans, root, bytenr, buf->len, - orig_buf->start, buf->start, - orig_root, ref_root, - orig_generation, ref_generation, - level - 1); - if (ret) { - faili = slot; - WARN_ON(1); - goto fail; - } - } - } -out: - kfree(sorted); - if (nr_extents) { - if (level == 0) - *nr_extents = nr_file_extents; - else - *nr_extents = nritems; - } - return 0; -fail: - kfree(sorted); - WARN_ON(1); - return ret; -} - -int btrfs_update_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *orig_buf, - struct extent_buffer *buf, int start_slot, int nr) - -{ - u64 bytenr; - u64 ref_root; - u64 orig_root; - u64 ref_generation; - u64 orig_generation; - struct btrfs_key key; - struct btrfs_file_extent_item *fi; - int i; - int ret; - int slot; - int level; - - BUG_ON(start_slot < 0); - BUG_ON(start_slot + nr > btrfs_header_nritems(buf)); - - ref_root = btrfs_header_owner(buf); - ref_generation = btrfs_header_generation(buf); - orig_root = btrfs_header_owner(orig_buf); - orig_generation = btrfs_header_generation(orig_buf); - level = btrfs_header_level(buf); + if (!root->ref_cows && level == 0) + return 0; - if (!root->ref_cows) { - if (level == 0 && - root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) - return 0; - if (level != 0 && - root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) - return 0; - } + if (inc) + process_func = btrfs_inc_extent_ref; + else + process_func = btrfs_free_extent; - for (i = 0, slot = start_slot; i < nr; i++, slot++) { - cond_resched(); + if (full_backref) + parent = buf->start; + else + parent = 0; + + for (i = 0; i < nritems; i++) { if (level == 0) { - btrfs_item_key_to_cpu(buf, &key, slot); + btrfs_item_key_to_cpu(buf, &key, i); if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) continue; - fi = btrfs_item_ptr(buf, slot, + fi = btrfs_item_ptr(buf, i, struct btrfs_file_extent_item); if (btrfs_file_extent_type(buf, fi) == BTRFS_FILE_EXTENT_INLINE) @@ -1483,28 +2334,39 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans, bytenr = btrfs_file_extent_disk_bytenr(buf, fi); if (bytenr == 0) continue; - ret = __btrfs_update_extent_ref(trans, root, bytenr, - btrfs_file_extent_disk_num_bytes(buf, fi), - orig_buf->start, buf->start, - orig_root, ref_root, orig_generation, - ref_generation, key.objectid); + + num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); + key.offset -= btrfs_file_extent_offset(buf, fi); + ret = process_func(trans, root, bytenr, num_bytes, + parent, ref_root, key.objectid, + key.offset); if (ret) goto fail; } else { - bytenr = btrfs_node_blockptr(buf, slot); - ret = __btrfs_update_extent_ref(trans, root, bytenr, - buf->len, orig_buf->start, - buf->start, orig_root, ref_root, - orig_generation, ref_generation, - level - 1); + bytenr = btrfs_node_blockptr(buf, i); + num_bytes = btrfs_level_size(root, level - 1); + ret = process_func(trans, root, bytenr, num_bytes, + parent, ref_root, level - 1, 0); if (ret) goto fail; } } return 0; fail: - WARN_ON(1); - return -1; + BUG(); + return ret; +} + +int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref) +{ + return __btrfs_mod_ref(trans, root, buf, full_backref, 1); +} + +int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref) +{ + return __btrfs_mod_ref(trans, root, buf, full_backref, 0); } static int write_one_cache_group(struct btrfs_trans_handle *trans, @@ -2007,6 +2869,24 @@ static int update_block_group(struct btrfs_trans_handle *trans, u64 old_val; u64 byte_in_group; + /* block accounting for super block */ + spin_lock(&info->delalloc_lock); + old_val = btrfs_super_bytes_used(&info->super_copy); + if (alloc) + old_val += num_bytes; + else + old_val -= num_bytes; + btrfs_set_super_bytes_used(&info->super_copy, old_val); + + /* block accounting for root item */ + old_val = btrfs_root_used(&root->root_item); + if (alloc) + old_val += num_bytes; + else + old_val -= num_bytes; + btrfs_set_root_used(&root->root_item, old_val); + spin_unlock(&info->delalloc_lock); + while (total) { cache = btrfs_lookup_block_group(info, bytenr); if (!cache) @@ -2216,8 +3096,6 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, u64 header_owner = btrfs_header_owner(buf); u64 header_transid = btrfs_header_generation(buf); if (header_owner != BTRFS_TREE_LOG_OBJECTID && - header_owner != BTRFS_TREE_RELOC_OBJECTID && - header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID && header_transid == trans->transid && !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { *must_clean = buf; @@ -2235,63 +3113,77 @@ pinit: return 0; } -/* - * remove an extent from the root, returns 0 on success - */ -static int __free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid, int pin, int mark_free, - int refs_to_drop) + +static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 owner_objectid, + u64 owner_offset, int refs_to_drop, + struct btrfs_delayed_extent_op *extent_op) { - struct btrfs_path *path; struct btrfs_key key; + struct btrfs_path *path; struct btrfs_fs_info *info = root->fs_info; struct btrfs_root *extent_root = info->extent_root; struct extent_buffer *leaf; + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; int ret; + int is_data; int extent_slot = 0; int found_extent = 0; int num_to_del = 1; - struct btrfs_extent_item *ei; - u32 refs; + u32 item_size; + u64 refs; - key.objectid = bytenr; - btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); - key.offset = num_bytes; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->reada = 1; path->leave_spinning = 1; - ret = lookup_extent_backref(trans, extent_root, path, - bytenr, parent, root_objectid, - ref_generation, owner_objectid, 1); + + is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; + BUG_ON(!is_data && refs_to_drop != 1); + + ret = lookup_extent_backref(trans, extent_root, path, &iref, + bytenr, num_bytes, parent, + root_objectid, owner_objectid, + owner_offset); if (ret == 0) { - struct btrfs_key found_key; extent_slot = path->slots[0]; - while (extent_slot > 0) { - extent_slot--; - btrfs_item_key_to_cpu(path->nodes[0], &found_key, + while (extent_slot >= 0) { + btrfs_item_key_to_cpu(path->nodes[0], &key, extent_slot); - if (found_key.objectid != bytenr) + if (key.objectid != bytenr) break; - if (found_key.type == BTRFS_EXTENT_ITEM_KEY && - found_key.offset == num_bytes) { + if (key.type == BTRFS_EXTENT_ITEM_KEY && + key.offset == num_bytes) { found_extent = 1; break; } if (path->slots[0] - extent_slot > 5) break; + extent_slot--; } +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + item_size = btrfs_item_size_nr(path->nodes[0], extent_slot); + if (found_extent && item_size < sizeof(*ei)) + found_extent = 0; +#endif if (!found_extent) { + BUG_ON(iref); ret = remove_extent_backref(trans, extent_root, path, - refs_to_drop); + NULL, refs_to_drop, + is_data); BUG_ON(ret); btrfs_release_path(extent_root, path); path->leave_spinning = 1; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); if (ret) { @@ -2307,82 +3199,98 @@ static int __free_extent(struct btrfs_trans_handle *trans, btrfs_print_leaf(extent_root, path->nodes[0]); WARN_ON(1); printk(KERN_ERR "btrfs unable to find ref byte nr %llu " - "parent %llu root %llu gen %llu owner %llu\n", + "parent %llu root %llu owner %llu offset %llu\n", (unsigned long long)bytenr, (unsigned long long)parent, (unsigned long long)root_objectid, - (unsigned long long)ref_generation, - (unsigned long long)owner_objectid); + (unsigned long long)owner_objectid, + (unsigned long long)owner_offset); } leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, extent_slot); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + BUG_ON(found_extent || extent_slot != path->slots[0]); + ret = convert_extent_item_v0(trans, extent_root, path, + owner_objectid, 0); + BUG_ON(ret < 0); + + btrfs_release_path(extent_root, path); + path->leave_spinning = 1; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + + ret = btrfs_search_slot(trans, extent_root, &key, path, + -1, 1); + if (ret) { + printk(KERN_ERR "umm, got %d back from search" + ", was looking for %llu\n", ret, + (unsigned long long)bytenr); + btrfs_print_leaf(extent_root, path->nodes[0]); + } + BUG_ON(ret); + extent_slot = path->slots[0]; + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, extent_slot); + } +#endif + BUG_ON(item_size < sizeof(*ei)); ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item); - refs = btrfs_extent_refs(leaf, ei); - - /* - * we're not allowed to delete the extent item if there - * are other delayed ref updates pending - */ + if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { + struct btrfs_tree_block_info *bi; + BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); + bi = (struct btrfs_tree_block_info *)(ei + 1); + WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); + } + refs = btrfs_extent_refs(leaf, ei); BUG_ON(refs < refs_to_drop); refs -= refs_to_drop; - btrfs_set_extent_refs(leaf, ei, refs); - btrfs_mark_buffer_dirty(leaf); - if (refs == 0 && found_extent && - path->slots[0] == extent_slot + 1) { - struct btrfs_extent_ref *ref; - ref = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_ref); - BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop); - /* if the back ref and the extent are next to each other - * they get deleted below in one shot + if (refs > 0) { + if (extent_op) + __run_delayed_extent_op(extent_op, leaf, ei); + /* + * In the case of inline back ref, reference count will + * be updated by remove_extent_backref */ - path->slots[0] = extent_slot; - num_to_del = 2; - } else if (found_extent) { - /* otherwise delete the extent back ref */ - ret = remove_extent_backref(trans, extent_root, path, - refs_to_drop); - BUG_ON(ret); - /* if refs are 0, we need to setup the path for deletion */ - if (refs == 0) { - btrfs_release_path(extent_root, path); - path->leave_spinning = 1; - ret = btrfs_search_slot(trans, extent_root, &key, path, - -1, 1); + if (iref) { + BUG_ON(!found_extent); + } else { + btrfs_set_extent_refs(leaf, ei, refs); + btrfs_mark_buffer_dirty(leaf); + } + if (found_extent) { + ret = remove_extent_backref(trans, extent_root, path, + iref, refs_to_drop, + is_data); BUG_ON(ret); } - } - - if (refs == 0) { - u64 super_used; - u64 root_used; + } else { + int mark_free = 0; struct extent_buffer *must_clean = NULL; - if (pin) { - ret = pin_down_bytes(trans, root, path, - bytenr, num_bytes, - owner_objectid >= BTRFS_FIRST_FREE_OBJECTID, - &must_clean); - if (ret > 0) - mark_free = 1; - BUG_ON(ret < 0); + if (found_extent) { + BUG_ON(is_data && refs_to_drop != + extent_data_ref_count(root, path, iref)); + if (iref) { + BUG_ON(path->slots[0] != extent_slot); + } else { + BUG_ON(path->slots[0] != extent_slot + 1); + path->slots[0] = extent_slot; + num_to_del = 2; + } } - /* block accounting for super block */ - spin_lock(&info->delalloc_lock); - super_used = btrfs_super_bytes_used(&info->super_copy); - btrfs_set_super_bytes_used(&info->super_copy, - super_used - num_bytes); - - /* block accounting for root item */ - root_used = btrfs_root_used(&root->root_item); - btrfs_set_root_used(&root->root_item, - root_used - num_bytes); - spin_unlock(&info->delalloc_lock); - + ret = pin_down_bytes(trans, root, path, bytenr, + num_bytes, is_data, &must_clean); + if (ret > 0) + mark_free = 1; + BUG_ON(ret < 0); /* * it is going to be very rare for someone to be waiting * on the block we're freeing. del_items might need to @@ -2403,7 +3311,7 @@ static int __free_extent(struct btrfs_trans_handle *trans, free_extent_buffer(must_clean); } - if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { + if (is_data) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); BUG_ON(ret); } else { @@ -2421,34 +3329,6 @@ static int __free_extent(struct btrfs_trans_handle *trans, } /* - * remove an extent from the root, returns 0 on success - */ -static int __btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid, int pin, - int refs_to_drop) -{ - WARN_ON(num_bytes < root->sectorsize); - - /* - * if metadata always pin - * if data pin when any transaction has committed this - */ - if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID || - ref_generation != trans->transid) - pin = 1; - - if (ref_generation != trans->transid) - pin = 1; - - return __free_extent(trans, root, bytenr, num_bytes, parent, - root_objectid, ref_generation, - owner_objectid, pin, pin == 0, refs_to_drop); -} - -/* * when we free an extent, it is possible (and likely) that we free the last * delayed ref for that extent as well. This searches the delayed ref tree for * a given extent, and if there are no other delayed refs to be processed, it @@ -2479,6 +3359,13 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (ref->bytenr == bytenr) goto out; + if (head->extent_op) { + if (!head->must_insert_reserved) + goto out; + kfree(head->extent_op); + head->extent_op = NULL; + } + /* * waiting for the lock here would deadlock. If someone else has it * locked they are already in the process of dropping it anyway @@ -2507,7 +3394,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); ret = run_one_delayed_ref(trans, root->fs_info->tree_root, - &head->node, head->must_insert_reserved); + &head->node, head->extent_op, + head->must_insert_reserved); BUG_ON(ret); btrfs_put_delayed_ref(&head->node); return 0; @@ -2519,32 +3407,32 @@ out: int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid, int pin) + u64 root_objectid, u64 owner, u64 offset) { int ret; /* * tree log blocks never actually go into the extent allocation * tree, just update pinning info and exit early. - * - * data extents referenced by the tree log do need to have - * their reference counts bumped. */ - if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID && - owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { + if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { + WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); /* unlocks the pinned mutex */ btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); update_reserved_extents(root, bytenr, num_bytes, 0); ret = 0; - } else { - ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, - root_objectid, ref_generation, - owner_objectid, - BTRFS_DROP_DELAYED_REF, 1); + } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { + ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, + parent, root_objectid, (int)owner, + BTRFS_DROP_DELAYED_REF, NULL); BUG_ON(ret); ret = check_ref_cleanup(trans, root, bytenr); BUG_ON(ret); + } else { + ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, + parent, root_objectid, owner, + offset, BTRFS_DROP_DELAYED_REF, NULL); + BUG_ON(ret); } return ret; } @@ -2969,99 +3857,147 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, return ret; } -static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner, struct btrfs_key *ins, - int ref_mod) +static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + u64 flags, u64 owner, u64 offset, + struct btrfs_key *ins, int ref_mod) { int ret; - u64 super_used; - u64 root_used; - u64 num_bytes = ins->offset; - u32 sizes[2]; - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_root *extent_root = info->extent_root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_extent_item *extent_item; - struct btrfs_extent_ref *ref; + struct btrfs_extent_inline_ref *iref; struct btrfs_path *path; - struct btrfs_key keys[2]; - - if (parent == 0) - parent = ins->objectid; - - /* block accounting for super block */ - spin_lock(&info->delalloc_lock); - super_used = btrfs_super_bytes_used(&info->super_copy); - btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes); + struct extent_buffer *leaf; + int type; + u32 size; - /* block accounting for root item */ - root_used = btrfs_root_used(&root->root_item); - btrfs_set_root_used(&root->root_item, root_used + num_bytes); - spin_unlock(&info->delalloc_lock); + if (parent > 0) + type = BTRFS_SHARED_DATA_REF_KEY; + else + type = BTRFS_EXTENT_DATA_REF_KEY; - memcpy(&keys[0], ins, sizeof(*ins)); - keys[1].objectid = ins->objectid; - keys[1].type = BTRFS_EXTENT_REF_KEY; - keys[1].offset = parent; - sizes[0] = sizeof(*extent_item); - sizes[1] = sizeof(*ref); + size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); path = btrfs_alloc_path(); BUG_ON(!path); path->leave_spinning = 1; - ret = btrfs_insert_empty_items(trans, extent_root, path, keys, - sizes, 2); + ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, + ins, size); BUG_ON(ret); - extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0], + leaf = path->nodes[0]; + extent_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); - btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod); - ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, - struct btrfs_extent_ref); - - btrfs_set_ref_root(path->nodes[0], ref, root_objectid); - btrfs_set_ref_generation(path->nodes[0], ref, ref_generation); - btrfs_set_ref_objectid(path->nodes[0], ref, owner); - btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod); + btrfs_set_extent_refs(leaf, extent_item, ref_mod); + btrfs_set_extent_generation(leaf, extent_item, trans->transid); + btrfs_set_extent_flags(leaf, extent_item, + flags | BTRFS_EXTENT_FLAG_DATA); + + iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); + btrfs_set_extent_inline_ref_type(leaf, iref, type); + if (parent > 0) { + struct btrfs_shared_data_ref *ref; + ref = (struct btrfs_shared_data_ref *)(iref + 1); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + btrfs_set_shared_data_ref_count(leaf, ref, ref_mod); + } else { + struct btrfs_extent_data_ref *ref; + ref = (struct btrfs_extent_data_ref *)(&iref->offset); + btrfs_set_extent_data_ref_root(leaf, ref, root_objectid); + btrfs_set_extent_data_ref_objectid(leaf, ref, owner); + btrfs_set_extent_data_ref_offset(leaf, ref, offset); + btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); + } btrfs_mark_buffer_dirty(path->nodes[0]); - - trans->alloc_exclude_start = 0; - trans->alloc_exclude_nr = 0; btrfs_free_path(path); - if (ret) - goto out; - - ret = update_block_group(trans, root, ins->objectid, - ins->offset, 1, 0); + ret = update_block_group(trans, root, ins->objectid, ins->offset, + 1, 0); if (ret) { printk(KERN_ERR "btrfs update block group failed for %llu " "%llu\n", (unsigned long long)ins->objectid, (unsigned long long)ins->offset); BUG(); } -out: return ret; } -int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner, struct btrfs_key *ins) +static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + u64 flags, struct btrfs_disk_key *key, + int level, struct btrfs_key *ins) { int ret; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_extent_item *extent_item; + struct btrfs_tree_block_info *block_info; + struct btrfs_extent_inline_ref *iref; + struct btrfs_path *path; + struct extent_buffer *leaf; + u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref); - if (root_objectid == BTRFS_TREE_LOG_OBJECTID) - return 0; + path = btrfs_alloc_path(); + BUG_ON(!path); - ret = btrfs_add_delayed_ref(trans, ins->objectid, - ins->offset, parent, root_objectid, - ref_generation, owner, - BTRFS_ADD_DELAYED_EXTENT, 0); + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, + ins, size); BUG_ON(ret); + + leaf = path->nodes[0]; + extent_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + btrfs_set_extent_refs(leaf, extent_item, 1); + btrfs_set_extent_generation(leaf, extent_item, trans->transid); + btrfs_set_extent_flags(leaf, extent_item, + flags | BTRFS_EXTENT_FLAG_TREE_BLOCK); + block_info = (struct btrfs_tree_block_info *)(extent_item + 1); + + btrfs_set_tree_block_key(leaf, block_info, key); + btrfs_set_tree_block_level(leaf, block_info, level); + + iref = (struct btrfs_extent_inline_ref *)(block_info + 1); + if (parent > 0) { + BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); + btrfs_set_extent_inline_ref_type(leaf, iref, + BTRFS_SHARED_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, parent); + } else { + btrfs_set_extent_inline_ref_type(leaf, iref, + BTRFS_TREE_BLOCK_REF_KEY); + btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); + } + + btrfs_mark_buffer_dirty(leaf); + btrfs_free_path(path); + + ret = update_block_group(trans, root, ins->objectid, ins->offset, + 1, 0); + if (ret) { + printk(KERN_ERR "btrfs update block group failed for %llu " + "%llu\n", (unsigned long long)ins->objectid, + (unsigned long long)ins->offset); + BUG(); + } + return ret; +} + +int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 root_objectid, u64 owner, + u64 offset, struct btrfs_key *ins) +{ + int ret; + + BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); + + ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset, + 0, root_objectid, owner, offset, + BTRFS_ADD_DELAYED_EXTENT, NULL); return ret; } @@ -3070,10 +4006,10 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, * an extent has been allocated and makes sure to clear the free * space cache bits as well */ -int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 parent, - u64 root_objectid, u64 ref_generation, - u64 owner, struct btrfs_key *ins) +int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 root_objectid, u64 owner, u64 offset, + struct btrfs_key *ins) { int ret; struct btrfs_block_group_cache *block_group; @@ -3087,8 +4023,8 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, ins->offset); BUG_ON(ret); btrfs_put_block_group(block_group); - ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, - ref_generation, owner, ins, 1); + ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, + 0, owner, offset, ins, 1); return ret; } @@ -3099,26 +4035,48 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, * * returns 0 if everything worked, non-zero otherwise. */ -int btrfs_alloc_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 num_bytes, u64 parent, u64 min_alloc_size, - u64 root_objectid, u64 ref_generation, - u64 owner_objectid, u64 empty_size, u64 hint_byte, - u64 search_end, struct btrfs_key *ins, u64 data) +static int alloc_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 parent, u64 root_objectid, + struct btrfs_disk_key *key, int level, + u64 empty_size, u64 hint_byte, u64 search_end, + struct btrfs_key *ins) { int ret; - ret = __btrfs_reserve_extent(trans, root, num_bytes, - min_alloc_size, empty_size, hint_byte, - search_end, ins, data); + u64 flags = 0; + + ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes, + empty_size, hint_byte, search_end, + ins, 0); BUG_ON(ret); + + if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (parent == 0) + parent = ins->objectid; + flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + } else + BUG_ON(parent > 0); + + update_reserved_extents(root, ins->objectid, ins->offset, 1); if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { - ret = btrfs_add_delayed_ref(trans, ins->objectid, - ins->offset, parent, root_objectid, - ref_generation, owner_objectid, - BTRFS_ADD_DELAYED_EXTENT, 0); + struct btrfs_delayed_extent_op *extent_op; + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + BUG_ON(!extent_op); + if (key) + memcpy(&extent_op->key, key, sizeof(extent_op->key)); + else + memset(&extent_op->key, 0, sizeof(extent_op->key)); + extent_op->flags_to_set = flags; + extent_op->update_key = 1; + extent_op->update_flags = 1; + extent_op->is_data = 0; + + ret = btrfs_add_delayed_tree_ref(trans, ins->objectid, + ins->offset, parent, root_objectid, + level, BTRFS_ADD_DELAYED_EXTENT, + extent_op); BUG_ON(ret); } - update_reserved_extents(root, ins->objectid, ins->offset, 1); return ret; } @@ -3157,21 +4115,17 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, * returns the tree buffer or NULL. */ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u32 blocksize, u64 parent, - u64 root_objectid, - u64 ref_generation, - int level, - u64 hint, - u64 empty_size) + struct btrfs_root *root, u32 blocksize, + u64 parent, u64 root_objectid, + struct btrfs_disk_key *key, int level, + u64 hint, u64 empty_size) { struct btrfs_key ins; int ret; struct extent_buffer *buf; - ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize, - root_objectid, ref_generation, level, - empty_size, hint, (u64)-1, &ins, 0); + ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid, + key, level, empty_size, hint, (u64)-1, &ins); if (ret) { BUG_ON(ret > 0); return ERR_PTR(ret); @@ -3185,32 +4139,19 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *leaf) { - u64 leaf_owner; - u64 leaf_generation; - struct refsort *sorted; + u64 disk_bytenr; + u64 num_bytes; struct btrfs_key key; struct btrfs_file_extent_item *fi; + u32 nritems; int i; - int nritems; int ret; - int refi = 0; - int slot; BUG_ON(!btrfs_is_leaf(leaf)); nritems = btrfs_header_nritems(leaf); - leaf_owner = btrfs_header_owner(leaf); - leaf_generation = btrfs_header_generation(leaf); - sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS); - /* we do this loop twice. The first time we build a list - * of the extents we have a reference on, then we sort the list - * by bytenr. The second time around we actually do the - * extent freeing. - */ for (i = 0; i < nritems; i++) { - u64 disk_bytenr; cond_resched(); - btrfs_item_key_to_cpu(leaf, &key, i); /* only extents have references, skip everything else */ @@ -3230,45 +4171,16 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, if (disk_bytenr == 0) continue; - sorted[refi].bytenr = disk_bytenr; - sorted[refi].slot = i; - refi++; - } - - if (refi == 0) - goto out; - - sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL); - - for (i = 0; i < refi; i++) { - u64 disk_bytenr; - - disk_bytenr = sorted[i].bytenr; - slot = sorted[i].slot; - - cond_resched(); - - btrfs_item_key_to_cpu(leaf, &key, slot); - if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) - continue; - - fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - - ret = btrfs_free_extent(trans, root, disk_bytenr, - btrfs_file_extent_disk_num_bytes(leaf, fi), - leaf->start, leaf_owner, leaf_generation, - key.objectid, 0); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes, + leaf->start, 0, key.objectid, 0); BUG_ON(ret); - - atomic_inc(&root->fs_info->throttle_gen); - wake_up(&root->fs_info->transaction_throttle); - cond_resched(); } -out: - kfree(sorted); return 0; } +#if 0 + static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_leaf_ref *ref) @@ -3311,13 +4223,14 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, return 0; } + static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 start, u64 len, u32 *refs) { int ret; - ret = btrfs_lookup_extent_ref(trans, root, start, len, refs); + ret = btrfs_lookup_extent_refs(trans, root, start, len, refs); BUG_ON(ret); #if 0 /* some debugging code in case we see problems here */ @@ -3352,6 +4265,7 @@ static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans, return ret; } + /* * this is used while deleting old snapshots, and it drops the refs * on a whole subtree starting from a level 1 node. @@ -3645,32 +4559,36 @@ out: cond_resched(); return 0; } +#endif /* * helper function for drop_subtree, this function is similar to * walk_down_tree. The main difference is that it checks reference * counts while tree blocks are locked. */ -static noinline int walk_down_subtree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, int *level) +static noinline int walk_down_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level) { struct extent_buffer *next; struct extent_buffer *cur; struct extent_buffer *parent; u64 bytenr; u64 ptr_gen; + u64 refs; + u64 flags; u32 blocksize; - u32 refs; int ret; cur = path->nodes[*level]; - ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len, - &refs); + ret = btrfs_lookup_extent_info(trans, root, cur->start, cur->len, + &refs, &flags); BUG_ON(ret); if (refs > 1) goto out; + BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); + while (*level >= 0) { cur = path->nodes[*level]; if (*level == 0) { @@ -3692,16 +4610,15 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans, btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, - &refs); + ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, + &refs, &flags); BUG_ON(ret); if (refs > 1) { parent = path->nodes[*level]; ret = btrfs_free_extent(trans, root, bytenr, - blocksize, parent->start, - btrfs_header_owner(parent), - btrfs_header_generation(parent), - *level - 1, 1); + blocksize, parent->start, + btrfs_header_owner(parent), + *level - 1, 0); BUG_ON(ret); path->slots[*level]++; btrfs_tree_unlock(next); @@ -3709,6 +4626,8 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans, continue; } + BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); + *level = btrfs_header_level(next); path->nodes[*level] = next; path->slots[*level] = 0; @@ -3716,13 +4635,15 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans, cond_resched(); } out: - parent = path->nodes[*level + 1]; + if (path->nodes[*level] == root->node) + parent = path->nodes[*level]; + else + parent = path->nodes[*level + 1]; bytenr = path->nodes[*level]->start; blocksize = path->nodes[*level]->len; - ret = btrfs_free_extent(trans, root, bytenr, blocksize, - parent->start, btrfs_header_owner(parent), - btrfs_header_generation(parent), *level, 1); + ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent->start, + btrfs_header_owner(parent), *level, 0); BUG_ON(ret); if (path->locks[*level]) { @@ -3746,8 +4667,6 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_path *path, int *level, int max_level) { - u64 root_owner; - u64 root_gen; struct btrfs_root_item *root_item = &root->root_item; int i; int slot; @@ -3755,24 +4674,22 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, for (i = *level; i < max_level && path->nodes[i]; i++) { slot = path->slots[i]; - if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { - struct extent_buffer *node; - struct btrfs_disk_key disk_key; - + if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { /* * there is more work to do in this level. * Update the drop_progress marker to reflect * the work we've done so far, and then bump * the slot number */ - node = path->nodes[i]; path->slots[i]++; - *level = i; WARN_ON(*level == 0); - btrfs_node_key(node, &disk_key, path->slots[i]); - memcpy(&root_item->drop_progress, - &disk_key, sizeof(disk_key)); - root_item->drop_level = i; + if (max_level == BTRFS_MAX_LEVEL) { + btrfs_node_key(path->nodes[i], + &root_item->drop_progress, + path->slots[i]); + root_item->drop_level = i; + } + *level = i; return 0; } else { struct extent_buffer *parent; @@ -3786,22 +4703,20 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, else parent = path->nodes[*level + 1]; - root_owner = btrfs_header_owner(parent); - root_gen = btrfs_header_generation(parent); - - clean_tree_block(trans, root, path->nodes[*level]); + clean_tree_block(trans, root, path->nodes[i]); ret = btrfs_free_extent(trans, root, - path->nodes[*level]->start, - path->nodes[*level]->len, - parent->start, root_owner, - root_gen, *level, 1); + path->nodes[i]->start, + path->nodes[i]->len, + parent->start, + btrfs_header_owner(parent), + *level, 0); BUG_ON(ret); if (path->locks[*level]) { - btrfs_tree_unlock(path->nodes[*level]); - path->locks[*level] = 0; + btrfs_tree_unlock(path->nodes[i]); + path->locks[i] = 0; } - free_extent_buffer(path->nodes[*level]); - path->nodes[*level] = NULL; + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; *level = i + 1; } } @@ -3820,21 +4735,18 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root int wret; int level; struct btrfs_path *path; - int i; - int orig_level; int update_count; struct btrfs_root_item *root_item = &root->root_item; - WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex)); path = btrfs_alloc_path(); BUG_ON(!path); level = btrfs_header_level(root->node); - orig_level = level; if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { - path->nodes[level] = root->node; - extent_buffer_get(root->node); + path->nodes[level] = btrfs_lock_root_node(root); + btrfs_set_lock_blocking(path->nodes[level]); path->slots[level] = 0; + path->locks[level] = 1; } else { struct btrfs_key key; struct btrfs_disk_key found_key; @@ -3856,12 +4768,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root * unlock our path, this is safe because only this * function is allowed to delete this snapshot */ - for (i = 0; i < BTRFS_MAX_LEVEL; i++) { - if (path->nodes[i] && path->locks[i]) { - path->locks[i] = 0; - btrfs_tree_unlock(path->nodes[i]); - } - } + btrfs_unlock_up_safe(path, 0); } while (1) { unsigned long update; @@ -3882,8 +4789,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root ret = -EAGAIN; break; } - atomic_inc(&root->fs_info->throttle_gen); - wake_up(&root->fs_info->transaction_throttle); for (update_count = 0; update_count < 16; update_count++) { update = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; @@ -3893,12 +4798,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root break; } } - for (i = 0; i <= orig_level; i++) { - if (path->nodes[i]) { - free_extent_buffer(path->nodes[i]); - path->nodes[i] = NULL; - } - } out: btrfs_free_path(path); return ret; @@ -3931,7 +4830,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, path->slots[level] = 0; while (1) { - wret = walk_down_subtree(trans, root, path, &level); + wret = walk_down_tree(trans, root, path, &level); if (wret < 0) ret = wret; if (wret != 0) @@ -3948,6 +4847,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, return ret; } +#if 0 static unsigned long calc_ra(unsigned long start, unsigned long last, unsigned long nr) { @@ -5429,6 +6329,7 @@ out: kfree(ref_path); return ret; } +#endif static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) { @@ -5477,7 +6378,8 @@ static int __alloc_chunk_for_shrink(struct btrfs_root *root, u64 calc; spin_lock(&shrink_block_group->lock); - if (btrfs_block_group_used(&shrink_block_group->item) > 0) { + if (btrfs_block_group_used(&shrink_block_group->item) + + shrink_block_group->reserved > 0) { spin_unlock(&shrink_block_group->lock); trans = btrfs_start_transaction(root, 1); @@ -5502,6 +6404,17 @@ static int __alloc_chunk_for_shrink(struct btrfs_root *root, return 0; } + +int btrfs_prepare_block_group_relocation(struct btrfs_root *root, + struct btrfs_block_group_cache *group) + +{ + __alloc_chunk_for_shrink(root, group, 1); + set_block_group_readonly(group); + return 0; +} + +#if 0 static int __insert_orphan_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 size) @@ -5781,6 +6694,7 @@ out: btrfs_free_path(path); return ret; } +#endif static int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1d51dc3..0726a73 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -291,16 +291,12 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, { u64 extent_end = 0; u64 search_start = start; - u64 leaf_start; u64 ram_bytes = 0; - u64 orig_parent = 0; u64 disk_bytenr = 0; u64 orig_locked_end = locked_end; u8 compression; u8 encryption; u16 other_encoding = 0; - u64 root_gen; - u64 root_owner; struct extent_buffer *leaf; struct btrfs_file_extent_item *extent; struct btrfs_path *path; @@ -340,9 +336,6 @@ next_slot: bookend = 0; found_extent = 0; found_inline = 0; - leaf_start = 0; - root_gen = 0; - root_owner = 0; compression = 0; encryption = 0; extent = NULL; @@ -417,9 +410,6 @@ next_slot: if (found_extent) { read_extent_buffer(leaf, &old, (unsigned long)extent, sizeof(old)); - root_gen = btrfs_header_generation(leaf); - root_owner = btrfs_header_owner(leaf); - leaf_start = leaf->start; } if (end < extent_end && end >= key.offset) { @@ -443,14 +433,14 @@ next_slot: } locked_end = extent_end; } - orig_parent = path->nodes[0]->start; disk_bytenr = le64_to_cpu(old.disk_bytenr); if (disk_bytenr != 0) { ret = btrfs_inc_extent_ref(trans, root, disk_bytenr, - le64_to_cpu(old.disk_num_bytes), - orig_parent, root->root_key.objectid, - trans->transid, inode->i_ino); + le64_to_cpu(old.disk_num_bytes), 0, + root->root_key.objectid, + key.objectid, key.offset - + le64_to_cpu(old.offset)); BUG_ON(ret); } } @@ -568,17 +558,6 @@ next_slot: btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_set_lock_blocking(path->nodes[0]); - if (disk_bytenr != 0) { - ret = btrfs_update_extent_ref(trans, root, - disk_bytenr, - le64_to_cpu(old.disk_num_bytes), - orig_parent, - leaf->start, - root->root_key.objectid, - trans->transid, ins.objectid); - - BUG_ON(ret); - } path->leave_spinning = 0; btrfs_release_path(root, path); if (disk_bytenr != 0) @@ -594,8 +573,9 @@ next_slot: ret = btrfs_free_extent(trans, root, old_disk_bytenr, le64_to_cpu(old.disk_num_bytes), - leaf_start, root_owner, - root_gen, key.objectid, 0); + 0, root->root_key.objectid, + key.objectid, key.offset - + le64_to_cpu(old.offset)); BUG_ON(ret); *hint_byte = old_disk_bytenr; } @@ -664,12 +644,11 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, u64 bytenr; u64 num_bytes; u64 extent_end; - u64 extent_offset; + u64 orig_offset; u64 other_start; u64 other_end; u64 split = start; u64 locked_end = end; - u64 orig_parent; int extent_type; int split_end = 1; int ret; @@ -703,7 +682,7 @@ again: bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); - extent_offset = btrfs_file_extent_offset(leaf, fi); + orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); if (key.offset == start) split = end; @@ -711,8 +690,6 @@ again: if (key.offset == start && extent_end == end) { int del_nr = 0; int del_slot = 0; - u64 leaf_owner = btrfs_header_owner(leaf); - u64 leaf_gen = btrfs_header_generation(leaf); other_start = end; other_end = 0; if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino, @@ -721,8 +698,8 @@ again: del_slot = path->slots[0] + 1; del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, - leaf->start, leaf_owner, - leaf_gen, inode->i_ino, 0); + 0, root->root_key.objectid, + inode->i_ino, orig_offset); BUG_ON(ret); } other_start = 0; @@ -733,8 +710,8 @@ again: del_slot = path->slots[0]; del_nr++; ret = btrfs_free_extent(trans, root, bytenr, num_bytes, - leaf->start, leaf_owner, - leaf_gen, inode->i_ino, 0); + 0, root->root_key.objectid, + inode->i_ino, orig_offset); BUG_ON(ret); } split_end = 0; @@ -768,13 +745,12 @@ again: locked_end = extent_end; } btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset); - extent_offset += split - key.offset; } else { BUG_ON(key.offset != start); - btrfs_set_file_extent_offset(leaf, fi, extent_offset + - split - key.offset); - btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); key.offset = split; + btrfs_set_file_extent_offset(leaf, fi, key.offset - + orig_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); btrfs_set_item_key_safe(trans, root, path, &key); extent_end = split; } @@ -793,7 +769,8 @@ again: struct btrfs_file_extent_item); key.offset = split; btrfs_set_item_key_safe(trans, root, path, &key); - btrfs_set_file_extent_offset(leaf, fi, extent_offset); + btrfs_set_file_extent_offset(leaf, fi, key.offset - + orig_offset); btrfs_set_file_extent_num_bytes(leaf, fi, other_end - split); goto done; @@ -815,10 +792,9 @@ again: btrfs_mark_buffer_dirty(leaf); - orig_parent = leaf->start; - ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, - orig_parent, root->root_key.objectid, - trans->transid, inode->i_ino); + ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, + root->root_key.objectid, + inode->i_ino, orig_offset); BUG_ON(ret); btrfs_release_path(root, path); @@ -833,20 +809,12 @@ again: btrfs_set_file_extent_type(leaf, fi, extent_type); btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr); btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes); - btrfs_set_file_extent_offset(leaf, fi, extent_offset); + btrfs_set_file_extent_offset(leaf, fi, key.offset - orig_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_compression(leaf, fi, 0); btrfs_set_file_extent_encryption(leaf, fi, 0); btrfs_set_file_extent_other_encoding(leaf, fi, 0); - - if (orig_parent != leaf->start) { - ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes, - orig_parent, leaf->start, - root->root_key.objectid, - trans->transid, inode->i_ino); - BUG_ON(ret); - } done: btrfs_mark_buffer_dirty(leaf); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1c8b019..917bf10 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -48,7 +48,6 @@ #include "ordered-data.h" #include "xattr.h" #include "tree-log.h" -#include "ref-cache.h" #include "compression.h" #include "locking.h" @@ -944,6 +943,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, u64 cow_start; u64 cur_offset; u64 extent_end; + u64 extent_offset; u64 disk_bytenr; u64 num_bytes; int extent_type; @@ -1005,6 +1005,7 @@ next_slot: if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + extent_offset = btrfs_file_extent_offset(leaf, fi); extent_end = found_key.offset + btrfs_file_extent_num_bytes(leaf, fi); if (extent_end <= start) { @@ -1022,9 +1023,10 @@ next_slot: if (btrfs_extent_readonly(root, disk_bytenr)) goto out_check; if (btrfs_cross_ref_exist(trans, root, inode->i_ino, - disk_bytenr)) + found_key.offset - + extent_offset, disk_bytenr)) goto out_check; - disk_bytenr += btrfs_file_extent_offset(leaf, fi); + disk_bytenr += extent_offset; disk_bytenr += cur_offset - found_key.offset; num_bytes = min(end + 1, extent_end) - cur_offset; /* @@ -1489,9 +1491,9 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, ins.objectid = disk_bytenr; ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_alloc_reserved_extent(trans, root, leaf->start, - root->root_key.objectid, - trans->transid, inode->i_ino, &ins); + ret = btrfs_alloc_reserved_file_extent(trans, root, + root->root_key.objectid, + inode->i_ino, file_pos, &ins); BUG_ON(ret); btrfs_free_path(path); @@ -1956,23 +1958,13 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) * crossing root thing. we store the inode number in the * offset of the orphan item. */ - inode = btrfs_iget_locked(root->fs_info->sb, - found_key.offset, root); - if (!inode) + found_key.objectid = found_key.offset; + found_key.type = BTRFS_INODE_ITEM_KEY; + found_key.offset = 0; + inode = btrfs_iget(root->fs_info->sb, &found_key, root); + if (IS_ERR(inode)) break; - if (inode->i_state & I_NEW) { - BTRFS_I(inode)->root = root; - - /* have to set the location manually */ - BTRFS_I(inode)->location.objectid = inode->i_ino; - BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; - BTRFS_I(inode)->location.offset = 0; - - btrfs_read_locked_inode(inode); - unlock_new_inode(inode); - } - /* * add this inode to the orphan list so btrfs_orphan_del does * the proper thing when we hit it @@ -2069,7 +2061,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, /* * read an inode from the btree into the in-memory inode */ -void btrfs_read_locked_inode(struct inode *inode) +static void btrfs_read_locked_inode(struct inode *inode) { struct btrfs_path *path; struct extent_buffer *leaf; @@ -2599,9 +2591,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item *fi; u64 extent_start = 0; u64 extent_num_bytes = 0; + u64 extent_offset = 0; u64 item_end = 0; - u64 root_gen = 0; - u64 root_owner = 0; int found_extent; int del_item; int pending_del_nr = 0; @@ -2716,6 +2707,9 @@ search_again: extent_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + extent_offset = found_key.offset - + btrfs_file_extent_offset(leaf, fi); + /* FIXME blocksize != 4096 */ num_dec = btrfs_file_extent_num_bytes(leaf, fi); if (extent_start != 0) { @@ -2723,8 +2717,6 @@ search_again: if (root->ref_cows) inode_sub_bytes(inode, num_dec); } - root_gen = btrfs_header_generation(leaf); - root_owner = btrfs_header_owner(leaf); } } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { /* @@ -2768,12 +2760,12 @@ delete: } else { break; } - if (found_extent) { + if (found_extent && root->ref_cows) { btrfs_set_path_blocking(path); ret = btrfs_free_extent(trans, root, extent_start, - extent_num_bytes, - leaf->start, root_owner, - root_gen, inode->i_ino, 0); + extent_num_bytes, 0, + btrfs_header_owner(leaf), + inode->i_ino, extent_offset); BUG_ON(ret); } next: @@ -3105,6 +3097,45 @@ static int fixup_tree_root_location(struct btrfs_root *root, return 0; } +static void inode_tree_add(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *entry; + struct rb_node **p = &root->inode_tree.rb_node; + struct rb_node *parent = NULL; + + spin_lock(&root->inode_lock); + while (*p) { + parent = *p; + entry = rb_entry(parent, struct btrfs_inode, rb_node); + + if (inode->i_ino < entry->vfs_inode.i_ino) + p = &(*p)->rb_left; + else if (inode->i_ino > entry->vfs_inode.i_ino) + p = &(*p)->rb_right; + else { + WARN_ON(!(entry->vfs_inode.i_state & + (I_WILL_FREE | I_FREEING | I_CLEAR))); + break; + } + } + rb_link_node(&BTRFS_I(inode)->rb_node, parent, p); + rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree); + spin_unlock(&root->inode_lock); +} + +static void inode_tree_del(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + + if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { + spin_lock(&root->inode_lock); + rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); + spin_unlock(&root->inode_lock); + RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); + } +} + static noinline void init_btrfs_i(struct inode *inode) { struct btrfs_inode *bi = BTRFS_I(inode); @@ -3130,6 +3161,7 @@ static noinline void init_btrfs_i(struct inode *inode) inode->i_mapping, GFP_NOFS); INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations); + RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); mutex_init(&BTRFS_I(inode)->extent_mutex); mutex_init(&BTRFS_I(inode)->log_mutex); @@ -3152,26 +3184,9 @@ static int btrfs_find_actor(struct inode *inode, void *opaque) args->root == BTRFS_I(inode)->root; } -struct inode *btrfs_ilookup(struct super_block *s, u64 objectid, - struct btrfs_root *root, int wait) -{ - struct inode *inode; - struct btrfs_iget_args args; - args.ino = objectid; - args.root = root; - - if (wait) { - inode = ilookup5(s, objectid, btrfs_find_actor, - (void *)&args); - } else { - inode = ilookup5_nowait(s, objectid, btrfs_find_actor, - (void *)&args); - } - return inode; -} - -struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, - struct btrfs_root *root) +static struct inode *btrfs_iget_locked(struct super_block *s, + u64 objectid, + struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; @@ -3188,24 +3203,21 @@ struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, * Returns in *is_new if the inode was read from disk */ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root, int *is_new) + struct btrfs_root *root) { struct inode *inode; inode = btrfs_iget_locked(s, location->objectid, root); if (!inode) - return ERR_PTR(-EACCES); + return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { BTRFS_I(inode)->root = root; memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); btrfs_read_locked_inode(inode); + + inode_tree_add(inode); unlock_new_inode(inode); - if (is_new) - *is_new = 1; - } else { - if (is_new) - *is_new = 0; } return inode; @@ -3218,7 +3230,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) struct btrfs_root *root = bi->root; struct btrfs_root *sub_root = root; struct btrfs_key location; - int ret, new; + int ret; if (dentry->d_name.len > BTRFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); @@ -3236,7 +3248,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return ERR_PTR(ret); if (ret > 0) return ERR_PTR(-ENOENT); - inode = btrfs_iget(dir->i_sb, &location, sub_root, &new); + inode = btrfs_iget(dir->i_sb, &location, sub_root); if (IS_ERR(inode)) return ERR_CAST(inode); } @@ -3631,6 +3643,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); insert_inode_hash(inode); + inode_tree_add(inode); return inode; fail: if (dir) @@ -4683,6 +4696,7 @@ void btrfs_destroy_inode(struct inode *inode) btrfs_put_ordered_extent(ordered); } } + inode_tree_del(inode); btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2624b53..54dfd45 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -82,22 +82,25 @@ static noinline int create_subvol(struct btrfs_root *root, if (ret) goto fail; - leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, - objectid, trans->transid, 0, 0, 0); + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, + 0, objectid, NULL, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto fail; } - btrfs_set_header_nritems(leaf, 0); - btrfs_set_header_level(leaf, 0); + memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_bytenr(leaf, leaf->start); btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(leaf, objectid); write_extent_buffer(leaf, root->fs_info->fsid, (unsigned long)btrfs_header_fsid(leaf), BTRFS_FSID_SIZE); + write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(leaf), + BTRFS_UUID_SIZE); btrfs_mark_buffer_dirty(leaf); inode_item = &root_item.inode; @@ -125,7 +128,7 @@ static noinline int create_subvol(struct btrfs_root *root, btrfs_set_root_dirid(&root_item, new_dirid); key.objectid = objectid; - key.offset = 1; + key.offset = 0; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, &root_item); @@ -911,10 +914,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (disko) { inode_add_bytes(inode, datal); ret = btrfs_inc_extent_ref(trans, root, - disko, diskl, leaf->start, - root->root_key.objectid, - trans->transid, - inode->i_ino); + disko, diskl, 0, + root->root_key.objectid, + inode->i_ino, + new_key.offset - datao); BUG_ON(ret); } } else if (type == BTRFS_FILE_EXTENT_INLINE) { diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 5f8f218..6d6523d 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -45,22 +45,132 @@ static void print_dev_item(struct extent_buffer *eb, (unsigned long long)btrfs_device_total_bytes(eb, dev_item), (unsigned long long)btrfs_device_bytes_used(eb, dev_item)); } +static void print_extent_data_ref(struct extent_buffer *eb, + struct btrfs_extent_data_ref *ref) +{ + printk(KERN_INFO "\t\textent data backref root %llu " + "objectid %llu offset %llu count %u\n", + (unsigned long long)btrfs_extent_data_ref_root(eb, ref), + (unsigned long long)btrfs_extent_data_ref_objectid(eb, ref), + (unsigned long long)btrfs_extent_data_ref_offset(eb, ref), + btrfs_extent_data_ref_count(eb, ref)); +} + +static void print_extent_item(struct extent_buffer *eb, int slot) +{ + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_data_ref *dref; + struct btrfs_shared_data_ref *sref; + struct btrfs_disk_key key; + unsigned long end; + unsigned long ptr; + int type; + u32 item_size = btrfs_item_size_nr(eb, slot); + u64 flags; + u64 offset; + + if (item_size < sizeof(*ei)) { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + struct btrfs_extent_item_v0 *ei0; + BUG_ON(item_size != sizeof(*ei0)); + ei0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_item_v0); + printk(KERN_INFO "\t\textent refs %u\n", + btrfs_extent_refs_v0(eb, ei0)); + return; +#else + BUG(); +#endif + } + + ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item); + flags = btrfs_extent_flags(eb, ei); + + printk(KERN_INFO "\t\textent refs %llu gen %llu flags %llu\n", + (unsigned long long)btrfs_extent_refs(eb, ei), + (unsigned long long)btrfs_extent_generation(eb, ei), + (unsigned long long)flags); + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + struct btrfs_tree_block_info *info; + info = (struct btrfs_tree_block_info *)(ei + 1); + btrfs_tree_block_key(eb, info, &key); + printk(KERN_INFO "\t\ttree block key (%llu %x %llu) " + "level %d\n", + (unsigned long long)btrfs_disk_key_objectid(&key), + key.type, + (unsigned long long)btrfs_disk_key_offset(&key), + btrfs_tree_block_level(eb, info)); + iref = (struct btrfs_extent_inline_ref *)(info + 1); + } else { + iref = (struct btrfs_extent_inline_ref *)(ei + 1); + } + + ptr = (unsigned long)iref; + end = (unsigned long)ei + item_size; + while (ptr < end) { + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_extent_inline_ref_type(eb, iref); + offset = btrfs_extent_inline_ref_offset(eb, iref); + switch (type) { + case BTRFS_TREE_BLOCK_REF_KEY: + printk(KERN_INFO "\t\ttree block backref " + "root %llu\n", (unsigned long long)offset); + break; + case BTRFS_SHARED_BLOCK_REF_KEY: + printk(KERN_INFO "\t\tshared block backref " + "parent %llu\n", (unsigned long long)offset); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + print_extent_data_ref(eb, dref); + break; + case BTRFS_SHARED_DATA_REF_KEY: + sref = (struct btrfs_shared_data_ref *)(iref + 1); + printk(KERN_INFO "\t\tshared data backref " + "parent %llu count %u\n", + (unsigned long long)offset, + btrfs_shared_data_ref_count(eb, sref)); + break; + default: + BUG(); + } + ptr += btrfs_extent_inline_ref_size(type); + } + WARN_ON(ptr > end); +} + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 +static void print_extent_ref_v0(struct extent_buffer *eb, int slot) +{ + struct btrfs_extent_ref_v0 *ref0; + + ref0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_ref_v0); + printk("\t\textent back ref root %llu gen %llu " + "owner %llu num_refs %lu\n", + (unsigned long long)btrfs_ref_root_v0(eb, ref0), + (unsigned long long)btrfs_ref_generation_v0(eb, ref0), + (unsigned long long)btrfs_ref_objectid_v0(eb, ref0), + (unsigned long)btrfs_ref_count_v0(eb, ref0)); +} +#endif + void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) { int i; + u32 type; u32 nr = btrfs_header_nritems(l); struct btrfs_item *item; - struct btrfs_extent_item *ei; struct btrfs_root_item *ri; struct btrfs_dir_item *di; struct btrfs_inode_item *ii; struct btrfs_block_group_item *bi; struct btrfs_file_extent_item *fi; + struct btrfs_extent_data_ref *dref; + struct btrfs_shared_data_ref *sref; + struct btrfs_dev_extent *dev_extent; struct btrfs_key key; struct btrfs_key found_key; - struct btrfs_extent_ref *ref; - struct btrfs_dev_extent *dev_extent; - u32 type; printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", (unsigned long long)btrfs_header_bytenr(l), nr, @@ -100,20 +210,25 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) btrfs_disk_root_refs(l, ri)); break; case BTRFS_EXTENT_ITEM_KEY: - ei = btrfs_item_ptr(l, i, struct btrfs_extent_item); - printk(KERN_INFO "\t\textent data refs %u\n", - btrfs_extent_refs(l, ei)); - break; - case BTRFS_EXTENT_REF_KEY: - ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref); - printk(KERN_INFO "\t\textent back ref root %llu " - "gen %llu owner %llu num_refs %lu\n", - (unsigned long long)btrfs_ref_root(l, ref), - (unsigned long long)btrfs_ref_generation(l, ref), - (unsigned long long)btrfs_ref_objectid(l, ref), - (unsigned long)btrfs_ref_num_refs(l, ref)); + print_extent_item(l, i); + break; + case BTRFS_TREE_BLOCK_REF_KEY: + printk(KERN_INFO "\t\ttree block backref\n"); + break; + case BTRFS_SHARED_BLOCK_REF_KEY: + printk(KERN_INFO "\t\tshared block backref\n"); + break; + case BTRFS_EXTENT_DATA_REF_KEY: + dref = btrfs_item_ptr(l, i, + struct btrfs_extent_data_ref); + print_extent_data_ref(l, dref); + break; + case BTRFS_SHARED_DATA_REF_KEY: + sref = btrfs_item_ptr(l, i, + struct btrfs_shared_data_ref); + printk(KERN_INFO "\t\tshared data backref count %u\n", + btrfs_shared_data_ref_count(l, sref)); break; - case BTRFS_EXTENT_DATA_KEY: fi = btrfs_item_ptr(l, i, struct btrfs_file_extent_item); @@ -139,6 +254,12 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) (unsigned long long) btrfs_file_extent_ram_bytes(l, fi)); break; + case BTRFS_EXTENT_REF_V0_KEY: +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + print_extent_ref_v0(l, i); +#else + BUG(); +#endif case BTRFS_BLOCK_GROUP_ITEM_KEY: bi = btrfs_item_ptr(l, i, struct btrfs_block_group_item); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c new file mode 100644 index 0000000..b23dc20 --- /dev/null +++ b/fs/btrfs/relocation.c @@ -0,0 +1,3711 @@ +/* + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "volumes.h" +#include "locking.h" +#include "btrfs_inode.h" +#include "async-thread.h" + +/* + * backref_node, mapping_node and tree_block start with this + */ +struct tree_entry { + struct rb_node rb_node; + u64 bytenr; +}; + +/* + * present a tree block in the backref cache + */ +struct backref_node { + struct rb_node rb_node; + u64 bytenr; + /* objectid tree block owner */ + u64 owner; + /* list of upper level blocks reference this block */ + struct list_head upper; + /* list of child blocks in the cache */ + struct list_head lower; + /* NULL if this node is not tree root */ + struct btrfs_root *root; + /* extent buffer got by COW the block */ + struct extent_buffer *eb; + /* level of tree block */ + unsigned int level:8; + /* 1 if the block is root of old snapshot */ + unsigned int old_root:1; + /* 1 if no child blocks in the cache */ + unsigned int lowest:1; + /* is the extent buffer locked */ + unsigned int locked:1; + /* has the block been processed */ + unsigned int processed:1; + /* have backrefs of this block been checked */ + unsigned int checked:1; +}; + +/* + * present a block pointer in the backref cache + */ +struct backref_edge { + struct list_head list[2]; + struct backref_node *node[2]; + u64 blockptr; +}; + +#define LOWER 0 +#define UPPER 1 + +struct backref_cache { + /* red black tree of all backref nodes in the cache */ + struct rb_root rb_root; + /* list of backref nodes with no child block in the cache */ + struct list_head pending[BTRFS_MAX_LEVEL]; + spinlock_t lock; +}; + +/* + * map address of tree root to tree + */ +struct mapping_node { + struct rb_node rb_node; + u64 bytenr; + void *data; +}; + +struct mapping_tree { + struct rb_root rb_root; + spinlock_t lock; +}; + +/* + * present a tree block to process + */ +struct tree_block { + struct rb_node rb_node; + u64 bytenr; + struct btrfs_key key; + unsigned int level:8; + unsigned int key_ready:1; +}; + +/* inode vector */ +#define INODEVEC_SIZE 16 + +struct inodevec { + struct list_head list; + struct inode *inode[INODEVEC_SIZE]; + int nr; +}; + +struct reloc_control { + /* block group to relocate */ + struct btrfs_block_group_cache *block_group; + /* extent tree */ + struct btrfs_root *extent_root; + /* inode for moving data */ + struct inode *data_inode; + struct btrfs_workers workers; + /* tree blocks have been processed */ + struct extent_io_tree processed_blocks; + /* map start of tree root to corresponding reloc tree */ + struct mapping_tree reloc_root_tree; + /* list of reloc trees */ + struct list_head reloc_roots; + u64 search_start; + u64 extents_found; + u64 extents_skipped; + int stage; + int create_reloc_root; + unsigned int found_file_extent:1; + unsigned int found_old_snapshot:1; +}; + +/* stages of data relocation */ +#define MOVE_DATA_EXTENTS 0 +#define UPDATE_DATA_PTRS 1 + +/* + * merge reloc tree to corresponding fs tree in worker threads + */ +struct async_merge { + struct btrfs_work work; + struct reloc_control *rc; + struct btrfs_root *root; + struct completion *done; + atomic_t *num_pending; +}; + +static void mapping_tree_init(struct mapping_tree *tree) +{ + tree->rb_root.rb_node = NULL; + spin_lock_init(&tree->lock); +} + +static void backref_cache_init(struct backref_cache *cache) +{ + int i; + cache->rb_root.rb_node = NULL; + for (i = 0; i < BTRFS_MAX_LEVEL; i++) + INIT_LIST_HEAD(&cache->pending[i]); + spin_lock_init(&cache->lock); +} + +static void backref_node_init(struct backref_node *node) +{ + memset(node, 0, sizeof(*node)); + INIT_LIST_HEAD(&node->upper); + INIT_LIST_HEAD(&node->lower); + RB_CLEAR_NODE(&node->rb_node); +} + +static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct tree_entry *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct tree_entry, rb_node); + + if (bytenr < entry->bytenr) + p = &(*p)->rb_left; + else if (bytenr > entry->bytenr) + p = &(*p)->rb_right; + else + return parent; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) +{ + struct rb_node *n = root->rb_node; + struct tree_entry *entry; + + while (n) { + entry = rb_entry(n, struct tree_entry, rb_node); + + if (bytenr < entry->bytenr) + n = n->rb_left; + else if (bytenr > entry->bytenr) + n = n->rb_right; + else + return n; + } + return NULL; +} + +/* + * walk up backref nodes until reach node presents tree root + */ +static struct backref_node *walk_up_backref(struct backref_node *node, + struct backref_edge *edges[], + int *index) +{ + struct backref_edge *edge; + int idx = *index; + + while (!list_empty(&node->upper)) { + edge = list_entry(node->upper.next, + struct backref_edge, list[LOWER]); + edges[idx++] = edge; + node = edge->node[UPPER]; + } + *index = idx; + return node; +} + +/* + * walk down backref nodes to find start of next reference path + */ +static struct backref_node *walk_down_backref(struct backref_edge *edges[], + int *index) +{ + struct backref_edge *edge; + struct backref_node *lower; + int idx = *index; + + while (idx > 0) { + edge = edges[idx - 1]; + lower = edge->node[LOWER]; + if (list_is_last(&edge->list[LOWER], &lower->upper)) { + idx--; + continue; + } + edge = list_entry(edge->list[LOWER].next, + struct backref_edge, list[LOWER]); + edges[idx - 1] = edge; + *index = idx; + return edge->node[UPPER]; + } + *index = 0; + return NULL; +} + +static void drop_node_buffer(struct backref_node *node) +{ + if (node->eb) { + if (node->locked) { + btrfs_tree_unlock(node->eb); + node->locked = 0; + } + free_extent_buffer(node->eb); + node->eb = NULL; + } +} + +static void drop_backref_node(struct backref_cache *tree, + struct backref_node *node) +{ + BUG_ON(!node->lowest); + BUG_ON(!list_empty(&node->upper)); + + drop_node_buffer(node); + list_del(&node->lower); + + rb_erase(&node->rb_node, &tree->rb_root); + kfree(node); +} + +/* + * remove a backref node from the backref cache + */ +static void remove_backref_node(struct backref_cache *cache, + struct backref_node *node) +{ + struct backref_node *upper; + struct backref_edge *edge; + + if (!node) + return; + + BUG_ON(!node->lowest); + while (!list_empty(&node->upper)) { + edge = list_entry(node->upper.next, struct backref_edge, + list[LOWER]); + upper = edge->node[UPPER]; + list_del(&edge->list[LOWER]); + list_del(&edge->list[UPPER]); + kfree(edge); + /* + * add the node to pending list if no other + * child block cached. + */ + if (list_empty(&upper->lower)) { + list_add_tail(&upper->lower, + &cache->pending[upper->level]); + upper->lowest = 1; + } + } + drop_backref_node(cache, node); +} + +/* + * find reloc tree by address of tree root + */ +static struct btrfs_root *find_reloc_root(struct reloc_control *rc, + u64 bytenr) +{ + struct rb_node *rb_node; + struct mapping_node *node; + struct btrfs_root *root = NULL; + + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr); + if (rb_node) { + node = rb_entry(rb_node, struct mapping_node, rb_node); + root = (struct btrfs_root *)node->data; + } + spin_unlock(&rc->reloc_root_tree.lock); + return root; +} + +static int is_cowonly_root(u64 root_objectid) +{ + if (root_objectid == BTRFS_ROOT_TREE_OBJECTID || + root_objectid == BTRFS_EXTENT_TREE_OBJECTID || + root_objectid == BTRFS_CHUNK_TREE_OBJECTID || + root_objectid == BTRFS_DEV_TREE_OBJECTID || + root_objectid == BTRFS_TREE_LOG_OBJECTID || + root_objectid == BTRFS_CSUM_TREE_OBJECTID) + return 1; + return 0; +} + +static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info, + u64 root_objectid) +{ + struct btrfs_key key; + + key.objectid = root_objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + if (is_cowonly_root(root_objectid)) + key.offset = 0; + else + key.offset = (u64)-1; + + return btrfs_read_fs_root_no_name(fs_info, &key); +} + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 +static noinline_for_stack +struct btrfs_root *find_tree_root(struct reloc_control *rc, + struct extent_buffer *leaf, + struct btrfs_extent_ref_v0 *ref0) +{ + struct btrfs_root *root; + u64 root_objectid = btrfs_ref_root_v0(leaf, ref0); + u64 generation = btrfs_ref_generation_v0(leaf, ref0); + + BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID); + + root = read_fs_root(rc->extent_root->fs_info, root_objectid); + BUG_ON(IS_ERR(root)); + + if (root->ref_cows && + generation != btrfs_root_generation(&root->root_item)) + return NULL; + + return root; +} +#endif + +static noinline_for_stack +int find_inline_backref(struct extent_buffer *leaf, int slot, + unsigned long *ptr, unsigned long *end) +{ + struct btrfs_extent_item *ei; + struct btrfs_tree_block_info *bi; + u32 item_size; + + item_size = btrfs_item_size_nr(leaf, slot); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (item_size < sizeof(*ei)) { + WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0)); + return 1; + } +#endif + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + WARN_ON(!(btrfs_extent_flags(leaf, ei) & + BTRFS_EXTENT_FLAG_TREE_BLOCK)); + + if (item_size <= sizeof(*ei) + sizeof(*bi)) { + WARN_ON(item_size < sizeof(*ei) + sizeof(*bi)); + return 1; + } + + bi = (struct btrfs_tree_block_info *)(ei + 1); + *ptr = (unsigned long)(bi + 1); + *end = (unsigned long)ei + item_size; + return 0; +} + +/* + * build backref tree for a given tree block. root of the backref tree + * corresponds the tree block, leaves of the backref tree correspond + * roots of b-trees that reference the tree block. + * + * the basic idea of this function is check backrefs of a given block + * to find upper level blocks that refernece the block, and then check + * bakcrefs of these upper level blocks recursively. the recursion stop + * when tree root is reached or backrefs for the block is cached. + * + * NOTE: if we find backrefs for a block are cached, we know backrefs + * for all upper level blocks that directly/indirectly reference the + * block are also cached. + */ +static struct backref_node *build_backref_tree(struct reloc_control *rc, + struct backref_cache *cache, + struct btrfs_key *node_key, + int level, u64 bytenr) +{ + struct btrfs_path *path1; + struct btrfs_path *path2; + struct extent_buffer *eb; + struct btrfs_root *root; + struct backref_node *cur; + struct backref_node *upper; + struct backref_node *lower; + struct backref_node *node = NULL; + struct backref_node *exist = NULL; + struct backref_edge *edge; + struct rb_node *rb_node; + struct btrfs_key key; + unsigned long end; + unsigned long ptr; + LIST_HEAD(list); + int ret; + int err = 0; + + path1 = btrfs_alloc_path(); + path2 = btrfs_alloc_path(); + if (!path1 || !path2) { + err = -ENOMEM; + goto out; + } + + node = kmalloc(sizeof(*node), GFP_NOFS); + if (!node) { + err = -ENOMEM; + goto out; + } + + backref_node_init(node); + node->bytenr = bytenr; + node->owner = 0; + node->level = level; + node->lowest = 1; + cur = node; +again: + end = 0; + ptr = 0; + key.objectid = cur->bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)-1; + + path1->search_commit_root = 1; + path1->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1, + 0, 0); + if (ret < 0) { + err = ret; + goto out; + } + BUG_ON(!ret || !path1->slots[0]); + + path1->slots[0]--; + + WARN_ON(cur->checked); + if (!list_empty(&cur->upper)) { + /* + * the backref was added previously when processsing + * backref of type BTRFS_TREE_BLOCK_REF_KEY + */ + BUG_ON(!list_is_singular(&cur->upper)); + edge = list_entry(cur->upper.next, struct backref_edge, + list[LOWER]); + BUG_ON(!list_empty(&edge->list[UPPER])); + exist = edge->node[UPPER]; + /* + * add the upper level block to pending list if we need + * check its backrefs + */ + if (!exist->checked) + list_add_tail(&edge->list[UPPER], &list); + } else { + exist = NULL; + } + + while (1) { + cond_resched(); + eb = path1->nodes[0]; + + if (ptr >= end) { + if (path1->slots[0] >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(rc->extent_root, path1); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) + break; + eb = path1->nodes[0]; + } + + btrfs_item_key_to_cpu(eb, &key, path1->slots[0]); + if (key.objectid != cur->bytenr) { + WARN_ON(exist); + break; + } + + if (key.type == BTRFS_EXTENT_ITEM_KEY) { + ret = find_inline_backref(eb, path1->slots[0], + &ptr, &end); + if (ret) + goto next; + } + } + + if (ptr < end) { + /* update key for inline back ref */ + struct btrfs_extent_inline_ref *iref; + iref = (struct btrfs_extent_inline_ref *)ptr; + key.type = btrfs_extent_inline_ref_type(eb, iref); + key.offset = btrfs_extent_inline_ref_offset(eb, iref); + WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY && + key.type != BTRFS_SHARED_BLOCK_REF_KEY); + } + + if (exist && + ((key.type == BTRFS_TREE_BLOCK_REF_KEY && + exist->owner == key.offset) || + (key.type == BTRFS_SHARED_BLOCK_REF_KEY && + exist->bytenr == key.offset))) { + exist = NULL; + goto next; + } + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (key.type == BTRFS_SHARED_BLOCK_REF_KEY || + key.type == BTRFS_EXTENT_REF_V0_KEY) { + if (key.objectid == key.offset && + key.type == BTRFS_EXTENT_REF_V0_KEY) { + struct btrfs_extent_ref_v0 *ref0; + ref0 = btrfs_item_ptr(eb, path1->slots[0], + struct btrfs_extent_ref_v0); + root = find_tree_root(rc, eb, ref0); + if (root) + cur->root = root; + else + cur->old_root = 1; + break; + } +#else + BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); + if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) { +#endif + if (key.objectid == key.offset) { + /* + * only root blocks of reloc trees use + * backref of this type. + */ + root = find_reloc_root(rc, cur->bytenr); + BUG_ON(!root); + cur->root = root; + break; + } + + edge = kzalloc(sizeof(*edge), GFP_NOFS); + if (!edge) { + err = -ENOMEM; + goto out; + } + rb_node = tree_search(&cache->rb_root, key.offset); + if (!rb_node) { + upper = kmalloc(sizeof(*upper), GFP_NOFS); + if (!upper) { + kfree(edge); + err = -ENOMEM; + goto out; + } + backref_node_init(upper); + upper->bytenr = key.offset; + upper->owner = 0; + upper->level = cur->level + 1; + /* + * backrefs for the upper level block isn't + * cached, add the block to pending list + */ + list_add_tail(&edge->list[UPPER], &list); + } else { + upper = rb_entry(rb_node, struct backref_node, + rb_node); + INIT_LIST_HEAD(&edge->list[UPPER]); + } + list_add(&edge->list[LOWER], &cur->upper); + edge->node[UPPER] = upper; + edge->node[LOWER] = cur; + + goto next; + } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { + goto next; + } + + /* key.type == BTRFS_TREE_BLOCK_REF_KEY */ + root = read_fs_root(rc->extent_root->fs_info, key.offset); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto out; + } + + if (btrfs_root_level(&root->root_item) == cur->level) { + /* tree root */ + BUG_ON(btrfs_root_bytenr(&root->root_item) != + cur->bytenr); + cur->root = root; + break; + } + + level = cur->level + 1; + + /* + * searching the tree to find upper level blocks + * reference the block. + */ + path2->search_commit_root = 1; + path2->skip_locking = 1; + path2->lowest_level = level; + ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0); + path2->lowest_level = 0; + if (ret < 0) { + err = ret; + goto out; + } + + eb = path2->nodes[level]; + WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) != + cur->bytenr); + + lower = cur; + for (; level < BTRFS_MAX_LEVEL; level++) { + if (!path2->nodes[level]) { + BUG_ON(btrfs_root_bytenr(&root->root_item) != + lower->bytenr); + lower->root = root; + break; + } + + edge = kzalloc(sizeof(*edge), GFP_NOFS); + if (!edge) { + err = -ENOMEM; + goto out; + } + + eb = path2->nodes[level]; + rb_node = tree_search(&cache->rb_root, eb->start); + if (!rb_node) { + upper = kmalloc(sizeof(*upper), GFP_NOFS); + if (!upper) { + kfree(edge); + err = -ENOMEM; + goto out; + } + backref_node_init(upper); + upper->bytenr = eb->start; + upper->owner = btrfs_header_owner(eb); + upper->level = lower->level + 1; + + /* + * if we know the block isn't shared + * we can void checking its backrefs. + */ + if (btrfs_block_can_be_shared(root, eb)) + upper->checked = 0; + else + upper->checked = 1; + + /* + * add the block to pending list if we + * need check its backrefs. only block + * at 'cur->level + 1' is added to the + * tail of pending list. this guarantees + * we check backrefs from lower level + * blocks to upper level blocks. + */ + if (!upper->checked && + level == cur->level + 1) { + list_add_tail(&edge->list[UPPER], + &list); + } else + INIT_LIST_HEAD(&edge->list[UPPER]); + } else { + upper = rb_entry(rb_node, struct backref_node, + rb_node); + BUG_ON(!upper->checked); + INIT_LIST_HEAD(&edge->list[UPPER]); + } + list_add_tail(&edge->list[LOWER], &lower->upper); + edge->node[UPPER] = upper; + edge->node[LOWER] = lower; + + if (rb_node) + break; + lower = upper; + upper = NULL; + } + btrfs_release_path(root, path2); +next: + if (ptr < end) { + ptr += btrfs_extent_inline_ref_size(key.type); + if (ptr >= end) { + WARN_ON(ptr > end); + ptr = 0; + end = 0; + } + } + if (ptr >= end) + path1->slots[0]++; + } + btrfs_release_path(rc->extent_root, path1); + + cur->checked = 1; + WARN_ON(exist); + + /* the pending list isn't empty, take the first block to process */ + if (!list_empty(&list)) { + edge = list_entry(list.next, struct backref_edge, list[UPPER]); + list_del_init(&edge->list[UPPER]); + cur = edge->node[UPPER]; + goto again; + } + + /* + * everything goes well, connect backref nodes and insert backref nodes + * into the cache. + */ + BUG_ON(!node->checked); + rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); + BUG_ON(rb_node); + + list_for_each_entry(edge, &node->upper, list[LOWER]) + list_add_tail(&edge->list[UPPER], &list); + + while (!list_empty(&list)) { + edge = list_entry(list.next, struct backref_edge, list[UPPER]); + list_del_init(&edge->list[UPPER]); + upper = edge->node[UPPER]; + + if (!RB_EMPTY_NODE(&upper->rb_node)) { + if (upper->lowest) { + list_del_init(&upper->lower); + upper->lowest = 0; + } + + list_add_tail(&edge->list[UPPER], &upper->lower); + continue; + } + + BUG_ON(!upper->checked); + rb_node = tree_insert(&cache->rb_root, upper->bytenr, + &upper->rb_node); + BUG_ON(rb_node); + + list_add_tail(&edge->list[UPPER], &upper->lower); + + list_for_each_entry(edge, &upper->upper, list[LOWER]) + list_add_tail(&edge->list[UPPER], &list); + } +out: + btrfs_free_path(path1); + btrfs_free_path(path2); + if (err) { + INIT_LIST_HEAD(&list); + upper = node; + while (upper) { + if (RB_EMPTY_NODE(&upper->rb_node)) { + list_splice_tail(&upper->upper, &list); + kfree(upper); + } + + if (list_empty(&list)) + break; + + edge = list_entry(list.next, struct backref_edge, + list[LOWER]); + upper = edge->node[UPPER]; + kfree(edge); + } + return ERR_PTR(err); + } + return node; +} + +/* + * helper to add 'address of tree root -> reloc tree' mapping + */ +static int __add_reloc_root(struct btrfs_root *root) +{ + struct rb_node *rb_node; + struct mapping_node *node; + struct reloc_control *rc = root->fs_info->reloc_ctl; + + node = kmalloc(sizeof(*node), GFP_NOFS); + BUG_ON(!node); + + node->bytenr = root->node->start; + node->data = root; + + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_insert(&rc->reloc_root_tree.rb_root, + node->bytenr, &node->rb_node); + spin_unlock(&rc->reloc_root_tree.lock); + BUG_ON(rb_node); + + list_add_tail(&root->root_list, &rc->reloc_roots); + return 0; +} + +/* + * helper to update/delete the 'address of tree root -> reloc tree' + * mapping + */ +static int __update_reloc_root(struct btrfs_root *root, int del) +{ + struct rb_node *rb_node; + struct mapping_node *node = NULL; + struct reloc_control *rc = root->fs_info->reloc_ctl; + + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_search(&rc->reloc_root_tree.rb_root, + root->commit_root->start); + if (rb_node) { + node = rb_entry(rb_node, struct mapping_node, rb_node); + rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); + } + spin_unlock(&rc->reloc_root_tree.lock); + + BUG_ON((struct btrfs_root *)node->data != root); + + if (!del) { + spin_lock(&rc->reloc_root_tree.lock); + node->bytenr = root->node->start; + rb_node = tree_insert(&rc->reloc_root_tree.rb_root, + node->bytenr, &node->rb_node); + spin_unlock(&rc->reloc_root_tree.lock); + BUG_ON(rb_node); + } else { + list_del_init(&root->root_list); + kfree(node); + } + return 0; +} + +/* + * create reloc tree for a given fs tree. reloc tree is just a + * snapshot of the fs tree with special root objectid. + */ +int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + struct extent_buffer *eb; + struct btrfs_root_item *root_item; + struct btrfs_key root_key; + int ret; + + if (root->reloc_root) { + reloc_root = root->reloc_root; + reloc_root->last_trans = trans->transid; + return 0; + } + + if (!root->fs_info->reloc_ctl || + !root->fs_info->reloc_ctl->create_reloc_root || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + return 0; + + root_item = kmalloc(sizeof(*root_item), GFP_NOFS); + BUG_ON(!root_item); + + root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = root->root_key.objectid; + + ret = btrfs_copy_root(trans, root, root->commit_root, &eb, + BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(ret); + + btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1); + memcpy(root_item, &root->root_item, sizeof(*root_item)); + btrfs_set_root_refs(root_item, 1); + btrfs_set_root_bytenr(root_item, eb->start); + btrfs_set_root_level(root_item, btrfs_header_level(eb)); + btrfs_set_root_generation(root_item, trans->transid); + memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); + root_item->drop_level = 0; + + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + + ret = btrfs_insert_root(trans, root->fs_info->tree_root, + &root_key, root_item); + BUG_ON(ret); + kfree(root_item); + + reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, + &root_key); + BUG_ON(IS_ERR(reloc_root)); + reloc_root->last_trans = trans->transid; + + __add_reloc_root(reloc_root); + root->reloc_root = reloc_root; + return 0; +} + +/* + * update root item of reloc tree + */ +int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + struct btrfs_root_item *root_item; + int del = 0; + int ret; + + if (!root->reloc_root) + return 0; + + reloc_root = root->reloc_root; + root_item = &reloc_root->root_item; + + if (btrfs_root_refs(root_item) == 0) { + root->reloc_root = NULL; + del = 1; + } + + __update_reloc_root(reloc_root, del); + + if (reloc_root->commit_root != reloc_root->node) { + btrfs_set_root_node(root_item, reloc_root->node); + free_extent_buffer(reloc_root->commit_root); + reloc_root->commit_root = btrfs_root_node(reloc_root); + } + + ret = btrfs_update_root(trans, root->fs_info->tree_root, + &reloc_root->root_key, root_item); + BUG_ON(ret); + return 0; +} + +/* + * helper to find first cached inode with inode number >= objectid + * in a subvolume + */ +static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid) +{ + struct rb_node *node; + struct rb_node *prev; + struct btrfs_inode *entry; + struct inode *inode; + + spin_lock(&root->inode_lock); +again: + node = root->inode_tree.rb_node; + prev = NULL; + while (node) { + prev = node; + entry = rb_entry(node, struct btrfs_inode, rb_node); + + if (objectid < entry->vfs_inode.i_ino) + node = node->rb_left; + else if (objectid > entry->vfs_inode.i_ino) + node = node->rb_right; + else + break; + } + if (!node) { + while (prev) { + entry = rb_entry(prev, struct btrfs_inode, rb_node); + if (objectid <= entry->vfs_inode.i_ino) { + node = prev; + break; + } + prev = rb_next(prev); + } + } + while (node) { + entry = rb_entry(node, struct btrfs_inode, rb_node); + inode = igrab(&entry->vfs_inode); + if (inode) { + spin_unlock(&root->inode_lock); + return inode; + } + + objectid = entry->vfs_inode.i_ino + 1; + if (cond_resched_lock(&root->inode_lock)) + goto again; + + node = rb_next(node); + } + spin_unlock(&root->inode_lock); + return NULL; +} + +static int in_block_group(u64 bytenr, + struct btrfs_block_group_cache *block_group) +{ + if (bytenr >= block_group->key.objectid && + bytenr < block_group->key.objectid + block_group->key.offset) + return 1; + return 0; +} + +/* + * get new location of data + */ +static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, + u64 bytenr, u64 num_bytes) +{ + struct btrfs_root *root = BTRFS_I(reloc_inode)->root; + struct btrfs_path *path; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + bytenr -= BTRFS_I(reloc_inode)->index_cnt; + ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino, + bytenr, 0); + if (ret < 0) + goto out; + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + BUG_ON(btrfs_file_extent_offset(leaf, fi) || + btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)); + + if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) { + ret = 1; + goto out; + } + + if (new_bytenr) + *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +/* + * update file extent items in the tree leaf to point to + * the new locations. + */ +static int replace_file_extents(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_root *root, + struct extent_buffer *leaf, + struct list_head *inode_list) +{ + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + struct inode *inode = NULL; + struct inodevec *ivec = NULL; + u64 parent; + u64 bytenr; + u64 new_bytenr; + u64 num_bytes; + u64 end; + u32 nritems; + u32 i; + int ret; + int first = 1; + int dirty = 0; + + if (rc->stage != UPDATE_DATA_PTRS) + return 0; + + /* reloc trees always use full backref */ + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + parent = leaf->start; + else + parent = 0; + + nritems = btrfs_header_nritems(leaf); + for (i = 0; i < nritems; i++) { + cond_resched(); + btrfs_item_key_to_cpu(leaf, &key, i); + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + if (bytenr == 0) + continue; + if (!in_block_group(bytenr, rc->block_group)) + continue; + + /* + * if we are modifying block in fs tree, wait for readpage + * to complete and drop the extent cache + */ + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + if (!ivec || ivec->nr == INODEVEC_SIZE) { + ivec = kmalloc(sizeof(*ivec), GFP_NOFS); + BUG_ON(!ivec); + ivec->nr = 0; + list_add_tail(&ivec->list, inode_list); + } + if (first) { + inode = find_next_inode(root, key.objectid); + if (inode) + ivec->inode[ivec->nr++] = inode; + first = 0; + } else if (inode && inode->i_ino < key.objectid) { + inode = find_next_inode(root, key.objectid); + if (inode) + ivec->inode[ivec->nr++] = inode; + } + if (inode && inode->i_ino == key.objectid) { + end = key.offset + + btrfs_file_extent_num_bytes(leaf, fi); + WARN_ON(!IS_ALIGNED(key.offset, + root->sectorsize)); + WARN_ON(!IS_ALIGNED(end, root->sectorsize)); + end--; + ret = try_lock_extent(&BTRFS_I(inode)->io_tree, + key.offset, end, + GFP_NOFS); + if (!ret) + continue; + + btrfs_drop_extent_cache(inode, key.offset, end, + 1); + unlock_extent(&BTRFS_I(inode)->io_tree, + key.offset, end, GFP_NOFS); + } + } + + ret = get_new_location(rc->data_inode, &new_bytenr, + bytenr, num_bytes); + if (ret > 0) + continue; + BUG_ON(ret < 0); + + btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); + dirty = 1; + + key.offset -= btrfs_file_extent_offset(leaf, fi); + ret = btrfs_inc_extent_ref(trans, root, new_bytenr, + num_bytes, parent, + btrfs_header_owner(leaf), + key.objectid, key.offset); + BUG_ON(ret); + + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, + parent, btrfs_header_owner(leaf), + key.objectid, key.offset); + BUG_ON(ret); + } + if (dirty) + btrfs_mark_buffer_dirty(leaf); + return 0; +} + +static noinline_for_stack +int memcmp_node_keys(struct extent_buffer *eb, int slot, + struct btrfs_path *path, int level) +{ + struct btrfs_disk_key key1; + struct btrfs_disk_key key2; + btrfs_node_key(eb, &key1, slot); + btrfs_node_key(path->nodes[level], &key2, path->slots[level]); + return memcmp(&key1, &key2, sizeof(key1)); +} + +/* + * try to replace tree blocks in fs tree with the new blocks + * in reloc tree. tree blocks haven't been modified since the + * reloc tree was create can be replaced. + * + * if a block was replaced, level of the block + 1 is returned. + * if no block got replaced, 0 is returned. if there are other + * errors, a negative error number is returned. + */ +static int replace_path(struct btrfs_trans_handle *trans, + struct btrfs_root *dest, struct btrfs_root *src, + struct btrfs_path *path, struct btrfs_key *next_key, + struct extent_buffer **leaf, + int lowest_level, int max_level) +{ + struct extent_buffer *eb; + struct extent_buffer *parent; + struct btrfs_key key; + u64 old_bytenr; + u64 new_bytenr; + u64 old_ptr_gen; + u64 new_ptr_gen; + u64 last_snapshot; + u32 blocksize; + int level; + int ret; + int slot; + + BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(lowest_level > 1 && leaf); + + last_snapshot = btrfs_root_last_snapshot(&src->root_item); + + slot = path->slots[lowest_level]; + btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); + + eb = btrfs_lock_root_node(dest); + btrfs_set_lock_blocking(eb); + level = btrfs_header_level(eb); + + if (level < lowest_level) { + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + return 0; + } + + ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); + BUG_ON(ret); + btrfs_set_lock_blocking(eb); + + if (next_key) { + next_key->objectid = (u64)-1; + next_key->type = (u8)-1; + next_key->offset = (u64)-1; + } + + parent = eb; + while (1) { + level = btrfs_header_level(parent); + BUG_ON(level < lowest_level); + + ret = btrfs_bin_search(parent, &key, level, &slot); + if (ret && slot > 0) + slot--; + + if (next_key && slot + 1 < btrfs_header_nritems(parent)) + btrfs_node_key_to_cpu(parent, next_key, slot + 1); + + old_bytenr = btrfs_node_blockptr(parent, slot); + blocksize = btrfs_level_size(dest, level - 1); + old_ptr_gen = btrfs_node_ptr_generation(parent, slot); + + if (level <= max_level) { + eb = path->nodes[level]; + new_bytenr = btrfs_node_blockptr(eb, + path->slots[level]); + new_ptr_gen = btrfs_node_ptr_generation(eb, + path->slots[level]); + } else { + new_bytenr = 0; + new_ptr_gen = 0; + } + + if (new_bytenr > 0 && new_bytenr == old_bytenr) { + WARN_ON(1); + ret = level; + break; + } + + if (new_bytenr == 0 || old_ptr_gen > last_snapshot || + memcmp_node_keys(parent, slot, path, level)) { + if (level <= lowest_level && !leaf) { + ret = 0; + break; + } + + eb = read_tree_block(dest, old_bytenr, blocksize, + old_ptr_gen); + btrfs_tree_lock(eb); + ret = btrfs_cow_block(trans, dest, eb, parent, + slot, &eb); + BUG_ON(ret); + btrfs_set_lock_blocking(eb); + + if (level <= lowest_level) { + *leaf = eb; + ret = 0; + break; + } + + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + + parent = eb; + continue; + } + + btrfs_node_key_to_cpu(path->nodes[level], &key, + path->slots[level]); + btrfs_release_path(src, path); + + path->lowest_level = level; + ret = btrfs_search_slot(trans, src, &key, path, 0, 1); + path->lowest_level = 0; + BUG_ON(ret); + + /* + * swap blocks in fs tree and reloc tree. + */ + btrfs_set_node_blockptr(parent, slot, new_bytenr); + btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); + btrfs_mark_buffer_dirty(parent); + + btrfs_set_node_blockptr(path->nodes[level], + path->slots[level], old_bytenr); + btrfs_set_node_ptr_generation(path->nodes[level], + path->slots[level], old_ptr_gen); + btrfs_mark_buffer_dirty(path->nodes[level]); + + ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, + path->nodes[level]->start, + src->root_key.objectid, level - 1, 0); + BUG_ON(ret); + ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, + 0, dest->root_key.objectid, level - 1, + 0); + BUG_ON(ret); + + ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, + path->nodes[level]->start, + src->root_key.objectid, level - 1, 0); + BUG_ON(ret); + + ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, + 0, dest->root_key.objectid, level - 1, + 0); + BUG_ON(ret); + + btrfs_unlock_up_safe(path, 0); + + ret = level; + break; + } + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + return ret; +} + +/* + * helper to find next relocated block in reloc tree + */ +static noinline_for_stack +int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, + int *level) +{ + struct extent_buffer *eb; + int i; + u64 last_snapshot; + u32 nritems; + + last_snapshot = btrfs_root_last_snapshot(&root->root_item); + + for (i = 0; i < *level; i++) { + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } + + for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { + eb = path->nodes[i]; + nritems = btrfs_header_nritems(eb); + while (path->slots[i] + 1 < nritems) { + path->slots[i]++; + if (btrfs_node_ptr_generation(eb, path->slots[i]) <= + last_snapshot) + continue; + + *level = i; + return 0; + } + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } + return 1; +} + +/* + * walk down reloc tree to find relocated block of lowest level + */ +static noinline_for_stack +int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, + int *level) +{ + struct extent_buffer *eb = NULL; + int i; + u64 bytenr; + u64 ptr_gen = 0; + u64 last_snapshot; + u32 blocksize; + u32 nritems; + + last_snapshot = btrfs_root_last_snapshot(&root->root_item); + + for (i = *level; i > 0; i--) { + eb = path->nodes[i]; + nritems = btrfs_header_nritems(eb); + while (path->slots[i] < nritems) { + ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]); + if (ptr_gen > last_snapshot) + break; + path->slots[i]++; + } + if (path->slots[i] >= nritems) { + if (i == *level) + break; + *level = i + 1; + return 0; + } + if (i == 1) { + *level = i; + return 0; + } + + bytenr = btrfs_node_blockptr(eb, path->slots[i]); + blocksize = btrfs_level_size(root, i - 1); + eb = read_tree_block(root, bytenr, blocksize, ptr_gen); + BUG_ON(btrfs_header_level(eb) != i - 1); + path->nodes[i - 1] = eb; + path->slots[i - 1] = 0; + } + return 1; +} + +/* + * invalidate extent cache for file extents whose key in range of + * [min_key, max_key) + */ +static int invalidate_extent_cache(struct btrfs_root *root, + struct btrfs_key *min_key, + struct btrfs_key *max_key) +{ + struct inode *inode = NULL; + u64 objectid; + u64 start, end; + + objectid = min_key->objectid; + while (1) { + cond_resched(); + iput(inode); + + if (objectid > max_key->objectid) + break; + + inode = find_next_inode(root, objectid); + if (!inode) + break; + + if (inode->i_ino > max_key->objectid) { + iput(inode); + break; + } + + objectid = inode->i_ino + 1; + if (!S_ISREG(inode->i_mode)) + continue; + + if (unlikely(min_key->objectid == inode->i_ino)) { + if (min_key->type > BTRFS_EXTENT_DATA_KEY) + continue; + if (min_key->type < BTRFS_EXTENT_DATA_KEY) + start = 0; + else { + start = min_key->offset; + WARN_ON(!IS_ALIGNED(start, root->sectorsize)); + } + } else { + start = 0; + } + + if (unlikely(max_key->objectid == inode->i_ino)) { + if (max_key->type < BTRFS_EXTENT_DATA_KEY) + continue; + if (max_key->type > BTRFS_EXTENT_DATA_KEY) { + end = (u64)-1; + } else { + if (max_key->offset == 0) + continue; + end = max_key->offset; + WARN_ON(!IS_ALIGNED(end, root->sectorsize)); + end--; + } + } else { + end = (u64)-1; + } + + /* the lock_extent waits for readpage to complete */ + lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + btrfs_drop_extent_cache(inode, start, end, 1); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + } + return 0; +} + +static int find_next_key(struct btrfs_path *path, int level, + struct btrfs_key *key) + +{ + while (level < BTRFS_MAX_LEVEL) { + if (!path->nodes[level]) + break; + if (path->slots[level] + 1 < + btrfs_header_nritems(path->nodes[level])) { + btrfs_node_key_to_cpu(path->nodes[level], key, + path->slots[level] + 1); + return 0; + } + level++; + } + return 1; +} + +/* + * merge the relocated tree blocks in reloc tree with corresponding + * fs tree. + */ +static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, + struct btrfs_root *root) +{ + LIST_HEAD(inode_list); + struct btrfs_key key; + struct btrfs_key next_key; + struct btrfs_trans_handle *trans; + struct btrfs_root *reloc_root; + struct btrfs_root_item *root_item; + struct btrfs_path *path; + struct extent_buffer *leaf = NULL; + unsigned long nr; + int level; + int max_level; + int replaced = 0; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + reloc_root = root->reloc_root; + root_item = &reloc_root->root_item; + + if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { + level = btrfs_root_level(root_item); + extent_buffer_get(reloc_root->node); + path->nodes[level] = reloc_root->node; + path->slots[level] = 0; + } else { + btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); + + level = root_item->drop_level; + BUG_ON(level == 0); + path->lowest_level = level; + ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); + if (ret < 0) { + btrfs_free_path(path); + return ret; + } + + btrfs_node_key_to_cpu(path->nodes[level], &next_key, + path->slots[level]); + WARN_ON(memcmp(&key, &next_key, sizeof(key))); + + btrfs_unlock_up_safe(path, 0); + } + + if (level == 0 && rc->stage == UPDATE_DATA_PTRS) { + trans = btrfs_start_transaction(root, 1); + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, 0); + btrfs_release_path(reloc_root, path); + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) { + err = ret; + goto out; + } + + leaf = path->nodes[0]; + btrfs_unlock_up_safe(path, 1); + ret = replace_file_extents(trans, rc, root, leaf, + &inode_list); + if (ret < 0) + err = ret; + goto out; + } + + memset(&next_key, 0, sizeof(next_key)); + + while (1) { + leaf = NULL; + replaced = 0; + trans = btrfs_start_transaction(root, 1); + max_level = level; + + ret = walk_down_reloc_tree(reloc_root, path, &level); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) + break; + + if (!find_next_key(path, level, &key) && + btrfs_comp_cpu_keys(&next_key, &key) >= 0) { + ret = 0; + } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) { + ret = replace_path(trans, root, reloc_root, + path, &next_key, &leaf, + level, max_level); + } else { + ret = replace_path(trans, root, reloc_root, + path, &next_key, NULL, + level, max_level); + } + if (ret < 0) { + err = ret; + goto out; + } + + if (ret > 0) { + level = ret; + btrfs_node_key_to_cpu(path->nodes[level], &key, + path->slots[level]); + replaced = 1; + } else if (leaf) { + /* + * no block got replaced, try replacing file extents + */ + btrfs_item_key_to_cpu(leaf, &key, 0); + ret = replace_file_extents(trans, rc, root, leaf, + &inode_list); + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); + BUG_ON(ret < 0); + } + + ret = walk_up_reloc_tree(reloc_root, path, &level); + if (ret > 0) + break; + + BUG_ON(level == 0); + /* + * save the merging progress in the drop_progress. + * this is OK since root refs == 1 in this case. + */ + btrfs_node_key(path->nodes[level], &root_item->drop_progress, + path->slots[level]); + root_item->drop_level = level; + + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + + btrfs_btree_balance_dirty(root, nr); + + if (replaced && rc->stage == UPDATE_DATA_PTRS) + invalidate_extent_cache(root, &key, &next_key); + } + + /* + * handle the case only one block in the fs tree need to be + * relocated and the block is tree root. + */ + leaf = btrfs_lock_root_node(root); + ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf); + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); + if (ret < 0) + err = ret; +out: + btrfs_free_path(path); + + if (err == 0) { + memset(&root_item->drop_progress, 0, + sizeof(root_item->drop_progress)); + root_item->drop_level = 0; + btrfs_set_root_refs(root_item, 0); + } + + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + + btrfs_btree_balance_dirty(root, nr); + + /* + * put inodes while we aren't holding the tree locks + */ + while (!list_empty(&inode_list)) { + struct inodevec *ivec; + ivec = list_entry(inode_list.next, struct inodevec, list); + list_del(&ivec->list); + while (ivec->nr > 0) { + ivec->nr--; + iput(ivec->inode[ivec->nr]); + } + kfree(ivec); + } + + if (replaced && rc->stage == UPDATE_DATA_PTRS) + invalidate_extent_cache(root, &key, &next_key); + + return err; +} + +/* + * callback for the work threads. + * this function merges reloc tree with corresponding fs tree, + * and then drops the reloc tree. + */ +static void merge_func(struct btrfs_work *work) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root; + struct btrfs_root *reloc_root; + struct async_merge *async; + + async = container_of(work, struct async_merge, work); + reloc_root = async->root; + + if (btrfs_root_refs(&reloc_root->root_item) > 0) { + root = read_fs_root(reloc_root->fs_info, + reloc_root->root_key.offset); + BUG_ON(IS_ERR(root)); + BUG_ON(root->reloc_root != reloc_root); + + merge_reloc_root(async->rc, root); + + trans = btrfs_start_transaction(root, 1); + btrfs_update_reloc_root(trans, root); + btrfs_end_transaction(trans, root); + } + + btrfs_drop_dead_root(reloc_root); + + if (atomic_dec_and_test(async->num_pending)) + complete(async->done); + + kfree(async); +} + +static int merge_reloc_roots(struct reloc_control *rc) +{ + struct async_merge *async; + struct btrfs_root *root; + struct completion done; + atomic_t num_pending; + + init_completion(&done); + atomic_set(&num_pending, 1); + + while (!list_empty(&rc->reloc_roots)) { + root = list_entry(rc->reloc_roots.next, + struct btrfs_root, root_list); + list_del_init(&root->root_list); + + async = kmalloc(sizeof(*async), GFP_NOFS); + BUG_ON(!async); + async->work.func = merge_func; + async->work.flags = 0; + async->rc = rc; + async->root = root; + async->done = &done; + async->num_pending = &num_pending; + atomic_inc(&num_pending); + btrfs_queue_worker(&rc->workers, &async->work); + } + + if (!atomic_dec_and_test(&num_pending)) + wait_for_completion(&done); + + BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); + return 0; +} + +static void free_block_list(struct rb_root *blocks) +{ + struct tree_block *block; + struct rb_node *rb_node; + while ((rb_node = rb_first(blocks))) { + block = rb_entry(rb_node, struct tree_block, rb_node); + rb_erase(rb_node, blocks); + kfree(block); + } +} + +static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *reloc_root) +{ + struct btrfs_root *root; + + if (reloc_root->last_trans == trans->transid) + return 0; + + root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset); + BUG_ON(IS_ERR(root)); + BUG_ON(root->reloc_root != reloc_root); + + return btrfs_record_root_in_trans(trans, root); +} + +/* + * select one tree from trees that references the block. + * for blocks in refernce counted trees, we preper reloc tree. + * if no reloc tree found and reloc_only is true, NULL is returned. + */ +static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans, + struct backref_node *node, + struct backref_edge *edges[], + int *nr, int reloc_only) +{ + struct backref_node *next; + struct btrfs_root *root; + int index; + int loop = 0; +again: + index = 0; + next = node; + while (1) { + cond_resched(); + next = walk_up_backref(next, edges, &index); + root = next->root; + if (!root) { + BUG_ON(!node->old_root); + goto skip; + } + + /* no other choice for non-refernce counted tree */ + if (!root->ref_cows) { + BUG_ON(reloc_only); + break; + } + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + record_reloc_root_in_trans(trans, root); + break; + } + + if (loop) { + btrfs_record_root_in_trans(trans, root); + break; + } + + if (reloc_only || next != node) { + if (!root->reloc_root) + btrfs_record_root_in_trans(trans, root); + root = root->reloc_root; + /* + * if the reloc tree was created in current + * transation, there is no node in backref tree + * corresponds to the root of the reloc tree. + */ + if (btrfs_root_last_snapshot(&root->root_item) == + trans->transid - 1) + break; + } +skip: + root = NULL; + next = walk_down_backref(edges, &index); + if (!next || next->level <= node->level) + break; + } + + if (!root && !loop && !reloc_only) { + loop = 1; + goto again; + } + + if (root) + *nr = index; + else + *nr = 0; + + return root; +} + +static noinline_for_stack +struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, + struct backref_node *node) +{ + struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + int nr; + return __select_one_root(trans, node, edges, &nr, 0); +} + +static noinline_for_stack +struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, + struct backref_node *node, + struct backref_edge *edges[], int *nr) +{ + return __select_one_root(trans, node, edges, nr, 1); +} + +static void grab_path_buffers(struct btrfs_path *path, + struct backref_node *node, + struct backref_edge *edges[], int nr) +{ + int i = 0; + while (1) { + drop_node_buffer(node); + node->eb = path->nodes[node->level]; + BUG_ON(!node->eb); + if (path->locks[node->level]) + node->locked = 1; + path->nodes[node->level] = NULL; + path->locks[node->level] = 0; + + if (i >= nr) + break; + + edges[i]->blockptr = node->eb->start; + node = edges[i]->node[UPPER]; + i++; + } +} + +/* + * relocate a block tree, and then update pointers in upper level + * blocks that reference the block to point to the new location. + * + * if called by link_to_upper, the block has already been relocated. + * in that case this function just updates pointers. + */ +static int do_relocation(struct btrfs_trans_handle *trans, + struct backref_node *node, + struct btrfs_key *key, + struct btrfs_path *path, int lowest) +{ + struct backref_node *upper; + struct backref_edge *edge; + struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + struct btrfs_root *root; + struct extent_buffer *eb; + u32 blocksize; + u64 bytenr; + u64 generation; + int nr; + int slot; + int ret; + int err = 0; + + BUG_ON(lowest && node->eb); + + path->lowest_level = node->level + 1; + list_for_each_entry(edge, &node->upper, list[LOWER]) { + cond_resched(); + if (node->eb && node->eb->start == edge->blockptr) + continue; + + upper = edge->node[UPPER]; + root = select_reloc_root(trans, upper, edges, &nr); + if (!root) + continue; + + if (upper->eb && !upper->locked) + drop_node_buffer(upper); + + if (!upper->eb) { + ret = btrfs_search_slot(trans, root, key, path, 0, 1); + if (ret < 0) { + err = ret; + break; + } + BUG_ON(ret > 0); + + slot = path->slots[upper->level]; + + btrfs_unlock_up_safe(path, upper->level + 1); + grab_path_buffers(path, upper, edges, nr); + + btrfs_release_path(NULL, path); + } else { + ret = btrfs_bin_search(upper->eb, key, upper->level, + &slot); + BUG_ON(ret); + } + + bytenr = btrfs_node_blockptr(upper->eb, slot); + if (!lowest) { + if (node->eb->start == bytenr) { + btrfs_tree_unlock(upper->eb); + upper->locked = 0; + continue; + } + } else { + BUG_ON(node->bytenr != bytenr); + } + + blocksize = btrfs_level_size(root, node->level); + generation = btrfs_node_ptr_generation(upper->eb, slot); + eb = read_tree_block(root, bytenr, blocksize, generation); + btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); + + if (!node->eb) { + ret = btrfs_cow_block(trans, root, eb, upper->eb, + slot, &eb); + if (ret < 0) { + err = ret; + break; + } + btrfs_set_lock_blocking(eb); + node->eb = eb; + node->locked = 1; + } else { + btrfs_set_node_blockptr(upper->eb, slot, + node->eb->start); + btrfs_set_node_ptr_generation(upper->eb, slot, + trans->transid); + btrfs_mark_buffer_dirty(upper->eb); + + ret = btrfs_inc_extent_ref(trans, root, + node->eb->start, blocksize, + upper->eb->start, + btrfs_header_owner(upper->eb), + node->level, 0); + BUG_ON(ret); + + ret = btrfs_drop_subtree(trans, root, eb, upper->eb); + BUG_ON(ret); + + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } + if (!lowest) { + btrfs_tree_unlock(upper->eb); + upper->locked = 0; + } + } + path->lowest_level = 0; + return err; +} + +static int link_to_upper(struct btrfs_trans_handle *trans, + struct backref_node *node, + struct btrfs_path *path) +{ + struct btrfs_key key; + if (!node->eb || list_empty(&node->upper)) + return 0; + + btrfs_node_key_to_cpu(node->eb, &key, 0); + return do_relocation(trans, node, &key, path, 0); +} + +static int finish_pending_nodes(struct btrfs_trans_handle *trans, + struct backref_cache *cache, + struct btrfs_path *path) +{ + struct backref_node *node; + int level; + int ret; + int err = 0; + + for (level = 0; level < BTRFS_MAX_LEVEL; level++) { + while (!list_empty(&cache->pending[level])) { + node = list_entry(cache->pending[level].next, + struct backref_node, lower); + BUG_ON(node->level != level); + + ret = link_to_upper(trans, node, path); + if (ret < 0) + err = ret; + /* + * this remove the node from the pending list and + * may add some other nodes to the level + 1 + * pending list + */ + remove_backref_node(cache, node); + } + } + BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root)); + return err; +} + +static void mark_block_processed(struct reloc_control *rc, + struct backref_node *node) +{ + u32 blocksize; + if (node->level == 0 || + in_block_group(node->bytenr, rc->block_group)) { + blocksize = btrfs_level_size(rc->extent_root, node->level); + set_extent_bits(&rc->processed_blocks, node->bytenr, + node->bytenr + blocksize - 1, EXTENT_DIRTY, + GFP_NOFS); + } + node->processed = 1; +} + +/* + * mark a block and all blocks directly/indirectly reference the block + * as processed. + */ +static void update_processed_blocks(struct reloc_control *rc, + struct backref_node *node) +{ + struct backref_node *next = node; + struct backref_edge *edge; + struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + int index = 0; + + while (next) { + cond_resched(); + while (1) { + if (next->processed) + break; + + mark_block_processed(rc, next); + + if (list_empty(&next->upper)) + break; + + edge = list_entry(next->upper.next, + struct backref_edge, list[LOWER]); + edges[index++] = edge; + next = edge->node[UPPER]; + } + next = walk_down_backref(edges, &index); + } +} + +static int tree_block_processed(u64 bytenr, u32 blocksize, + struct reloc_control *rc) +{ + if (test_range_bit(&rc->processed_blocks, bytenr, + bytenr + blocksize - 1, EXTENT_DIRTY, 1)) + return 1; + return 0; +} + +/* + * check if there are any file extent pointers in the leaf point to + * data require processing + */ +static int check_file_extents(struct reloc_control *rc, + u64 bytenr, u32 blocksize, u64 ptr_gen) +{ + struct btrfs_key found_key; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + u32 nritems; + int i; + int ret = 0; + + leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen); + + nritems = btrfs_header_nritems(leaf); + for (i = 0; i < nritems; i++) { + cond_resched(); + btrfs_item_key_to_cpu(leaf, &found_key, i); + if (found_key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + if (bytenr == 0) + continue; + if (in_block_group(bytenr, rc->block_group)) { + ret = 1; + break; + } + } + free_extent_buffer(leaf); + return ret; +} + +/* + * scan child blocks of a given block to find blocks require processing + */ +static int add_child_blocks(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_node *node, + struct rb_root *blocks) +{ + struct tree_block *block; + struct rb_node *rb_node; + u64 bytenr; + u64 ptr_gen; + u32 blocksize; + u32 nritems; + int i; + int err = 0; + + nritems = btrfs_header_nritems(node->eb); + blocksize = btrfs_level_size(rc->extent_root, node->level - 1); + for (i = 0; i < nritems; i++) { + cond_resched(); + bytenr = btrfs_node_blockptr(node->eb, i); + ptr_gen = btrfs_node_ptr_generation(node->eb, i); + if (ptr_gen == trans->transid) + continue; + if (!in_block_group(bytenr, rc->block_group) && + (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS)) + continue; + if (tree_block_processed(bytenr, blocksize, rc)) + continue; + + readahead_tree_block(rc->extent_root, + bytenr, blocksize, ptr_gen); + } + + for (i = 0; i < nritems; i++) { + cond_resched(); + bytenr = btrfs_node_blockptr(node->eb, i); + ptr_gen = btrfs_node_ptr_generation(node->eb, i); + if (ptr_gen == trans->transid) + continue; + if (!in_block_group(bytenr, rc->block_group) && + (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS)) + continue; + if (tree_block_processed(bytenr, blocksize, rc)) + continue; + if (!in_block_group(bytenr, rc->block_group) && + !check_file_extents(rc, bytenr, blocksize, ptr_gen)) + continue; + + block = kmalloc(sizeof(*block), GFP_NOFS); + if (!block) { + err = -ENOMEM; + break; + } + block->bytenr = bytenr; + btrfs_node_key_to_cpu(node->eb, &block->key, i); + block->level = node->level - 1; + block->key_ready = 1; + rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); + BUG_ON(rb_node); + } + if (err) + free_block_list(blocks); + return err; +} + +/* + * find adjacent blocks require processing + */ +static noinline_for_stack +int add_adjacent_blocks(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_cache *cache, + struct rb_root *blocks, int level, + struct backref_node **upper) +{ + struct backref_node *node; + int ret = 0; + + WARN_ON(!list_empty(&cache->pending[level])); + + if (list_empty(&cache->pending[level + 1])) + return 1; + + node = list_entry(cache->pending[level + 1].next, + struct backref_node, lower); + if (node->eb) + ret = add_child_blocks(trans, rc, node, blocks); + + *upper = node; + return ret; +} + +static int get_tree_block_key(struct reloc_control *rc, + struct tree_block *block) +{ + struct extent_buffer *eb; + + BUG_ON(block->key_ready); + eb = read_tree_block(rc->extent_root, block->bytenr, + block->key.objectid, block->key.offset); + WARN_ON(btrfs_header_level(eb) != block->level); + if (block->level == 0) + btrfs_item_key_to_cpu(eb, &block->key, 0); + else + btrfs_node_key_to_cpu(eb, &block->key, 0); + free_extent_buffer(eb); + block->key_ready = 1; + return 0; +} + +static int reada_tree_block(struct reloc_control *rc, + struct tree_block *block) +{ + BUG_ON(block->key_ready); + readahead_tree_block(rc->extent_root, block->bytenr, + block->key.objectid, block->key.offset); + return 0; +} + +/* + * helper function to relocate a tree block + */ +static int relocate_tree_block(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_node *node, + struct btrfs_key *key, + struct btrfs_path *path) +{ + struct btrfs_root *root; + int ret; + + root = select_one_root(trans, node); + if (unlikely(!root)) { + rc->found_old_snapshot = 1; + update_processed_blocks(rc, node); + return 0; + } + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + ret = do_relocation(trans, node, key, path, 1); + if (ret < 0) + goto out; + if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) { + ret = replace_file_extents(trans, rc, root, + node->eb, NULL); + if (ret < 0) + goto out; + } + drop_node_buffer(node); + } else if (!root->ref_cows) { + path->lowest_level = node->level; + ret = btrfs_search_slot(trans, root, key, path, 0, 1); + btrfs_release_path(root, path); + if (ret < 0) + goto out; + } else if (root != node->root) { + WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS); + } + + update_processed_blocks(rc, node); + ret = 0; +out: + drop_node_buffer(node); + return ret; +} + +/* + * relocate a list of blocks + */ +static noinline_for_stack +int relocate_tree_blocks(struct btrfs_trans_handle *trans, + struct reloc_control *rc, struct rb_root *blocks) +{ + struct backref_cache *cache; + struct backref_node *node; + struct btrfs_path *path; + struct tree_block *block; + struct rb_node *rb_node; + int level = -1; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + cache = kmalloc(sizeof(*cache), GFP_NOFS); + if (!cache) { + btrfs_free_path(path); + return -ENOMEM; + } + + backref_cache_init(cache); + + rb_node = rb_first(blocks); + while (rb_node) { + block = rb_entry(rb_node, struct tree_block, rb_node); + if (level == -1) + level = block->level; + else + BUG_ON(level != block->level); + if (!block->key_ready) + reada_tree_block(rc, block); + rb_node = rb_next(rb_node); + } + + rb_node = rb_first(blocks); + while (rb_node) { + block = rb_entry(rb_node, struct tree_block, rb_node); + if (!block->key_ready) + get_tree_block_key(rc, block); + rb_node = rb_next(rb_node); + } + + rb_node = rb_first(blocks); + while (rb_node) { + block = rb_entry(rb_node, struct tree_block, rb_node); + + node = build_backref_tree(rc, cache, &block->key, + block->level, block->bytenr); + if (IS_ERR(node)) { + err = PTR_ERR(node); + goto out; + } + + ret = relocate_tree_block(trans, rc, node, &block->key, + path); + if (ret < 0) { + err = ret; + goto out; + } + remove_backref_node(cache, node); + rb_node = rb_next(rb_node); + } + + if (level > 0) + goto out; + + free_block_list(blocks); + + /* + * now backrefs of some upper level tree blocks have been cached, + * try relocating blocks referenced by these upper level blocks. + */ + while (1) { + struct backref_node *upper = NULL; + if (trans->transaction->in_commit || + trans->transaction->delayed_refs.flushing) + break; + + ret = add_adjacent_blocks(trans, rc, cache, blocks, level, + &upper); + if (ret < 0) + err = ret; + if (ret != 0) + break; + + rb_node = rb_first(blocks); + while (rb_node) { + block = rb_entry(rb_node, struct tree_block, rb_node); + if (trans->transaction->in_commit || + trans->transaction->delayed_refs.flushing) + goto out; + BUG_ON(!block->key_ready); + node = build_backref_tree(rc, cache, &block->key, + level, block->bytenr); + if (IS_ERR(node)) { + err = PTR_ERR(node); + goto out; + } + + ret = relocate_tree_block(trans, rc, node, + &block->key, path); + if (ret < 0) { + err = ret; + goto out; + } + remove_backref_node(cache, node); + rb_node = rb_next(rb_node); + } + free_block_list(blocks); + + if (upper) { + ret = link_to_upper(trans, upper, path); + if (ret < 0) { + err = ret; + break; + } + remove_backref_node(cache, upper); + } + } +out: + free_block_list(blocks); + + ret = finish_pending_nodes(trans, cache, path); + if (ret < 0) + err = ret; + + kfree(cache); + btrfs_free_path(path); + return err; +} + +static noinline_for_stack +int relocate_inode_pages(struct inode *inode, u64 start, u64 len) +{ + u64 page_start; + u64 page_end; + unsigned long i; + unsigned long first_index; + unsigned long last_index; + unsigned int total_read = 0; + unsigned int total_dirty = 0; + struct page *page; + struct file_ra_state *ra; + struct btrfs_ordered_extent *ordered; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + int ret = 0; + + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -ENOMEM; + + mutex_lock(&inode->i_mutex); + first_index = start >> PAGE_CACHE_SHIFT; + last_index = (start + len - 1) >> PAGE_CACHE_SHIFT; + + /* make sure the dirty trick played by the caller work */ + ret = invalidate_inode_pages2_range(inode->i_mapping, + first_index, last_index); + if (ret) + goto out_unlock; + + file_ra_state_init(ra, inode->i_mapping); + + for (i = first_index ; i <= last_index; i++) { + if (total_read % ra->ra_pages == 0) { + btrfs_force_ra(inode->i_mapping, ra, NULL, i, + min(last_index, ra->ra_pages + i - 1)); + } + total_read++; +again: + if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode)) + BUG_ON(1); + page = grab_cache_page(inode->i_mapping, i); + if (!page) { + ret = -ENOMEM; + goto out_unlock; + } + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + ret = -EIO; + goto out_unlock; + } + } + wait_on_page_writeback(page); + + page_start = (u64)page->index << PAGE_CACHE_SHIFT; + page_end = page_start + PAGE_CACHE_SIZE - 1; + lock_extent(io_tree, page_start, page_end, GFP_NOFS); + + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + goto again; + } + set_page_extent_mapped(page); + + if (i == first_index) + set_extent_bits(io_tree, page_start, page_end, + EXTENT_BOUNDARY, GFP_NOFS); + btrfs_set_extent_delalloc(inode, page_start, page_end); + + set_page_dirty(page); + total_dirty++; + + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + } +out_unlock: + mutex_unlock(&inode->i_mutex); + kfree(ra); + balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty); + return ret; +} + +static noinline_for_stack +int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt; + u64 end = start + extent_key->offset - 1; + + em = alloc_extent_map(GFP_NOFS); + em->start = start; + em->len = extent_key->offset; + em->block_len = extent_key->offset; + em->block_start = extent_key->objectid; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + /* setup extent map to cheat btrfs_readpage */ + lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + while (1) { + int ret; + spin_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + spin_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, start, end, 0); + } + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + + return relocate_inode_pages(inode, start, extent_key->offset); +} + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 +static int get_ref_objectid_v0(struct reloc_control *rc, + struct btrfs_path *path, + struct btrfs_key *extent_key, + u64 *ref_objectid, int *path_change) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_extent_ref_v0 *ref0; + int ret; + int slot; + + leaf = path->nodes[0]; + slot = path->slots[0]; + while (1) { + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(rc->extent_root, path); + if (ret < 0) + return ret; + BUG_ON(ret > 0); + leaf = path->nodes[0]; + slot = path->slots[0]; + if (path_change) + *path_change = 1; + } + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != extent_key->objectid) + return -ENOENT; + + if (key.type != BTRFS_EXTENT_REF_V0_KEY) { + slot++; + continue; + } + ref0 = btrfs_item_ptr(leaf, slot, + struct btrfs_extent_ref_v0); + *ref_objectid = btrfs_ref_objectid_v0(leaf, ref0); + break; + } + return 0; +} +#endif + +/* + * helper to add a tree block to the list. + * the major work is getting the generation and level of the block + */ +static int add_tree_block(struct reloc_control *rc, + struct btrfs_key *extent_key, + struct btrfs_path *path, + struct rb_root *blocks) +{ + struct extent_buffer *eb; + struct btrfs_extent_item *ei; + struct btrfs_tree_block_info *bi; + struct tree_block *block; + struct rb_node *rb_node; + u32 item_size; + int level = -1; + int generation; + + eb = path->nodes[0]; + item_size = btrfs_item_size_nr(eb, path->slots[0]); + + if (item_size >= sizeof(*ei) + sizeof(*bi)) { + ei = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_extent_item); + bi = (struct btrfs_tree_block_info *)(ei + 1); + generation = btrfs_extent_generation(eb, ei); + level = btrfs_tree_block_level(eb, bi); + } else { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + u64 ref_owner; + int ret; + + BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0)); + ret = get_ref_objectid_v0(rc, path, extent_key, + &ref_owner, NULL); + BUG_ON(ref_owner >= BTRFS_MAX_LEVEL); + level = (int)ref_owner; + /* FIXME: get real generation */ + generation = 0; +#else + BUG(); +#endif + } + + btrfs_release_path(rc->extent_root, path); + + BUG_ON(level == -1); + + block = kmalloc(sizeof(*block), GFP_NOFS); + if (!block) + return -ENOMEM; + + block->bytenr = extent_key->objectid; + block->key.objectid = extent_key->offset; + block->key.offset = generation; + block->level = level; + block->key_ready = 0; + + rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); + BUG_ON(rb_node); + + return 0; +} + +/* + * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY + */ +static int __add_tree_block(struct reloc_control *rc, + u64 bytenr, u32 blocksize, + struct rb_root *blocks) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret; + + if (tree_block_processed(bytenr, blocksize, rc)) + return 0; + + if (tree_search(blocks, bytenr)) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = blocksize; + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret); + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + ret = add_tree_block(rc, &key, path, blocks); +out: + btrfs_free_path(path); + return ret; +} + +/* + * helper to check if the block use full backrefs for pointers in it + */ +static int block_use_full_backref(struct reloc_control *rc, + struct extent_buffer *eb) +{ + struct btrfs_path *path; + struct btrfs_extent_item *ei; + struct btrfs_key key; + u64 flags; + int ret; + + if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) || + btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) + return 1; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + key.objectid = eb->start; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = eb->len; + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, + &key, path, 0, 0); + BUG_ON(ret); + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + flags = btrfs_extent_flags(path->nodes[0], ei); + BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); + if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) + ret = 1; + else + ret = 0; + btrfs_free_path(path); + return ret; +} + +/* + * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY + * this function scans fs tree to find blocks reference the data extent + */ +static int find_data_references(struct reloc_control *rc, + struct btrfs_key *extent_key, + struct extent_buffer *leaf, + struct btrfs_extent_data_ref *ref, + struct rb_root *blocks) +{ + struct btrfs_path *path; + struct tree_block *block; + struct btrfs_root *root; + struct btrfs_file_extent_item *fi; + struct rb_node *rb_node; + struct btrfs_key key; + u64 ref_root; + u64 ref_objectid; + u64 ref_offset; + u32 ref_count; + u32 nritems; + int err = 0; + int added = 0; + int counted; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ref_root = btrfs_extent_data_ref_root(leaf, ref); + ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref); + ref_offset = btrfs_extent_data_ref_offset(leaf, ref); + ref_count = btrfs_extent_data_ref_count(leaf, ref); + + root = read_fs_root(rc->extent_root->fs_info, ref_root); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto out; + } + + key.objectid = ref_objectid; + key.offset = ref_offset; + key.type = BTRFS_EXTENT_DATA_KEY; + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + err = ret; + goto out; + } + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + /* + * the references in tree blocks that use full backrefs + * are not counted in + */ + if (block_use_full_backref(rc, leaf)) + counted = 0; + else + counted = 1; + rb_node = tree_search(blocks, leaf->start); + if (rb_node) { + if (counted) + added = 1; + else + path->slots[0] = nritems; + } + + while (ref_count > 0) { + while (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) { + WARN_ON(1); + goto out; + } + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + added = 0; + + if (block_use_full_backref(rc, leaf)) + counted = 0; + else + counted = 1; + rb_node = tree_search(blocks, leaf->start); + if (rb_node) { + if (counted) + added = 1; + else + path->slots[0] = nritems; + } + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ref_objectid || + key.type != BTRFS_EXTENT_DATA_KEY) { + WARN_ON(1); + break; + } + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + goto next; + + if (btrfs_file_extent_disk_bytenr(leaf, fi) != + extent_key->objectid) + goto next; + + key.offset -= btrfs_file_extent_offset(leaf, fi); + if (key.offset != ref_offset) + goto next; + + if (counted) + ref_count--; + if (added) + goto next; + + if (!tree_block_processed(leaf->start, leaf->len, rc)) { + block = kmalloc(sizeof(*block), GFP_NOFS); + if (!block) { + err = -ENOMEM; + break; + } + block->bytenr = leaf->start; + btrfs_item_key_to_cpu(leaf, &block->key, 0); + block->level = 0; + block->key_ready = 1; + rb_node = tree_insert(blocks, block->bytenr, + &block->rb_node); + BUG_ON(rb_node); + } + if (counted) + added = 1; + else + path->slots[0] = nritems; +next: + path->slots[0]++; + + } +out: + btrfs_free_path(path); + return err; +} + +/* + * hepler to find all tree blocks that reference a given data extent + */ +static noinline_for_stack +int add_data_references(struct reloc_control *rc, + struct btrfs_key *extent_key, + struct btrfs_path *path, + struct rb_root *blocks) +{ + struct btrfs_key key; + struct extent_buffer *eb; + struct btrfs_extent_data_ref *dref; + struct btrfs_extent_inline_ref *iref; + unsigned long ptr; + unsigned long end; + u32 blocksize; + int ret; + int err = 0; + + ret = get_new_location(rc->data_inode, NULL, extent_key->objectid, + extent_key->offset); + BUG_ON(ret < 0); + if (ret > 0) { + /* the relocated data is fragmented */ + rc->extents_skipped++; + btrfs_release_path(rc->extent_root, path); + return 0; + } + + blocksize = btrfs_level_size(rc->extent_root, 0); + + eb = path->nodes[0]; + ptr = btrfs_item_ptr_offset(eb, path->slots[0]); + end = ptr + btrfs_item_size_nr(eb, path->slots[0]); +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (ptr + sizeof(struct btrfs_extent_item_v0) == end) + ptr = end; + else +#endif + ptr += sizeof(struct btrfs_extent_item); + + while (ptr < end) { + iref = (struct btrfs_extent_inline_ref *)ptr; + key.type = btrfs_extent_inline_ref_type(eb, iref); + if (key.type == BTRFS_SHARED_DATA_REF_KEY) { + key.offset = btrfs_extent_inline_ref_offset(eb, iref); + ret = __add_tree_block(rc, key.offset, blocksize, + blocks); + } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = (struct btrfs_extent_data_ref *)(&iref->offset); + ret = find_data_references(rc, extent_key, + eb, dref, blocks); + } else { + BUG(); + } + ptr += btrfs_extent_inline_ref_size(key.type); + } + WARN_ON(ptr > end); + + while (1) { + cond_resched(); + eb = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(eb)) { + ret = btrfs_next_leaf(rc->extent_root, path); + if (ret < 0) { + err = ret; + break; + } + if (ret > 0) + break; + eb = path->nodes[0]; + } + + btrfs_item_key_to_cpu(eb, &key, path->slots[0]); + if (key.objectid != extent_key->objectid) + break; + +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + if (key.type == BTRFS_SHARED_DATA_REF_KEY || + key.type == BTRFS_EXTENT_REF_V0_KEY) { +#else + BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); + if (key.type == BTRFS_SHARED_DATA_REF_KEY) { +#endif + ret = __add_tree_block(rc, key.offset, blocksize, + blocks); + } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { + dref = btrfs_item_ptr(eb, path->slots[0], + struct btrfs_extent_data_ref); + ret = find_data_references(rc, extent_key, + eb, dref, blocks); + } else { + ret = 0; + } + if (ret) { + err = ret; + break; + } + path->slots[0]++; + } + btrfs_release_path(rc->extent_root, path); + if (err) + free_block_list(blocks); + return err; +} + +/* + * hepler to find next unprocessed extent + */ +static noinline_for_stack +int find_next_extent(struct btrfs_trans_handle *trans, + struct reloc_control *rc, struct btrfs_path *path) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + u64 start, end, last; + int ret; + + last = rc->block_group->key.objectid + rc->block_group->key.offset; + while (1) { + cond_resched(); + if (rc->search_start >= last) { + ret = 1; + break; + } + + key.objectid = rc->search_start; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = 0; + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, + 0, 0); + if (ret < 0) + break; +next: + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(rc->extent_root, path); + if (ret != 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid >= last) { + ret = 1; + break; + } + + if (key.type != BTRFS_EXTENT_ITEM_KEY || + key.objectid + key.offset <= rc->search_start) { + path->slots[0]++; + goto next; + } + + ret = find_first_extent_bit(&rc->processed_blocks, + key.objectid, &start, &end, + EXTENT_DIRTY); + + if (ret == 0 && start <= key.objectid) { + btrfs_release_path(rc->extent_root, path); + rc->search_start = end + 1; + } else { + rc->search_start = key.objectid + key.offset; + return 0; + } + } + btrfs_release_path(rc->extent_root, path); + return ret; +} + +static void set_reloc_control(struct reloc_control *rc) +{ + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + mutex_lock(&fs_info->trans_mutex); + fs_info->reloc_ctl = rc; + mutex_unlock(&fs_info->trans_mutex); +} + +static void unset_reloc_control(struct reloc_control *rc) +{ + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; + mutex_lock(&fs_info->trans_mutex); + fs_info->reloc_ctl = NULL; + mutex_unlock(&fs_info->trans_mutex); +} + +static int check_extent_flags(u64 flags) +{ + if ((flags & BTRFS_EXTENT_FLAG_DATA) && + (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) + return 1; + if (!(flags & BTRFS_EXTENT_FLAG_DATA) && + !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) + return 1; + if ((flags & BTRFS_EXTENT_FLAG_DATA) && + (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) + return 1; + return 0; +} + +static noinline_for_stack int relocate_block_group(struct reloc_control *rc) +{ + struct rb_root blocks = RB_ROOT; + struct btrfs_key key; + struct btrfs_trans_handle *trans = NULL; + struct btrfs_path *path; + struct btrfs_extent_item *ei; + unsigned long nr; + u64 flags; + u32 item_size; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + rc->search_start = rc->block_group->key.objectid; + clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, + GFP_NOFS); + + rc->create_reloc_root = 1; + set_reloc_control(rc); + + trans = btrfs_start_transaction(rc->extent_root, 1); + btrfs_commit_transaction(trans, rc->extent_root); + + while (1) { + trans = btrfs_start_transaction(rc->extent_root, 1); + + ret = find_next_extent(trans, rc, path); + if (ret < 0) + err = ret; + if (ret != 0) + break; + + rc->extents_found++; + + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + item_size = btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + if (item_size >= sizeof(*ei)) { + flags = btrfs_extent_flags(path->nodes[0], ei); + ret = check_extent_flags(flags); + BUG_ON(ret); + + } else { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + u64 ref_owner; + int path_change = 0; + + BUG_ON(item_size != + sizeof(struct btrfs_extent_item_v0)); + ret = get_ref_objectid_v0(rc, path, &key, &ref_owner, + &path_change); + if (ref_owner < BTRFS_FIRST_FREE_OBJECTID) + flags = BTRFS_EXTENT_FLAG_TREE_BLOCK; + else + flags = BTRFS_EXTENT_FLAG_DATA; + + if (path_change) { + btrfs_release_path(rc->extent_root, path); + + path->search_commit_root = 1; + path->skip_locking = 1; + ret = btrfs_search_slot(NULL, rc->extent_root, + &key, path, 0, 0); + if (ret < 0) { + err = ret; + break; + } + BUG_ON(ret > 0); + } +#else + BUG(); +#endif + } + + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { + ret = add_tree_block(rc, &key, path, &blocks); + } else if (rc->stage == UPDATE_DATA_PTRS && + (flags & BTRFS_EXTENT_FLAG_DATA)) { + ret = add_data_references(rc, &key, path, &blocks); + } else { + btrfs_release_path(rc->extent_root, path); + ret = 0; + } + if (ret < 0) { + err = 0; + break; + } + + if (!RB_EMPTY_ROOT(&blocks)) { + ret = relocate_tree_blocks(trans, rc, &blocks); + if (ret < 0) { + err = ret; + break; + } + } + + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, rc->extent_root); + trans = NULL; + btrfs_btree_balance_dirty(rc->extent_root, nr); + + if (rc->stage == MOVE_DATA_EXTENTS && + (flags & BTRFS_EXTENT_FLAG_DATA)) { + rc->found_file_extent = 1; + ret = relocate_data_extent(rc->data_inode, &key); + if (ret < 0) { + err = ret; + break; + } + } + } + btrfs_free_path(path); + + if (trans) { + nr = trans->blocks_used; + btrfs_end_transaction(trans, rc->extent_root); + btrfs_btree_balance_dirty(rc->extent_root, nr); + } + + rc->create_reloc_root = 0; + smp_mb(); + + if (rc->extents_found > 0) { + trans = btrfs_start_transaction(rc->extent_root, 1); + btrfs_commit_transaction(trans, rc->extent_root); + } + + merge_reloc_roots(rc); + + unset_reloc_control(rc); + + /* get rid of pinned extents */ + trans = btrfs_start_transaction(rc->extent_root, 1); + btrfs_commit_transaction(trans, rc->extent_root); + + return err; +} + +static int __insert_orphan_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 size) +{ + struct btrfs_path *path; + struct btrfs_inode_item *item; + struct extent_buffer *leaf; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_insert_empty_inode(trans, root, path, objectid); + if (ret) + goto out; + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); + memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); + btrfs_set_inode_generation(leaf, item, 1); + btrfs_set_inode_size(leaf, item, size); + btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); + btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(root, path); +out: + btrfs_free_path(path); + return ret; +} + +/* + * helper to create inode for data relocation. + * the inode is in data relocation tree and its link count is 0 + */ +static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *group) +{ + struct inode *inode = NULL; + struct btrfs_trans_handle *trans; + struct btrfs_root *root; + struct btrfs_key key; + unsigned long nr; + u64 objectid = BTRFS_FIRST_FREE_OBJECTID; + int err = 0; + + root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); + if (IS_ERR(root)) + return ERR_CAST(root); + + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + + err = btrfs_find_free_objectid(trans, root, objectid, &objectid); + if (err) + goto out; + + err = __insert_orphan_inode(trans, root, objectid, group->key.offset); + BUG_ON(err); + + err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0, + group->key.offset, 0, group->key.offset, + 0, 0, 0); + BUG_ON(err); + + key.objectid = objectid; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + inode = btrfs_iget(root->fs_info->sb, &key, root); + BUG_ON(IS_ERR(inode) || is_bad_inode(inode)); + BTRFS_I(inode)->index_cnt = group->key.objectid; + + err = btrfs_orphan_add(trans, inode); +out: + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + + btrfs_btree_balance_dirty(root, nr); + if (err) { + if (inode) + iput(inode); + inode = ERR_PTR(err); + } + return inode; +} + +/* + * function to relocate all extents in a block group. + */ +int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) +{ + struct btrfs_fs_info *fs_info = extent_root->fs_info; + struct reloc_control *rc; + int ret; + int err = 0; + + rc = kzalloc(sizeof(*rc), GFP_NOFS); + if (!rc) + return -ENOMEM; + + mapping_tree_init(&rc->reloc_root_tree); + extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS); + INIT_LIST_HEAD(&rc->reloc_roots); + + rc->block_group = btrfs_lookup_block_group(fs_info, group_start); + BUG_ON(!rc->block_group); + + btrfs_init_workers(&rc->workers, "relocate", + fs_info->thread_pool_size); + + rc->extent_root = extent_root; + btrfs_prepare_block_group_relocation(extent_root, rc->block_group); + + rc->data_inode = create_reloc_inode(fs_info, rc->block_group); + if (IS_ERR(rc->data_inode)) { + err = PTR_ERR(rc->data_inode); + rc->data_inode = NULL; + goto out; + } + + printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n", + (unsigned long long)rc->block_group->key.objectid, + (unsigned long long)rc->block_group->flags); + + btrfs_start_delalloc_inodes(fs_info->tree_root); + btrfs_wait_ordered_extents(fs_info->tree_root, 0); + + while (1) { + mutex_lock(&fs_info->cleaner_mutex); + btrfs_clean_old_snapshots(fs_info->tree_root); + mutex_unlock(&fs_info->cleaner_mutex); + + rc->extents_found = 0; + rc->extents_skipped = 0; + + ret = relocate_block_group(rc); + if (ret < 0) { + err = ret; + break; + } + + if (rc->extents_found == 0) + break; + + printk(KERN_INFO "btrfs: found %llu extents\n", + (unsigned long long)rc->extents_found); + + if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { + btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1); + invalidate_mapping_pages(rc->data_inode->i_mapping, + 0, -1); + rc->stage = UPDATE_DATA_PTRS; + } else if (rc->stage == UPDATE_DATA_PTRS && + rc->extents_skipped >= rc->extents_found) { + iput(rc->data_inode); + rc->data_inode = create_reloc_inode(fs_info, + rc->block_group); + if (IS_ERR(rc->data_inode)) { + err = PTR_ERR(rc->data_inode); + rc->data_inode = NULL; + break; + } + rc->stage = MOVE_DATA_EXTENTS; + rc->found_file_extent = 0; + } + } + + filemap_fdatawrite_range(fs_info->btree_inode->i_mapping, + rc->block_group->key.objectid, + rc->block_group->key.objectid + + rc->block_group->key.offset - 1); + + WARN_ON(rc->block_group->pinned > 0); + WARN_ON(rc->block_group->reserved > 0); + WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); +out: + iput(rc->data_inode); + btrfs_stop_workers(&rc->workers); + btrfs_put_block_group(rc->block_group); + kfree(rc); + return err; +} + +/* + * recover relocation interrupted by system crash. + * + * this function resumes merging reloc trees with corresponding fs trees. + * this is important for keeping the sharing of tree blocks + */ +int btrfs_recover_relocation(struct btrfs_root *root) +{ + LIST_HEAD(reloc_roots); + struct btrfs_key key; + struct btrfs_root *fs_root; + struct btrfs_root *reloc_root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct reloc_control *rc = NULL; + struct btrfs_trans_handle *trans; + int ret; + int err = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_TREE_RELOC_OBJECTID; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + + while (1) { + ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, + path, 0, 0); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_release_path(root->fs_info->tree_root, path); + + if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || + key.type != BTRFS_ROOT_ITEM_KEY) + break; + + reloc_root = btrfs_read_fs_root_no_radix(root, &key); + if (IS_ERR(reloc_root)) { + err = PTR_ERR(reloc_root); + goto out; + } + + list_add(&reloc_root->root_list, &reloc_roots); + + if (btrfs_root_refs(&reloc_root->root_item) > 0) { + fs_root = read_fs_root(root->fs_info, + reloc_root->root_key.offset); + if (IS_ERR(fs_root)) { + err = PTR_ERR(fs_root); + goto out; + } + } + + if (key.offset == 0) + break; + + key.offset--; + } + btrfs_release_path(root->fs_info->tree_root, path); + + if (list_empty(&reloc_roots)) + goto out; + + rc = kzalloc(sizeof(*rc), GFP_NOFS); + if (!rc) { + err = -ENOMEM; + goto out; + } + + mapping_tree_init(&rc->reloc_root_tree); + INIT_LIST_HEAD(&rc->reloc_roots); + btrfs_init_workers(&rc->workers, "relocate", + root->fs_info->thread_pool_size); + rc->extent_root = root->fs_info->extent_root; + + set_reloc_control(rc); + + while (!list_empty(&reloc_roots)) { + reloc_root = list_entry(reloc_roots.next, + struct btrfs_root, root_list); + list_del(&reloc_root->root_list); + + if (btrfs_root_refs(&reloc_root->root_item) == 0) { + list_add_tail(&reloc_root->root_list, + &rc->reloc_roots); + continue; + } + + fs_root = read_fs_root(root->fs_info, + reloc_root->root_key.offset); + BUG_ON(IS_ERR(fs_root)); + + __add_reloc_root(reloc_root); + fs_root->reloc_root = reloc_root; + } + + trans = btrfs_start_transaction(rc->extent_root, 1); + btrfs_commit_transaction(trans, rc->extent_root); + + merge_reloc_roots(rc); + + unset_reloc_control(rc); + + trans = btrfs_start_transaction(rc->extent_root, 1); + btrfs_commit_transaction(trans, rc->extent_root); +out: + if (rc) { + btrfs_stop_workers(&rc->workers); + kfree(rc); + } + while (!list_empty(&reloc_roots)) { + reloc_root = list_entry(reloc_roots.next, + struct btrfs_root, root_list); + list_del(&reloc_root->root_list); + free_extent_buffer(reloc_root->node); + free_extent_buffer(reloc_root->commit_root); + kfree(reloc_root); + } + btrfs_free_path(path); + + if (err == 0) { + /* cleanup orphan inode in data relocation tree */ + fs_root = read_fs_root(root->fs_info, + BTRFS_DATA_RELOC_TREE_OBJECTID); + if (IS_ERR(fs_root)) + err = PTR_ERR(fs_root); + } + return err; +} + +/* + * helper to add ordered checksum for data relocation. + * + * cloning checksum properly handles the nodatasum extents. + * it also saves CPU time to re-calculate the checksum. + */ +int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) +{ + struct btrfs_ordered_sum *sums; + struct btrfs_sector_sum *sector_sum; + struct btrfs_ordered_extent *ordered; + struct btrfs_root *root = BTRFS_I(inode)->root; + size_t offset; + int ret; + u64 disk_bytenr; + LIST_HEAD(list); + + ordered = btrfs_lookup_ordered_extent(inode, file_pos); + BUG_ON(ordered->file_offset != file_pos || ordered->len != len); + + disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; + ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr, + disk_bytenr + len - 1, &list); + + while (!list_empty(&list)) { + sums = list_entry(list.next, struct btrfs_ordered_sum, list); + list_del_init(&sums->list); + + sector_sum = sums->sums; + sums->bytenr = ordered->start; + + offset = 0; + while (offset < sums->len) { + sector_sum->bytenr += ordered->start - disk_bytenr; + sector_sum++; + offset += root->sectorsize; + } + + btrfs_add_ordered_sum(inode, ordered, sums); + } + btrfs_put_ordered_extent(ordered); + return 0; +} diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index b48650d..0ddc6d6 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -111,6 +111,15 @@ out: return ret; } +int btrfs_set_root_node(struct btrfs_root_item *item, + struct extent_buffer *node) +{ + btrfs_set_root_bytenr(item, node->start); + btrfs_set_root_level(item, btrfs_header_level(node)); + btrfs_set_root_generation(item, btrfs_header_generation(node)); + return 0; +} + /* * copy the data in 'item' into the btree */ @@ -164,8 +173,7 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root * offset lower than the latest root. They need to be queued for deletion to * finish what was happening when we crashed. */ -int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid, - struct btrfs_root *latest) +int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid) { struct btrfs_root *dead_root; struct btrfs_item *item; @@ -227,10 +235,7 @@ again: goto err; } - if (objectid == BTRFS_TREE_RELOC_OBJECTID) - ret = btrfs_add_dead_reloc_root(dead_root); - else - ret = btrfs_add_dead_root(dead_root, latest); + ret = btrfs_add_dead_root(dead_root); if (ret) goto err; goto again; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 2ff7cd2..e9ef8c3 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -52,7 +52,6 @@ #include "export.h" #include "compression.h" - static struct super_operations btrfs_super_ops; static void btrfs_put_super(struct super_block *sb) @@ -322,7 +321,7 @@ static int btrfs_fill_super(struct super_block *sb, struct dentry *root_dentry; struct btrfs_super_block *disk_super; struct btrfs_root *tree_root; - struct btrfs_inode *bi; + struct btrfs_key key; int err; sb->s_maxbytes = MAX_LFS_FILESIZE; @@ -341,23 +340,15 @@ static int btrfs_fill_super(struct super_block *sb, } sb->s_fs_info = tree_root; disk_super = &tree_root->fs_info->super_copy; - inode = btrfs_iget_locked(sb, BTRFS_FIRST_FREE_OBJECTID, - tree_root->fs_info->fs_root); - bi = BTRFS_I(inode); - bi->location.objectid = inode->i_ino; - bi->location.offset = 0; - bi->root = tree_root->fs_info->fs_root; - - btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY); - if (!inode) { - err = -ENOMEM; + key.objectid = BTRFS_FIRST_FREE_OBJECTID; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); goto fail_close; } - if (inode->i_state & I_NEW) { - btrfs_read_locked_inode(inode); - unlock_new_inode(inode); - } root_dentry = d_alloc_root(inode); if (!root_dentry) { @@ -584,7 +575,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) return -EINVAL; - ret = btrfs_cleanup_reloc_trees(root); + /* recover relocation */ + ret = btrfs_recover_relocation(root); WARN_ON(ret); ret = btrfs_cleanup_fs_roots(root->fs_info); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 01b1436..2e177d7 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -25,7 +25,6 @@ #include "disk-io.h" #include "transaction.h" #include "locking.h" -#include "ref-cache.h" #include "tree-log.h" #define BTRFS_ROOT_TRANS_TAG 0 @@ -94,45 +93,37 @@ static noinline int join_transaction(struct btrfs_root *root) * to make sure the old root from before we joined the transaction is deleted * when the transaction commits */ -noinline int btrfs_record_root_in_trans(struct btrfs_root *root) +static noinline int record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { - struct btrfs_dirty_root *dirty; - u64 running_trans_id = root->fs_info->running_transaction->transid; - if (root->ref_cows && root->last_trans < running_trans_id) { + if (root->ref_cows && root->last_trans < trans->transid) { WARN_ON(root == root->fs_info->extent_root); - if (root->root_item.refs != 0) { - radix_tree_tag_set(&root->fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); - - dirty = kmalloc(sizeof(*dirty), GFP_NOFS); - BUG_ON(!dirty); - dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS); - BUG_ON(!dirty->root); - dirty->latest_root = root; - INIT_LIST_HEAD(&dirty->list); - - root->commit_root = btrfs_root_node(root); - - memcpy(dirty->root, root, sizeof(*root)); - spin_lock_init(&dirty->root->node_lock); - spin_lock_init(&dirty->root->list_lock); - mutex_init(&dirty->root->objectid_mutex); - mutex_init(&dirty->root->log_mutex); - INIT_LIST_HEAD(&dirty->root->dead_list); - dirty->root->node = root->commit_root; - dirty->root->commit_root = NULL; + WARN_ON(root->root_item.refs == 0); + WARN_ON(root->commit_root != root->node); + + radix_tree_tag_set(&root->fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + root->last_trans = trans->transid; + btrfs_init_reloc_root(trans, root); + } + return 0; +} - spin_lock(&root->list_lock); - list_add(&dirty->root->dead_list, &root->dead_list); - spin_unlock(&root->list_lock); +int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (!root->ref_cows) + return 0; - root->dirty_root = dirty; - } else { - WARN_ON(1); - } - root->last_trans = running_trans_id; + mutex_lock(&root->fs_info->trans_mutex); + if (root->last_trans == trans->transid) { + mutex_unlock(&root->fs_info->trans_mutex); + return 0; } + + record_root_in_trans(trans, root); + mutex_unlock(&root->fs_info->trans_mutex); return 0; } @@ -181,7 +172,6 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, ret = join_transaction(root); BUG_ON(ret); - btrfs_record_root_in_trans(root); h->transid = root->fs_info->running_transaction->transid; h->transaction = root->fs_info->running_transaction; h->blocks_reserved = num_blocks; @@ -192,6 +182,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, h->delayed_ref_updates = 0; root->fs_info->running_transaction->use_count++; + record_root_in_trans(h, root); mutex_unlock(&root->fs_info->trans_mutex); return h; } @@ -233,6 +224,7 @@ static noinline int wait_for_commit(struct btrfs_root *root, return 0; } +#if 0 /* * rate limit against the drop_snapshot code. This helps to slow down new * operations if the drop_snapshot code isn't able to keep up. @@ -273,6 +265,7 @@ harder: goto harder; } } +#endif void btrfs_throttle(struct btrfs_root *root) { @@ -280,7 +273,6 @@ void btrfs_throttle(struct btrfs_root *root) if (!root->fs_info->open_ioctl_trans) wait_current_trans(root); mutex_unlock(&root->fs_info->trans_mutex); - throttle_on_drops(root); } static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, @@ -323,9 +315,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, memset(trans, 0, sizeof(*trans)); kmem_cache_free(btrfs_trans_handle_cachep, trans); - if (throttle) - throttle_on_drops(root); - return 0; } @@ -462,12 +451,8 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, old_root_bytenr = btrfs_root_bytenr(&root->root_item); if (old_root_bytenr == root->node->start) break; - btrfs_set_root_bytenr(&root->root_item, - root->node->start); - btrfs_set_root_level(&root->root_item, - btrfs_header_level(root->node)); - btrfs_set_root_generation(&root->root_item, trans->transid); + btrfs_set_root_node(&root->root_item, root->node); ret = btrfs_update_root(trans, tree_root, &root->root_key, &root->root_item); @@ -477,14 +462,16 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); BUG_ON(ret); } + free_extent_buffer(root->commit_root); + root->commit_root = btrfs_root_node(root); return 0; } /* * update all the cowonly tree roots on disk */ -int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct list_head *next; @@ -520,118 +507,54 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, * a dirty root struct and adds it into the list of dead roots that need to * be deleted */ -int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest) +int btrfs_add_dead_root(struct btrfs_root *root) { - struct btrfs_dirty_root *dirty; - - dirty = kmalloc(sizeof(*dirty), GFP_NOFS); - if (!dirty) - return -ENOMEM; - dirty->root = root; - dirty->latest_root = latest; - mutex_lock(&root->fs_info->trans_mutex); - list_add(&dirty->list, &latest->fs_info->dead_roots); + list_add(&root->root_list, &root->fs_info->dead_roots); mutex_unlock(&root->fs_info->trans_mutex); return 0; } /* - * at transaction commit time we need to schedule the old roots for - * deletion via btrfs_drop_snapshot. This runs through all the - * reference counted roots that were modified in the current - * transaction and puts them into the drop list + * update all the cowonly tree roots on disk */ -static noinline int add_dirty_roots(struct btrfs_trans_handle *trans, - struct radix_tree_root *radix, - struct list_head *list) +static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { - struct btrfs_dirty_root *dirty; struct btrfs_root *gang[8]; - struct btrfs_root *root; + struct btrfs_fs_info *fs_info = root->fs_info; int i; int ret; int err = 0; - u32 refs; while (1) { - ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0, + ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, + (void **)gang, 0, ARRAY_SIZE(gang), BTRFS_ROOT_TRANS_TAG); if (ret == 0) break; for (i = 0; i < ret; i++) { root = gang[i]; - radix_tree_tag_clear(radix, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); - - BUG_ON(!root->ref_tree); - dirty = root->dirty_root; + radix_tree_tag_clear(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); btrfs_free_log(trans, root); - btrfs_free_reloc_root(trans, root); - - if (root->commit_root == root->node) { - WARN_ON(root->node->start != - btrfs_root_bytenr(&root->root_item)); - - free_extent_buffer(root->commit_root); - root->commit_root = NULL; - root->dirty_root = NULL; - - spin_lock(&root->list_lock); - list_del_init(&dirty->root->dead_list); - spin_unlock(&root->list_lock); + btrfs_update_reloc_root(trans, root); - kfree(dirty->root); - kfree(dirty); - - /* make sure to update the root on disk - * so we get any updates to the block used - * counts - */ - err = btrfs_update_root(trans, - root->fs_info->tree_root, - &root->root_key, - &root->root_item); + if (root->commit_root == root->node) continue; - } - memset(&root->root_item.drop_progress, 0, - sizeof(struct btrfs_disk_key)); - root->root_item.drop_level = 0; - root->commit_root = NULL; - root->dirty_root = NULL; - root->root_key.offset = root->fs_info->generation; - btrfs_set_root_bytenr(&root->root_item, - root->node->start); - btrfs_set_root_level(&root->root_item, - btrfs_header_level(root->node)); - btrfs_set_root_generation(&root->root_item, - root->root_key.offset); - - err = btrfs_insert_root(trans, root->fs_info->tree_root, + free_extent_buffer(root->commit_root); + root->commit_root = btrfs_root_node(root); + + btrfs_set_root_node(&root->root_item, root->node); + err = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); if (err) break; - - refs = btrfs_root_refs(&dirty->root->root_item); - btrfs_set_root_refs(&dirty->root->root_item, refs - 1); - err = btrfs_update_root(trans, root->fs_info->tree_root, - &dirty->root->root_key, - &dirty->root->root_item); - - BUG_ON(err); - if (refs == 1) { - list_add(&dirty->list, list); - } else { - WARN_ON(1); - free_extent_buffer(dirty->root->node); - kfree(dirty->root); - kfree(dirty); - } } } return err; @@ -688,12 +611,8 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info) TASK_UNINTERRUPTIBLE); mutex_unlock(&info->trans_mutex); - atomic_dec(&info->throttles); - wake_up(&info->transaction_throttle); - schedule(); - atomic_inc(&info->throttles); mutex_lock(&info->trans_mutex); finish_wait(&info->transaction_wait, &wait); } @@ -705,111 +624,61 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info) * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on * all of them */ -static noinline int drop_dirty_roots(struct btrfs_root *tree_root, - struct list_head *list) +int btrfs_drop_dead_root(struct btrfs_root *root) { - struct btrfs_dirty_root *dirty; struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = root->fs_info->tree_root; unsigned long nr; - u64 num_bytes; - u64 bytes_used; - u64 max_useless; - int ret = 0; - int err; - - while (!list_empty(list)) { - struct btrfs_root *root; - - dirty = list_entry(list->prev, struct btrfs_dirty_root, list); - list_del_init(&dirty->list); - - num_bytes = btrfs_root_used(&dirty->root->root_item); - root = dirty->latest_root; - atomic_inc(&root->fs_info->throttles); - - while (1) { - /* - * we don't want to jump in and create a bunch of - * delayed refs if the transaction is starting to close - */ - wait_transaction_pre_flush(tree_root->fs_info); - trans = btrfs_start_transaction(tree_root, 1); - - /* - * we've joined a transaction, make sure it isn't - * closing right now - */ - if (trans->transaction->delayed_refs.flushing) { - btrfs_end_transaction(trans, tree_root); - continue; - } - - mutex_lock(&root->fs_info->drop_mutex); - ret = btrfs_drop_snapshot(trans, dirty->root); - if (ret != -EAGAIN) - break; - mutex_unlock(&root->fs_info->drop_mutex); + int ret; - err = btrfs_update_root(trans, - tree_root, - &dirty->root->root_key, - &dirty->root->root_item); - if (err) - ret = err; - nr = trans->blocks_used; - ret = btrfs_end_transaction(trans, tree_root); - BUG_ON(ret); + while (1) { + /* + * we don't want to jump in and create a bunch of + * delayed refs if the transaction is starting to close + */ + wait_transaction_pre_flush(tree_root->fs_info); + trans = btrfs_start_transaction(tree_root, 1); - btrfs_btree_balance_dirty(tree_root, nr); - cond_resched(); + /* + * we've joined a transaction, make sure it isn't + * closing right now + */ + if (trans->transaction->delayed_refs.flushing) { + btrfs_end_transaction(trans, tree_root); + continue; } - BUG_ON(ret); - atomic_dec(&root->fs_info->throttles); - wake_up(&root->fs_info->transaction_throttle); - num_bytes -= btrfs_root_used(&dirty->root->root_item); - bytes_used = btrfs_root_used(&root->root_item); - if (num_bytes) { - mutex_lock(&root->fs_info->trans_mutex); - btrfs_record_root_in_trans(root); - mutex_unlock(&root->fs_info->trans_mutex); - btrfs_set_root_used(&root->root_item, - bytes_used - num_bytes); - } + ret = btrfs_drop_snapshot(trans, root); + if (ret != -EAGAIN) + break; - ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key); - if (ret) { - BUG(); + ret = btrfs_update_root(trans, tree_root, + &root->root_key, + &root->root_item); + if (ret) break; - } - mutex_unlock(&root->fs_info->drop_mutex); - - spin_lock(&root->list_lock); - list_del_init(&dirty->root->dead_list); - if (!list_empty(&root->dead_list)) { - struct btrfs_root *oldest; - oldest = list_entry(root->dead_list.prev, - struct btrfs_root, dead_list); - max_useless = oldest->root_key.offset - 1; - } else { - max_useless = root->root_key.offset - 1; - } - spin_unlock(&root->list_lock); nr = trans->blocks_used; ret = btrfs_end_transaction(trans, tree_root); BUG_ON(ret); - ret = btrfs_remove_leaf_refs(root, max_useless, 0); - BUG_ON(ret); - - free_extent_buffer(dirty->root->node); - kfree(dirty->root); - kfree(dirty); - btrfs_btree_balance_dirty(tree_root, nr); cond_resched(); } + BUG_ON(ret); + + ret = btrfs_del_root(trans, tree_root, &root->root_key); + BUG_ON(ret); + + nr = trans->blocks_used; + ret = btrfs_end_transaction(trans, tree_root); + BUG_ON(ret); + + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); + kfree(root); + + btrfs_btree_balance_dirty(tree_root, nr); return ret; } @@ -839,24 +708,23 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret) goto fail; - btrfs_record_root_in_trans(root); + record_root_in_trans(trans, root); btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); key.objectid = objectid; - key.offset = trans->transid; + key.offset = 0; btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); old = btrfs_lock_root_node(root); btrfs_cow_block(trans, root, old, NULL, 0, &old); + btrfs_set_lock_blocking(old); btrfs_copy_root(trans, root, old, &tmp, objectid); btrfs_tree_unlock(old); free_extent_buffer(old); - btrfs_set_root_bytenr(new_root_item, tmp->start); - btrfs_set_root_level(new_root_item, btrfs_header_level(tmp)); - btrfs_set_root_generation(new_root_item, trans->transid); + btrfs_set_root_node(new_root_item, tmp); ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, new_root_item); btrfs_tree_unlock(tmp); @@ -964,6 +832,24 @@ static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans, return 0; } +static void update_super_roots(struct btrfs_root *root) +{ + struct btrfs_root_item *root_item; + struct btrfs_super_block *super; + + super = &root->fs_info->super_copy; + + root_item = &root->fs_info->chunk_root->root_item; + super->chunk_root = root_item->bytenr; + super->chunk_root_generation = root_item->generation; + super->chunk_root_level = root_item->level; + + root_item = &root->fs_info->tree_root->root_item; + super->root = root_item->bytenr; + super->generation = root_item->generation; + super->root_level = root_item->level; +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -971,8 +857,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, unsigned long timeout = 1; struct btrfs_transaction *cur_trans; struct btrfs_transaction *prev_trans = NULL; - struct btrfs_root *chunk_root = root->fs_info->chunk_root; - struct list_head dirty_fs_roots; struct extent_io_tree *pinned_copy; DEFINE_WAIT(wait); int ret; @@ -999,7 +883,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, BUG_ON(ret); mutex_lock(&root->fs_info->trans_mutex); - INIT_LIST_HEAD(&dirty_fs_roots); if (cur_trans->in_commit) { cur_trans->use_count++; mutex_unlock(&root->fs_info->trans_mutex); @@ -1105,41 +988,36 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, * with the tree-log code. */ mutex_lock(&root->fs_info->tree_log_mutex); - /* - * keep tree reloc code from adding new reloc trees - */ - mutex_lock(&root->fs_info->tree_reloc_mutex); - - ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix, - &dirty_fs_roots); + ret = commit_fs_roots(trans, root); BUG_ON(ret); - /* add_dirty_roots gets rid of all the tree log roots, it is now + /* commit_fs_roots gets rid of all the tree log roots, it is now * safe to free the root of tree log roots */ btrfs_free_log_root_tree(trans, root->fs_info); - ret = btrfs_commit_tree_roots(trans, root); + ret = commit_cowonly_roots(trans, root); BUG_ON(ret); cur_trans = root->fs_info->running_transaction; spin_lock(&root->fs_info->new_trans_lock); root->fs_info->running_transaction = NULL; spin_unlock(&root->fs_info->new_trans_lock); - btrfs_set_super_generation(&root->fs_info->super_copy, - cur_trans->transid); - btrfs_set_super_root(&root->fs_info->super_copy, - root->fs_info->tree_root->node->start); - btrfs_set_super_root_level(&root->fs_info->super_copy, - btrfs_header_level(root->fs_info->tree_root->node)); - - btrfs_set_super_chunk_root(&root->fs_info->super_copy, - chunk_root->node->start); - btrfs_set_super_chunk_root_level(&root->fs_info->super_copy, - btrfs_header_level(chunk_root->node)); - btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy, - btrfs_header_generation(chunk_root->node)); + + btrfs_set_root_node(&root->fs_info->tree_root->root_item, + root->fs_info->tree_root->node); + free_extent_buffer(root->fs_info->tree_root->commit_root); + root->fs_info->tree_root->commit_root = + btrfs_root_node(root->fs_info->tree_root); + + btrfs_set_root_node(&root->fs_info->chunk_root->root_item, + root->fs_info->chunk_root->node); + free_extent_buffer(root->fs_info->chunk_root->commit_root); + root->fs_info->chunk_root->commit_root = + btrfs_root_node(root->fs_info->chunk_root); + + update_super_roots(root); if (!root->fs_info->log_root_recovering) { btrfs_set_super_log_root(&root->fs_info->super_copy, 0); @@ -1153,7 +1031,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, trans->transaction->blocked = 0; - wake_up(&root->fs_info->transaction_throttle); wake_up(&root->fs_info->transaction_wait); mutex_unlock(&root->fs_info->trans_mutex); @@ -1170,9 +1047,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_finish_extent_commit(trans, root, pinned_copy); kfree(pinned_copy); - btrfs_drop_dead_reloc_roots(root); - mutex_unlock(&root->fs_info->tree_reloc_mutex); - /* do the directory inserts of any pending snapshot creations */ finish_pending_snapshots(trans, root->fs_info); @@ -1186,16 +1060,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, put_transaction(cur_trans); put_transaction(cur_trans); - list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots); - if (root->fs_info->closing) - list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots); - mutex_unlock(&root->fs_info->trans_mutex); kmem_cache_free(btrfs_trans_handle_cachep, trans); - - if (root->fs_info->closing) - drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots); return ret; } @@ -1204,16 +1071,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, */ int btrfs_clean_old_snapshots(struct btrfs_root *root) { - struct list_head dirty_roots; - INIT_LIST_HEAD(&dirty_roots); -again: - mutex_lock(&root->fs_info->trans_mutex); - list_splice_init(&root->fs_info->dead_roots, &dirty_roots); - mutex_unlock(&root->fs_info->trans_mutex); + LIST_HEAD(list); + struct btrfs_fs_info *fs_info = root->fs_info; + + mutex_lock(&fs_info->trans_mutex); + list_splice_init(&fs_info->dead_roots, &list); + mutex_unlock(&fs_info->trans_mutex); - if (!list_empty(&dirty_roots)) { - drop_dirty_roots(root, &dirty_roots); - goto again; + while (!list_empty(&list)) { + root = list_entry(list.next, struct btrfs_root, root_list); + list_del_init(&root->root_list); + btrfs_drop_dead_root(root); } return 0; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 94f5bde..961c3ee 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -62,12 +62,6 @@ struct btrfs_pending_snapshot { struct list_head list; }; -struct btrfs_dirty_root { - struct list_head list; - struct btrfs_root *root; - struct btrfs_root *latest_root; -}; - static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans, struct inode *inode) { @@ -100,7 +94,8 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest); +int btrfs_add_dead_root(struct btrfs_root *root); +int btrfs_drop_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); int btrfs_clean_old_snapshots(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans, @@ -108,7 +103,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_throttle(struct btrfs_root *root); -int btrfs_record_root_in_trans(struct btrfs_root *root); +int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index db5e212..2b41fc0 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -430,18 +430,16 @@ no_copy: static noinline struct inode *read_one_inode(struct btrfs_root *root, u64 objectid) { + struct btrfs_key key; struct inode *inode; - inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); - if (inode->i_state & I_NEW) { - BTRFS_I(inode)->root = root; - BTRFS_I(inode)->location.objectid = objectid; - BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; - BTRFS_I(inode)->location.offset = 0; - btrfs_read_locked_inode(inode); - unlock_new_inode(inode); - } - if (is_bad_inode(inode)) { + key.objectid = objectid; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + inode = btrfs_iget(root->fs_info->sb, &key, root); + if (IS_ERR(inode)) { + inode = NULL; + } else if (is_bad_inode(inode)) { iput(inode); inode = NULL; } @@ -541,6 +539,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { + u64 offset; unsigned long dest_offset; struct btrfs_key ins; @@ -555,6 +554,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); ins.type = BTRFS_EXTENT_ITEM_KEY; + offset = key->offset - btrfs_file_extent_offset(eb, item); if (ins.objectid > 0) { u64 csum_start; @@ -569,19 +569,16 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, if (ret == 0) { ret = btrfs_inc_extent_ref(trans, root, ins.objectid, ins.offset, - path->nodes[0]->start, - root->root_key.objectid, - trans->transid, key->objectid); + 0, root->root_key.objectid, + key->objectid, offset); } else { /* * insert the extent pointer in the extent * allocation tree */ - ret = btrfs_alloc_logged_extent(trans, root, - path->nodes[0]->start, - root->root_key.objectid, - trans->transid, key->objectid, - &ins); + ret = btrfs_alloc_logged_file_extent(trans, + root, root->root_key.objectid, + key->objectid, offset, &ins); BUG_ON(ret); } btrfs_release_path(root, path); @@ -1706,9 +1703,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); - ret = btrfs_drop_leaf_ref(trans, root, next); - BUG_ON(ret); - WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_reserved_extent(root, @@ -1753,10 +1747,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); - if (*level == 0) { - ret = btrfs_drop_leaf_ref(trans, root, next); - BUG_ON(ret); - } WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_reserved_extent(root, bytenr, blocksize); BUG_ON(ret); @@ -1811,12 +1801,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); - if (*level == 0) { - ret = btrfs_drop_leaf_ref(trans, root, - next); - BUG_ON(ret); - } - WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_reserved_extent(root, path->nodes[*level]->start, @@ -1884,11 +1868,6 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); - if (orig_level == 0) { - ret = btrfs_drop_leaf_ref(trans, log, - next); - BUG_ON(ret); - } WARN_ON(log->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); ret = btrfs_free_reserved_extent(log, next->start, @@ -2027,9 +2006,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); BUG_ON(ret); - btrfs_set_root_bytenr(&log->root_item, log->node->start); - btrfs_set_root_generation(&log->root_item, trans->transid); - btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node)); + btrfs_set_root_node(&log->root_item, log->node); root->log_batch = 0; root->log_transid++; @@ -2581,7 +2558,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, ins_keys, ins_sizes, nr); BUG_ON(ret); - for (i = 0; i < nr; i++) { + for (i = 0; i < nr; i++, dst_path->slots[0]++) { dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_path->slots[0]); @@ -2617,36 +2594,31 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, found_type = btrfs_file_extent_type(src, extent); if (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC) { - u64 ds = btrfs_file_extent_disk_bytenr(src, - extent); - u64 dl = btrfs_file_extent_disk_num_bytes(src, - extent); - u64 cs = btrfs_file_extent_offset(src, extent); - u64 cl = btrfs_file_extent_num_bytes(src, - extent);; + u64 ds, dl, cs, cl; + ds = btrfs_file_extent_disk_bytenr(src, + extent); + /* ds == 0 is a hole */ + if (ds == 0) + continue; + + dl = btrfs_file_extent_disk_num_bytes(src, + extent); + cs = btrfs_file_extent_offset(src, extent); + cl = btrfs_file_extent_num_bytes(src, + extent);; if (btrfs_file_extent_compression(src, extent)) { cs = 0; cl = dl; } - /* ds == 0 is a hole */ - if (ds != 0) { - ret = btrfs_inc_extent_ref(trans, log, - ds, dl, - dst_path->nodes[0]->start, - BTRFS_TREE_LOG_OBJECTID, - trans->transid, - ins_keys[i].objectid); - BUG_ON(ret); - ret = btrfs_lookup_csums_range( - log->fs_info->csum_root, - ds + cs, ds + cs + cl - 1, - &ordered_sums); - BUG_ON(ret); - } + + ret = btrfs_lookup_csums_range( + log->fs_info->csum_root, + ds + cs, ds + cs + cl - 1, + &ordered_sums); + BUG_ON(ret); } } - dst_path->slots[0]++; } btrfs_mark_buffer_dirty(dst_path->nodes[0]); @@ -3029,9 +3001,7 @@ again: BUG_ON(!wc.replay_dest); wc.replay_dest->log_root = log; - mutex_lock(&fs_info->trans_mutex); - btrfs_record_root_in_trans(wc.replay_dest); - mutex_unlock(&fs_info->trans_mutex); + btrfs_record_root_in_trans(trans, wc.replay_dest); ret = walk_log_tree(trans, log, &wc); BUG_ON(ret); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a6d35b0..8bc6a88 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1671,8 +1671,6 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, int ret; int i; - printk(KERN_INFO "btrfs relocating chunk %llu\n", - (unsigned long long)chunk_offset); root = root->fs_info->chunk_root; extent_root = root->fs_info->extent_root; em_tree = &root->fs_info->mapping_tree.map_tree; -- cgit v1.1 From b36124210248706186a02093427bdff4b3f548e8 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 13 May 2009 19:12:15 -0400 Subject: Btrfs: stop avoiding balancing at the end of the transaction. When the delayed reference code was added, some checks were added to avoid extra balancing while the delayed references were being flushed. This made for less efficient btrees, but it reduced the chances of loops where no forward progress was made because the balances made more delayed ref updates. With the new dead root removal code and the mixed back references, the extent allocation tree is no longer using precise back refs, and the delayed reference updates don't carry the risk of looping forever anymore. So, the balance avoidance is no longer required. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 2b96027..2f633e7 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1052,8 +1052,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(root) / 4) return 0; - if (trans->transaction->delayed_refs.flushing && - btrfs_header_nritems(mid) > 2) + if (btrfs_header_nritems(mid) > 2) return 0; if (btrfs_header_nritems(mid) < 2) @@ -2194,7 +2193,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, ret = insert_new_root(trans, root, path, level + 1); if (ret) return ret; - } else if (!trans->transaction->delayed_refs.flushing) { + } else { ret = push_nodes_for_insert(trans, root, path, level); c = path->nodes[level]; if (!ret && btrfs_header_nritems(c) < @@ -2869,8 +2868,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, int num_doubles = 0; /* first try to make some room by pushing left and right */ - if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY && - !trans->transaction->delayed_refs.flushing) { + if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { wret = push_leaf_right(trans, root, path, data_size, 0); if (wret < 0) return wret; @@ -3809,8 +3807,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, } /* delete the leaf if it is mostly empty */ - if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 && - !trans->transaction->delayed_refs.flushing) { + if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) { /* push_leaf_left fixes the path. * make sure the path still points to our leaf * for possible call to del_ptr below -- cgit v1.1 From cfbb9308463f6dad1334884db046ccf0f1a77918 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 18 May 2009 10:41:58 -0400 Subject: Btrfs: balance btree more often With the new back reference code, the cost of a balance has gone down in terms of the number of back reference updates done. This commit makes us more aggressively balance leaves and nodes as they become less full. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 2f633e7..60a45f3 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1651,7 +1651,7 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, } b = p->nodes[level]; } else if (ins_len < 0 && btrfs_header_nritems(b) < - BTRFS_NODEPTRS_PER_BLOCK(root) / 4) { + BTRFS_NODEPTRS_PER_BLOCK(root) / 2) { int sret; sret = reada_for_balance(root, p, level); @@ -3807,7 +3807,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, } /* delete the leaf if it is mostly empty */ - if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) { + if (used < BTRFS_LEAF_DATA_SIZE(root) / 2) { /* push_leaf_left fixes the path. * make sure the path still points to our leaf * for possible call to del_ptr below -- cgit v1.1 From 2c943de6ad795a174dcc424c293bb77f15ae3b8c Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 18 May 2009 10:41:58 -0400 Subject: Btrfs: reduce mount -o ssd CPU usage The block allocator in SSD mode will try to find groups of free blocks that are close together. This commit makes it loop less on a given group size before bumping it. The end result is that we are less likely to fill small holes in the available free space, but we don't waste as much CPU building the large cluster used by ssd mode. Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 0bc9365..2801655 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -652,7 +652,7 @@ again: last = entry; max_extent = 0; total_retries++; - if (total_retries % 256 == 0) { + if (total_retries % 64 == 0) { if (min_bytes >= (bytes + empty_size)) { ret = -ENOSPC; goto out; -- cgit v1.1 From 585ad2c3797dcaa643aeba75b9f072778adf3490 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 18 May 2009 10:41:58 -0400 Subject: Btrfs: fix metadata dirty throttling limits Once a metadata block has been written, it must be recowed, so the btrfs dirty balancing call has a check to make sure a fair amount of metadata was actually dirty before it started writing it back to disk. A previous commit had changed the dirty tracking for metadata without updating the btrfs dirty balancing checks. This commit switches it to use the correct counter. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7f5c6e3..e572cf4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2378,17 +2378,14 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) * looks as though older kernels can get into trouble with * this code, they end up stuck in balance_dirty_pages forever */ - struct extent_io_tree *tree; u64 num_dirty; - u64 start = 0; unsigned long thresh = 32 * 1024 * 1024; - tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; if (current->flags & PF_MEMALLOC) return; - num_dirty = count_range_bits(tree, &start, (u64)-1, - thresh, EXTENT_DIRTY); + num_dirty = root->fs_info->dirty_metadata_bytes; + if (num_dirty > thresh) { balance_dirty_pages_ratelimited_nr( root->fs_info->btree_inode->i_mapping, 1); -- cgit v1.1 From d84275c938e1a5e2dc5b89eb9b878e0ddb2c55e0 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 9 Jun 2009 15:39:08 -0400 Subject: Btrfs: don't allow WRITE_SYNC bios to starve out regular writes Btrfs uses dedicated threads to submit bios when checksumming is on, which allows us to make sure the threads dedicated to checksumming don't get stuck waiting for requests. For each btrfs device, there are two lists of bios. One list is for WRITE_SYNC bios and the other is for regular priority bios. The IO submission threads used to process all of the WRITE_SYNC bios first and then switch to the regular bios. This commit makes sure we don't completely starve the regular bios by rotating between the two lists. WRITE_SYNC bios are still favored 2:1 over the regular bios, and this tries to run in batches to avoid seeking. Benchmarking shows this eliminates stalls during streaming buffered writes on both multi-device and single device filesystems. If the regular bios starve, the system can end up with a large amount of ram pinned down in writeback pages. If we are a little more fair between the two classes, we're able to keep throughput up and make progress on the bulk of our dirty ram. Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8bc6a88..9d36181 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -163,6 +163,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) unsigned long num_sync_run; unsigned long limit; unsigned long last_waited = 0; + int force_reg = 0; bdi = blk_get_backing_dev_info(device->bdev); fs_info = device->dev_root->fs_info; @@ -176,19 +177,22 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) loop: spin_lock(&device->io_lock); - num_run = 0; loop_lock: + num_run = 0; /* take all the bios off the list at once and process them * later on (without the lock held). But, remember the * tail and other pointers so the bios can be properly reinserted * into the list if we hit congestion */ - if (device->pending_sync_bios.head) + if (!force_reg && device->pending_sync_bios.head) { pending_bios = &device->pending_sync_bios; - else + force_reg = 1; + } else { pending_bios = &device->pending_bios; + force_reg = 0; + } pending = pending_bios->head; tail = pending_bios->tail; @@ -228,10 +232,14 @@ loop_lock: while (pending) { rmb(); - if (pending_bios != &device->pending_sync_bios && - device->pending_sync_bios.head && - num_run > 16) { - cond_resched(); + /* we want to work on both lists, but do more bios on the + * sync list than the regular list + */ + if ((num_run > 32 && + pending_bios != &device->pending_sync_bios && + device->pending_sync_bios.head) || + (num_run > 64 && pending_bios == &device->pending_sync_bios && + device->pending_bios.head)) { spin_lock(&device->io_lock); requeue_list(pending_bios, pending, tail); goto loop_lock; -- cgit v1.1 From d644d8a1e30b88a93bcfb63cada2ae628462ddba Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 9 Jun 2009 15:59:22 -0400 Subject: Btrfs: avoid IO stalls behind congested devices in a multi-device FS The btrfs IO submission threads try to service a bunch of devices with a small number of threads. They do a congestion check to try and avoid waiting on requests for a busy device. The checks make sure we've sent a few requests down to a given device just so that we aren't bouncing between busy devices without actually sending down any IO. The counter used to decide if we can switch to the next device is somewhat overloaded. It is also being used to decide if we've done a good batch of requests between the WRITE_SYNC or regular priority lists. It may get reset to zero often, leaving us hammering on a busy device instead of moving on to another disk. This commit adds a new counter for the number of bios sent while servicing a device. It doesn't get reset or fiddled with. On multi-device filesystems, this fixes IO stalls in streaming write workloads. Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9d36181..27d5f37 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -161,6 +161,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) int again = 0; unsigned long num_run; unsigned long num_sync_run; + unsigned long batch_run = 0; unsigned long limit; unsigned long last_waited = 0; int force_reg = 0; @@ -257,6 +258,8 @@ loop_lock: BUG_ON(atomic_read(&cur->bi_cnt) == 0); submit_bio(cur->bi_rw, cur); num_run++; + batch_run++; + if (bio_sync(cur)) num_sync_run++; @@ -273,7 +276,7 @@ loop_lock: * is now congested. Back off and let other work structs * run instead */ - if (pending && bdi_write_congested(bdi) && num_run > 16 && + if (pending && bdi_write_congested(bdi) && batch_run > 32 && fs_info->fs_devices->open_devices > 1) { struct io_context *ioc; -- cgit v1.1 From 3b30c22f64a6bb297719c60e494af1d26563f584 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 9 Jun 2009 16:42:22 -0400 Subject: Btrfs: Add mount -o nossd This allows you to turn off the ssd mode via remount. Signed-off-by: Chris Mason --- fs/btrfs/super.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index e9ef8c3..22855a1 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -66,8 +66,8 @@ static void btrfs_put_super(struct super_block *sb) enum { Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, - Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_notreelog, - Opt_ratio, Opt_flushoncommit, Opt_err, + Opt_ssd, Opt_nossd, Opt_thread_pool, Opt_noacl, Opt_compress, + Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err, }; static match_table_t tokens = { @@ -83,6 +83,7 @@ static match_table_t tokens = { {Opt_thread_pool, "thread_pool=%d"}, {Opt_compress, "compress"}, {Opt_ssd, "ssd"}, + {Opt_nossd, "nossd"}, {Opt_noacl, "noacl"}, {Opt_notreelog, "notreelog"}, {Opt_flushoncommit, "flushoncommit"}, @@ -173,6 +174,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); btrfs_set_opt(info->mount_opt, SSD); break; + case Opt_nossd: + printk(KERN_INFO "btrfs: not using ssd allocation scheme\n"); + btrfs_clear_opt(info->mount_opt, SSD); + break; case Opt_nobarrier: printk(KERN_INFO "btrfs: turning off barriers\n"); btrfs_set_opt(info->mount_opt, NOBARRIER); -- cgit v1.1 From c604480171c510c1beeb81b82418e5bc4de8f1ae Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 9 Jun 2009 18:35:15 -0400 Subject: Btrfs: avoid allocation clusters that are too spread out In SSD mode for data, and all the time for metadata the allocator will try to find a cluster of nearby blocks for allocations. This commit adds extra checks to make sure that each free block in the cluster is close to the last one. Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 2801655..ac23476 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -645,7 +645,8 @@ again: * we haven't filled the empty size and the window is * very large. reset and try again */ - if (next->offset - window_start > (bytes + empty_size) * 2) { + if (next->offset - (last->offset + last->bytes) > 128 * 1024 || + next->offset - window_start > (bytes + empty_size) * 2) { entry = next; window_start = entry->offset; window_free = entry->bytes; -- cgit v1.1 From 451d7585a8bb1b9bec0d676ce3dece1923164e55 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 9 Jun 2009 20:28:34 -0400 Subject: Btrfs: add mount -o ssd_spread to spread allocations out Some SSDs perform best when reusing block numbers often, while others perform much better when clustering strictly allocates big chunks of unused space. The default mount -o ssd will find rough groupings of blocks where there are a bunch of free blocks that might have some allocated blocks mixed in. mount -o ssd_spread will make sure there are no allocated blocks mixed in. It should perform better on lower end SSDs. Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 1 + fs/btrfs/extent-tree.c | 2 +- fs/btrfs/free-space-cache.c | 5 ++++- fs/btrfs/free-space-cache.h | 1 + fs/btrfs/super.c | 19 +++++++++++++++---- 5 files changed, 22 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ce3ab4e..b9d8788 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1100,6 +1100,7 @@ struct btrfs_root { #define BTRFS_MOUNT_COMPRESS (1 << 5) #define BTRFS_MOUNT_NOTREELOG (1 << 6) #define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) +#define BTRFS_MOUNT_SSD_SPREAD (1 << 8) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a42419c..3355d7e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3607,7 +3607,7 @@ refill_cluster: last_ptr_loop = 0; /* allocate a cluster in this block group */ - ret = btrfs_find_space_cluster(trans, + ret = btrfs_find_space_cluster(trans, root, block_group, last_ptr, offset, num_bytes, empty_cluster + empty_size); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index ac23476..4538e48 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -579,6 +579,7 @@ out: * it returns -enospc */ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, u64 empty_size) @@ -595,7 +596,9 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, int ret; /* for metadata, allow allocates with more holes */ - if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { + if (btrfs_test_opt(root, SSD_SPREAD)) { + min_bytes = bytes + empty_size; + } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { /* * we want to do larger allocations when we are * flushing out the delayed refs, it helps prevent diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index ab0bdc0..266fb87 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -31,6 +31,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, u64 bytes); u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group); int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, u64 empty_size); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 22855a1..7f5b288 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -66,8 +66,8 @@ static void btrfs_put_super(struct super_block *sb) enum { Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, - Opt_ssd, Opt_nossd, Opt_thread_pool, Opt_noacl, Opt_compress, - Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err, + Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, + Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err, }; static match_table_t tokens = { @@ -83,6 +83,7 @@ static match_table_t tokens = { {Opt_thread_pool, "thread_pool=%d"}, {Opt_compress, "compress"}, {Opt_ssd, "ssd"}, + {Opt_ssd_spread, "ssd_spread"}, {Opt_nossd, "nossd"}, {Opt_noacl, "noacl"}, {Opt_notreelog, "notreelog"}, @@ -174,9 +175,17 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); btrfs_set_opt(info->mount_opt, SSD); break; + case Opt_ssd_spread: + printk(KERN_INFO "btrfs: use spread ssd " + "allocation scheme\n"); + btrfs_set_opt(info->mount_opt, SSD); + btrfs_set_opt(info->mount_opt, SSD_SPREAD); + break; case Opt_nossd: - printk(KERN_INFO "btrfs: not using ssd allocation scheme\n"); + printk(KERN_INFO "btrfs: not using ssd allocation " + "scheme\n"); btrfs_clear_opt(info->mount_opt, SSD); + btrfs_clear_opt(info->mount_opt, SSD_SPREAD); break; case Opt_nobarrier: printk(KERN_INFO "btrfs: turning off barriers\n"); @@ -429,7 +438,9 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); if (btrfs_test_opt(root, COMPRESS)) seq_puts(seq, ",compress"); - if (btrfs_test_opt(root, SSD)) + if (btrfs_test_opt(root, SSD_SPREAD)) + seq_puts(seq, ",ssd_spread"); + else if (btrfs_test_opt(root, SSD)) seq_puts(seq, ",ssd"); if (btrfs_test_opt(root, NOTREELOG)) seq_puts(seq, ",notreelog"); -- cgit v1.1 From c289811cc096c57ff35550ee8132793a4f9b5b59 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 10 Jun 2009 09:51:32 -0400 Subject: Btrfs: autodetect SSD devices During mount, btrfs will check the queue nonrot flag for all the devices found in the FS. If they are all non-rotating, SSD mode is enabled by default. If the FS was mounted with -o nossd, the non-rotating flag is ignored. Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 9 +++++++++ fs/btrfs/super.c | 3 +++ fs/btrfs/volumes.c | 6 ++++++ fs/btrfs/volumes.h | 5 +++++ 5 files changed, 24 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b9d8788..5fa7d7d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1101,6 +1101,7 @@ struct btrfs_root { #define BTRFS_MOUNT_NOTREELOG (1 << 6) #define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) #define BTRFS_MOUNT_SSD_SPREAD (1 << 8) +#define BTRFS_MOUNT_NOSSD (1 << 9) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e572cf4..f4dfbb7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1850,6 +1850,14 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (IS_ERR(fs_info->transaction_kthread)) goto fail_cleaner; + if (!btrfs_test_opt(tree_root, SSD) && + !btrfs_test_opt(tree_root, NOSSD) && + !fs_info->fs_devices->rotating) { + printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD " + "mode\n"); + btrfs_set_opt(fs_info->mount_opt, SSD); + } + if (btrfs_super_log_root(disk_super) != 0) { u64 bytenr = btrfs_super_log_root(disk_super); @@ -1893,6 +1901,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); if (!fs_info->fs_root) goto fail_trans_kthread; + return tree_root; fail_trans_kthread: diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 7f5b288..3427db2 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -184,6 +184,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_nossd: printk(KERN_INFO "btrfs: not using ssd allocation " "scheme\n"); + btrfs_set_opt(info->mount_opt, NOSSD); btrfs_clear_opt(info->mount_opt, SSD); btrfs_clear_opt(info->mount_opt, SSD_SPREAD); break; @@ -438,6 +439,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); if (btrfs_test_opt(root, COMPRESS)) seq_puts(seq, ",compress"); + if (btrfs_test_opt(root, NOSSD)) + seq_puts(seq, ",nossd"); if (btrfs_test_opt(root, SSD_SPREAD)) seq_puts(seq, ",ssd_spread"); else if (btrfs_test_opt(root, SSD)) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 27d5f37..3f4a593 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -605,6 +605,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, device->in_fs_metadata = 0; device->mode = flags; + if (!blk_queue_nonrot(bdev_get_queue(bdev))) + fs_devices->rotating = 1; + fs_devices->open_devices++; if (device->writeable) { fs_devices->rw_devices++; @@ -1473,6 +1476,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) root->fs_info->fs_devices->rw_devices++; root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; + if (!blk_queue_nonrot(bdev_get_queue(bdev))) + root->fs_info->fs_devices->rotating = 1; + total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); btrfs_set_super_total_bytes(&root->fs_info->super_copy, total_bytes + device->total_bytes); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 5c3ff6d..3c1f731 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -107,6 +107,11 @@ struct btrfs_fs_devices { int seeding; int opened; + + /* set when we find or add a device that doesn't have the + * nonrot flag set + */ + int rotating; }; struct btrfs_bio_stripe { -- cgit v1.1 From 6cbff00f4632c8060b06bfc9585805217f11e12e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Apr 2009 10:37:41 +0200 Subject: Btrfs: implement FS_IOC_GETFLAGS/SETFLAGS/GETVERSION Add support for the standard attributes set via chattr and read via lsattr. Currently we store the attributes in the flags value in the btrfs inode, but I wonder whether we should split it into two so that we don't have to keep converting between the two formats. Remove the btrfs_clear_flag/btrfs_set_flag/btrfs_test_flag macros as they were confusing the existing code and got in the way of the new additions. Also add the FS_IOC_GETVERSION ioctl for getting i_generation as it's trivial. Signed-off-by: Christoph Hellwig Signed-off-by: Chris Mason --- fs/btrfs/btrfs_inode.h | 1 - fs/btrfs/compression.c | 6 +- fs/btrfs/ctree.h | 16 +++-- fs/btrfs/inode.c | 27 ++++---- fs/btrfs/ioctl.c | 171 +++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 200 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ecf5f7d..acb4f35 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -157,5 +157,4 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size) BTRFS_I(inode)->disk_i_size = size; } - #endif diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index ab07627..de1e2fd 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -123,7 +123,7 @@ static int check_compressed_csum(struct inode *inode, u32 csum; u32 *cb_sum = &cb->sums; - if (btrfs_test_flag(inode, NODATASUM)) + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) return 0; for (i = 0; i < cb->nr_pages; i++) { @@ -670,7 +670,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, */ atomic_inc(&cb->pending_bios); - if (!btrfs_test_flag(inode, NODATASUM)) { + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { btrfs_lookup_bio_sums(root, inode, comp_bio, sums); } @@ -697,7 +697,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); BUG_ON(ret); - if (!btrfs_test_flag(inode, NODATASUM)) + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) btrfs_lookup_bio_sums(root, inode, comp_bio, sums); ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5fa7d7d..4d6e0b6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1115,12 +1115,14 @@ struct btrfs_root { #define BTRFS_INODE_READONLY (1 << 2) #define BTRFS_INODE_NOCOMPRESS (1 << 3) #define BTRFS_INODE_PREALLOC (1 << 4) -#define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \ - ~BTRFS_INODE_##flag) -#define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \ - BTRFS_INODE_##flag) -#define btrfs_test_flag(inode, flag) (BTRFS_I(inode)->flags & \ - BTRFS_INODE_##flag) +#define BTRFS_INODE_SYNC (1 << 5) +#define BTRFS_INODE_IMMUTABLE (1 << 6) +#define BTRFS_INODE_APPEND (1 << 7) +#define BTRFS_INODE_NODUMP (1 << 8) +#define BTRFS_INODE_NOATIME (1 << 9) +#define BTRFS_INODE_DIRSYNC (1 << 10) + + /* some macros to generate set/get funcs for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple * one for u8: @@ -2260,6 +2262,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t size); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +void btrfs_update_iflags(struct inode *inode); +void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); /* file.c */ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 917bf10..5b68330 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -368,7 +368,7 @@ again: * inode has not been flagged as nocompress. This flag can * change at any time if we discover bad compression ratios. */ - if (!btrfs_test_flag(inode, NOCOMPRESS) && + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && btrfs_test_opt(root, COMPRESS)) { WARN_ON(pages); pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); @@ -469,7 +469,7 @@ again: nr_pages_ret = 0; /* flag the file so we don't compress in the future */ - btrfs_set_flag(inode, NOCOMPRESS); + BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; } if (will_compress) { *num_added += 1; @@ -862,7 +862,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow->locked_page = locked_page; async_cow->start = start; - if (btrfs_test_flag(inode, NOCOMPRESS)) + if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) cur_end = end; else cur_end = min(end, start + 512 * 1024 - 1); @@ -1133,10 +1133,10 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, int ret; struct btrfs_root *root = BTRFS_I(inode)->root; - if (btrfs_test_flag(inode, NODATACOW)) + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 1, nr_written); - else if (btrfs_test_flag(inode, PREALLOC)) + else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); else if (!btrfs_test_opt(root, COMPRESS)) @@ -1290,7 +1290,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, int ret = 0; int skip_sum; - skip_sum = btrfs_test_flag(inode, NODATASUM); + skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); BUG_ON(ret); @@ -1790,7 +1790,8 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, ClearPageChecked(page); goto good; } - if (btrfs_test_flag(inode, NODATASUM)) + + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) return 0; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && @@ -2156,6 +2157,8 @@ static void btrfs_read_locked_inode(struct inode *inode) init_special_inode(inode, inode->i_mode, rdev); break; } + + btrfs_update_iflags(inode); return; make_bad: @@ -3586,9 +3589,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_find_block_group(root, 0, alloc_hint, owner); if ((mode & S_IFREG)) { if (btrfs_test_opt(root, NODATASUM)) - btrfs_set_flag(inode, NODATASUM); + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; if (btrfs_test_opt(root, NODATACOW)) - btrfs_set_flag(inode, NODATACOW); + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; } key[0].objectid = objectid; @@ -3642,6 +3645,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, location->offset = 0; btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); + btrfs_inherit_iflags(inode, dir); + insert_inode_hash(inode); inode_tree_add(inode); return inode; @@ -5075,7 +5080,7 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans, out: if (cur_offset > start) { inode->i_ctime = CURRENT_TIME; - btrfs_set_flag(inode, PREALLOC); + BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; if (!(mode & FALLOC_FL_KEEP_SIZE) && cur_offset > i_size_read(inode)) btrfs_i_size_write(inode, cur_offset); @@ -5196,7 +5201,7 @@ static int btrfs_set_page_dirty(struct page *page) static int btrfs_permission(struct inode *inode, int mask) { - if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE)) + if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) return -EACCES; return generic_permission(inode, mask, btrfs_check_acl); } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 54dfd45..926332a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -50,7 +50,172 @@ #include "volumes.h" #include "locking.h" +/* Mask out flags that are inappropriate for the given type of inode. */ +static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & ~FS_DIRSYNC_FL; + else + return flags & (FS_NODUMP_FL | FS_NOATIME_FL); +} + +/* + * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl. + */ +static unsigned int btrfs_flags_to_ioctl(unsigned int flags) +{ + unsigned int iflags = 0; + + if (flags & BTRFS_INODE_SYNC) + iflags |= FS_SYNC_FL; + if (flags & BTRFS_INODE_IMMUTABLE) + iflags |= FS_IMMUTABLE_FL; + if (flags & BTRFS_INODE_APPEND) + iflags |= FS_APPEND_FL; + if (flags & BTRFS_INODE_NODUMP) + iflags |= FS_NODUMP_FL; + if (flags & BTRFS_INODE_NOATIME) + iflags |= FS_NOATIME_FL; + if (flags & BTRFS_INODE_DIRSYNC) + iflags |= FS_DIRSYNC_FL; + + return iflags; +} + +/* + * Update inode->i_flags based on the btrfs internal flags. + */ +void btrfs_update_iflags(struct inode *inode) +{ + struct btrfs_inode *ip = BTRFS_I(inode); + + inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + + if (ip->flags & BTRFS_INODE_SYNC) + inode->i_flags |= S_SYNC; + if (ip->flags & BTRFS_INODE_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; + if (ip->flags & BTRFS_INODE_APPEND) + inode->i_flags |= S_APPEND; + if (ip->flags & BTRFS_INODE_NOATIME) + inode->i_flags |= S_NOATIME; + if (ip->flags & BTRFS_INODE_DIRSYNC) + inode->i_flags |= S_DIRSYNC; +} + +/* + * Inherit flags from the parent inode. + * + * Unlike extN we don't have any flags we don't want to inherit currently. + */ +void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) +{ + unsigned int flags = BTRFS_I(dir)->flags; + + if (S_ISREG(inode->i_mode)) + flags &= ~BTRFS_INODE_DIRSYNC; + else if (!S_ISDIR(inode->i_mode)) + flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME); + + BTRFS_I(inode)->flags = flags; + btrfs_update_iflags(inode); +} + +static int btrfs_ioctl_getflags(struct file *file, void __user *arg) +{ + struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode); + unsigned int flags = btrfs_flags_to_ioctl(ip->flags); + + if (copy_to_user(arg, &flags, sizeof(flags))) + return -EFAULT; + return 0; +} + +static int btrfs_ioctl_setflags(struct file *file, void __user *arg) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct btrfs_inode *ip = BTRFS_I(inode); + struct btrfs_root *root = ip->root; + struct btrfs_trans_handle *trans; + unsigned int flags, oldflags; + int ret; + + if (copy_from_user(&flags, arg, sizeof(flags))) + return -EFAULT; + + if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ + FS_NOATIME_FL | FS_NODUMP_FL | \ + FS_SYNC_FL | FS_DIRSYNC_FL)) + return -EOPNOTSUPP; + if (!is_owner_or_cap(inode)) + return -EACCES; + + mutex_lock(&inode->i_mutex); + + flags = btrfs_mask_flags(inode->i_mode, flags); + oldflags = btrfs_flags_to_ioctl(ip->flags); + if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + ret = -EPERM; + goto out_unlock; + } + } + + ret = mnt_want_write(file->f_path.mnt); + if (ret) + goto out_unlock; + + if (flags & FS_SYNC_FL) + ip->flags |= BTRFS_INODE_SYNC; + else + ip->flags &= ~BTRFS_INODE_SYNC; + if (flags & FS_IMMUTABLE_FL) + ip->flags |= BTRFS_INODE_IMMUTABLE; + else + ip->flags &= ~BTRFS_INODE_IMMUTABLE; + if (flags & FS_APPEND_FL) + ip->flags |= BTRFS_INODE_APPEND; + else + ip->flags &= ~BTRFS_INODE_APPEND; + if (flags & FS_NODUMP_FL) + ip->flags |= BTRFS_INODE_NODUMP; + else + ip->flags &= ~BTRFS_INODE_NODUMP; + if (flags & FS_NOATIME_FL) + ip->flags |= BTRFS_INODE_NOATIME; + else + ip->flags &= ~BTRFS_INODE_NOATIME; + if (flags & FS_DIRSYNC_FL) + ip->flags |= BTRFS_INODE_DIRSYNC; + else + ip->flags &= ~BTRFS_INODE_DIRSYNC; + + + trans = btrfs_join_transaction(root, 1); + BUG_ON(!trans); + + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + + btrfs_update_iflags(inode); + inode->i_ctime = CURRENT_TIME; + btrfs_end_transaction(trans, root); + + mnt_drop_write(file->f_path.mnt); + out_unlock: + mutex_unlock(&inode->i_mutex); + return 0; +} + +static int btrfs_ioctl_getversion(struct file *file, int __user *arg) +{ + struct inode *inode = file->f_path.dentry->d_inode; + + return put_user(inode->i_generation, arg); +} static noinline int create_subvol(struct btrfs_root *root, struct dentry *dentry, @@ -1077,6 +1242,12 @@ long btrfs_ioctl(struct file *file, unsigned int void __user *argp = (void __user *)arg; switch (cmd) { + case FS_IOC_GETFLAGS: + return btrfs_ioctl_getflags(file, argp); + case FS_IOC_SETFLAGS: + return btrfs_ioctl_setflags(file, argp); + case FS_IOC_GETVERSION: + return btrfs_ioctl_getversion(file, argp); case BTRFS_IOC_SNAP_CREATE: return btrfs_ioctl_snap_create(file, argp, 0); case BTRFS_IOC_SUBVOL_CREATE: -- cgit v1.1 From 163e783e6a8b1e8bcb4c9084d438091386b589df Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sun, 19 Apr 2009 13:02:41 +0100 Subject: Btrfs: remove crc32c.h and use libcrc32c directly. There's no need to preserve this abstraction; it used to let us use hardware crc32c support directly, but libcrc32c is already doing that for us through the crypto API -- so we're already using the Intel crc32c acceleration where appropriate. Signed-off-by: David Woodhouse Signed-off-by: Chris Mason --- fs/btrfs/crc32c.h | 29 ----------------------------- fs/btrfs/disk-io.c | 4 ++-- fs/btrfs/extent-tree.c | 7 +++---- fs/btrfs/hash.h | 4 ++-- 4 files changed, 7 insertions(+), 37 deletions(-) delete mode 100644 fs/btrfs/crc32c.h (limited to 'fs') diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h deleted file mode 100644 index 6e1b3de..0000000 --- a/fs/btrfs/crc32c.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License v2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef __BTRFS_CRC32C__ -#define __BTRFS_CRC32C__ -#include - -/* - * this file used to do more for selecting the HW version of crc32c, - * perhaps it will one day again soon. - */ -#define btrfs_crc32c(seed, data, length) crc32c(seed, data, length) -#endif - diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f4dfbb7..6c54c21 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -26,8 +26,8 @@ #include #include #include +#include #include "compat.h" -#include "crc32c.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -171,7 +171,7 @@ out: u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len) { - return btrfs_crc32c(seed, data, len); + return crc32c(seed, data, len); } void btrfs_csum_final(u32 crc, char *result) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3355d7e..33a65f2 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -23,7 +23,6 @@ #include #include "compat.h" #include "hash.h" -#include "crc32c.h" #include "ctree.h" #include "disk-io.h" #include "print-tree.h" @@ -625,11 +624,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) __le64 lenum; lenum = cpu_to_le64(root_objectid); - high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); + high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(owner); - low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(offset); - low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); return ((u64)high_crc << 31) ^ (u64)low_crc; } diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h index 2a020b2..db2ff97 100644 --- a/fs/btrfs/hash.h +++ b/fs/btrfs/hash.h @@ -19,9 +19,9 @@ #ifndef __HASH__ #define __HASH__ -#include "crc32c.h" +#include static inline u64 btrfs_name_hash(const char *name, int len) { - return btrfs_crc32c((u32)~1, name, len); + return crc32c((u32)~1, name, len); } #endif -- cgit v1.1 From 524724ed1f224875a117be593540591ed050c73d Mon Sep 17 00:00:00 2001 From: Hisashi Hifumi Date: Wed, 10 Jun 2009 11:13:17 -0400 Subject: Btrfs: fdatasync should skip metadata writeout In btrfs, fdatasync and fsync are identical, but fdatasync should skip committing transaction when inode->i_state is set just I_DIRTY_SYNC and this indicates only atime or/and mtime updates. Following patch improves fdatasync throughput. --file-block-size=4K --file-total-size=16G --file-test-mode=rndwr --file-fsync-mode=fdatasync run Results: -2.6.30-rc8 Test execution summary: total time: 1980.6540s total number of events: 10001 total time taken by event execution: 1192.9804 per-request statistics: min: 0.0000s avg: 0.1193s max: 15.3720s approx. 95 percentile: 0.7257s Threads fairness: events (avg/stddev): 625.0625/151.32 execution time (avg/stddev): 74.5613/9.46 -2.6.30-rc8-patched Test execution summary: total time: 1695.9118s total number of events: 10000 total time taken by event execution: 871.3214 per-request statistics: min: 0.0000s avg: 0.0871s max: 10.4644s approx. 95 percentile: 0.4787s Threads fairness: events (avg/stddev): 625.0000/131.86 execution time (avg/stddev): 54.4576/8.98 Signed-off-by: Hisashi Hifumi Signed-off-by: Chris Mason --- fs/btrfs/file.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0726a73..126477e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1157,6 +1157,8 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) btrfs_wait_ordered_range(inode, 0, (u64)-1); root->log_batch++; + if (datasync && !(inode->i_state & I_DIRTY_PAGES)) + goto out; /* * ok we haven't committed the transaction yet, lets do a commit */ -- cgit v1.1 From 7df336ec1266dccbb253bac52c529d3dcc7c22d0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 10 Jun 2009 11:36:43 -0400 Subject: Fix btrfs when ACLs are configured out ... otherwise generic_permission() will allow *anything* for all files you don't own and that have some group permissions. Signed-off-by: Al Viro Signed-off-by: Chris Mason --- fs/btrfs/acl.c | 5 ----- fs/btrfs/ctree.h | 4 ++++ 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index cbba000..6039725 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -351,9 +351,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir) return 0; } -int btrfs_check_acl(struct inode *inode, int mask) -{ - return 0; -} - #endif /* CONFIG_FS_POSIX_ACL */ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4d6e0b6..03441a9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2301,7 +2301,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options); int btrfs_sync_fs(struct super_block *sb, int wait); /* acl.c */ +#ifdef CONFIG_FS_POSIX_ACL int btrfs_check_acl(struct inode *inode, int mask); +#else +#define btrfs_check_acl NULL +#endif int btrfs_init_acl(struct inode *inode, struct inode *dir); int btrfs_acl_chmod(struct inode *inode); -- cgit v1.1 From 58f7f68f228c3aba2ba4468d92e2cec35724ba2e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 10 Jun 2009 09:57:55 -0400 Subject: cifs: add addr= mount option alias for ip= When you look in /proc/mounts, the address of the server gets displayed as "addr=". That's really a better option to use anyway since it's more generic. What if we eventually want to support non-IP transports? It also makes CIFS option consistent with the NFS option of the same name. Begin the migration to that option name by adding an alias for ip= called addr=. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 10151f8..6298dc3 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -958,7 +958,8 @@ cifs_parse_mount_options(char *options, const char *devname, } strcpy(vol->password, value); } - } else if (strnicmp(data, "ip", 2) == 0) { + } else if (!strnicmp(data, "ip", 2) || + !strnicmp(data, "addr", 4)) { if (!value || !*value) { vol->UNCip = NULL; } else if (strnlen(value, INET6_ADDRSTRLEN) < -- cgit v1.1 From a41f20716975910d9beb90b7efc61107901492b8 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 10 Jun 2009 14:22:55 -0400 Subject: ext4: Avoid corrupting the uninitialized bit in the extent during truncate The unitialized bit was not properly getting preserved in in an extent which is partially truncated because the it was geting set to the value of the first extent to be removed or truncated as part of the truncate operation, and if there are multiple extents are getting removed or modified as part of the truncate operation, it is only the last extent which will might be partially truncated, and its uninitalized bit is not necessarily the same as the first extent to be truncated. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 9c35a7b..2593f74 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2083,12 +2083,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex = EXT_LAST_EXTENT(eh); ex_ee_block = le32_to_cpu(ex->ee_block); - if (ext4_ext_is_uninitialized(ex)) - uninitialized = 1; ex_ee_len = ext4_ext_get_actual_len(ex); while (ex >= EXT_FIRST_EXTENT(eh) && ex_ee_block + ex_ee_len > start) { + + if (ext4_ext_is_uninitialized(ex)) + uninitialized = 1; + else + uninitialized = 0; + ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len); path[depth].p_ext = ex; -- cgit v1.1 From 61b6bc525a34931bb73e4c95bfe009cd9572a288 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 10 Jun 2009 10:04:58 -0400 Subject: cifs: remove never-used in6_addr option This option was never used to my knowledge. Remove it before someone does... Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 6298dc3..97f4311 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -62,7 +62,6 @@ struct smb_vol { char *domainname; char *UNC; char *UNCip; - char *in6_addr; /* ipv6 address as human readable form of in6_addr */ char *iocharset; /* local code page for mapping to and from Unicode */ char source_rfc1001_name[16]; /* netbios name of client */ char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */ @@ -1320,16 +1319,6 @@ cifs_parse_mount_options(char *options, const char *devname, vol->direct_io = 1; } else if (strnicmp(data, "forcedirectio", 13) == 0) { vol->direct_io = 1; - } else if (strnicmp(data, "in6_addr", 8) == 0) { - if (!value || !*value) { - vol->in6_addr = NULL; - } else if (strnlen(value, 49) == 48) { - vol->in6_addr = value; - } else { - printk(KERN_WARNING "CIFS: ip v6 address not " - "48 characters long\n"); - return 1; - } } else if (strnicmp(data, "noac", 4) == 0) { printk(KERN_WARNING "CIFS: Mount option noac not " "supported. Instead set " -- cgit v1.1 From e5e9a5206a171b2c467e494aebcdcf70c47289bc Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 10 Jun 2009 15:17:02 -0400 Subject: Btrfs: avoid races between super writeout and device list updates On multi-device filesystems, btrfs writes supers to all of the devices before considering a sync complete. There wasn't any additional locking between super writeout and the device list management code because device management was done inside a transaction and super writeout only happened with no transation writers running. With the btrfs fsync log and other async transaction updates, this has been racey for some time. This adds a mutex to protect the device list. The existing volume mutex could not be reused due to transaction lock ordering requirements. Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 6 +++++- fs/btrfs/volumes.c | 34 ++++++++++++++++++++++++++++++++++ fs/btrfs/volumes.h | 7 ++++++- 3 files changed, 45 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6c54c21..b7ddc77 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2111,7 +2111,7 @@ static int write_dev_supers(struct btrfs_device *device, int write_all_supers(struct btrfs_root *root, int max_mirrors) { - struct list_head *head = &root->fs_info->fs_devices->devices; + struct list_head *head; struct btrfs_device *dev; struct btrfs_super_block *sb; struct btrfs_dev_item *dev_item; @@ -2126,6 +2126,9 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) sb = &root->fs_info->super_for_commit; dev_item = &sb->dev_item; + + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + head = &root->fs_info->fs_devices->devices; list_for_each_entry(dev, head, dev_list) { if (!dev->bdev) { total_errors++; @@ -2169,6 +2172,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors) if (ret) total_errors++; } + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (total_errors > max_errors) { printk(KERN_ERR "btrfs: %d errors while writing supers\n", total_errors); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3f4a593..3ab80e9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -377,6 +377,7 @@ static noinline int device_list_add(const char *path, memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); fs_devices->latest_devid = devid; fs_devices->latest_trans = found_transid; + mutex_init(&fs_devices->device_list_mutex); device = NULL; } else { device = __find_device(&fs_devices->devices, devid, @@ -403,7 +404,11 @@ static noinline int device_list_add(const char *path, return -ENOMEM; } INIT_LIST_HEAD(&device->dev_alloc_list); + + mutex_lock(&fs_devices->device_list_mutex); list_add(&device->dev_list, &fs_devices->devices); + mutex_unlock(&fs_devices->device_list_mutex); + device->fs_devices = fs_devices; fs_devices->num_devices++; } @@ -429,10 +434,12 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) INIT_LIST_HEAD(&fs_devices->devices); INIT_LIST_HEAD(&fs_devices->alloc_list); INIT_LIST_HEAD(&fs_devices->list); + mutex_init(&fs_devices->device_list_mutex); fs_devices->latest_devid = orig->latest_devid; fs_devices->latest_trans = orig->latest_trans; memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); + mutex_lock(&orig->device_list_mutex); list_for_each_entry(orig_dev, &orig->devices, dev_list) { device = kzalloc(sizeof(*device), GFP_NOFS); if (!device) @@ -454,8 +461,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) device->fs_devices = fs_devices; fs_devices->num_devices++; } + mutex_unlock(&orig->device_list_mutex); return fs_devices; error: + mutex_unlock(&orig->device_list_mutex); free_fs_devices(fs_devices); return ERR_PTR(-ENOMEM); } @@ -466,6 +475,7 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) mutex_lock(&uuid_mutex); again: + mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { if (device->in_fs_metadata) continue; @@ -485,6 +495,7 @@ again: kfree(device->name); kfree(device); } + mutex_unlock(&fs_devices->device_list_mutex); if (fs_devices->seed) { fs_devices = fs_devices->seed; @@ -1135,12 +1146,14 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) device = NULL; devices = &root->fs_info->fs_devices->devices; + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); list_for_each_entry(tmp, devices, dev_list) { if (tmp->in_fs_metadata && !tmp->bdev) { device = tmp; break; } } + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); bdev = NULL; bh = NULL; disk_super = NULL; @@ -1195,7 +1208,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) goto error_brelse; device->in_fs_metadata = 0; + + /* + * the device list mutex makes sure that we don't change + * the device list while someone else is writing out all + * the device supers. + */ + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); list_del_init(&device->dev_list); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + device->fs_devices->num_devices--; next_device = list_entry(root->fs_info->fs_devices->devices.next, @@ -1289,6 +1311,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, seed_devices->opened = 1; INIT_LIST_HEAD(&seed_devices->devices); INIT_LIST_HEAD(&seed_devices->alloc_list); + mutex_init(&seed_devices->device_list_mutex); list_splice_init(&fs_devices->devices, &seed_devices->devices); list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); list_for_each_entry(device, &seed_devices->devices, dev_list) { @@ -1414,6 +1437,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) mutex_lock(&root->fs_info->volume_mutex); devices = &root->fs_info->fs_devices->devices; + /* + * we have the volume lock, so we don't need the extra + * device list mutex while reading the list here. + */ list_for_each_entry(device, devices, dev_list) { if (device->bdev == bdev) { ret = -EEXIST; @@ -1468,6 +1495,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) } device->fs_devices = root->fs_info->fs_devices; + + /* + * we don't want write_supers to jump in here with our device + * half setup + */ + mutex_lock(&root->fs_info->fs_devices->device_list_mutex); list_add(&device->dev_list, &root->fs_info->fs_devices->devices); list_add(&device->dev_alloc_list, &root->fs_info->fs_devices->alloc_list); @@ -1486,6 +1519,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); btrfs_set_super_num_devices(&root->fs_info->super_copy, total_bytes + 1); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (seeding_dev) { ret = init_first_rw_device(trans, root, device); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 3c1f731..5139a83 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -96,7 +96,12 @@ struct btrfs_fs_devices { u64 rw_devices; u64 total_rw_bytes; struct block_device *latest_bdev; - /* all of the devices in the FS */ + + /* all of the devices in the FS, protected by a mutex + * so we can safely walk it to write out the supers without + * worrying about add/remove by the multi-device code + */ + struct mutex device_list_mutex; struct list_head devices; /* devices not currently being allocated */ -- cgit v1.1 From 4eedeb75e7f15ffdb12d1ad559b565e7505bdbaf Mon Sep 17 00:00:00 2001 From: Hisashi Hifumi Date: Wed, 10 Jun 2009 15:28:55 -0400 Subject: Btrfs: pin buffers during write_dev_supers write_dev_supers is called in sequence. First is it called with wait == 0, which starts IO on all of the super blocks for a given device. Then it is called with wait == 1 to make sure they all reach the disk. It doesn't currently pin the buffers between the two calls, and it also assumes the buffers won't go away between the two calls, leading to an oops if the VM manages to free the buffers in the middle of the sync. This fixes that assumption and updates the code to return an error if things are not up to date when the wait == 1 run is done. Signed-off-by: Hisashi Hifumi Signed-off-by: Chris Mason --- fs/btrfs/disk-io.c | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b7ddc77..0d50d49 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2020,6 +2020,17 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) return latest; } +/* + * this should be called twice, once with wait == 0 and + * once with wait == 1. When wait == 0 is done, all the buffer heads + * we write are pinned. + * + * They are released when wait == 1 is done. + * max_mirrors must be the same for both runs, and it indicates how + * many supers on this one device should be written. + * + * max_mirrors == 0 means to write them all. + */ static int write_dev_supers(struct btrfs_device *device, struct btrfs_super_block *sb, int do_barriers, int wait, int max_mirrors) @@ -2055,12 +2066,16 @@ static int write_dev_supers(struct btrfs_device *device, bh = __find_get_block(device->bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); BUG_ON(!bh); - brelse(bh); wait_on_buffer(bh); - if (buffer_uptodate(bh)) { - brelse(bh); - continue; - } + if (!buffer_uptodate(bh)) + errors++; + + /* drop our reference */ + brelse(bh); + + /* drop the reference from the wait == 0 run */ + brelse(bh); + continue; } else { btrfs_set_super_bytenr(sb, bytenr); @@ -2071,12 +2086,18 @@ static int write_dev_supers(struct btrfs_device *device, BTRFS_CSUM_SIZE); btrfs_csum_final(crc, sb->csum); + /* + * one reference for us, and we leave it for the + * caller + */ bh = __getblk(device->bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); - set_buffer_uptodate(bh); + /* one reference for submit_bh */ get_bh(bh); + + set_buffer_uptodate(bh); lock_buffer(bh); bh->b_end_io = btrfs_end_buffer_write_sync; } @@ -2088,6 +2109,7 @@ static int write_dev_supers(struct btrfs_device *device, device->name); set_buffer_uptodate(bh); device->barriers = 0; + /* one reference for submit_bh */ get_bh(bh); lock_buffer(bh); ret = submit_bh(WRITE_SYNC, bh); @@ -2096,15 +2118,8 @@ static int write_dev_supers(struct btrfs_device *device, ret = submit_bh(WRITE_SYNC, bh); } - if (!ret && wait) { - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - errors++; - } else if (ret) { + if (ret) errors++; - } - if (wait) - brelse(bh); } return errors < i ? 0 : -1; } -- cgit v1.1 From 0e0c62123b517d2b3c26922342c0cc5bb63a93f8 Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Wed, 10 Jun 2009 12:57:07 -0700 Subject: fs/bio.c: add missing __user annotation As reported by sparse: fs/bio.c:720:13: warning: incorrect type in assignment (different address spaces) fs/bio.c:720:13: expected char *iov_addr fs/bio.c:720:13: got void [noderef] * fs/bio.c:724:36: warning: incorrect type in argument 2 (different address spaces) fs/bio.c:724:36: expected void const [noderef] *from fs/bio.c:724:36: got char *iov_addr Signed-off-by: Michal Simek Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- fs/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index ab423a1..533266a 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -722,7 +722,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, while (bv_len && iov_idx < iov_count) { unsigned int bytes; - char *iov_addr; + char __user *iov_addr; bytes = min_t(unsigned int, iov[iov_idx].iov_len - iov_off, bv_len); -- cgit v1.1 From fd0fb038d5a308c7faddd1701be5e70aaffec98b Mon Sep 17 00:00:00 2001 From: Shin Hong Date: Wed, 10 Jun 2009 20:11:29 -0400 Subject: Btrfs: init worker struct fields before kthread-run This patch fixes a bug which may result race condition between btrfs_start_workers() and worker_loop(). btrfs_start_workers() executed in a parent thread writes on workers->worker and worker_loop() in a child thread reads workers->worker. However, there is no synchronization enforcing the order of two operations. This patch makes btrfs_start_workers() fill workers->worker before it starts a child thread with worker_loop() Signed-off-by: Chris Mason --- fs/btrfs/async-thread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 502c3d6..7f88628 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -294,10 +294,10 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) INIT_LIST_HEAD(&worker->worker_list); spin_lock_init(&worker->lock); atomic_set(&worker->num_pending, 0); + worker->workers = workers; worker->task = kthread_run(worker_loop, worker, "btrfs-%s-%d", workers->name, workers->num_workers + i); - worker->workers = workers; if (IS_ERR(worker->task)) { kfree(worker); ret = PTR_ERR(worker->task); -- cgit v1.1 From 85d4198e40c289dd623cecd16601fa613559bed7 Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Thu, 11 Jun 2009 08:51:10 -0400 Subject: Btrfs: check duplicate backrefs for both data and metadata lookup_inline_extent_backref only checks for duplicate backref for data extents. It assumes backrefs for tree block never conflict. This patch makes lookup_inline_extent_backref check for duplicate backrefs for both data and tree block, so that we can detect potential bug earlier. This is a safety check, strictly speaking it is not required. Signed-off-by: Yan Zheng Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 33a65f2..edc7d20 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1056,8 +1056,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, want = extent_ref_type(parent, owner); if (insert) { extra_size = btrfs_extent_inline_ref_size(want); - if (owner >= BTRFS_FIRST_FREE_OBJECTID) - path->keep_locks = 1; + path->keep_locks = 1; } else extra_size = -1; ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); @@ -1087,12 +1086,6 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, #endif BUG_ON(item_size < sizeof(*ei)); - if (owner < BTRFS_FIRST_FREE_OBJECTID && insert && - item_size + extra_size >= BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { - err = -EAGAIN; - goto out; - } - ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); flags = btrfs_extent_flags(leaf, ei); @@ -1165,15 +1158,15 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, * For simplicity, we just do not add new inline back * ref if there is any kind of item for this block */ - if (owner >= BTRFS_FIRST_FREE_OBJECTID && - find_next_key(path, &key) == 0 && key.objectid == bytenr) { + if (find_next_key(path, &key) == 0 && key.objectid == bytenr && + key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { err = -EAGAIN; goto out; } } *ref_ret = (struct btrfs_extent_inline_ref *)ptr; out: - if (insert && owner >= BTRFS_FIRST_FREE_OBJECTID) { + if (insert) { path->keep_locks = 0; btrfs_unlock_up_safe(path, 1); } -- cgit v1.1 From 067c28adc53807514ac0c6ebb6af3243cbd071fa Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 11 Jun 2009 09:30:13 -0400 Subject: Btrfs: fix -o nodatasum printk spelling It was printing nodatacsum, which was not the correct option name. Signed-off-by: Chris Mason --- fs/btrfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 3427db2..708ac06 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -159,7 +159,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) */ break; case Opt_nodatasum: - printk(KERN_INFO "btrfs: setting nodatacsum\n"); + printk(KERN_INFO "btrfs: setting nodatasum\n"); btrfs_set_opt(info->mount_opt, NODATASUM); break; case Opt_nodatacow: -- cgit v1.1 From 0b4dcea579a1b6f4d249d61f5bc8adeaa7c895d8 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 11 Jun 2009 11:13:35 -0400 Subject: Btrfs: fix oops when btrfs_inherit_iflags called with a NULL dir This happens during subvol creation. Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 926332a..eff18f5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -112,7 +112,12 @@ void btrfs_update_iflags(struct inode *inode) */ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) { - unsigned int flags = BTRFS_I(dir)->flags; + unsigned int flags; + + if (!dir) + return; + + flags = BTRFS_I(dir)->flags; if (S_ISREG(inode->i_mode)) flags &= ~BTRFS_INODE_DIRSYNC; -- cgit v1.1 From b263c2c8bf13c273485bd99dbbeba79c844409dd Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 11 Jun 2009 11:24:47 -0400 Subject: Btrfs: fix extent_buffer leak during tree log replay During tree log replay, we read in the tree log roots, process them and then free them. A recent change takes an extra reference on the root node of the tree when the root is read in, and stores that reference in root->commit_root. This reference was not being freed, leaving us with one buffer pinned in ram for each subvol with a tree log root after a crash. Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 2b41fc0..c139222 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3019,6 +3019,7 @@ again: key.offset = found_key.offset - 1; wc.replay_dest->log_root = NULL; free_extent_buffer(log->node); + free_extent_buffer(log->commit_root); kfree(log); if (found_key.offset == 0) -- cgit v1.1 From 93d5581e20600593ec3236921b6620225fb76034 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 11 Jun 2009 14:03:55 +0100 Subject: devpts: unregister the file system on error Closes-bug: http://bugzilla.kernel.org/show_bug.cgi?id=13429 Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- fs/devpts/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index c68edb9..9b1d285 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -557,8 +557,10 @@ static int __init init_devpts_fs(void) int err = register_filesystem(&devpts_fs_type); if (!err) { devpts_mnt = kern_mount(&devpts_fs_type); - if (IS_ERR(devpts_mnt)) + if (IS_ERR(devpts_mnt)) { err = PTR_ERR(devpts_mnt); + unregister_filesystem(&devpts_fs_type); + } } return err; } -- cgit v1.1 From 2e1483c995bbd0fa6cbd055ad76088a520799ba4 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 11 Jun 2009 13:24:13 +0100 Subject: kmemleak: Remove some of the kmemleak false positives There are allocations for which the main pointer cannot be found but they are not memory leaks. This patch fixes some of them. For more information on false positives, see Documentation/kmemleak.txt. Signed-off-by: Catalin Marinas --- fs/block_dev.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index f45dbc1..d250f80 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include "internal.h" @@ -492,6 +493,11 @@ void __init bdev_cache_init(void) bd_mnt = kern_mount(&bd_type); if (IS_ERR(bd_mnt)) panic("Cannot create bdev pseudo-fs"); + /* + * This vfsmount structure is only used to obtain the + * blockdev_superblock, so tell kmemleak not to report it. + */ + kmemleak_not_leak(bd_mnt); blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ } -- cgit v1.1 From 90586523eb4b349806887c62ee70685a49415124 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 21 May 2009 17:01:20 -0400 Subject: fsnotify: unified filesystem notification backend fsnotify is a backend for filesystem notification. fsnotify does not provide any userspace interface but does provide the basis needed for other notification schemes such as dnotify. fsnotify can be extended to be the backend for inotify or the upcoming fanotify. fsnotify provides a mechanism for "groups" to register for some set of filesystem events and to then deliver those events to those groups for processing. fsnotify has a number of benefits, the first being actually shrinking the size of an inode. Before fsnotify to support both dnotify and inotify an inode had unsigned long i_dnotify_mask; /* Directory notify events */ struct dnotify_struct *i_dnotify; /* for directory notifications */ struct list_head inotify_watches; /* watches on this inode */ struct mutex inotify_mutex; /* protects the watches list But with fsnotify this same functionallity (and more) is done with just __u32 i_fsnotify_mask; /* all events for this inode */ struct hlist_head i_fsnotify_mark_entries; /* marks on this inode */ That's right, inotify, dnotify, and fanotify all in 64 bits. We used that much space just in inotify_watches alone, before this patch set. fsnotify object lifetime and locking is MUCH better than what we have today. inotify locking is incredibly complex. See 8f7b0ba1c8539 as an example of what's been busted since inception. inotify needs to know internal semantics of superblock destruction and unmounting to function. The inode pinning and vfs contortions are horrible. no fsnotify implementers do allocation under locks. This means things like f04b30de3 which (due to an overabundance of caution) changes GFP_KERNEL to GFP_NOFS can be reverted. There are no longer any allocation rules when using or implementing your own fsnotify listener. fsnotify paves the way for fanotify. In brief fanotify is a notification mechanism that delivers the lisener both an 'event' and an open file descriptor to the object in question. This means that fanotify is pathname agnostic. Some on lkml may not care for the original companies or users that pushed for TALPA, but fanotify was designed with flexibility and input for other users in mind. The readahead group expressed interest in fanotify as it could be used to profile disk access on boot without breaking the audit system. The desktop search groups have also expressed interest in fanotify as it solves a number of the race conditions and problems present with managing inotify when more than a limited number of specific files are of interest. fanotify can provide for a userspace access control system which makes it a clean interface for AV vendors to hook without trying to do binary patching on the syscall table, LSM, and everywhere else they do their things today. With this patch series fanotify can be implemented in less than 1200 lines of easy to review code. Almost all of which is the socket based user interface. This patch series builds fsnotify to the point that it can implement dnotify and inotify_user. Patches exist and will be sent soon after acceptance to finish the in kernel inotify conversion (audit) and implement fanotify. Signed-off-by: Eric Paris Acked-by: Al Viro Cc: Christoph Hellwig --- fs/notify/Kconfig | 13 +++ fs/notify/Makefile | 2 + fs/notify/fsnotify.c | 79 ++++++++++++++++++ fs/notify/fsnotify.h | 15 ++++ fs/notify/group.c | 198 ++++++++++++++++++++++++++++++++++++++++++++ fs/notify/inotify/inotify.c | 20 +++++ fs/notify/notification.c | 121 +++++++++++++++++++++++++++ 7 files changed, 448 insertions(+) create mode 100644 fs/notify/fsnotify.c create mode 100644 fs/notify/fsnotify.h create mode 100644 fs/notify/group.c create mode 100644 fs/notify/notification.c (limited to 'fs') diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig index 50914d7..31dac7e 100644 --- a/fs/notify/Kconfig +++ b/fs/notify/Kconfig @@ -1,2 +1,15 @@ +config FSNOTIFY + bool "Filesystem notification backend" + default y + ---help--- + fsnotify is a backend for filesystem notification. fsnotify does + not provide any userspace interface but does provide the basis + needed for other notification schemes such as dnotify, inotify, + and fanotify. + + Say Y here to enable fsnotify suport. + + If unsure, say Y. + source "fs/notify/dnotify/Kconfig" source "fs/notify/inotify/Kconfig" diff --git a/fs/notify/Makefile b/fs/notify/Makefile index 5a95b60..db5467b 100644 --- a/fs/notify/Makefile +++ b/fs/notify/Makefile @@ -1,2 +1,4 @@ +obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o + obj-y += dnotify/ obj-y += inotify/ diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c new file mode 100644 index 0000000..56bee0f --- /dev/null +++ b/fs/notify/fsnotify.c @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2008 Red Hat, Inc., Eric Paris + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include + +#include +#include "fsnotify.h" + +/* + * This is the main call to fsnotify. The VFS calls into hook specific functions + * in linux/fsnotify.h. Those functions then in turn call here. Here will call + * out to all of the registered fsnotify_group. Those groups can then use the + * notification event in whatever means they feel necessary. + */ +void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is) +{ + struct fsnotify_group *group; + struct fsnotify_event *event = NULL; + int idx; + + if (list_empty(&fsnotify_groups)) + return; + + if (!(mask & fsnotify_mask)) + return; + + /* + * SRCU!! the groups list is very very much read only and the path is + * very hot. The VAST majority of events are not going to need to do + * anything other than walk the list so it's crazy to pre-allocate. + */ + idx = srcu_read_lock(&fsnotify_grp_srcu); + list_for_each_entry_rcu(group, &fsnotify_groups, group_list) { + if (mask & group->mask) { + if (!event) { + event = fsnotify_create_event(to_tell, mask, data, data_is); + /* shit, we OOM'd and now we can't tell, maybe + * someday someone else will want to do something + * here */ + if (!event) + break; + } + group->ops->handle_event(group, event); + } + } + srcu_read_unlock(&fsnotify_grp_srcu, idx); + /* + * fsnotify_create_event() took a reference so the event can't be cleaned + * up while we are still trying to add it to lists, drop that one. + */ + if (event) + fsnotify_put_event(event); +} +EXPORT_SYMBOL_GPL(fsnotify); + +static __init int fsnotify_init(void) +{ + return init_srcu_struct(&fsnotify_grp_srcu); +} +subsys_initcall(fsnotify_init); diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h new file mode 100644 index 0000000..c6a8bd4 --- /dev/null +++ b/fs/notify/fsnotify.h @@ -0,0 +1,15 @@ +#ifndef __FS_NOTIFY_FSNOTIFY_H_ +#define __FS_NOTIFY_FSNOTIFY_H_ + +#include +#include +#include +#include + +/* protects reads of fsnotify_groups */ +extern struct srcu_struct fsnotify_grp_srcu; +/* all groups which receive fsnotify events */ +extern struct list_head fsnotify_groups; +/* all bitwise OR of all event types (FS_*) for all fsnotify_groups */ +extern __u32 fsnotify_mask; +#endif /* __FS_NOTIFY_FSNOTIFY_H_ */ diff --git a/fs/notify/group.c b/fs/notify/group.c new file mode 100644 index 0000000..c681295 --- /dev/null +++ b/fs/notify/group.c @@ -0,0 +1,198 @@ +/* + * Copyright (C) 2008 Red Hat, Inc., Eric Paris + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include "fsnotify.h" + +#include + +/* protects writes to fsnotify_groups and fsnotify_mask */ +static DEFINE_MUTEX(fsnotify_grp_mutex); +/* protects reads while running the fsnotify_groups list */ +struct srcu_struct fsnotify_grp_srcu; +/* all groups registered to receive filesystem notifications */ +LIST_HEAD(fsnotify_groups); +/* bitwise OR of all events (FS_*) interesting to some group on this system */ +__u32 fsnotify_mask; + +/* + * When a new group registers or changes it's set of interesting events + * this function updates the fsnotify_mask to contain all interesting events + */ +void fsnotify_recalc_global_mask(void) +{ + struct fsnotify_group *group; + __u32 mask = 0; + int idx; + + idx = srcu_read_lock(&fsnotify_grp_srcu); + list_for_each_entry_rcu(group, &fsnotify_groups, group_list) + mask |= group->mask; + srcu_read_unlock(&fsnotify_grp_srcu, idx); + fsnotify_mask = mask; +} + +/* + * Take a reference to a group so things found under the fsnotify_grp_mutex + * can't get freed under us + */ +static void fsnotify_get_group(struct fsnotify_group *group) +{ + atomic_inc(&group->refcnt); +} + +/* + * Final freeing of a group + */ +static void fsnotify_destroy_group(struct fsnotify_group *group) +{ + if (group->ops->free_group_priv) + group->ops->free_group_priv(group); + + kfree(group); +} + +/* + * Remove this group from the global list of groups that will get events + * this can be done even if there are still references and things still using + * this group. This just stops the group from getting new events. + */ +static void __fsnotify_evict_group(struct fsnotify_group *group) +{ + BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex)); + + if (group->on_group_list) + list_del_rcu(&group->group_list); + group->on_group_list = 0; +} + +/* + * Called when a group is no longer interested in getting events. This can be + * used if a group is misbehaving or if for some reason a group should no longer + * get any filesystem events. + */ +void fsnotify_evict_group(struct fsnotify_group *group) +{ + mutex_lock(&fsnotify_grp_mutex); + __fsnotify_evict_group(group); + mutex_unlock(&fsnotify_grp_mutex); +} + +/* + * Drop a reference to a group. Free it if it's through. + */ +void fsnotify_put_group(struct fsnotify_group *group) +{ + if (!atomic_dec_and_mutex_lock(&group->refcnt, &fsnotify_grp_mutex)) + return; + + /* + * OK, now we know that there's no other users *and* we hold mutex, + * so no new references will appear + */ + __fsnotify_evict_group(group); + + /* + * now it's off the list, so the only thing we might care about is + * srcu access.... + */ + mutex_unlock(&fsnotify_grp_mutex); + synchronize_srcu(&fsnotify_grp_srcu); + + /* and now it is really dead. _Nothing_ could be seeing it */ + fsnotify_recalc_global_mask(); + fsnotify_destroy_group(group); +} + +/* + * Simply run the fsnotify_groups list and find a group which matches + * the given parameters. If a group is found we take a reference to that + * group. + */ +static struct fsnotify_group *fsnotify_find_group(unsigned int group_num, __u32 mask, + const struct fsnotify_ops *ops) +{ + struct fsnotify_group *group_iter; + struct fsnotify_group *group = NULL; + + BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex)); + + list_for_each_entry_rcu(group_iter, &fsnotify_groups, group_list) { + if (group_iter->group_num == group_num) { + if ((group_iter->mask == mask) && + (group_iter->ops == ops)) { + fsnotify_get_group(group_iter); + group = group_iter; + } else + group = ERR_PTR(-EEXIST); + } + } + return group; +} + +/* + * Either finds an existing group which matches the group_num, mask, and ops or + * creates a new group and adds it to the global group list. In either case we + * take a reference for the group returned. + */ +struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask, + const struct fsnotify_ops *ops) +{ + struct fsnotify_group *group, *tgroup; + + /* very low use, simpler locking if we just always alloc */ + group = kmalloc(sizeof(struct fsnotify_group), GFP_KERNEL); + if (!group) + return ERR_PTR(-ENOMEM); + + atomic_set(&group->refcnt, 1); + + group->on_group_list = 0; + group->group_num = group_num; + group->mask = mask; + + group->ops = ops; + + mutex_lock(&fsnotify_grp_mutex); + tgroup = fsnotify_find_group(group_num, mask, ops); + if (tgroup) { + /* group already exists */ + mutex_unlock(&fsnotify_grp_mutex); + /* destroy the new one we made */ + fsnotify_put_group(group); + return tgroup; + } + + /* group not found, add a new one */ + list_add_rcu(&group->group_list, &fsnotify_groups); + group->on_group_list = 1; + + mutex_unlock(&fsnotify_grp_mutex); + + if (mask) + fsnotify_recalc_global_mask(); + + return group; +} diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c index 220c13f..40b1cf9 100644 --- a/fs/notify/inotify/inotify.c +++ b/fs/notify/inotify/inotify.c @@ -32,6 +32,7 @@ #include #include #include +#include static atomic_t inotify_cookie; @@ -905,6 +906,25 @@ EXPORT_SYMBOL_GPL(inotify_rm_watch); */ static int __init inotify_setup(void) { + BUILD_BUG_ON(IN_ACCESS != FS_ACCESS); + BUILD_BUG_ON(IN_MODIFY != FS_MODIFY); + BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB); + BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE); + BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); + BUILD_BUG_ON(IN_OPEN != FS_OPEN); + BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM); + BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO); + BUILD_BUG_ON(IN_CREATE != FS_CREATE); + BUILD_BUG_ON(IN_DELETE != FS_DELETE); + BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF); + BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF); + BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW); + + BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT); + BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR); + BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED); + BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT); + atomic_set(&inotify_cookie, 0); return 0; diff --git a/fs/notify/notification.c b/fs/notify/notification.c new file mode 100644 index 0000000..b8e9a87 --- /dev/null +++ b/fs/notify/notification.c @@ -0,0 +1,121 @@ +/* + * Copyright (C) 2008 Red Hat, Inc., Eric Paris + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include "fsnotify.h" + +static struct kmem_cache *fsnotify_event_cachep; + +void fsnotify_get_event(struct fsnotify_event *event) +{ + atomic_inc(&event->refcnt); +} + +void fsnotify_put_event(struct fsnotify_event *event) +{ + if (!event) + return; + + if (atomic_dec_and_test(&event->refcnt)) { + if (event->data_type == FSNOTIFY_EVENT_PATH) + path_put(&event->path); + + kmem_cache_free(fsnotify_event_cachep, event); + } +} + +/* + * Allocate a new event which will be sent to each group's handle_event function + * if the group was interested in this particular event. + */ +struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, + void *data, int data_type) +{ + struct fsnotify_event *event; + + event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL); + if (!event) + return NULL; + + atomic_set(&event->refcnt, 1); + + spin_lock_init(&event->lock); + + event->path.dentry = NULL; + event->path.mnt = NULL; + event->inode = NULL; + + event->to_tell = to_tell; + + switch (data_type) { + case FSNOTIFY_EVENT_FILE: { + struct file *file = data; + struct path *path = &file->f_path; + event->path.dentry = path->dentry; + event->path.mnt = path->mnt; + path_get(&event->path); + event->data_type = FSNOTIFY_EVENT_PATH; + break; + } + case FSNOTIFY_EVENT_PATH: { + struct path *path = data; + event->path.dentry = path->dentry; + event->path.mnt = path->mnt; + path_get(&event->path); + event->data_type = FSNOTIFY_EVENT_PATH; + break; + } + case FSNOTIFY_EVENT_INODE: + event->inode = data; + event->data_type = FSNOTIFY_EVENT_INODE; + break; + case FSNOTIFY_EVENT_NONE: + event->inode = NULL; + event->path.dentry = NULL; + event->path.mnt = NULL; + break; + default: + BUG(); + } + + event->mask = mask; + + return event; +} + +__init int fsnotify_notification_init(void) +{ + fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC); + + return 0; +} +subsys_initcall(fsnotify_notification_init); + -- cgit v1.1 From 3be25f49b9d6a97eae9bcb96d3292072b7658bd8 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 21 May 2009 17:01:26 -0400 Subject: fsnotify: add marks to inodes so groups can interpret how to handle those inodes This patch creates a way for fsnotify groups to attach marks to inodes. These marks have little meaning to the generic fsnotify infrastructure and thus their meaning should be interpreted by the group that attached them to the inode's list. dnotify and inotify will make use of these markings to indicate which inodes are of interest to their respective groups. But this implementation has the useful property that in the future other listeners could actually use the marks for the exact opposite reason, aka to indicate which inodes it had NO interest in. Signed-off-by: Eric Paris Acked-by: Al Viro Cc: Christoph Hellwig --- fs/inode.c | 9 ++ fs/notify/Makefile | 2 +- fs/notify/fsnotify.c | 13 ++ fs/notify/fsnotify.h | 5 + fs/notify/group.c | 49 +++++++- fs/notify/inode_mark.c | 329 +++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 405 insertions(+), 2 deletions(-) create mode 100644 fs/notify/inode_mark.c (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index bca0c61..54c63ce 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -189,6 +190,10 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode) inode->i_private = NULL; inode->i_mapping = mapping; +#ifdef CONFIG_FSNOTIFY + inode->i_fsnotify_mask = 0; +#endif + return inode; out_free_security: @@ -221,6 +226,7 @@ void destroy_inode(struct inode *inode) BUG_ON(inode_has_buffers(inode)); ima_inode_free(inode); security_inode_free(inode); + fsnotify_inode_delete(inode); if (inode->i_sb->s_op->destroy_inode) inode->i_sb->s_op->destroy_inode(inode); else @@ -252,6 +258,9 @@ void inode_init_once(struct inode *inode) INIT_LIST_HEAD(&inode->inotify_watches); mutex_init(&inode->inotify_mutex); #endif +#ifdef CONFIG_FSNOTIFY + INIT_HLIST_HEAD(&inode->i_fsnotify_mark_entries); +#endif } EXPORT_SYMBOL(inode_init_once); diff --git a/fs/notify/Makefile b/fs/notify/Makefile index db5467b..0922cc8 100644 --- a/fs/notify/Makefile +++ b/fs/notify/Makefile @@ -1,4 +1,4 @@ -obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o +obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o inode_mark.o obj-y += dnotify/ obj-y += inotify/ diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 56bee0f..d565462 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -26,6 +26,15 @@ #include "fsnotify.h" /* + * Clear all of the marks on an inode when it is being evicted from core + */ +void __fsnotify_inode_delete(struct inode *inode) +{ + fsnotify_clear_marks_by_inode(inode); +} +EXPORT_SYMBOL_GPL(__fsnotify_inode_delete); + +/* * This is the main call to fsnotify. The VFS calls into hook specific functions * in linux/fsnotify.h. Those functions then in turn call here. Here will call * out to all of the registered fsnotify_group. Those groups can then use the @@ -43,6 +52,8 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is) if (!(mask & fsnotify_mask)) return; + if (!(mask & to_tell->i_fsnotify_mask)) + return; /* * SRCU!! the groups list is very very much read only and the path is * very hot. The VAST majority of events are not going to need to do @@ -51,6 +62,8 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is) idx = srcu_read_lock(&fsnotify_grp_srcu); list_for_each_entry_rcu(group, &fsnotify_groups, group_list) { if (mask & group->mask) { + if (!group->ops->should_send_event(group, to_tell, mask)) + continue; if (!event) { event = fsnotify_create_event(to_tell, mask, data, data_is); /* shit, we OOM'd and now we can't tell, maybe diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index c6a8bd4..8ebcbe8 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -12,4 +12,9 @@ extern struct srcu_struct fsnotify_grp_srcu; extern struct list_head fsnotify_groups; /* all bitwise OR of all event types (FS_*) for all fsnotify_groups */ extern __u32 fsnotify_mask; + +/* final kfree of a group */ +extern void fsnotify_final_destroy_group(struct fsnotify_group *group); +/* run the list of all marks associated with inode and flag them to be freed */ +extern void fsnotify_clear_marks_by_inode(struct inode *inode); #endif /* __FS_NOTIFY_FSNOTIFY_H_ */ diff --git a/fs/notify/group.c b/fs/notify/group.c index c681295..a29d2fa 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -55,6 +55,29 @@ void fsnotify_recalc_global_mask(void) } /* + * Update the group->mask by running all of the marks associated with this + * group and finding the bitwise | of all of the mark->mask. If we change + * the group->mask we need to update the global mask of events interesting + * to the system. + */ +void fsnotify_recalc_group_mask(struct fsnotify_group *group) +{ + __u32 mask = 0; + __u32 old_mask = group->mask; + struct fsnotify_mark_entry *entry; + + spin_lock(&group->mark_lock); + list_for_each_entry(entry, &group->mark_entries, g_list) + mask |= entry->mask; + spin_unlock(&group->mark_lock); + + group->mask = mask; + + if (old_mask != mask) + fsnotify_recalc_global_mask(); +} + +/* * Take a reference to a group so things found under the fsnotify_grp_mutex * can't get freed under us */ @@ -66,7 +89,7 @@ static void fsnotify_get_group(struct fsnotify_group *group) /* * Final freeing of a group */ -static void fsnotify_destroy_group(struct fsnotify_group *group) +void fsnotify_final_destroy_group(struct fsnotify_group *group) { if (group->ops->free_group_priv) group->ops->free_group_priv(group); @@ -75,6 +98,24 @@ static void fsnotify_destroy_group(struct fsnotify_group *group) } /* + * Trying to get rid of a group. We need to first get rid of any outstanding + * allocations and then free the group. Remember that fsnotify_clear_marks_by_group + * could miss marks that are being freed by inode and those marks could still + * hold a reference to this group (via group->num_marks) If we get into that + * situtation, the fsnotify_final_destroy_group will get called when that final + * mark is freed. + */ +static void fsnotify_destroy_group(struct fsnotify_group *group) +{ + /* clear all inode mark entries for this group */ + fsnotify_clear_marks_by_group(group); + + /* past the point of no return, matches the initial value of 1 */ + if (atomic_dec_and_test(&group->num_marks)) + fsnotify_final_destroy_group(group); +} + +/* * Remove this group from the global list of groups that will get events * this can be done even if there are still references and things still using * this group. This just stops the group from getting new events. @@ -173,6 +214,10 @@ struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask, group->group_num = group_num; group->mask = mask; + spin_lock_init(&group->mark_lock); + atomic_set(&group->num_marks, 0); + INIT_LIST_HEAD(&group->mark_entries); + group->ops = ops; mutex_lock(&fsnotify_grp_mutex); @@ -188,6 +233,8 @@ struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask, /* group not found, add a new one */ list_add_rcu(&group->group_list, &fsnotify_groups); group->on_group_list = 1; + /* being on the fsnotify_groups list holds one num_marks */ + atomic_inc(&group->num_marks); mutex_unlock(&fsnotify_grp_mutex); diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c new file mode 100644 index 0000000..cdc1541 --- /dev/null +++ b/fs/notify/inode_mark.c @@ -0,0 +1,329 @@ +/* + * Copyright (C) 2008 Red Hat, Inc., Eric Paris + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* + * fsnotify inode mark locking/lifetime/and refcnting + * + * REFCNT: + * The mark->refcnt tells how many "things" in the kernel currently are + * referencing this object. The object typically will live inside the kernel + * with a refcnt of 2, one for each list it is on (i_list, g_list). Any task + * which can find this object holding the appropriete locks, can take a reference + * and the object itself is guarenteed to survive until the reference is dropped. + * + * LOCKING: + * There are 3 spinlocks involved with fsnotify inode marks and they MUST + * be taken in order as follows: + * + * entry->lock + * group->mark_lock + * inode->i_lock + * + * entry->lock protects 2 things, entry->group and entry->inode. You must hold + * that lock to dereference either of these things (they could be NULL even with + * the lock) + * + * group->mark_lock protects the mark_entries list anchored inside a given group + * and each entry is hooked via the g_list. It also sorta protects the + * free_g_list, which when used is anchored by a private list on the stack of the + * task which held the group->mark_lock. + * + * inode->i_lock protects the i_fsnotify_mark_entries list anchored inside a + * given inode and each entry is hooked via the i_list. (and sorta the + * free_i_list) + * + * + * LIFETIME: + * Inode marks survive between when they are added to an inode and when their + * refcnt==0. + * + * The inode mark can be cleared for a number of different reasons including: + * - The inode is unlinked for the last time. (fsnotify_inode_remove) + * - The inode is being evicted from cache. (fsnotify_inode_delete) + * - The fs the inode is on is unmounted. (fsnotify_inode_delete/fsnotify_unmount_inodes) + * - Something explicitly requests that it be removed. (fsnotify_destroy_mark_by_entry) + * - The fsnotify_group associated with the mark is going away and all such marks + * need to be cleaned up. (fsnotify_clear_marks_by_group) + * + * Worst case we are given an inode and need to clean up all the marks on that + * inode. We take i_lock and walk the i_fsnotify_mark_entries safely. For each + * mark on the list we take a reference (so the mark can't disappear under us). + * We remove that mark form the inode's list of marks and we add this mark to a + * private list anchored on the stack using i_free_list; At this point we no + * longer fear anything finding the mark using the inode's list of marks. + * + * We can safely and locklessly run the private list on the stack of everything + * we just unattached from the original inode. For each mark on the private list + * we grab the mark-> and can thus dereference mark->group and mark->inode. If + * we see the group and inode are not NULL we take those locks. Now holding all + * 3 locks we can completely remove the mark from other tasks finding it in the + * future. Remember, 10 things might already be referencing this mark, but they + * better be holding a ref. We drop our reference we took before we unhooked it + * from the inode. When the ref hits 0 we can free the mark. + * + * Very similarly for freeing by group, except we use free_g_list. + * + * This has the very interesting property of being able to run concurrently with + * any (or all) other directions. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include "fsnotify.h" + +void fsnotify_get_mark(struct fsnotify_mark_entry *entry) +{ + atomic_inc(&entry->refcnt); +} + +void fsnotify_put_mark(struct fsnotify_mark_entry *entry) +{ + if (atomic_dec_and_test(&entry->refcnt)) + entry->free_mark(entry); +} + +/* + * Recalculate the mask of events relevant to a given inode locked. + */ +static void fsnotify_recalc_inode_mask_locked(struct inode *inode) +{ + struct fsnotify_mark_entry *entry; + struct hlist_node *pos; + __u32 new_mask = 0; + + assert_spin_locked(&inode->i_lock); + + hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) + new_mask |= entry->mask; + inode->i_fsnotify_mask = new_mask; +} + +/* + * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types + * any notifier is interested in hearing for this inode. + */ +void fsnotify_recalc_inode_mask(struct inode *inode) +{ + spin_lock(&inode->i_lock); + fsnotify_recalc_inode_mask_locked(inode); + spin_unlock(&inode->i_lock); +} + +/* + * Any time a mark is getting freed we end up here. + * The caller had better be holding a reference to this mark so we don't actually + * do the final put under the entry->lock + */ +void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry) +{ + struct fsnotify_group *group; + struct inode *inode; + + spin_lock(&entry->lock); + + group = entry->group; + inode = entry->inode; + + BUG_ON(group && !inode); + BUG_ON(!group && inode); + + /* if !group something else already marked this to die */ + if (!group) { + spin_unlock(&entry->lock); + return; + } + + /* 1 from caller and 1 for being on i_list/g_list */ + BUG_ON(atomic_read(&entry->refcnt) < 2); + + spin_lock(&group->mark_lock); + spin_lock(&inode->i_lock); + + hlist_del_init(&entry->i_list); + entry->inode = NULL; + + list_del_init(&entry->g_list); + entry->group = NULL; + + fsnotify_put_mark(entry); /* for i_list and g_list */ + + /* + * this mark is now off the inode->i_fsnotify_mark_entries list and we + * hold the inode->i_lock, so this is the perfect time to update the + * inode->i_fsnotify_mask + */ + fsnotify_recalc_inode_mask_locked(inode); + + spin_unlock(&inode->i_lock); + spin_unlock(&group->mark_lock); + spin_unlock(&entry->lock); + + /* + * Some groups like to know that marks are being freed. This is a + * callback to the group function to let it know that this entry + * is being freed. + */ + group->ops->freeing_mark(entry, group); + + /* + * it's possible that this group tried to destroy itself, but this + * this mark was simultaneously being freed by inode. If that's the + * case, we finish freeing the group here. + */ + if (unlikely(atomic_dec_and_test(&group->num_marks))) + fsnotify_final_destroy_group(group); +} + +/* + * Given a group, destroy all of the marks associated with that group. + */ +void fsnotify_clear_marks_by_group(struct fsnotify_group *group) +{ + struct fsnotify_mark_entry *lentry, *entry; + LIST_HEAD(free_list); + + spin_lock(&group->mark_lock); + list_for_each_entry_safe(entry, lentry, &group->mark_entries, g_list) { + list_add(&entry->free_g_list, &free_list); + list_del_init(&entry->g_list); + fsnotify_get_mark(entry); + } + spin_unlock(&group->mark_lock); + + list_for_each_entry_safe(entry, lentry, &free_list, free_g_list) { + fsnotify_destroy_mark_by_entry(entry); + fsnotify_put_mark(entry); + } +} + +/* + * Given an inode, destroy all of the marks associated with that inode. + */ +void fsnotify_clear_marks_by_inode(struct inode *inode) +{ + struct fsnotify_mark_entry *entry, *lentry; + struct hlist_node *pos, *n; + LIST_HEAD(free_list); + + spin_lock(&inode->i_lock); + hlist_for_each_entry_safe(entry, pos, n, &inode->i_fsnotify_mark_entries, i_list) { + list_add(&entry->free_i_list, &free_list); + hlist_del_init(&entry->i_list); + fsnotify_get_mark(entry); + } + spin_unlock(&inode->i_lock); + + list_for_each_entry_safe(entry, lentry, &free_list, free_i_list) { + fsnotify_destroy_mark_by_entry(entry); + fsnotify_put_mark(entry); + } +} + +/* + * given a group and inode, find the mark associated with that combination. + * if found take a reference to that mark and return it, else return NULL + */ +struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group, + struct inode *inode) +{ + struct fsnotify_mark_entry *entry; + struct hlist_node *pos; + + assert_spin_locked(&inode->i_lock); + + hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) { + if (entry->group == group) { + fsnotify_get_mark(entry); + return entry; + } + } + return NULL; +} + +/* + * Nothing fancy, just initialize lists and locks and counters. + */ +void fsnotify_init_mark(struct fsnotify_mark_entry *entry, + void (*free_mark)(struct fsnotify_mark_entry *entry)) + +{ + spin_lock_init(&entry->lock); + atomic_set(&entry->refcnt, 1); + INIT_HLIST_NODE(&entry->i_list); + entry->group = NULL; + entry->mask = 0; + entry->inode = NULL; + entry->free_mark = free_mark; +} + +/* + * Attach an initialized mark entry to a given group and inode. + * These marks may be used for the fsnotify backend to determine which + * event types should be delivered to which group and for which inodes. + */ +int fsnotify_add_mark(struct fsnotify_mark_entry *entry, + struct fsnotify_group *group, struct inode *inode) +{ + struct fsnotify_mark_entry *lentry; + int ret = 0; + + /* + * LOCKING ORDER!!!! + * entry->lock + * group->mark_lock + * inode->i_lock + */ + spin_lock(&entry->lock); + spin_lock(&group->mark_lock); + spin_lock(&inode->i_lock); + + entry->group = group; + entry->inode = inode; + + lentry = fsnotify_find_mark_entry(group, inode); + if (!lentry) { + hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries); + list_add(&entry->g_list, &group->mark_entries); + + fsnotify_get_mark(entry); /* for i_list and g_list */ + + atomic_inc(&group->num_marks); + + fsnotify_recalc_inode_mask_locked(inode); + } + + spin_unlock(&inode->i_lock); + spin_unlock(&group->mark_lock); + spin_unlock(&entry->lock); + + if (lentry) { + ret = -EEXIST; + fsnotify_put_mark(lentry); + } + + return ret; +} -- cgit v1.1 From c28f7e56e9d95fb531dc3be8df2e7f52bee76d21 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 21 May 2009 17:01:29 -0400 Subject: fsnotify: parent event notification inotify and dnotify both use a similar parent notification mechanism. We add a generic parent notification mechanism to fsnotify for both of these to use. This new machanism also adds the dentry flag optimization which exists for inotify to dnotify. Signed-off-by: Eric Paris Acked-by: Al Viro Cc: Christoph Hellwig --- fs/notify/fsnotify.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/notify/fsnotify.h | 5 +++ fs/notify/inode_mark.c | 17 ++++++++++ 3 files changed, 113 insertions(+) (limited to 'fs') diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index d565462..7fc7600 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -35,6 +35,97 @@ void __fsnotify_inode_delete(struct inode *inode) EXPORT_SYMBOL_GPL(__fsnotify_inode_delete); /* + * Given an inode, first check if we care what happens to our children. Inotify + * and dnotify both tell their parents about events. If we care about any event + * on a child we run all of our children and set a dentry flag saying that the + * parent cares. Thus when an event happens on a child it can quickly tell if + * if there is a need to find a parent and send the event to the parent. + */ +void __fsnotify_update_child_dentry_flags(struct inode *inode) +{ + struct dentry *alias; + int watched; + + if (!S_ISDIR(inode->i_mode)) + return; + + /* determine if the children should tell inode about their events */ + watched = fsnotify_inode_watches_children(inode); + + spin_lock(&dcache_lock); + /* run all of the dentries associated with this inode. Since this is a + * directory, there damn well better only be one item on this list */ + list_for_each_entry(alias, &inode->i_dentry, d_alias) { + struct dentry *child; + + /* run all of the children of the original inode and fix their + * d_flags to indicate parental interest (their parent is the + * original inode) */ + list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) { + if (!child->d_inode) + continue; + + spin_lock(&child->d_lock); + if (watched) + child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; + else + child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; + spin_unlock(&child->d_lock); + } + } + spin_unlock(&dcache_lock); +} + +/* Notify this dentry's parent about a child's events. */ +void __fsnotify_parent(struct dentry *dentry, __u32 mask) +{ + struct dentry *parent; + struct inode *p_inode; + bool send = false; + bool should_update_children = false; + + if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED)) + return; + + spin_lock(&dentry->d_lock); + parent = dentry->d_parent; + p_inode = parent->d_inode; + + if (fsnotify_inode_watches_children(p_inode)) { + if (p_inode->i_fsnotify_mask & mask) { + dget(parent); + send = true; + } + } else { + /* + * The parent doesn't care about events on it's children but + * at least one child thought it did. We need to run all the + * children and update their d_flags to let them know p_inode + * doesn't care about them any more. + */ + dget(parent); + should_update_children = true; + } + + spin_unlock(&dentry->d_lock); + + if (send) { + /* we are notifying a parent so come up with the new mask which + * specifies these are events which came from a child. */ + mask |= FS_EVENT_ON_CHILD; + + fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE); + dput(parent); + } + + if (unlikely(should_update_children)) { + __fsnotify_update_child_dentry_flags(p_inode); + dput(parent); + } +} +EXPORT_SYMBOL_GPL(__fsnotify_parent); + +/* * This is the main call to fsnotify. The VFS calls into hook specific functions * in linux/fsnotify.h. Those functions then in turn call here. Here will call * out to all of the registered fsnotify_group. Those groups can then use the diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 8ebcbe8..83b8ec0 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -17,4 +17,9 @@ extern __u32 fsnotify_mask; extern void fsnotify_final_destroy_group(struct fsnotify_group *group); /* run the list of all marks associated with inode and flag them to be freed */ extern void fsnotify_clear_marks_by_inode(struct inode *inode); +/* + * update the dentry->d_flags of all of inode's children to indicate if inode cares + * about events that happen to its children. + */ +extern void __fsnotify_update_child_dentry_flags(struct inode *inode); #endif /* __FS_NOTIFY_FSNOTIFY_H_ */ diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index cdc1541..a395348 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -131,6 +131,8 @@ void fsnotify_recalc_inode_mask(struct inode *inode) spin_lock(&inode->i_lock); fsnotify_recalc_inode_mask_locked(inode); spin_unlock(&inode->i_lock); + + __fsnotify_update_child_dentry_flags(inode); } /* @@ -190,6 +192,19 @@ void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry) group->ops->freeing_mark(entry, group); /* + * __fsnotify_update_child_dentry_flags(inode); + * + * I really want to call that, but we can't, we have no idea if the inode + * still exists the second we drop the entry->lock. + * + * The next time an event arrive to this inode from one of it's children + * __fsnotify_parent will see that the inode doesn't care about it's + * children and will update all of these flags then. So really this + * is just a lazy update (and could be a perf win...) + */ + + + /* * it's possible that this group tried to destroy itself, but this * this mark was simultaneously being freed by inode. If that's the * case, we finish freeing the group here. @@ -323,6 +338,8 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry, if (lentry) { ret = -EEXIST; fsnotify_put_mark(lentry); + } else { + __fsnotify_update_child_dentry_flags(inode); } return ret; -- cgit v1.1 From 3c5119c05d624f95f4967d16b38c9624b816bdb9 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 21 May 2009 17:01:33 -0400 Subject: dnotify: reimplement dnotify using fsnotify Reimplement dnotify using fsnotify. Signed-off-by: Eric Paris Acked-by: Al Viro Cc: Christoph Hellwig --- fs/notify/dnotify/Kconfig | 1 + fs/notify/dnotify/dnotify.c | 469 ++++++++++++++++++++++++++++++++++---------- 2 files changed, 363 insertions(+), 107 deletions(-) (limited to 'fs') diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig index 26adf5d..904ff8d 100644 --- a/fs/notify/dnotify/Kconfig +++ b/fs/notify/dnotify/Kconfig @@ -1,5 +1,6 @@ config DNOTIFY bool "Dnotify support" + depends on FSNOTIFY default y help Dnotify is a directory-based per-fd file change notification system diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index b0aa2cd..d9d80f5 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -3,6 +3,9 @@ * * Copyright (C) 2000,2001,2002 Stephen Rothwell * + * Copyright (C) 2009 Eric Paris + * dnotify was largly rewritten to use the new fsnotify infrastructure + * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2, or (at your option) any @@ -21,24 +24,178 @@ #include #include #include +#include int dir_notify_enable __read_mostly = 1; -static struct kmem_cache *dn_cache __read_mostly; +static struct kmem_cache *dnotify_struct_cache __read_mostly; +static struct kmem_cache *dnotify_mark_entry_cache __read_mostly; +static struct fsnotify_group *dnotify_group __read_mostly; +static DEFINE_MUTEX(dnotify_mark_mutex); + +/* + * dnotify will attach one of these to each inode (i_fsnotify_mark_entries) which + * is being watched by dnotify. If multiple userspace applications are watching + * the same directory with dnotify their information is chained in dn + */ +struct dnotify_mark_entry { + struct fsnotify_mark_entry fsn_entry; + struct dnotify_struct *dn; +}; -static void redo_inode_mask(struct inode *inode) +/* + * When a process starts or stops watching an inode the set of events which + * dnotify cares about for that inode may change. This function runs the + * list of everything receiving dnotify events about this directory and calculates + * the set of all those events. After it updates what dnotify is interested in + * it calls the fsnotify function so it can update the set of all events relevant + * to this inode. + */ +static void dnotify_recalc_inode_mask(struct fsnotify_mark_entry *entry) { - unsigned long new_mask; + __u32 new_mask, old_mask; struct dnotify_struct *dn; + struct dnotify_mark_entry *dnentry = container_of(entry, + struct dnotify_mark_entry, + fsn_entry); + + assert_spin_locked(&entry->lock); + old_mask = entry->mask; new_mask = 0; - for (dn = inode->i_dnotify; dn != NULL; dn = dn->dn_next) - new_mask |= dn->dn_mask & ~DN_MULTISHOT; - inode->i_dnotify_mask = new_mask; + for (dn = dnentry->dn; dn != NULL; dn = dn->dn_next) + new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT); + entry->mask = new_mask; + + if (old_mask == new_mask) + return; + + if (entry->inode) + fsnotify_recalc_inode_mask(entry->inode); } +/* + * Mains fsnotify call where events are delivered to dnotify. + * Find the dnotify mark on the relevant inode, run the list of dnotify structs + * on that mark and determine which of them has expressed interest in receiving + * events of this type. When found send the correct process and signal and + * destroy the dnotify struct if it was not registered to receive multiple + * events. + */ +static int dnotify_handle_event(struct fsnotify_group *group, + struct fsnotify_event *event) +{ + struct fsnotify_mark_entry *entry = NULL; + struct dnotify_mark_entry *dnentry; + struct inode *to_tell; + struct dnotify_struct *dn; + struct dnotify_struct **prev; + struct fown_struct *fown; + + to_tell = event->to_tell; + + spin_lock(&to_tell->i_lock); + entry = fsnotify_find_mark_entry(group, to_tell); + spin_unlock(&to_tell->i_lock); + + /* unlikely since we alreay passed dnotify_should_send_event() */ + if (unlikely(!entry)) + return 0; + dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry); + + spin_lock(&entry->lock); + prev = &dnentry->dn; + while ((dn = *prev) != NULL) { + if ((dn->dn_mask & event->mask) == 0) { + prev = &dn->dn_next; + continue; + } + fown = &dn->dn_filp->f_owner; + send_sigio(fown, dn->dn_fd, POLL_MSG); + if (dn->dn_mask & FS_DN_MULTISHOT) + prev = &dn->dn_next; + else { + *prev = dn->dn_next; + kmem_cache_free(dnotify_struct_cache, dn); + dnotify_recalc_inode_mask(entry); + } + } + + spin_unlock(&entry->lock); + fsnotify_put_mark(entry); + + return 0; +} + +/* + * Given an inode and mask determine if dnotify would be interested in sending + * userspace notification for that pair. + */ +static bool dnotify_should_send_event(struct fsnotify_group *group, + struct inode *inode, __u32 mask) +{ + struct fsnotify_mark_entry *entry; + bool send; + + /* !dir_notify_enable should never get here, don't waste time checking + if (!dir_notify_enable) + return 0; */ + + /* not a dir, dnotify doesn't care */ + if (!S_ISDIR(inode->i_mode)) + return false; + + spin_lock(&inode->i_lock); + entry = fsnotify_find_mark_entry(group, inode); + spin_unlock(&inode->i_lock); + + /* no mark means no dnotify watch */ + if (!entry) + return false; + + spin_lock(&entry->lock); + send = (mask & entry->mask) ? true : false; + spin_unlock(&entry->lock); + fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */ + + return send; +} + +static void dnotify_freeing_mark(struct fsnotify_mark_entry *entry, + struct fsnotify_group *group) +{ + /* dnotify doesn't care than an inode is on the way out */ +} + +static void dnotify_free_mark(struct fsnotify_mark_entry *entry) +{ + struct dnotify_mark_entry *dnentry = container_of(entry, + struct dnotify_mark_entry, + fsn_entry); + + BUG_ON(dnentry->dn); + + kmem_cache_free(dnotify_mark_entry_cache, dnentry); +} + +static struct fsnotify_ops dnotify_fsnotify_ops = { + .handle_event = dnotify_handle_event, + .should_send_event = dnotify_should_send_event, + .free_group_priv = NULL, + .freeing_mark = dnotify_freeing_mark, +}; + +/* + * Called every time a file is closed. Looks first for a dnotify mark on the + * inode. If one is found run all of the ->dn entries attached to that + * mark for one relevant to this process closing the file and remove that + * dnotify_struct. If that was the last dnotify_struct also remove the + * fsnotify_mark_entry. + */ void dnotify_flush(struct file *filp, fl_owner_t id) { + struct fsnotify_mark_entry *entry; + struct dnotify_mark_entry *dnentry; struct dnotify_struct *dn; struct dnotify_struct **prev; struct inode *inode; @@ -46,145 +203,243 @@ void dnotify_flush(struct file *filp, fl_owner_t id) inode = filp->f_path.dentry->d_inode; if (!S_ISDIR(inode->i_mode)) return; + spin_lock(&inode->i_lock); - prev = &inode->i_dnotify; + entry = fsnotify_find_mark_entry(dnotify_group, inode); + spin_unlock(&inode->i_lock); + if (!entry) + return; + dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry); + + mutex_lock(&dnotify_mark_mutex); + + spin_lock(&entry->lock); + prev = &dnentry->dn; while ((dn = *prev) != NULL) { if ((dn->dn_owner == id) && (dn->dn_filp == filp)) { *prev = dn->dn_next; - redo_inode_mask(inode); - kmem_cache_free(dn_cache, dn); + kmem_cache_free(dnotify_struct_cache, dn); + dnotify_recalc_inode_mask(entry); break; } prev = &dn->dn_next; } - spin_unlock(&inode->i_lock); + + spin_unlock(&entry->lock); + + /* nothing else could have found us thanks to the dnotify_mark_mutex */ + if (dnentry->dn == NULL) + fsnotify_destroy_mark_by_entry(entry); + + fsnotify_recalc_group_mask(dnotify_group); + + mutex_unlock(&dnotify_mark_mutex); + + fsnotify_put_mark(entry); +} + +/* this conversion is done only at watch creation */ +static __u32 convert_arg(unsigned long arg) +{ + __u32 new_mask = FS_EVENT_ON_CHILD; + + if (arg & DN_MULTISHOT) + new_mask |= FS_DN_MULTISHOT; + if (arg & DN_DELETE) + new_mask |= (FS_DELETE | FS_MOVED_FROM); + if (arg & DN_MODIFY) + new_mask |= FS_MODIFY; + if (arg & DN_ACCESS) + new_mask |= FS_ACCESS; + if (arg & DN_ATTRIB) + new_mask |= FS_ATTRIB; + if (arg & DN_RENAME) + new_mask |= FS_DN_RENAME; + if (arg & DN_CREATE) + new_mask |= (FS_CREATE | FS_MOVED_TO); + + return new_mask; } +/* + * If multiple processes watch the same inode with dnotify there is only one + * dnotify mark in inode->i_fsnotify_mark_entries but we chain a dnotify_struct + * onto that mark. This function either attaches the new dnotify_struct onto + * that list, or it |= the mask onto an existing dnofiy_struct. + */ +static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnentry, + fl_owner_t id, int fd, struct file *filp, __u32 mask) +{ + struct dnotify_struct *odn; + + odn = dnentry->dn; + while (odn != NULL) { + /* adding more events to existing dnofiy_struct? */ + if ((odn->dn_owner == id) && (odn->dn_filp == filp)) { + odn->dn_fd = fd; + odn->dn_mask |= mask; + return -EEXIST; + } + odn = odn->dn_next; + } + + dn->dn_mask = mask; + dn->dn_fd = fd; + dn->dn_filp = filp; + dn->dn_owner = id; + dn->dn_next = dnentry->dn; + dnentry->dn = dn; + + return 0; +} + +/* + * When a process calls fcntl to attach a dnotify watch to a directory it ends + * up here. Allocate both a mark for fsnotify to add and a dnotify_struct to be + * attached to the fsnotify_mark. + */ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) { + struct dnotify_mark_entry *new_dnentry, *dnentry; + struct fsnotify_mark_entry *new_entry, *entry; struct dnotify_struct *dn; - struct dnotify_struct *odn; - struct dnotify_struct **prev; struct inode *inode; fl_owner_t id = current->files; struct file *f; - int error = 0; + int destroy = 0, error = 0; + __u32 mask; + + /* we use these to tell if we need to kfree */ + new_entry = NULL; + dn = NULL; + if (!dir_notify_enable) { + error = -EINVAL; + goto out_err; + } + + /* a 0 mask means we are explicitly removing the watch */ if ((arg & ~DN_MULTISHOT) == 0) { dnotify_flush(filp, id); - return 0; + error = 0; + goto out_err; } - if (!dir_notify_enable) - return -EINVAL; + + /* dnotify only works on directories */ inode = filp->f_path.dentry->d_inode; - if (!S_ISDIR(inode->i_mode)) - return -ENOTDIR; - dn = kmem_cache_alloc(dn_cache, GFP_KERNEL); - if (dn == NULL) - return -ENOMEM; - spin_lock(&inode->i_lock); - prev = &inode->i_dnotify; - while ((odn = *prev) != NULL) { - if ((odn->dn_owner == id) && (odn->dn_filp == filp)) { - odn->dn_fd = fd; - odn->dn_mask |= arg; - inode->i_dnotify_mask |= arg & ~DN_MULTISHOT; - goto out_free; - } - prev = &odn->dn_next; + if (!S_ISDIR(inode->i_mode)) { + error = -ENOTDIR; + goto out_err; } - rcu_read_lock(); - f = fcheck(fd); - rcu_read_unlock(); - /* we'd lost the race with close(), sod off silently */ - /* note that inode->i_lock prevents reordering problems - * between accesses to descriptor table and ->i_dnotify */ - if (f != filp) - goto out_free; + /* expect most fcntl to add new rather than augment old */ + dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL); + if (!dn) { + error = -ENOMEM; + goto out_err; + } - error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); - if (error) - goto out_free; + /* new fsnotify mark, we expect most fcntl calls to add a new mark */ + new_dnentry = kmem_cache_alloc(dnotify_mark_entry_cache, GFP_KERNEL); + if (!new_dnentry) { + error = -ENOMEM; + goto out_err; + } - dn->dn_mask = arg; - dn->dn_fd = fd; - dn->dn_filp = filp; - dn->dn_owner = id; - inode->i_dnotify_mask |= arg & ~DN_MULTISHOT; - dn->dn_next = inode->i_dnotify; - inode->i_dnotify = dn; - spin_unlock(&inode->i_lock); - return 0; + /* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */ + mask = convert_arg(arg); -out_free: - spin_unlock(&inode->i_lock); - kmem_cache_free(dn_cache, dn); - return error; -} + /* set up the new_entry and new_dnentry */ + new_entry = &new_dnentry->fsn_entry; + fsnotify_init_mark(new_entry, dnotify_free_mark); + new_entry->mask = mask; + new_dnentry->dn = NULL; -void __inode_dir_notify(struct inode *inode, unsigned long event) -{ - struct dnotify_struct * dn; - struct dnotify_struct **prev; - struct fown_struct * fown; - int changed = 0; + /* this is needed to prevent the fcntl/close race described below */ + mutex_lock(&dnotify_mark_mutex); + /* add the new_entry or find an old one. */ spin_lock(&inode->i_lock); - prev = &inode->i_dnotify; - while ((dn = *prev) != NULL) { - if ((dn->dn_mask & event) == 0) { - prev = &dn->dn_next; - continue; - } - fown = &dn->dn_filp->f_owner; - send_sigio(fown, dn->dn_fd, POLL_MSG); - if (dn->dn_mask & DN_MULTISHOT) - prev = &dn->dn_next; - else { - *prev = dn->dn_next; - changed = 1; - kmem_cache_free(dn_cache, dn); - } - } - if (changed) - redo_inode_mask(inode); + entry = fsnotify_find_mark_entry(dnotify_group, inode); spin_unlock(&inode->i_lock); -} - -EXPORT_SYMBOL(__inode_dir_notify); + if (entry) { + dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry); + spin_lock(&entry->lock); + } else { + fsnotify_add_mark(new_entry, dnotify_group, inode); + spin_lock(&new_entry->lock); + entry = new_entry; + dnentry = new_dnentry; + /* we used new_entry, so don't free it */ + new_entry = NULL; + } -/* - * This is hopelessly wrong, but unfixable without API changes. At - * least it doesn't oops the kernel... - * - * To safely access ->d_parent we need to keep d_move away from it. Use the - * dentry's d_lock for this. - */ -void dnotify_parent(struct dentry *dentry, unsigned long event) -{ - struct dentry *parent; + rcu_read_lock(); + f = fcheck(fd); + rcu_read_unlock(); - if (!dir_notify_enable) - return; + /* if (f != filp) means that we lost a race and another task/thread + * actually closed the fd we are still playing with before we grabbed + * the dnotify_mark_mutex and entry->lock. Since closing the fd is the + * only time we clean up the mark entries we need to get our mark off + * the list. */ + if (f != filp) { + /* if we added ourselves, shoot ourselves, it's possible that + * the flush actually did shoot this entry. That's fine too + * since multiple calls to destroy_mark is perfectly safe, if + * we found a dnentry already attached to the inode, just sod + * off silently as the flush at close time dealt with it. + */ + if (dnentry == new_dnentry) + destroy = 1; + goto out; + } - spin_lock(&dentry->d_lock); - parent = dentry->d_parent; - if (parent->d_inode->i_dnotify_mask & event) { - dget(parent); - spin_unlock(&dentry->d_lock); - __inode_dir_notify(parent->d_inode, event); - dput(parent); - } else { - spin_unlock(&dentry->d_lock); + error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); + if (error) { + /* if we added, we must shoot */ + if (dnentry == new_dnentry) + destroy = 1; + goto out; } + + error = attach_dn(dn, dnentry, id, fd, filp, mask); + /* !error means that we attached the dn to the dnentry, so don't free it */ + if (!error) + dn = NULL; + /* -EEXIST means that we didn't add this new dn and used an old one. + * that isn't an error (and the unused dn should be freed) */ + else if (error == -EEXIST) + error = 0; + + dnotify_recalc_inode_mask(entry); +out: + spin_unlock(&entry->lock); + + if (destroy) + fsnotify_destroy_mark_by_entry(entry); + + fsnotify_recalc_group_mask(dnotify_group); + + mutex_unlock(&dnotify_mark_mutex); + fsnotify_put_mark(entry); +out_err: + if (new_entry) + fsnotify_put_mark(new_entry); + if (dn) + kmem_cache_free(dnotify_struct_cache, dn); + return error; } -EXPORT_SYMBOL_GPL(dnotify_parent); static int __init dnotify_init(void) { - dn_cache = kmem_cache_create("dnotify_cache", - sizeof(struct dnotify_struct), 0, SLAB_PANIC, NULL); + dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC); + dnotify_mark_entry_cache = KMEM_CACHE(dnotify_mark_entry, SLAB_PANIC); + + dnotify_group = fsnotify_obtain_group(DNOTIFY_GROUP_NUM, + 0, &dnotify_fsnotify_ops); + if (IS_ERR(dnotify_group)) + panic("unable to allocate fsnotify group for dnotify\n"); return 0; } -- cgit v1.1 From a2d8bc6cb4a3024661baf877242f123787d0c054 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 21 May 2009 17:01:37 -0400 Subject: fsnotify: generic notification queue and waitq inotify needs to do asyc notification in which event information is stored on a queue until the listener is ready to receive it. This patch implements a generic notification queue for inotify (and later fanotify) to store events to be sent at a later time. Signed-off-by: Eric Paris Acked-by: Al Viro Cc: Christoph Hellwig --- fs/notify/fsnotify.h | 9 ++ fs/notify/group.c | 9 ++ fs/notify/notification.c | 230 +++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 241 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 83b8ec0..4dc2408 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -13,8 +13,12 @@ extern struct list_head fsnotify_groups; /* all bitwise OR of all event types (FS_*) for all fsnotify_groups */ extern __u32 fsnotify_mask; +/* destroy all events sitting in this groups notification queue */ +extern void fsnotify_flush_notify(struct fsnotify_group *group); + /* final kfree of a group */ extern void fsnotify_final_destroy_group(struct fsnotify_group *group); + /* run the list of all marks associated with inode and flag them to be freed */ extern void fsnotify_clear_marks_by_inode(struct inode *inode); /* @@ -22,4 +26,9 @@ extern void fsnotify_clear_marks_by_inode(struct inode *inode); * about events that happen to its children. */ extern void __fsnotify_update_child_dentry_flags(struct inode *inode); + +/* allocate and destroy and event holder to attach events to notification/access queues */ +extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void); +extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder); + #endif /* __FS_NOTIFY_FSNOTIFY_H_ */ diff --git a/fs/notify/group.c b/fs/notify/group.c index a29d2fa..0e16771 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -91,6 +91,9 @@ static void fsnotify_get_group(struct fsnotify_group *group) */ void fsnotify_final_destroy_group(struct fsnotify_group *group) { + /* clear the notification queue of all events */ + fsnotify_flush_notify(group); + if (group->ops->free_group_priv) group->ops->free_group_priv(group); @@ -214,6 +217,12 @@ struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask, group->group_num = group_num; group->mask = mask; + mutex_init(&group->notification_mutex); + INIT_LIST_HEAD(&group->notification_list); + init_waitqueue_head(&group->notification_waitq); + group->q_len = 0; + group->max_events = UINT_MAX; + spin_lock_init(&group->mark_lock); atomic_set(&group->num_marks, 0); INIT_LIST_HEAD(&group->mark_entries); diff --git a/fs/notify/notification.c b/fs/notify/notification.c index b8e9a87..dddecc7 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -16,6 +16,21 @@ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */ +/* + * Basic idea behind the notification queue: An fsnotify group (like inotify) + * sends the userspace notification about events asyncronously some time after + * the event happened. When inotify gets an event it will need to add that + * event to the group notify queue. Since a single event might need to be on + * multiple group's notification queues we can't add the event directly to each + * queue and instead add a small "event_holder" to each queue. This event_holder + * has a pointer back to the original event. Since the majority of events are + * going to end up on one, and only one, notification queue we embed one + * event_holder into each event. This means we have a single allocation instead + * of always needing two. If the embedded event_holder is already in use by + * another group a new event_holder (from fsnotify_event_holder_cachep) will be + * allocated and used. + */ + #include #include #include @@ -33,6 +48,21 @@ #include "fsnotify.h" static struct kmem_cache *fsnotify_event_cachep; +static struct kmem_cache *fsnotify_event_holder_cachep; +/* + * This is a magic event we send when the q is too full. Since it doesn't + * hold real event information we just keep one system wide and use it any time + * it is needed. It's refcnt is set 1 at kernel init time and will never + * get set to 0 so it will never get 'freed' + */ +static struct fsnotify_event q_overflow_event; + +/* return true if the notify queue is empty, false otherwise */ +bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) +{ + BUG_ON(!mutex_is_locked(&group->notification_mutex)); + return list_empty(&group->notification_list) ? true : false; +} void fsnotify_get_event(struct fsnotify_event *event) { @@ -52,19 +82,176 @@ void fsnotify_put_event(struct fsnotify_event *event) } } +struct fsnotify_event_holder *fsnotify_alloc_event_holder(void) +{ + return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL); +} + +void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder) +{ + kmem_cache_free(fsnotify_event_holder_cachep, holder); +} + +/* + * check if 2 events contain the same information. + */ +static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new) +{ + if ((old->mask == new->mask) && + (old->to_tell == new->to_tell) && + (old->data_type == new->data_type)) { + switch (old->data_type) { + case (FSNOTIFY_EVENT_INODE): + if (old->inode == new->inode) + return true; + break; + case (FSNOTIFY_EVENT_PATH): + if ((old->path.mnt == new->path.mnt) && + (old->path.dentry == new->path.dentry)) + return true; + case (FSNOTIFY_EVENT_NONE): + return true; + }; + } + return false; +} + /* - * Allocate a new event which will be sent to each group's handle_event function - * if the group was interested in this particular event. + * Add an event to the group notification queue. The group can later pull this + * event off the queue to deal with. If the event is successfully added to the + * group's notification queue, a reference is taken on event. */ -struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, - void *data, int data_type) +int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event) +{ + struct fsnotify_event_holder *holder = NULL; + struct list_head *list = &group->notification_list; + struct fsnotify_event_holder *last_holder; + struct fsnotify_event *last_event; + + /* + * There is one fsnotify_event_holder embedded inside each fsnotify_event. + * Check if we expect to be able to use that holder. If not alloc a new + * holder. + * For the overflow event it's possible that something will use the in + * event holder before we get the lock so we may need to jump back and + * alloc a new holder, this can't happen for most events... + */ + if (!list_empty(&event->holder.event_list)) { +alloc_holder: + holder = fsnotify_alloc_event_holder(); + if (!holder) + return -ENOMEM; + } + + mutex_lock(&group->notification_mutex); + + if (group->q_len >= group->max_events) + event = &q_overflow_event; + + spin_lock(&event->lock); + + if (list_empty(&event->holder.event_list)) { + if (unlikely(holder)) + fsnotify_destroy_event_holder(holder); + holder = &event->holder; + } else if (unlikely(!holder)) { + /* between the time we checked above and got the lock the in + * event holder was used, go back and get a new one */ + spin_unlock(&event->lock); + mutex_unlock(&group->notification_mutex); + goto alloc_holder; + } + + if (!list_empty(list)) { + last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list); + last_event = last_holder->event; + if (event_compare(last_event, event)) { + spin_unlock(&event->lock); + mutex_unlock(&group->notification_mutex); + if (holder != &event->holder) + fsnotify_destroy_event_holder(holder); + return 0; + } + } + + group->q_len++; + holder->event = event; + + fsnotify_get_event(event); + list_add_tail(&holder->event_list, list); + spin_unlock(&event->lock); + mutex_unlock(&group->notification_mutex); + + wake_up(&group->notification_waitq); + return 0; +} + +/* + * Remove and return the first event from the notification list. There is a + * reference held on this event since it was on the list. It is the responsibility + * of the caller to drop this reference. + */ +struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group) { struct fsnotify_event *event; + struct fsnotify_event_holder *holder; - event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL); - if (!event) - return NULL; + BUG_ON(!mutex_is_locked(&group->notification_mutex)); + holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); + + event = holder->event; + + spin_lock(&event->lock); + holder->event = NULL; + list_del_init(&holder->event_list); + spin_unlock(&event->lock); + + /* event == holder means we are referenced through the in event holder */ + if (holder != &event->holder) + fsnotify_destroy_event_holder(holder); + + group->q_len--; + + return event; +} + +/* + * This will not remove the event, that must be done with fsnotify_remove_notify_event() + */ +struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group) +{ + struct fsnotify_event *event; + struct fsnotify_event_holder *holder; + + BUG_ON(!mutex_is_locked(&group->notification_mutex)); + + holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list); + event = holder->event; + + return event; +} + +/* + * Called when a group is being torn down to clean up any outstanding + * event notifications. + */ +void fsnotify_flush_notify(struct fsnotify_group *group) +{ + struct fsnotify_event *event; + + mutex_lock(&group->notification_mutex); + while (!fsnotify_notify_queue_is_empty(group)) { + event = fsnotify_remove_notify_event(group); + fsnotify_put_event(event); /* matches fsnotify_add_notify_event */ + } + mutex_unlock(&group->notification_mutex); +} + +static void initialize_event(struct fsnotify_event *event) +{ + event->holder.event = NULL; + INIT_LIST_HEAD(&event->holder.event_list); atomic_set(&event->refcnt, 1); spin_lock_init(&event->lock); @@ -72,7 +259,32 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, event->path.dentry = NULL; event->path.mnt = NULL; event->inode = NULL; + event->data_type = FSNOTIFY_EVENT_NONE; + event->to_tell = NULL; +} + +/* + * fsnotify_create_event - Allocate a new event which will be sent to each + * group's handle_event function if the group was interested in this + * particular event. + * + * @to_tell the inode which is supposed to receive the event (sometimes a + * parent of the inode to which the event happened. + * @mask what actually happened. + * @data pointer to the object which was actually affected + * @data_type flag indication if the data is a file, path, inode, nothing... + */ +struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, + void *data, int data_type) +{ + struct fsnotify_event *event; + + event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL); + if (!event) + return NULL; + + initialize_event(event); event->to_tell = to_tell; switch (data_type) { @@ -114,6 +326,10 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, __init int fsnotify_notification_init(void) { fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC); + fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC); + + initialize_event(&q_overflow_event); + q_overflow_event.mask = FS_Q_OVERFLOW; return 0; } -- cgit v1.1 From 62ffe5dfba056f7ba81d710fee9f28c58a42fdd6 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 21 May 2009 17:01:43 -0400 Subject: fsnotify: include pathnames with entries when possible When inotify wants to send events to a directory about a child it includes the name of the original file. This patch collects that filename and makes it available for notification. Signed-off-by: Eric Paris Acked-by: Al Viro Cc: Christoph Hellwig --- fs/notify/fsnotify.c | 7 ++++--- fs/notify/notification.c | 16 +++++++++++++++- 2 files changed, 19 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 7fc7600..675129f 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -114,7 +114,8 @@ void __fsnotify_parent(struct dentry *dentry, __u32 mask) * specifies these are events which came from a child. */ mask |= FS_EVENT_ON_CHILD; - fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE); + fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE, + dentry->d_name.name); dput(parent); } @@ -131,7 +132,7 @@ EXPORT_SYMBOL_GPL(__fsnotify_parent); * out to all of the registered fsnotify_group. Those groups can then use the * notification event in whatever means they feel necessary. */ -void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is) +void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *file_name) { struct fsnotify_group *group; struct fsnotify_event *event = NULL; @@ -156,7 +157,7 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is) if (!group->ops->should_send_event(group, to_tell, mask)) continue; if (!event) { - event = fsnotify_create_event(to_tell, mask, data, data_is); + event = fsnotify_create_event(to_tell, mask, data, data_is, file_name); /* shit, we OOM'd and now we can't tell, maybe * someday someone else will want to do something * here */ diff --git a/fs/notify/notification.c b/fs/notify/notification.c index dddecc7..c69b18b 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -78,6 +78,7 @@ void fsnotify_put_event(struct fsnotify_event *event) if (event->data_type == FSNOTIFY_EVENT_PATH) path_put(&event->path); + kfree(event->file_name); kmem_cache_free(fsnotify_event_cachep, event); } } @@ -262,6 +263,9 @@ static void initialize_event(struct fsnotify_event *event) event->data_type = FSNOTIFY_EVENT_NONE; event->to_tell = NULL; + + event->file_name = NULL; + event->name_len = 0; } /* @@ -274,9 +278,10 @@ static void initialize_event(struct fsnotify_event *event) * @mask what actually happened. * @data pointer to the object which was actually affected * @data_type flag indication if the data is a file, path, inode, nothing... + * @name the filename, if available */ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, - void *data, int data_type) + void *data, int data_type, const char *name) { struct fsnotify_event *event; @@ -285,6 +290,15 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, return NULL; initialize_event(event); + + if (name) { + event->file_name = kstrdup(name, GFP_KERNEL); + if (!event->file_name) { + kmem_cache_free(fsnotify_event_cachep, event); + return NULL; + } + event->name_len = strlen(event->file_name); + } event->to_tell = to_tell; switch (data_type) { -- cgit v1.1 From 47882c6f51e8ef41fbbe2bbb746a1ea3228dd7ca Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 21 May 2009 17:01:47 -0400 Subject: fsnotify: add correlations between events As part of the standard inotify events it includes a correlation cookie between two dentry move operations. This patch includes the same behaviour in fsnotify events. It is needed so that inotify userspace can be implemented on top of fsnotify. Signed-off-by: Eric Paris Acked-by: Al Viro Cc: Christoph Hellwig --- fs/notify/fsnotify.c | 6 +++--- fs/notify/notification.c | 20 ++++++++++++++++++-- 2 files changed, 21 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 675129f..f11d75f 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -115,7 +115,7 @@ void __fsnotify_parent(struct dentry *dentry, __u32 mask) mask |= FS_EVENT_ON_CHILD; fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE, - dentry->d_name.name); + dentry->d_name.name, 0); dput(parent); } @@ -132,7 +132,7 @@ EXPORT_SYMBOL_GPL(__fsnotify_parent); * out to all of the registered fsnotify_group. Those groups can then use the * notification event in whatever means they feel necessary. */ -void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *file_name) +void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *file_name, u32 cookie) { struct fsnotify_group *group; struct fsnotify_event *event = NULL; @@ -157,7 +157,7 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const if (!group->ops->should_send_event(group, to_tell, mask)) continue; if (!event) { - event = fsnotify_create_event(to_tell, mask, data, data_is, file_name); + event = fsnotify_create_event(to_tell, mask, data, data_is, file_name, cookie); /* shit, we OOM'd and now we can't tell, maybe * someday someone else will want to do something * here */ diff --git a/fs/notify/notification.c b/fs/notify/notification.c index c69b18b..346f6e5 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -56,6 +57,17 @@ static struct kmem_cache *fsnotify_event_holder_cachep; * get set to 0 so it will never get 'freed' */ static struct fsnotify_event q_overflow_event; +static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0); + +/** + * fsnotify_get_cookie - return a unique cookie for use in synchronizing events. + * Called from fsnotify_move, which is inlined into filesystem modules. + */ +u32 fsnotify_get_cookie(void) +{ + return atomic_inc_return(&fsnotify_sync_cookie); +} +EXPORT_SYMBOL_GPL(fsnotify_get_cookie); /* return true if the notify queue is empty, false otherwise */ bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) @@ -266,6 +278,8 @@ static void initialize_event(struct fsnotify_event *event) event->file_name = NULL; event->name_len = 0; + + event->sync_cookie = 0; } /* @@ -280,8 +294,8 @@ static void initialize_event(struct fsnotify_event *event) * @data_type flag indication if the data is a file, path, inode, nothing... * @name the filename, if available */ -struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, - void *data, int data_type, const char *name) +struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data, + int data_type, const char *name, u32 cookie) { struct fsnotify_event *event; @@ -299,6 +313,8 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, } event->name_len = strlen(event->file_name); } + + event->sync_cookie = cookie; event->to_tell = to_tell; switch (data_type) { -- cgit v1.1 From e4aff117368cfdd3567ee41844d216d079b55173 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 21 May 2009 17:01:50 -0400 Subject: fsnotify: allow groups to add private data to events inotify needs per group information attached to events. This patch allows groups to attach private information and implements a callback so that information can be freed when an event is being destroyed. Signed-off-by: Eric Paris Acked-by: Al Viro Cc: Christoph Hellwig --- fs/notify/dnotify/dnotify.c | 1 + fs/notify/notification.c | 52 +++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 49 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index d9d80f5..12f9e6b 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -183,6 +183,7 @@ static struct fsnotify_ops dnotify_fsnotify_ops = { .should_send_event = dnotify_should_send_event, .free_group_priv = NULL, .freeing_mark = dnotify_freeing_mark, + .free_event_priv = NULL, }; /* diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 346f6e5..959b73e 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -90,6 +90,8 @@ void fsnotify_put_event(struct fsnotify_event *event) if (event->data_type == FSNOTIFY_EVENT_PATH) path_put(&event->path); + BUG_ON(!list_empty(&event->private_data_list)); + kfree(event->file_name); kmem_cache_free(fsnotify_event_cachep, event); } @@ -106,7 +108,29 @@ void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder) } /* - * check if 2 events contain the same information. + * Find the private data that the group previously attached to this event when + * the group added the event to the notification queue (fsnotify_add_notify_event) + */ +struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event) +{ + struct fsnotify_event_private_data *lpriv; + struct fsnotify_event_private_data *priv = NULL; + + assert_spin_locked(&event->lock); + + list_for_each_entry(lpriv, &event->private_data_list, event_list) { + if (lpriv->group == group) { + priv = lpriv; + list_del(&priv->event_list); + break; + } + } + return priv; +} + +/* + * Check if 2 events contain the same information. We do not compare private data + * but at this moment that isn't a problem for any know fsnotify listeners. */ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new) { @@ -134,13 +158,17 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new * event off the queue to deal with. If the event is successfully added to the * group's notification queue, a reference is taken on event. */ -int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event) +int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event, + struct fsnotify_event_private_data *priv) { struct fsnotify_event_holder *holder = NULL; struct list_head *list = &group->notification_list; struct fsnotify_event_holder *last_holder; struct fsnotify_event *last_event; + /* easy to tell if priv was attached to the event */ + INIT_LIST_HEAD(&priv->event_list); + /* * There is one fsnotify_event_holder embedded inside each fsnotify_event. * Check if we expect to be able to use that holder. If not alloc a new @@ -158,8 +186,11 @@ alloc_holder: mutex_lock(&group->notification_mutex); - if (group->q_len >= group->max_events) + if (group->q_len >= group->max_events) { event = &q_overflow_event; + /* sorry, no private data on the overflow event */ + priv = NULL; + } spin_lock(&event->lock); @@ -183,7 +214,7 @@ alloc_holder: mutex_unlock(&group->notification_mutex); if (holder != &event->holder) fsnotify_destroy_event_holder(holder); - return 0; + return -EEXIST; } } @@ -192,6 +223,8 @@ alloc_holder: fsnotify_get_event(event); list_add_tail(&holder->event_list, list); + if (priv) + list_add_tail(&priv->event_list, &event->private_data_list); spin_unlock(&event->lock); mutex_unlock(&group->notification_mutex); @@ -252,10 +285,19 @@ struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group) void fsnotify_flush_notify(struct fsnotify_group *group) { struct fsnotify_event *event; + struct fsnotify_event_private_data *priv; mutex_lock(&group->notification_mutex); while (!fsnotify_notify_queue_is_empty(group)) { event = fsnotify_remove_notify_event(group); + /* if they don't implement free_event_priv they better not have attached any */ + if (group->ops->free_event_priv) { + spin_lock(&event->lock); + priv = fsnotify_remove_priv_from_event(group, event); + spin_unlock(&event->lock); + if (priv) + group->ops->free_event_priv(priv); + } fsnotify_put_event(event); /* matches fsnotify_add_notify_event */ } mutex_unlock(&group->notification_mutex); @@ -274,6 +316,8 @@ static void initialize_event(struct fsnotify_event *event) event->inode = NULL; event->data_type = FSNOTIFY_EVENT_NONE; + INIT_LIST_HEAD(&event->private_data_list); + event->to_tell = NULL; event->file_name = NULL; -- cgit v1.1 From 1ef5f13c6c8acd3fd10db9f1743f3b4cf30a4abb Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 21 May 2009 17:01:54 -0400 Subject: fsnotify: fsnotify marks on inodes pin them in core This patch pins any inodes with an fsnotify mark in core. The idea is that as soon as the mark is removed from the inode->fsnotify_mark_entries list the inode will be iput. In reality is doesn't quite work exactly this way. The igrab will happen when the mark is added to an inode, but the iput will happen when the inode pointer is NULL'd inside the mark. It's possible that 2 racing things will try to remove the mark from different directions. One may try to remove the mark because of an explicit request and one might try to remove it because the inode was deleted. It's possible that the removal because of inode deletion will remove the mark from the inode's list, but the removal by explicit request will actually set entry->inode == NULL; and call the iput. This is safe. Signed-off-by: Eric Paris Acked-by: Al Viro Cc: Christoph Hellwig --- fs/notify/inode_mark.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index a395348..282150f 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -204,6 +204,8 @@ void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry) */ + iput(inode); + /* * it's possible that this group tried to destroy itself, but this * this mark was simultaneously being freed by inode. If that's the @@ -306,6 +308,10 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry, struct fsnotify_mark_entry *lentry; int ret = 0; + inode = igrab(inode); + if (unlikely(!inode)) + return -EINVAL; + /* * LOCKING ORDER!!!! * entry->lock @@ -337,6 +343,7 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry, if (lentry) { ret = -EEXIST; + iput(inode); fsnotify_put_mark(lentry); } else { __fsnotify_update_child_dentry_flags(inode); -- cgit v1.1 From 164bc6195139047faaf5ada1278332e99494803b Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 21 May 2009 17:01:58 -0400 Subject: fsnotify: handle filesystem unmounts with fsnotify marks When an fs is unmounted with an fsnotify mark entry attached to one of its inodes we need to destroy that mark entry and we also (like inotify) send an unmount event. Signed-off-by: Eric Paris Acked-by: Al Viro Cc: Christoph Hellwig --- fs/inode.c | 1 + fs/notify/inode_mark.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index 54c63ce..ca33701 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -407,6 +407,7 @@ int invalidate_inodes(struct super_block *sb) mutex_lock(&iprune_mutex); spin_lock(&inode_lock); inotify_unmount_inodes(&sb->s_inodes); + fsnotify_unmount_inodes(&sb->s_inodes); busy = invalidate_list(&sb->s_inodes, &throw_away); spin_unlock(&inode_lock); diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 282150f..0a499d2 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -89,6 +89,7 @@ #include #include #include +#include /* for inode_lock */ #include @@ -351,3 +352,74 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry, return ret; } + +/** + * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. + * @list: list of inodes being unmounted (sb->s_inodes) + * + * Called with inode_lock held, protecting the unmounting super block's list + * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay. + * We temporarily drop inode_lock, however, and CAN block. + */ +void fsnotify_unmount_inodes(struct list_head *list) +{ + struct inode *inode, *next_i, *need_iput = NULL; + + list_for_each_entry_safe(inode, next_i, list, i_sb_list) { + struct inode *need_iput_tmp; + + /* + * We cannot __iget() an inode in state I_CLEAR, I_FREEING, + * I_WILL_FREE, or I_NEW which is fine because by that point + * the inode cannot have any associated watches. + */ + if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW)) + continue; + + /* + * If i_count is zero, the inode cannot have any watches and + * doing an __iget/iput with MS_ACTIVE clear would actually + * evict all inodes with zero i_count from icache which is + * unnecessarily violent and may in fact be illegal to do. + */ + if (!atomic_read(&inode->i_count)) + continue; + + need_iput_tmp = need_iput; + need_iput = NULL; + + /* In case fsnotify_inode_delete() drops a reference. */ + if (inode != need_iput_tmp) + __iget(inode); + else + need_iput_tmp = NULL; + + /* In case the dropping of a reference would nuke next_i. */ + if ((&next_i->i_sb_list != list) && + atomic_read(&next_i->i_count) && + !(next_i->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))) { + __iget(next_i); + need_iput = next_i; + } + + /* + * We can safely drop inode_lock here because we hold + * references on both inode and next_i. Also no new inodes + * will be added since the umount has begun. Finally, + * iprune_mutex keeps shrink_icache_memory() away. + */ + spin_unlock(&inode_lock); + + if (need_iput_tmp) + iput(need_iput_tmp); + + /* for each watch, send FS_UNMOUNT and then remove it */ + fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0); + + fsnotify_inode_delete(inode); + + iput(inode); + + spin_lock(&inode_lock); + } +} -- cgit v1.1 From 63c882a05416e18de6fb59f7dd6da48f3bbe8273 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 21 May 2009 17:02:01 -0400 Subject: inotify: reimplement inotify using fsnotify Reimplement inotify_user using fsnotify. This should be feature for feature exactly the same as the original inotify_user. This does not make any changes to the in kernel inotify feature used by audit. Those patches (and the eventual removal of in kernel inotify) will come after the new inotify_user proves to be working correctly. Signed-off-by: Eric Paris Acked-by: Al Viro Cc: Christoph Hellwig --- fs/notify/inotify/Kconfig | 20 +- fs/notify/inotify/Makefile | 2 +- fs/notify/inotify/inotify.h | 21 + fs/notify/inotify/inotify_fsnotify.c | 137 ++++++ fs/notify/inotify/inotify_user.c | 837 +++++++++++++++++------------------ 5 files changed, 570 insertions(+), 447 deletions(-) create mode 100644 fs/notify/inotify/inotify.h create mode 100644 fs/notify/inotify/inotify_fsnotify.c (limited to 'fs') diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig index 4467928..5356884 100644 --- a/fs/notify/inotify/Kconfig +++ b/fs/notify/inotify/Kconfig @@ -1,26 +1,30 @@ config INOTIFY bool "Inotify file change notification support" - default y + default n ---help--- - Say Y here to enable inotify support. Inotify is a file change - notification system and a replacement for dnotify. Inotify fixes - numerous shortcomings in dnotify and introduces several new features - including multiple file events, one-shot support, and unmount - notification. + Say Y here to enable legacy in kernel inotify support. Inotify is a + file change notification system. It is a replacement for dnotify. + This option only provides the legacy inotify in kernel API. There + are no in tree kernel users of this interface since it is deprecated. + You only need this if you are loading an out of tree kernel module + that uses inotify. For more information, see - If unsure, say Y. + If unsure, say N. config INOTIFY_USER bool "Inotify support for userspace" - depends on INOTIFY + depends on FSNOTIFY default y ---help--- Say Y here to enable inotify support for userspace, including the associated system calls. Inotify allows monitoring of both files and directories via a single open fd. Events are read from the file descriptor, which is also select()- and poll()-able. + Inotify fixes numerous shortcomings in dnotify and introduces several + new features including multiple file events, one-shot support, and + unmount notification. For more information, see diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile index e290f3b..9438281 100644 --- a/fs/notify/inotify/Makefile +++ b/fs/notify/inotify/Makefile @@ -1,2 +1,2 @@ obj-$(CONFIG_INOTIFY) += inotify.o -obj-$(CONFIG_INOTIFY_USER) += inotify_user.o +obj-$(CONFIG_INOTIFY_USER) += inotify_fsnotify.o inotify_user.o diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h new file mode 100644 index 0000000..ea2605a --- /dev/null +++ b/fs/notify/inotify/inotify.h @@ -0,0 +1,21 @@ +#include +#include +#include /* struct kmem_cache */ + +extern struct kmem_cache *event_priv_cachep; + +struct inotify_event_private_data { + struct fsnotify_event_private_data fsnotify_event_priv_data; + int wd; +}; + +struct inotify_inode_mark_entry { + /* fsnotify_mark_entry MUST be the first thing */ + struct fsnotify_mark_entry fsn_entry; + int wd; +}; + +extern void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group); +extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv); + +extern const struct fsnotify_ops inotify_fsnotify_ops; diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c new file mode 100644 index 0000000..160da54 --- /dev/null +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -0,0 +1,137 @@ +/* + * fs/inotify_user.c - inotify support for userspace + * + * Authors: + * John McCutchan + * Robert Love + * + * Copyright (C) 2005 John McCutchan + * Copyright 2006 Hewlett-Packard Development Company, L.P. + * + * Copyright (C) 2009 Eric Paris + * inotify was largely rewriten to make use of the fsnotify infrastructure + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include /* struct inode */ +#include +#include +#include /* struct path */ +#include /* kmem_* */ +#include + +#include "inotify.h" + +static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_event *event) +{ + struct fsnotify_mark_entry *entry; + struct inotify_inode_mark_entry *ientry; + struct inode *to_tell; + struct inotify_event_private_data *event_priv; + struct fsnotify_event_private_data *fsn_event_priv; + int wd, ret; + + to_tell = event->to_tell; + + spin_lock(&to_tell->i_lock); + entry = fsnotify_find_mark_entry(group, to_tell); + spin_unlock(&to_tell->i_lock); + /* race with watch removal? We already passes should_send */ + if (unlikely(!entry)) + return 0; + ientry = container_of(entry, struct inotify_inode_mark_entry, + fsn_entry); + wd = ientry->wd; + + event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); + if (unlikely(!event_priv)) + return -ENOMEM; + + fsn_event_priv = &event_priv->fsnotify_event_priv_data; + + fsn_event_priv->group = group; + event_priv->wd = wd; + + ret = fsnotify_add_notify_event(group, event, fsn_event_priv); + /* EEXIST is not an error */ + if (ret == -EEXIST) + ret = 0; + + /* did event_priv get attached? */ + if (list_empty(&fsn_event_priv->event_list)) + inotify_free_event_priv(fsn_event_priv); + + /* + * If we hold the entry until after the event is on the queue + * IN_IGNORED won't be able to pass this event in the queue + */ + fsnotify_put_mark(entry); + + return ret; +} + +static void inotify_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group) +{ + inotify_destroy_mark_entry(entry, group); +} + +static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask) +{ + struct fsnotify_mark_entry *entry; + bool send; + + spin_lock(&inode->i_lock); + entry = fsnotify_find_mark_entry(group, inode); + spin_unlock(&inode->i_lock); + if (!entry) + return false; + + send = (entry->mask & mask); + + /* find took a reference */ + fsnotify_put_mark(entry); + + return send; +} + +static int idr_callback(int id, void *p, void *data) +{ + BUG(); + return 0; +} + +static void inotify_free_group_priv(struct fsnotify_group *group) +{ + /* ideally the idr is empty and we won't hit the BUG in teh callback */ + idr_for_each(&group->inotify_data.idr, idr_callback, NULL); + idr_remove_all(&group->inotify_data.idr); + idr_destroy(&group->inotify_data.idr); +} + +void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv) +{ + struct inotify_event_private_data *event_priv; + + + event_priv = container_of(fsn_event_priv, struct inotify_event_private_data, + fsnotify_event_priv_data); + + kmem_cache_free(event_priv_cachep, event_priv); +} + +const struct fsnotify_ops inotify_fsnotify_ops = { + .handle_event = inotify_handle_event, + .should_send_event = inotify_should_send_event, + .free_group_priv = inotify_free_group_priv, + .free_event_priv = inotify_free_event_priv, + .freeing_mark = inotify_freeing_mark, +}; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 1634319..982a412 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -8,6 +8,9 @@ * Copyright (C) 2005 John McCutchan * Copyright 2006 Hewlett-Packard Development Company, L.P. * + * Copyright (C) 2009 Eric Paris + * inotify was largely rewriten to make use of the fsnotify infrastructure + * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2, or (at your option) any @@ -19,94 +22,48 @@ * General Public License for more details. */ -#include -#include -#include -#include #include -#include -#include -#include -#include -#include +#include /* struct inode */ +#include +#include +#include /* module_init */ #include +#include /* roundup() */ +#include /* superblock magic number */ +#include /* mntget */ +#include /* LOOKUP_FOLLOW */ +#include /* struct path */ +#include /* struct user */ +#include /* struct kmem_cache */ #include -#include +#include +#include +#include +#include -#include +#include "inotify.h" -static struct kmem_cache *watch_cachep __read_mostly; -static struct kmem_cache *event_cachep __read_mostly; +#include static struct vfsmount *inotify_mnt __read_mostly; +/* this just sits here and wastes global memory. used to just pad userspace messages with zeros */ +static struct inotify_event nul_inotify_event; + /* these are configurable via /proc/sys/fs/inotify/ */ static int inotify_max_user_instances __read_mostly; -static int inotify_max_user_watches __read_mostly; static int inotify_max_queued_events __read_mostly; +int inotify_max_user_watches __read_mostly; -/* - * Lock ordering: - * - * inotify_dev->up_mutex (ensures we don't re-add the same watch) - * inode->inotify_mutex (protects inode's watch list) - * inotify_handle->mutex (protects inotify_handle's watch list) - * inotify_dev->ev_mutex (protects device's event queue) - */ +static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; +struct kmem_cache *event_priv_cachep __read_mostly; +static struct fsnotify_event *inotify_ignored_event; /* - * Lifetimes of the main data structures: - * - * inotify_device: Lifetime is managed by reference count, from - * sys_inotify_init() until release. Additional references can bump the count - * via get_inotify_dev() and drop the count via put_inotify_dev(). - * - * inotify_user_watch: Lifetime is from create_watch() to the receipt of an - * IN_IGNORED event from inotify, or when using IN_ONESHOT, to receipt of the - * first event, or to inotify_destroy(). + * When inotify registers a new group it increments this and uses that + * value as an offset to set the fsnotify group "name" and priority. */ - -/* - * struct inotify_device - represents an inotify instance - * - * This structure is protected by the mutex 'mutex'. - */ -struct inotify_device { - wait_queue_head_t wq; /* wait queue for i/o */ - struct mutex ev_mutex; /* protects event queue */ - struct mutex up_mutex; /* synchronizes watch updates */ - struct list_head events; /* list of queued events */ - struct user_struct *user; /* user who opened this dev */ - struct inotify_handle *ih; /* inotify handle */ - struct fasync_struct *fa; /* async notification */ - atomic_t count; /* reference count */ - unsigned int queue_size; /* size of the queue (bytes) */ - unsigned int event_count; /* number of pending events */ - unsigned int max_events; /* maximum number of events */ -}; - -/* - * struct inotify_kernel_event - An inotify event, originating from a watch and - * queued for user-space. A list of these is attached to each instance of the - * device. In read(), this list is walked and all events that can fit in the - * buffer are returned. - * - * Protected by dev->ev_mutex of the device in which we are queued. - */ -struct inotify_kernel_event { - struct inotify_event event; /* the user-space event */ - struct list_head list; /* entry in inotify_device's list */ - char *name; /* filename, if any */ -}; - -/* - * struct inotify_user_watch - our version of an inotify_watch, we add - * a reference to the associated inotify_device. - */ -struct inotify_user_watch { - struct inotify_device *dev; /* associated device */ - struct inotify_watch wdata; /* inotify watch data */ -}; +static atomic_t inotify_grp_num; #ifdef CONFIG_SYSCTL @@ -149,280 +106,36 @@ ctl_table inotify_table[] = { }; #endif /* CONFIG_SYSCTL */ -static inline void get_inotify_dev(struct inotify_device *dev) -{ - atomic_inc(&dev->count); -} - -static inline void put_inotify_dev(struct inotify_device *dev) -{ - if (atomic_dec_and_test(&dev->count)) { - atomic_dec(&dev->user->inotify_devs); - free_uid(dev->user); - kfree(dev); - } -} - -/* - * free_inotify_user_watch - cleans up the watch and its references - */ -static void free_inotify_user_watch(struct inotify_watch *w) -{ - struct inotify_user_watch *watch; - struct inotify_device *dev; - - watch = container_of(w, struct inotify_user_watch, wdata); - dev = watch->dev; - - atomic_dec(&dev->user->inotify_watches); - put_inotify_dev(dev); - kmem_cache_free(watch_cachep, watch); -} - -/* - * kernel_event - create a new kernel event with the given parameters - * - * This function can sleep. - */ -static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie, - const char *name) -{ - struct inotify_kernel_event *kevent; - - kevent = kmem_cache_alloc(event_cachep, GFP_NOFS); - if (unlikely(!kevent)) - return NULL; - - /* we hand this out to user-space, so zero it just in case */ - memset(&kevent->event, 0, sizeof(struct inotify_event)); - - kevent->event.wd = wd; - kevent->event.mask = mask; - kevent->event.cookie = cookie; - - INIT_LIST_HEAD(&kevent->list); - - if (name) { - size_t len, rem, event_size = sizeof(struct inotify_event); - - /* - * We need to pad the filename so as to properly align an - * array of inotify_event structures. Because the structure is - * small and the common case is a small filename, we just round - * up to the next multiple of the structure's sizeof. This is - * simple and safe for all architectures. - */ - len = strlen(name) + 1; - rem = event_size - len; - if (len > event_size) { - rem = event_size - (len % event_size); - if (len % event_size == 0) - rem = 0; - } - - kevent->name = kmalloc(len + rem, GFP_NOFS); - if (unlikely(!kevent->name)) { - kmem_cache_free(event_cachep, kevent); - return NULL; - } - memcpy(kevent->name, name, len); - if (rem) - memset(kevent->name + len, 0, rem); - kevent->event.len = len + rem; - } else { - kevent->event.len = 0; - kevent->name = NULL; - } - - return kevent; -} - -/* - * inotify_dev_get_event - return the next event in the given dev's queue - * - * Caller must hold dev->ev_mutex. - */ -static inline struct inotify_kernel_event * -inotify_dev_get_event(struct inotify_device *dev) -{ - return list_entry(dev->events.next, struct inotify_kernel_event, list); -} - -/* - * inotify_dev_get_last_event - return the last event in the given dev's queue - * - * Caller must hold dev->ev_mutex. - */ -static inline struct inotify_kernel_event * -inotify_dev_get_last_event(struct inotify_device *dev) +static inline __u32 inotify_arg_to_mask(u32 arg) { - if (list_empty(&dev->events)) - return NULL; - return list_entry(dev->events.prev, struct inotify_kernel_event, list); -} + __u32 mask; -/* - * inotify_dev_queue_event - event handler registered with core inotify, adds - * a new event to the given device - * - * Can sleep (calls kernel_event()). - */ -static void inotify_dev_queue_event(struct inotify_watch *w, u32 wd, u32 mask, - u32 cookie, const char *name, - struct inode *ignored) -{ - struct inotify_user_watch *watch; - struct inotify_device *dev; - struct inotify_kernel_event *kevent, *last; + /* everything should accept their own ignored and cares about children */ + mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD); - watch = container_of(w, struct inotify_user_watch, wdata); - dev = watch->dev; + /* mask off the flags used to open the fd */ + mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT)); - mutex_lock(&dev->ev_mutex); - - /* we can safely put the watch as we don't reference it while - * generating the event - */ - if (mask & IN_IGNORED || w->mask & IN_ONESHOT) - put_inotify_watch(w); /* final put */ - - /* coalescing: drop this event if it is a dupe of the previous */ - last = inotify_dev_get_last_event(dev); - if (last && last->event.mask == mask && last->event.wd == wd && - last->event.cookie == cookie) { - const char *lastname = last->name; - - if (!name && !lastname) - goto out; - if (name && lastname && !strcmp(lastname, name)) - goto out; - } - - /* the queue overflowed and we already sent the Q_OVERFLOW event */ - if (unlikely(dev->event_count > dev->max_events)) - goto out; - - /* if the queue overflows, we need to notify user space */ - if (unlikely(dev->event_count == dev->max_events)) - kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL); - else - kevent = kernel_event(wd, mask, cookie, name); - - if (unlikely(!kevent)) - goto out; - - /* queue the event and wake up anyone waiting */ - dev->event_count++; - dev->queue_size += sizeof(struct inotify_event) + kevent->event.len; - list_add_tail(&kevent->list, &dev->events); - wake_up_interruptible(&dev->wq); - kill_fasync(&dev->fa, SIGIO, POLL_IN); - -out: - mutex_unlock(&dev->ev_mutex); -} - -/* - * remove_kevent - cleans up the given kevent - * - * Caller must hold dev->ev_mutex. - */ -static void remove_kevent(struct inotify_device *dev, - struct inotify_kernel_event *kevent) -{ - list_del(&kevent->list); - - dev->event_count--; - dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len; -} - -/* - * free_kevent - frees the given kevent. - */ -static void free_kevent(struct inotify_kernel_event *kevent) -{ - kfree(kevent->name); - kmem_cache_free(event_cachep, kevent); -} - -/* - * inotify_dev_event_dequeue - destroy an event on the given device - * - * Caller must hold dev->ev_mutex. - */ -static void inotify_dev_event_dequeue(struct inotify_device *dev) -{ - if (!list_empty(&dev->events)) { - struct inotify_kernel_event *kevent; - kevent = inotify_dev_get_event(dev); - remove_kevent(dev, kevent); - free_kevent(kevent); - } -} - -/* - * find_inode - resolve a user-given path to a specific inode - */ -static int find_inode(const char __user *dirname, struct path *path, - unsigned flags) -{ - int error; - - error = user_path_at(AT_FDCWD, dirname, flags, path); - if (error) - return error; - /* you can only watch an inode if you have read permissions on it */ - error = inode_permission(path->dentry->d_inode, MAY_READ); - if (error) - path_put(path); - return error; + return mask; } -/* - * create_watch - creates a watch on the given device. - * - * Callers must hold dev->up_mutex. - */ -static int create_watch(struct inotify_device *dev, struct inode *inode, - u32 mask) +static inline u32 inotify_mask_to_arg(__u32 mask) { - struct inotify_user_watch *watch; - int ret; - - if (atomic_read(&dev->user->inotify_watches) >= - inotify_max_user_watches) - return -ENOSPC; - - watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL); - if (unlikely(!watch)) - return -ENOMEM; - - /* save a reference to device and bump the count to make it official */ - get_inotify_dev(dev); - watch->dev = dev; - - atomic_inc(&dev->user->inotify_watches); - - inotify_init_watch(&watch->wdata); - ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask); - if (ret < 0) - free_inotify_user_watch(&watch->wdata); - - return ret; + return mask & (IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT | IN_IGNORED | + IN_Q_OVERFLOW); } -/* Device Interface */ - +/* intofiy userspace file descriptor functions */ static unsigned int inotify_poll(struct file *file, poll_table *wait) { - struct inotify_device *dev = file->private_data; + struct fsnotify_group *group = file->private_data; int ret = 0; - poll_wait(file, &dev->wq, wait); - mutex_lock(&dev->ev_mutex); - if (!list_empty(&dev->events)) + poll_wait(file, &group->notification_waitq, wait); + mutex_lock(&group->notification_mutex); + if (!fsnotify_notify_queue_is_empty(group)) ret = POLLIN | POLLRDNORM; - mutex_unlock(&dev->ev_mutex); + mutex_unlock(&group->notification_mutex); return ret; } @@ -432,26 +145,29 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait) * enough to fit in "count". Return an error pointer if * not large enough. * - * Called with the device ev_mutex held. + * Called with the group->notification_mutex held. */ -static struct inotify_kernel_event *get_one_event(struct inotify_device *dev, - size_t count) +static struct fsnotify_event *get_one_event(struct fsnotify_group *group, + size_t count) { size_t event_size = sizeof(struct inotify_event); - struct inotify_kernel_event *kevent; + struct fsnotify_event *event; - if (list_empty(&dev->events)) + if (fsnotify_notify_queue_is_empty(group)) return NULL; - kevent = inotify_dev_get_event(dev); - if (kevent->name) - event_size += kevent->event.len; + event = fsnotify_peek_notify_event(group); + + event_size += roundup(event->name_len, event_size); if (event_size > count) return ERR_PTR(-EINVAL); - remove_kevent(dev, kevent); - return kevent; + /* held the notification_mutex the whole time, so this is the + * same event we peeked above */ + fsnotify_remove_notify_event(group); + + return event; } /* @@ -460,51 +176,90 @@ static struct inotify_kernel_event *get_one_event(struct inotify_device *dev, * We already checked that the event size is smaller than the * buffer we had in "get_one_event()" above. */ -static ssize_t copy_event_to_user(struct inotify_kernel_event *kevent, +static ssize_t copy_event_to_user(struct fsnotify_group *group, + struct fsnotify_event *event, char __user *buf) { + struct inotify_event inotify_event; + struct fsnotify_event_private_data *fsn_priv; + struct inotify_event_private_data *priv; size_t event_size = sizeof(struct inotify_event); + size_t name_len; + + /* we get the inotify watch descriptor from the event private data */ + spin_lock(&event->lock); + fsn_priv = fsnotify_remove_priv_from_event(group, event); + spin_unlock(&event->lock); + + if (!fsn_priv) + inotify_event.wd = -1; + else { + priv = container_of(fsn_priv, struct inotify_event_private_data, + fsnotify_event_priv_data); + inotify_event.wd = priv->wd; + inotify_free_event_priv(fsn_priv); + } + + /* round up event->name_len so it is a multiple of event_size */ + name_len = roundup(event->name_len, event_size); + inotify_event.len = name_len; + + inotify_event.mask = inotify_mask_to_arg(event->mask); + inotify_event.cookie = event->sync_cookie; - if (copy_to_user(buf, &kevent->event, event_size)) + /* send the main event */ + if (copy_to_user(buf, &inotify_event, event_size)) return -EFAULT; - if (kevent->name) { - buf += event_size; + buf += event_size; - if (copy_to_user(buf, kevent->name, kevent->event.len)) + /* + * fsnotify only stores the pathname, so here we have to send the pathname + * and then pad that pathname out to a multiple of sizeof(inotify_event) + * with zeros. I get my zeros from the nul_inotify_event. + */ + if (name_len) { + unsigned int len_to_zero = name_len - event->name_len; + /* copy the path name */ + if (copy_to_user(buf, event->file_name, event->name_len)) return -EFAULT; + buf += event->name_len; - event_size += kevent->event.len; + /* fill userspace with 0's from nul_inotify_event */ + if (copy_to_user(buf, &nul_inotify_event, len_to_zero)) + return -EFAULT; + buf += len_to_zero; + event_size += name_len; } + return event_size; } static ssize_t inotify_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { - struct inotify_device *dev; + struct fsnotify_group *group; + struct fsnotify_event *kevent; char __user *start; int ret; DEFINE_WAIT(wait); start = buf; - dev = file->private_data; + group = file->private_data; while (1) { - struct inotify_kernel_event *kevent; + prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE); - prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE); - - mutex_lock(&dev->ev_mutex); - kevent = get_one_event(dev, count); - mutex_unlock(&dev->ev_mutex); + mutex_lock(&group->notification_mutex); + kevent = get_one_event(group, count); + mutex_unlock(&group->notification_mutex); if (kevent) { ret = PTR_ERR(kevent); if (IS_ERR(kevent)) break; - ret = copy_event_to_user(kevent, buf); - free_kevent(kevent); + ret = copy_event_to_user(group, kevent, buf); + fsnotify_put_event(kevent); if (ret < 0) break; buf += ret; @@ -525,7 +280,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf, schedule(); } - finish_wait(&dev->wq, &wait); + finish_wait(&group->notification_waitq, &wait); if (start != buf && ret != -EFAULT) ret = buf - start; return ret; @@ -533,25 +288,19 @@ static ssize_t inotify_read(struct file *file, char __user *buf, static int inotify_fasync(int fd, struct file *file, int on) { - struct inotify_device *dev = file->private_data; + struct fsnotify_group *group = file->private_data; - return fasync_helper(fd, file, on, &dev->fa) >= 0 ? 0 : -EIO; + return fasync_helper(fd, file, on, &group->inotify_data.fa) >= 0 ? 0 : -EIO; } static int inotify_release(struct inode *ignored, struct file *file) { - struct inotify_device *dev = file->private_data; - - inotify_destroy(dev->ih); + struct fsnotify_group *group = file->private_data; - /* destroy all of the events on this device */ - mutex_lock(&dev->ev_mutex); - while (!list_empty(&dev->events)) - inotify_dev_event_dequeue(dev); - mutex_unlock(&dev->ev_mutex); + fsnotify_clear_marks_by_group(group); - /* free this device: the put matching the get in inotify_init() */ - put_inotify_dev(dev); + /* free this group, matching get was inotify_init->fsnotify_obtain_group */ + fsnotify_put_group(group); return 0; } @@ -559,16 +308,27 @@ static int inotify_release(struct inode *ignored, struct file *file) static long inotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct inotify_device *dev; + struct fsnotify_group *group; + struct fsnotify_event_holder *holder; + struct fsnotify_event *event; void __user *p; int ret = -ENOTTY; + size_t send_len = 0; - dev = file->private_data; + group = file->private_data; p = (void __user *) arg; switch (cmd) { case FIONREAD: - ret = put_user(dev->queue_size, (int __user *) p); + mutex_lock(&group->notification_mutex); + list_for_each_entry(holder, &group->notification_list, event_list) { + event = holder->event; + send_len += sizeof(struct inotify_event); + send_len += roundup(event->name_len, + sizeof(struct inotify_event)); + } + mutex_unlock(&group->notification_mutex); + ret = put_user(send_len, (int __user *) p); break; } @@ -576,23 +336,233 @@ static long inotify_ioctl(struct file *file, unsigned int cmd, } static const struct file_operations inotify_fops = { - .poll = inotify_poll, - .read = inotify_read, - .fasync = inotify_fasync, - .release = inotify_release, - .unlocked_ioctl = inotify_ioctl, + .poll = inotify_poll, + .read = inotify_read, + .fasync = inotify_fasync, + .release = inotify_release, + .unlocked_ioctl = inotify_ioctl, .compat_ioctl = inotify_ioctl, }; -static const struct inotify_operations inotify_user_ops = { - .handle_event = inotify_dev_queue_event, - .destroy_watch = free_inotify_user_watch, -}; +/* + * find_inode - resolve a user-given path to a specific inode + */ +static int inotify_find_inode(const char __user *dirname, struct path *path, unsigned flags) +{ + int error; + + error = user_path_at(AT_FDCWD, dirname, flags, path); + if (error) + return error; + /* you can only watch an inode if you have read permissions on it */ + error = inode_permission(path->dentry->d_inode, MAY_READ); + if (error) + path_put(path); + return error; +} + +/* + * When, for whatever reason, inotify is done with a mark (or what used to be a + * watch) we need to remove that watch from the idr and we need to send IN_IGNORED + * for the given wd. + * + * There is a bit of recursion here. The loop looks like: + * inotify_destroy_mark_entry -> fsnotify_destroy_mark_by_entry -> + * inotify_freeing_mark -> inotify_destory_mark_entry -> restart + * But the loop is broken in 2 places. fsnotify_destroy_mark_by_entry sets + * entry->group = NULL before the call to inotify_freeing_mark, so the if (egroup) + * test below will not call back to fsnotify again. But even if that test wasn't + * there this would still be safe since fsnotify_destroy_mark_by_entry() is + * safe from recursion. + */ +void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group) +{ + struct inotify_inode_mark_entry *ientry; + struct inotify_event_private_data *event_priv; + struct fsnotify_event_private_data *fsn_event_priv; + struct fsnotify_group *egroup; + struct idr *idr; + + spin_lock(&entry->lock); + egroup = entry->group; + + /* if egroup we aren't really done and something might still send events + * for this inode, on the callback we'll send the IN_IGNORED */ + if (egroup) { + spin_unlock(&entry->lock); + fsnotify_destroy_mark_by_entry(entry); + return; + } + spin_unlock(&entry->lock); + + ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); + + event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); + if (unlikely(!event_priv)) + goto skip_send_ignore; + + fsn_event_priv = &event_priv->fsnotify_event_priv_data; + + fsn_event_priv->group = group; + event_priv->wd = ientry->wd; + + fsnotify_add_notify_event(group, inotify_ignored_event, fsn_event_priv); + + /* did the private data get added? */ + if (list_empty(&fsn_event_priv->event_list)) + inotify_free_event_priv(fsn_event_priv); + +skip_send_ignore: + + /* remove this entry from the idr */ + spin_lock(&group->inotify_data.idr_lock); + idr = &group->inotify_data.idr; + idr_remove(idr, ientry->wd); + spin_unlock(&group->inotify_data.idr_lock); + + /* removed from idr, drop that reference */ + fsnotify_put_mark(entry); +} + +/* ding dong the mark is dead */ +static void inotify_free_mark(struct fsnotify_mark_entry *entry) +{ + struct inotify_inode_mark_entry *ientry = (struct inotify_inode_mark_entry *)entry; + + kmem_cache_free(inotify_inode_mark_cachep, ientry); +} + +static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) +{ + struct fsnotify_mark_entry *entry = NULL; + struct inotify_inode_mark_entry *ientry; + int ret = 0; + int add = (arg & IN_MASK_ADD); + __u32 mask; + __u32 old_mask, new_mask; + + /* don't allow invalid bits: we don't want flags set */ + mask = inotify_arg_to_mask(arg); + if (unlikely(!mask)) + return -EINVAL; + + ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); + if (unlikely(!ientry)) + return -ENOMEM; + /* we set the mask at the end after attaching it */ + fsnotify_init_mark(&ientry->fsn_entry, inotify_free_mark); + ientry->wd = 0; + +find_entry: + spin_lock(&inode->i_lock); + entry = fsnotify_find_mark_entry(group, inode); + spin_unlock(&inode->i_lock); + if (entry) { + kmem_cache_free(inotify_inode_mark_cachep, ientry); + ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); + } else { + if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) { + ret = -ENOSPC; + goto out_err; + } + + ret = fsnotify_add_mark(&ientry->fsn_entry, group, inode); + if (ret == -EEXIST) + goto find_entry; + else if (ret) + goto out_err; + + entry = &ientry->fsn_entry; +retry: + ret = -ENOMEM; + if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL))) + goto out_err; + + spin_lock(&group->inotify_data.idr_lock); + /* if entry is added to the idr we keep the reference obtained + * through fsnotify_mark_add. remember to drop this reference + * when entry is removed from idr */ + ret = idr_get_new_above(&group->inotify_data.idr, entry, + ++group->inotify_data.last_wd, + &ientry->wd); + spin_unlock(&group->inotify_data.idr_lock); + if (ret) { + if (ret == -EAGAIN) + goto retry; + goto out_err; + } + atomic_inc(&group->inotify_data.user->inotify_watches); + } + + spin_lock(&entry->lock); + + old_mask = entry->mask; + if (add) { + entry->mask |= mask; + new_mask = entry->mask; + } else { + entry->mask = mask; + new_mask = entry->mask; + } + + spin_unlock(&entry->lock); + + if (old_mask != new_mask) { + /* more bits in old than in new? */ + int dropped = (old_mask & ~new_mask); + /* more bits in this entry than the inode's mask? */ + int do_inode = (new_mask & ~inode->i_fsnotify_mask); + /* more bits in this entry than the group? */ + int do_group = (new_mask & ~group->mask); + + /* update the inode with this new entry */ + if (dropped || do_inode) + fsnotify_recalc_inode_mask(inode); + + /* update the group mask with the new mask */ + if (dropped || do_group) + fsnotify_recalc_group_mask(group); + } + + return ientry->wd; + +out_err: + /* see this isn't supposed to happen, just kill the watch */ + if (entry) { + fsnotify_destroy_mark_by_entry(entry); + fsnotify_put_mark(entry); + } + return ret; +} + +static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsigned int max_events) +{ + struct fsnotify_group *group; + unsigned int grp_num; + + /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */ + grp_num = (INOTIFY_GROUP_NUM - atomic_inc_return(&inotify_grp_num)); + group = fsnotify_obtain_group(grp_num, 0, &inotify_fsnotify_ops); + if (IS_ERR(group)) + return group; + + group->max_events = max_events; + + spin_lock_init(&group->inotify_data.idr_lock); + idr_init(&group->inotify_data.idr); + group->inotify_data.last_wd = 0; + group->inotify_data.user = user; + group->inotify_data.fa = NULL; + + return group; +} + + +/* inotify syscalls */ SYSCALL_DEFINE1(inotify_init1, int, flags) { - struct inotify_device *dev; - struct inotify_handle *ih; + struct fsnotify_group *group; struct user_struct *user; struct file *filp; int fd, ret; @@ -621,45 +591,27 @@ SYSCALL_DEFINE1(inotify_init1, int, flags) goto out_free_uid; } - dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL); - if (unlikely(!dev)) { - ret = -ENOMEM; + /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */ + group = inotify_new_group(user, inotify_max_queued_events); + if (IS_ERR(group)) { + ret = PTR_ERR(group); goto out_free_uid; } - ih = inotify_init(&inotify_user_ops); - if (IS_ERR(ih)) { - ret = PTR_ERR(ih); - goto out_free_dev; - } - dev->ih = ih; - dev->fa = NULL; - filp->f_op = &inotify_fops; filp->f_path.mnt = mntget(inotify_mnt); filp->f_path.dentry = dget(inotify_mnt->mnt_root); filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping; filp->f_mode = FMODE_READ; filp->f_flags = O_RDONLY | (flags & O_NONBLOCK); - filp->private_data = dev; - - INIT_LIST_HEAD(&dev->events); - init_waitqueue_head(&dev->wq); - mutex_init(&dev->ev_mutex); - mutex_init(&dev->up_mutex); - dev->event_count = 0; - dev->queue_size = 0; - dev->max_events = inotify_max_queued_events; - dev->user = user; - atomic_set(&dev->count, 0); - - get_inotify_dev(dev); + filp->private_data = group; + atomic_inc(&user->inotify_devs); + fd_install(fd, filp); return fd; -out_free_dev: - kfree(dev); + out_free_uid: free_uid(user); put_filp(filp); @@ -676,8 +628,8 @@ SYSCALL_DEFINE0(inotify_init) SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, u32, mask) { + struct fsnotify_group *group; struct inode *inode; - struct inotify_device *dev; struct path path; struct file *filp; int ret, fput_needed; @@ -698,20 +650,20 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, if (mask & IN_ONLYDIR) flags |= LOOKUP_DIRECTORY; - ret = find_inode(pathname, &path, flags); - if (unlikely(ret)) + ret = inotify_find_inode(pathname, &path, flags); + if (ret) goto fput_and_out; - /* inode held in place by reference to path; dev by fget on fd */ + /* inode held in place by reference to path; group by fget on fd */ inode = path.dentry->d_inode; - dev = filp->private_data; + group = filp->private_data; - mutex_lock(&dev->up_mutex); - ret = inotify_find_update_watch(dev->ih, inode, mask); - if (ret == -ENOENT) - ret = create_watch(dev, inode, mask); - mutex_unlock(&dev->up_mutex); + /* create/update an inode mark */ + ret = inotify_update_watch(group, inode, mask); + if (unlikely(ret)) + goto path_put_and_out; +path_put_and_out: path_put(&path); fput_and_out: fput_light(filp, fput_needed); @@ -720,9 +672,10 @@ fput_and_out: SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd) { + struct fsnotify_group *group; + struct fsnotify_mark_entry *entry; struct file *filp; - struct inotify_device *dev; - int ret, fput_needed; + int ret = 0, fput_needed; filp = fget_light(fd, &fput_needed); if (unlikely(!filp)) @@ -734,10 +687,20 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd) goto out; } - dev = filp->private_data; + group = filp->private_data; - /* we free our watch data when we get IN_IGNORED */ - ret = inotify_rm_wd(dev->ih, wd); + spin_lock(&group->inotify_data.idr_lock); + entry = idr_find(&group->inotify_data.idr, wd); + if (unlikely(!entry)) { + spin_unlock(&group->inotify_data.idr_lock); + ret = -EINVAL; + goto out; + } + fsnotify_get_mark(entry); + spin_unlock(&group->inotify_data.idr_lock); + + inotify_destroy_mark_entry(entry, group); + fsnotify_put_mark(entry); out: fput_light(filp, fput_needed); @@ -753,9 +716,9 @@ inotify_get_sb(struct file_system_type *fs_type, int flags, } static struct file_system_type inotify_fs_type = { - .name = "inotifyfs", - .get_sb = inotify_get_sb, - .kill_sb = kill_anon_super, + .name = "inotifyfs", + .get_sb = inotify_get_sb, + .kill_sb = kill_anon_super, }; /* @@ -775,18 +738,16 @@ static int __init inotify_user_setup(void) if (IS_ERR(inotify_mnt)) panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt)); + inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC); + event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); + inotify_ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0); + if (!inotify_ignored_event) + panic("unable to allocate the inotify ignored event\n"); + inotify_max_queued_events = 16384; inotify_max_user_instances = 128; inotify_max_user_watches = 8192; - watch_cachep = kmem_cache_create("inotify_watch_cache", - sizeof(struct inotify_user_watch), - 0, SLAB_PANIC, NULL); - event_cachep = kmem_cache_create("inotify_event_cache", - sizeof(struct inotify_kernel_event), - 0, SLAB_PANIC, NULL); - return 0; } - module_init(inotify_user_setup); -- cgit v1.1 From 5ac697b793a3c45005c568df692518da6e690390 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 11 Jun 2009 11:09:47 -0400 Subject: dnotify: do not use ?true:false when assigning to a bool dnotify_should send event assigned a bool using ?true:false when computing a bit operation. This is poitless and the bool type does this for us. Signed-off-by: Eric Paris --- fs/notify/dnotify/dnotify.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 12f9e6b..5134e89 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -154,7 +154,7 @@ static bool dnotify_should_send_event(struct fsnotify_group *group, return false; spin_lock(&entry->lock); - send = (mask & entry->mask) ? true : false; + send = (mask & entry->mask); spin_unlock(&entry->lock); fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */ -- cgit v1.1 From ce61856bd2aadb064f595e5c0444376a2b117c41 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 11 Jun 2009 11:09:47 -0400 Subject: dnotify: do not bother to lock entry->lock when reading mask entry->lock is needed to make sure entry->mask does not change while manipulating it. In dnotify_should_send_event() we don't care if we get an old or a new mask value out of this entry so there is no point it taking the lock. Signed-off-by: Eric Paris --- fs/notify/dnotify/dnotify.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 5134e89..ec459b6 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -153,9 +153,8 @@ static bool dnotify_should_send_event(struct fsnotify_group *group, if (!entry) return false; - spin_lock(&entry->lock); send = (mask & entry->mask); - spin_unlock(&entry->lock); + fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */ return send; -- cgit v1.1 From e42e27736de80045f925564ea27a1d32957219e7 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 11 Jun 2009 11:09:47 -0400 Subject: inotify/dnotify: should_send_event shouldn't match on FS_EVENT_ON_CHILD inotify and dnotify will both indicate that they want any event which came from a child inode. The fix is to mask off FS_EVENT_ON_CHILD when deciding if inotify or dnotify is interested in a given event. Signed-off-by: Eric Paris --- fs/notify/dnotify/dnotify.c | 1 + fs/notify/fsnotify.c | 8 +++++--- fs/notify/inotify/inotify_fsnotify.c | 1 + 3 files changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index ec459b6..98a7516 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -153,6 +153,7 @@ static bool dnotify_should_send_event(struct fsnotify_group *group, if (!entry) return false; + mask = (mask & ~FS_EVENT_ON_CHILD); send = (mask & entry->mask); fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */ diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index f11d75f..ec2f7bd 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -137,14 +137,16 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const struct fsnotify_group *group; struct fsnotify_event *event = NULL; int idx; + /* global tests shouldn't care about events on child only the specific event */ + __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); if (list_empty(&fsnotify_groups)) return; - if (!(mask & fsnotify_mask)) + if (!(test_mask & fsnotify_mask)) return; - if (!(mask & to_tell->i_fsnotify_mask)) + if (!(test_mask & to_tell->i_fsnotify_mask)) return; /* * SRCU!! the groups list is very very much read only and the path is @@ -153,7 +155,7 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const */ idx = srcu_read_lock(&fsnotify_grp_srcu); list_for_each_entry_rcu(group, &fsnotify_groups, group_list) { - if (mask & group->mask) { + if (test_mask & group->mask) { if (!group->ops->should_send_event(group, to_tell, mask)) continue; if (!event) { diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 160da54..7ef75b8 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -95,6 +95,7 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode if (!entry) return false; + mask = (mask & ~FS_EVENT_ON_CHILD); send = (entry->mask & mask); /* find took a reference */ -- cgit v1.1 From a092ee20fd33d2df0990dcbf2235afc181612818 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 11 Jun 2009 11:09:48 -0400 Subject: fsnotify: allow groups to set freeing_mark to null Most fsnotify listeners (all but inotify) do not care about marks being freed. Allow groups to set freeing_mark to null and do not call any function if it is set that way. Signed-off-by: Eric Paris --- fs/notify/dnotify/dnotify.c | 8 +------- fs/notify/inode_mark.c | 3 ++- 2 files changed, 3 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 98a7516..828a889 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -161,12 +161,6 @@ static bool dnotify_should_send_event(struct fsnotify_group *group, return send; } -static void dnotify_freeing_mark(struct fsnotify_mark_entry *entry, - struct fsnotify_group *group) -{ - /* dnotify doesn't care than an inode is on the way out */ -} - static void dnotify_free_mark(struct fsnotify_mark_entry *entry) { struct dnotify_mark_entry *dnentry = container_of(entry, @@ -182,7 +176,7 @@ static struct fsnotify_ops dnotify_fsnotify_ops = { .handle_event = dnotify_handle_event, .should_send_event = dnotify_should_send_event, .free_group_priv = NULL, - .freeing_mark = dnotify_freeing_mark, + .freeing_mark = NULL, .free_event_priv = NULL, }; diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 0a499d2..c8a07c6 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -190,7 +190,8 @@ void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry) * callback to the group function to let it know that this entry * is being freed. */ - group->ops->freeing_mark(entry, group); + if (group->ops->freeing_mark) + group->ops->freeing_mark(entry, group); /* * __fsnotify_update_child_dentry_flags(inode); -- cgit v1.1 From 73422811d290c628b4ddbf6830e5cd6fa42e84f1 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Sun, 10 May 2009 16:05:39 -0400 Subject: reiserfs: allow exposing privroot w/ xattrs enabled This patch adds an -oexpose_privroot option to allow access to the privroot. Signed-off-by: Jeff Mahoney Signed-off-by: Al Viro --- fs/reiserfs/dir.c | 10 ++++------ fs/reiserfs/super.c | 1 + fs/reiserfs/xattr.c | 3 ++- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 45ee3d3..6d2668f 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -44,13 +44,11 @@ static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, static inline bool is_privroot_deh(struct dentry *dir, struct reiserfs_de_head *deh) { - int ret = 0; -#ifdef CONFIG_REISERFS_FS_XATTR struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root; - ret = (dir == dir->d_parent && privroot->d_inode && - deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid); -#endif - return ret; + if (reiserfs_expose_privroot(dir->d_sb)) + return 0; + return (dir == dir->d_parent && privroot->d_inode && + deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid); } int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 3567fb9..9dbdcfb 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -898,6 +898,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin {"conv",.setmask = 1 << REISERFS_CONVERT}, {"attrs",.setmask = 1 << REISERFS_ATTRS}, {"noattrs",.clrmask = 1 << REISERFS_ATTRS}, + {"expose_privroot", .setmask = 1 << REISERFS_EXPOSE_PRIVROOT}, #ifdef CONFIG_REISERFS_FS_XATTR {"user_xattr",.setmask = 1 << REISERFS_XATTRS_USER}, {"nouser_xattr",.clrmask = 1 << REISERFS_XATTRS_USER}, diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 8e7deb0..f3d47d8 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -981,7 +981,8 @@ int reiserfs_lookup_privroot(struct super_block *s) strlen(PRIVROOT_NAME)); if (!IS_ERR(dentry)) { REISERFS_SB(s)->priv_root = dentry; - s->s_root->d_op = &xattr_lookup_poison_ops; + if (!reiserfs_expose_privroot(s)) + s->s_root->d_op = &xattr_lookup_poison_ops; if (dentry->d_inode) dentry->d_inode->i_flags |= S_PRIVATE; } else -- cgit v1.1 From 4e44b6852e03c915618ca6776b6697b436246b00 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 7 Apr 2009 11:08:56 -0400 Subject: Get rid of path_lookup in autofs4 Signed-off-by: Al Viro --- fs/autofs4/dev-ioctl.c | 195 +++++++++++++++---------------------------------- 1 file changed, 60 insertions(+), 135 deletions(-) (limited to 'fs') diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 84168c0..f71dac9 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -192,77 +192,42 @@ static int autofs_dev_ioctl_protosubver(struct file *fp, return 0; } -/* - * Walk down the mount stack looking for an autofs mount that - * has the requested device number (aka. new_encode_dev(sb->s_dev). - */ -static int autofs_dev_ioctl_find_super(struct nameidata *nd, dev_t devno) +static int find_autofs_mount(const char *pathname, + struct path *res, + int test(struct path *path, void *data), + void *data) { - struct dentry *dentry; - struct inode *inode; - struct super_block *sb; - dev_t s_dev; - unsigned int err; - + struct path path; + int err = kern_path(pathname, 0, &path); + if (err) + return err; err = -ENOENT; - - /* Lookup the dentry name at the base of our mount point */ - dentry = d_lookup(nd->path.dentry, &nd->last); - if (!dentry) - goto out; - - dput(nd->path.dentry); - nd->path.dentry = dentry; - - /* And follow the mount stack looking for our autofs mount */ - while (follow_down(&nd->path.mnt, &nd->path.dentry)) { - inode = nd->path.dentry->d_inode; - if (!inode) - break; - - sb = inode->i_sb; - s_dev = new_encode_dev(sb->s_dev); - if (devno == s_dev) { - if (sb->s_magic == AUTOFS_SUPER_MAGIC) { + while (path.dentry == path.mnt->mnt_root) { + if (path.mnt->mnt_sb->s_magic == AUTOFS_SUPER_MAGIC) { + if (test(&path, data)) { + path_get(&path); + if (!err) /* already found some */ + path_put(res); + *res = path; err = 0; - break; } } + if (!follow_up(&path.mnt, &path.dentry)) + break; } -out: + path_put(&path); return err; } -/* - * Walk down the mount stack looking for an autofs mount that - * has the requested mount type (ie. indirect, direct or offset). - */ -static int autofs_dev_ioctl_find_sbi_type(struct nameidata *nd, unsigned int type) +static int test_by_dev(struct path *path, void *p) { - struct dentry *dentry; - struct autofs_info *ino; - unsigned int err; - - err = -ENOENT; - - /* Lookup the dentry name at the base of our mount point */ - dentry = d_lookup(nd->path.dentry, &nd->last); - if (!dentry) - goto out; - - dput(nd->path.dentry); - nd->path.dentry = dentry; + return path->mnt->mnt_sb->s_dev == *(dev_t *)p; +} - /* And follow the mount stack looking for our autofs mount */ - while (follow_down(&nd->path.mnt, &nd->path.dentry)) { - ino = autofs4_dentry_ino(nd->path.dentry); - if (ino && ino->sbi->type & type) { - err = 0; - break; - } - } -out: - return err; +static int test_by_type(struct path *path, void *p) +{ + struct autofs_info *ino = autofs4_dentry_ino(path->dentry); + return ino && ino->sbi->type & *(unsigned *)p; } static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file) @@ -283,31 +248,25 @@ static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file) * Open a file descriptor on the autofs mount point corresponding * to the given path and device number (aka. new_encode_dev(sb->s_dev)). */ -static int autofs_dev_ioctl_open_mountpoint(const char *path, dev_t devid) +static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid) { - struct file *filp; - struct nameidata nd; int err, fd; fd = get_unused_fd(); if (likely(fd >= 0)) { - /* Get nameidata of the parent directory */ - err = path_lookup(path, LOOKUP_PARENT, &nd); + struct file *filp; + struct path path; + + err = find_autofs_mount(name, &path, test_by_dev, &devid); if (err) goto out; /* - * Search down, within the parent, looking for an - * autofs super block that has the device number + * Find autofs super block that has the device number * corresponding to the autofs fs we want to open. */ - err = autofs_dev_ioctl_find_super(&nd, devid); - if (err) { - path_put(&nd.path); - goto out; - } - filp = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY, + filp = dentry_open(path.dentry, path.mnt, O_RDONLY, current_cred()); if (IS_ERR(filp)) { err = PTR_ERR(filp); @@ -340,7 +299,7 @@ static int autofs_dev_ioctl_openmount(struct file *fp, param->ioctlfd = -1; path = param->path; - devid = param->openmount.devid; + devid = new_decode_dev(param->openmount.devid); err = 0; fd = autofs_dev_ioctl_open_mountpoint(path, devid); @@ -475,8 +434,7 @@ static int autofs_dev_ioctl_requester(struct file *fp, struct autofs_dev_ioctl *param) { struct autofs_info *ino; - struct nameidata nd; - const char *path; + struct path path; dev_t devid; int err = -ENOENT; @@ -485,32 +443,24 @@ static int autofs_dev_ioctl_requester(struct file *fp, goto out; } - path = param->path; - devid = new_encode_dev(sbi->sb->s_dev); + devid = sbi->sb->s_dev; param->requester.uid = param->requester.gid = -1; - /* Get nameidata of the parent directory */ - err = path_lookup(path, LOOKUP_PARENT, &nd); + err = find_autofs_mount(param->path, &path, test_by_dev, &devid); if (err) goto out; - err = autofs_dev_ioctl_find_super(&nd, devid); - if (err) - goto out_release; - - ino = autofs4_dentry_ino(nd.path.dentry); + ino = autofs4_dentry_ino(path.dentry); if (ino) { err = 0; - autofs4_expire_wait(nd.path.dentry); + autofs4_expire_wait(path.dentry); spin_lock(&sbi->fs_lock); param->requester.uid = ino->uid; param->requester.gid = ino->gid; spin_unlock(&sbi->fs_lock); } - -out_release: - path_put(&nd.path); + path_put(&path); out: return err; } @@ -569,8 +519,8 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { - struct nameidata nd; - const char *path; + struct path path; + const char *name; unsigned int type; unsigned int devid, magic; int err = -ENOENT; @@ -580,71 +530,46 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, goto out; } - path = param->path; + name = param->path; type = param->ismountpoint.in.type; param->ismountpoint.out.devid = devid = 0; param->ismountpoint.out.magic = magic = 0; if (!fp || param->ioctlfd == -1) { - if (autofs_type_any(type)) { - struct super_block *sb; - - err = path_lookup(path, LOOKUP_FOLLOW, &nd); - if (err) - goto out; - - sb = nd.path.dentry->d_sb; - devid = new_encode_dev(sb->s_dev); - } else { - struct autofs_info *ino; - - err = path_lookup(path, LOOKUP_PARENT, &nd); - if (err) - goto out; - - err = autofs_dev_ioctl_find_sbi_type(&nd, type); - if (err) - goto out_release; - - ino = autofs4_dentry_ino(nd.path.dentry); - devid = autofs4_get_dev(ino->sbi); - } - + if (autofs_type_any(type)) + err = kern_path(name, LOOKUP_FOLLOW, &path); + else + err = find_autofs_mount(name, &path, test_by_type, &type); + if (err) + goto out; + devid = new_encode_dev(path.mnt->mnt_sb->s_dev); err = 0; - if (nd.path.dentry->d_inode && - nd.path.mnt->mnt_root == nd.path.dentry) { + if (path.dentry->d_inode && + path.mnt->mnt_root == path.dentry) { err = 1; - magic = nd.path.dentry->d_inode->i_sb->s_magic; + magic = path.dentry->d_inode->i_sb->s_magic; } } else { - dev_t dev = autofs4_get_dev(sbi); + dev_t dev = sbi->sb->s_dev; - err = path_lookup(path, LOOKUP_PARENT, &nd); + err = find_autofs_mount(name, &path, test_by_dev, &dev); if (err) goto out; - err = autofs_dev_ioctl_find_super(&nd, dev); - if (err) - goto out_release; - - devid = dev; + devid = new_encode_dev(dev); - err = have_submounts(nd.path.dentry); + err = have_submounts(path.dentry); - if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) { - if (follow_down(&nd.path.mnt, &nd.path.dentry)) { - struct inode *inode = nd.path.dentry->d_inode; - magic = inode->i_sb->s_magic; - } + if (path.mnt->mnt_mountpoint != path.mnt->mnt_root) { + if (follow_down(&path.mnt, &path.dentry)) + magic = path.mnt->mnt_sb->s_magic; } } param->ismountpoint.out.devid = devid; param->ismountpoint.out.magic = magic; - -out_release: - path_put(&nd.path); + path_put(&path); out: return err; } -- cgit v1.1 From 9b4a9b14a793bc69b505ed916051f6f32db13bb8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 7 Apr 2009 11:44:16 -0400 Subject: Preparations to caching root in path_walk() Split do_path_lookup(), opencode the call from do_filp_open() do_filp_open() is the only caller of do_path_lookup() that cares about root afterwards (it keeps resolving symlinks on O_CREAT path after it'd done LOOKUP_PARENT walk). So when we start caching fs->root in path_walk(), it'll need a different treatment. Signed-off-by: Al Viro --- fs/namei.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index c82805d..895733e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1017,9 +1017,7 @@ static int path_walk(const char *name, struct nameidata *nd) return link_path_walk(name, nd); } -/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ -static int do_path_lookup(int dfd, const char *name, - unsigned int flags, struct nameidata *nd) +static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd) { int retval = 0; int fput_needed; @@ -1063,17 +1061,25 @@ static int do_path_lookup(int dfd, const char *name, fput_light(file, fput_needed); } + return 0; - retval = path_walk(name, nd); +fput_fail: + fput_light(file, fput_needed); +out_fail: + return retval; +} + +/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ +static int do_path_lookup(int dfd, const char *name, + unsigned int flags, struct nameidata *nd) +{ + int retval = path_init(dfd, name, flags, nd); + if (!retval) + retval = path_walk(name, nd); if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && nd->path.dentry->d_inode)) audit_inode(name, nd->path.dentry); -out_fail: return retval; - -fput_fail: - fput_light(file, fput_needed); - goto out_fail; } int path_lookup(const char *name, unsigned int flags, @@ -1676,9 +1682,14 @@ struct file *do_filp_open(int dfd, const char *pathname, /* * Create - we need to know the parent. */ - error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd); + error = path_init(dfd, pathname, LOOKUP_PARENT, &nd); if (error) return ERR_PTR(error); + error = path_walk(pathname, &nd); + if (error) + return ERR_PTR(error); + if (unlikely(!audit_dummy_context())) + audit_inode(pathname, nd.path.dentry); /* * We have the parent and last component. First of all, check -- cgit v1.1 From 2a737871108de9ba8930f7650d549f1383767f8b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 7 Apr 2009 11:49:53 -0400 Subject: Cache root in nameidata New field: nd->root. When pathname resolution wants to know the root, check if nd->root.mnt is non-NULL; use nd->root if it is, otherwise copy current->fs->root there. After path_walk() is finished, we check if we'd got a cached value in nd->root and drop it. Before calling path_walk() we should either set nd->root.mnt to NULL *or* copy (and pin down) some path to nd->root. In the latter case we won't be looking at current->fs->root at all. Signed-off-by: Al Viro --- fs/namei.c | 53 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 895733e..88baaf2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -552,6 +552,17 @@ static __always_inline int link_path_walk(const char *name, struct nameidata *nd return result; } +static __always_inline void set_root(struct nameidata *nd) +{ + if (!nd->root.mnt) { + struct fs_struct *fs = current->fs; + read_lock(&fs->lock); + nd->root = fs->root; + path_get(&nd->root); + read_unlock(&fs->lock); + } +} + static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link) { int res = 0; @@ -560,14 +571,10 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l goto fail; if (*link == '/') { - struct fs_struct *fs = current->fs; - + set_root(nd); path_put(&nd->path); - - read_lock(&fs->lock); - nd->path = fs->root; - path_get(&fs->root); - read_unlock(&fs->lock); + nd->path = nd->root; + path_get(&nd->root); } res = link_path_walk(link, nd); @@ -741,19 +748,16 @@ int follow_down(struct vfsmount **mnt, struct dentry **dentry) static __always_inline void follow_dotdot(struct nameidata *nd) { - struct fs_struct *fs = current->fs; + set_root(nd); while(1) { struct vfsmount *parent; struct dentry *old = nd->path.dentry; - read_lock(&fs->lock); - if (nd->path.dentry == fs->root.dentry && - nd->path.mnt == fs->root.mnt) { - read_unlock(&fs->lock); + if (nd->path.dentry == nd->root.dentry && + nd->path.mnt == nd->root.mnt) { break; } - read_unlock(&fs->lock); spin_lock(&dcache_lock); if (nd->path.dentry != nd->path.mnt->mnt_root) { nd->path.dentry = dget(nd->path.dentry->d_parent); @@ -1022,18 +1026,18 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei int retval = 0; int fput_needed; struct file *file; - struct fs_struct *fs = current->fs; nd->last_type = LAST_ROOT; /* if there are only slashes... */ nd->flags = flags; nd->depth = 0; + nd->root.mnt = NULL; if (*name=='/') { - read_lock(&fs->lock); - nd->path = fs->root; - path_get(&fs->root); - read_unlock(&fs->lock); + set_root(nd); + nd->path = nd->root; + path_get(&nd->root); } else if (dfd == AT_FDCWD) { + struct fs_struct *fs = current->fs; read_lock(&fs->lock); nd->path = fs->pwd; path_get(&fs->pwd); @@ -1079,6 +1083,10 @@ static int do_path_lookup(int dfd, const char *name, if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && nd->path.dentry->d_inode)) audit_inode(name, nd->path.dentry); + if (nd->root.mnt) { + path_put(&nd->root); + nd->root.mnt = NULL; + } return retval; } @@ -1115,6 +1123,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, nd->last_type = LAST_ROOT; nd->flags = flags; nd->depth = 0; + nd->root.mnt = NULL; nd->path.dentry = dentry; nd->path.mnt = mnt; @@ -1125,8 +1134,12 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, nd->path.dentry->d_inode)) audit_inode(name, nd->path.dentry); - return retval; + if (nd->root.mnt) { + path_put(&nd->root); + nd->root.mnt = NULL; + } + return retval; } /** @@ -1817,6 +1830,8 @@ exit: if (!IS_ERR(nd.intent.open.file)) release_open_intent(&nd); exit_parent: + if (nd.root.mnt) + path_put(&nd.root); path_put(&nd.path); return ERR_PTR(error); -- cgit v1.1 From 5b857119538daac7118c1364d7ff3613f12b84d3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 7 Apr 2009 11:53:49 -0400 Subject: Make vfs_path_lookup() use starting point as root Signed-off-by: Al Viro --- fs/namei.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 88baaf2..4379ef9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1123,21 +1123,20 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, nd->last_type = LAST_ROOT; nd->flags = flags; nd->depth = 0; - nd->root.mnt = NULL; nd->path.dentry = dentry; nd->path.mnt = mnt; path_get(&nd->path); + nd->root = nd->path; + path_get(&nd->root); retval = path_walk(name, nd); if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && nd->path.dentry->d_inode)) audit_inode(name, nd->path.dentry); - if (nd->root.mnt) { - path_put(&nd->root); - nd->root.mnt = NULL; - } + path_put(&nd->root); + nd->root.mnt = NULL; return retval; } -- cgit v1.1 From dd5cae6e9772ecc62fd374f7a8ec10eb51c96c4d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 7 Apr 2009 12:21:18 -0400 Subject: Don't bother with check_mnt() in do_add_mount() on shrinkable ones These guys are what we add as submounts; checks for "is that attached in our namespace" are simply irrelevant for those and counterproductive for use of private vfsmount trees a-la what NFS folks want. Signed-off-by: Al Viro --- fs/namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 134d494..88a904d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1698,7 +1698,7 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path, follow_down(&path->mnt, &path->dentry)) ; err = -EINVAL; - if (!check_mnt(path->mnt)) + if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt)) goto unlock; /* Refuse the same filesystem on the same mount point */ -- cgit v1.1 From 55430e2ecee574e729c12d4063b3ecabfa98fa82 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Apr 2009 02:04:46 -0400 Subject: nfsd struct path use: exp_get_by_name() Signed-off-by: Al Viro --- fs/nfsd/export.c | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 5839b22..3f6d51b 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -847,9 +847,8 @@ exp_get_fsid_key(svc_client *clp, int fsid) return exp_find_key(clp, FSID_NUM, fsidv, NULL); } -static svc_export *exp_get_by_name(svc_client *clp, struct vfsmount *mnt, - struct dentry *dentry, - struct cache_req *reqp) +static svc_export *exp_get_by_name(svc_client *clp, const struct path *path, + struct cache_req *reqp) { struct svc_export *exp, key; int err; @@ -858,8 +857,7 @@ static svc_export *exp_get_by_name(svc_client *clp, struct vfsmount *mnt, return ERR_PTR(-ENOENT); key.ex_client = clp; - key.ex_path.mnt = mnt; - key.ex_path.dentry = dentry; + key.ex_path = *path; exp = svc_export_lookup(&key); if (exp == NULL) @@ -877,20 +875,19 @@ static struct svc_export *exp_parent(svc_client *clp, struct vfsmount *mnt, struct dentry *dentry, struct cache_req *reqp) { + struct path path = {.mnt = mnt, .dentry = dentry}; svc_export *exp; - dget(dentry); - exp = exp_get_by_name(clp, mnt, dentry, reqp); - - while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) { - struct dentry *parent; + dget(path.dentry); + exp = exp_get_by_name(clp, &path, reqp); - parent = dget_parent(dentry); - dput(dentry); - dentry = parent; - exp = exp_get_by_name(clp, mnt, dentry, reqp); + while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path.dentry)) { + struct dentry *parent = dget_parent(path.dentry); + dput(path.dentry); + path.dentry = parent; + exp = exp_get_by_name(clp, &path, reqp); } - dput(dentry); + dput(path.dentry); return exp; } @@ -1018,7 +1015,7 @@ exp_export(struct nfsctl_export *nxp) goto out_put_clp; err = -EINVAL; - exp = exp_get_by_name(clp, path.mnt, path.dentry, NULL); + exp = exp_get_by_name(clp, &path, NULL); memset(&new, 0, sizeof(new)); @@ -1135,7 +1132,7 @@ exp_unexport(struct nfsctl_export *nxp) goto out_domain; err = -EINVAL; - exp = exp_get_by_name(dom, path.mnt, path.dentry, NULL); + exp = exp_get_by_name(dom, &path, NULL); path_put(&path); if (IS_ERR(exp)) goto out_domain; @@ -1207,7 +1204,7 @@ static struct svc_export *exp_find(struct auth_domain *clp, int fsid_type, if (IS_ERR(ek)) return ERR_CAST(ek); - exp = exp_get_by_name(clp, ek->ek_path.mnt, ek->ek_path.dentry, reqp); + exp = exp_get_by_name(clp, &ek->ek_path, reqp); cache_put(&ek->h, &svc_expkey_cache); if (IS_ERR(exp)) @@ -1251,12 +1248,13 @@ rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt, struct dentry *dentry) { struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT); + struct path path = {.mnt = mnt, .dentry = dentry}; if (rqstp->rq_client == NULL) goto gss; /* First try the auth_unix client: */ - exp = exp_get_by_name(rqstp->rq_client, mnt, dentry, + exp = exp_get_by_name(rqstp->rq_client, &path, &rqstp->rq_chandle); if (PTR_ERR(exp) == -ENOENT) goto gss; @@ -1269,7 +1267,7 @@ gss: /* Otherwise, try falling back on gss client */ if (rqstp->rq_gssclient == NULL) return exp; - gssexp = exp_get_by_name(rqstp->rq_gssclient, mnt, dentry, + gssexp = exp_get_by_name(rqstp->rq_gssclient, &path, &rqstp->rq_chandle); if (PTR_ERR(gssexp) == -ENOENT) return exp; -- cgit v1.1 From 5bf3bd2b5cb68ba43c91f5bd0ac043543fba2558 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Apr 2009 02:14:32 -0400 Subject: switch exp_parent() to struct path ... and lose the always-NULL last argument (non-NULL case had been split off a while ago). Signed-off-by: Al Viro --- fs/nfsd/export.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 3f6d51b..5149dab 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -871,23 +871,19 @@ static svc_export *exp_get_by_name(svc_client *clp, const struct path *path, /* * Find the export entry for a given dentry. */ -static struct svc_export *exp_parent(svc_client *clp, struct vfsmount *mnt, - struct dentry *dentry, - struct cache_req *reqp) +static struct svc_export *exp_parent(svc_client *clp, struct path *path) { - struct path path = {.mnt = mnt, .dentry = dentry}; - svc_export *exp; - - dget(path.dentry); - exp = exp_get_by_name(clp, &path, reqp); - - while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path.dentry)) { - struct dentry *parent = dget_parent(path.dentry); - dput(path.dentry); - path.dentry = parent; - exp = exp_get_by_name(clp, &path, reqp); + struct dentry *saved = dget(path->dentry); + svc_export *exp = exp_get_by_name(clp, path, NULL); + + while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) { + struct dentry *parent = dget_parent(path->dentry); + dput(path->dentry); + path->dentry = parent; + exp = exp_get_by_name(clp, path, NULL); } - dput(path.dentry); + dput(path->dentry); + path->dentry = saved; return exp; } @@ -1174,7 +1170,7 @@ exp_rootfh(svc_client *clp, char *name, struct knfsd_fh *f, int maxsize) dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n", name, path.dentry, clp->name, inode->i_sb->s_id, inode->i_ino); - exp = exp_parent(clp, path.mnt, path.dentry, NULL); + exp = exp_parent(clp, &path); if (IS_ERR(exp)) { err = PTR_ERR(exp); goto out; -- cgit v1.1 From 91c9fa8f75877c0c1e455c23e8f8206c91c8f77f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Apr 2009 02:42:05 -0400 Subject: switch rqst_exp_get_by_name() Signed-off-by: Al Viro --- fs/nfsd/export.c | 15 ++++++--------- fs/nfsd/vfs.c | 32 ++++++++++++++++---------------- 2 files changed, 22 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 5149dab..84f5e5c 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1240,18 +1240,15 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp) * use exp_get_by_name() or exp_find(). */ struct svc_export * -rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt, - struct dentry *dentry) +rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path) { struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT); - struct path path = {.mnt = mnt, .dentry = dentry}; if (rqstp->rq_client == NULL) goto gss; /* First try the auth_unix client: */ - exp = exp_get_by_name(rqstp->rq_client, &path, - &rqstp->rq_chandle); + exp = exp_get_by_name(rqstp->rq_client, path, &rqstp->rq_chandle); if (PTR_ERR(exp) == -ENOENT) goto gss; if (IS_ERR(exp)) @@ -1263,8 +1260,7 @@ gss: /* Otherwise, try falling back on gss client */ if (rqstp->rq_gssclient == NULL) return exp; - gssexp = exp_get_by_name(rqstp->rq_gssclient, &path, - &rqstp->rq_chandle); + gssexp = exp_get_by_name(rqstp->rq_gssclient, path, &rqstp->rq_chandle); if (PTR_ERR(gssexp) == -ENOENT) return exp; if (!IS_ERR(exp)) @@ -1307,9 +1303,10 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct vfsmount *mnt, struct dentry *dentry) { struct svc_export *exp; + struct path path = {.mnt = mnt, .dentry = dentry}; dget(dentry); - exp = rqst_exp_get_by_name(rqstp, mnt, dentry); + exp = rqst_exp_get_by_name(rqstp, &path); while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) { struct dentry *parent; @@ -1317,7 +1314,7 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct vfsmount *mnt, parent = dget_parent(dentry); dput(dentry); dentry = parent; - exp = rqst_exp_get_by_name(rqstp, mnt, dentry); + exp = rqst_exp_get_by_name(rqstp, &path); } dput(dentry); return exp; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index bd584bc..d84c4ea 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -101,36 +101,36 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, { struct svc_export *exp = *expp, *exp2 = NULL; struct dentry *dentry = *dpp; - struct vfsmount *mnt = mntget(exp->ex_path.mnt); - struct dentry *mounts = dget(dentry); + struct path path = {.mnt = mntget(exp->ex_path.mnt), + .dentry = dget(dentry)}; int err = 0; - while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts)); + while (follow_down(&path.mnt, &path.dentry) && + d_mountpoint(path.dentry)) + ; - exp2 = rqst_exp_get_by_name(rqstp, mnt, mounts); + exp2 = rqst_exp_get_by_name(rqstp, &path); if (IS_ERR(exp2)) { if (PTR_ERR(exp2) != -ENOENT) err = PTR_ERR(exp2); - dput(mounts); - mntput(mnt); + path_put(&path); goto out; } if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) { /* successfully crossed mount point */ /* - * This is subtle: dentry is *not* under mnt at this point. - * The only reason we are safe is that original mnt is pinned - * down by exp, so we should dput before putting exp. + * This is subtle: path.dentry is *not* on path.mnt + * at this point. The only reason we are safe is that + * original mnt is pinned down by exp, so we should + * put path *before* putting exp */ - dput(dentry); - *dpp = mounts; - exp_put(exp); + *dpp = path.dentry; + path.dentry = dentry; *expp = exp2; - } else { - exp_put(exp2); - dput(mounts); + exp2 = exp; } - mntput(mnt); + path_put(&path); + exp_put(exp2); out: return err; } -- cgit v1.1 From e64c390ca0b60fd2119331ef1fa888d7ea27e424 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Apr 2009 03:00:46 -0400 Subject: switch rqst_exp_parent() Signed-off-by: Al Viro --- fs/nfsd/export.c | 25 ++++++++++--------------- fs/nfsd/vfs.c | 23 ++++++++++++----------- 2 files changed, 22 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 84f5e5c..8b1f8ef 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1299,24 +1299,19 @@ gss: } struct svc_export * -rqst_exp_parent(struct svc_rqst *rqstp, struct vfsmount *mnt, - struct dentry *dentry) +rqst_exp_parent(struct svc_rqst *rqstp, struct path *path) { - struct svc_export *exp; - struct path path = {.mnt = mnt, .dentry = dentry}; - - dget(dentry); - exp = rqst_exp_get_by_name(rqstp, &path); - - while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) { - struct dentry *parent; + struct dentry *saved = dget(path->dentry); + struct svc_export *exp = rqst_exp_get_by_name(rqstp, path); - parent = dget_parent(dentry); - dput(dentry); - dentry = parent; - exp = rqst_exp_get_by_name(rqstp, &path); + while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) { + struct dentry *parent = dget_parent(path->dentry); + dput(path->dentry); + path->dentry = parent; + exp = rqst_exp_get_by_name(rqstp, path); } - dput(dentry); + dput(path->dentry); + path->dentry = saved; return exp; } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index d84c4ea..9f1ea31 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -169,28 +169,29 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, /* checking mountpoint crossing is very different when stepping up */ struct svc_export *exp2 = NULL; struct dentry *dp; - struct vfsmount *mnt = mntget(exp->ex_path.mnt); - dentry = dget(dparent); - while(dentry == mnt->mnt_root && follow_up(&mnt, &dentry)) + struct path path = {.mnt = mntget(exp->ex_path.mnt), + .dentry = dget(dparent)}; + + while (path.dentry == path.mnt->mnt_root && + follow_up(&path.mnt, &path.dentry)) ; - dp = dget_parent(dentry); - dput(dentry); - dentry = dp; + dp = dget_parent(path.dentry); + dput(path.dentry); + path.dentry = dp; - exp2 = rqst_exp_parent(rqstp, mnt, dentry); + exp2 = rqst_exp_parent(rqstp, &path); if (PTR_ERR(exp2) == -ENOENT) { - dput(dentry); dentry = dget(dparent); } else if (IS_ERR(exp2)) { host_err = PTR_ERR(exp2); - dput(dentry); - mntput(mnt); + path_put(&path); goto out_nfserr; } else { + dentry = dget(path.dentry); exp_put(exp); exp = exp2; } - mntput(mnt); + path_put(&path); } } else { fh_lock(fhp); -- cgit v1.1 From bab77ebf51e3902f608ecf08c9d34a0a52ac35a9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Apr 2009 03:26:48 -0400 Subject: switch follow_up() to struct path Signed-off-by: Al Viro --- fs/autofs4/dev-ioctl.c | 2 +- fs/namei.c | 16 ++++++++-------- fs/nfsd/vfs.c | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index f71dac9..6704075 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -212,7 +212,7 @@ static int find_autofs_mount(const char *pathname, err = 0; } } - if (!follow_up(&path.mnt, &path.dentry)) + if (!follow_up(&path)) break; } path_put(&path); diff --git a/fs/namei.c b/fs/namei.c index 4379ef9..8c1f48a 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -675,23 +675,23 @@ loop: return err; } -int follow_up(struct vfsmount **mnt, struct dentry **dentry) +int follow_up(struct path *path) { struct vfsmount *parent; struct dentry *mountpoint; spin_lock(&vfsmount_lock); - parent=(*mnt)->mnt_parent; - if (parent == *mnt) { + parent = path->mnt->mnt_parent; + if (parent == path->mnt) { spin_unlock(&vfsmount_lock); return 0; } mntget(parent); - mountpoint=dget((*mnt)->mnt_mountpoint); + mountpoint = dget(path->mnt->mnt_mountpoint); spin_unlock(&vfsmount_lock); - dput(*dentry); - *dentry = mountpoint; - mntput(*mnt); - *mnt = parent; + dput(path->dentry); + path->dentry = mountpoint; + mntput(path->mnt); + path->mnt = parent; return 1; } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 9f1ea31..7b2b3f7 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -173,7 +173,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, .dentry = dget(dparent)}; while (path.dentry == path.mnt->mnt_root && - follow_up(&path.mnt, &path.dentry)) + follow_up(&path)) ; dp = dget_parent(path.dentry); dput(path.dentry); -- cgit v1.1 From 589ff870ed60a9ebdd5ec99ec3f5afe1282fe151 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Apr 2009 03:28:19 -0400 Subject: Switch collect_mounts() to struct path Signed-off-by: Al Viro --- fs/namespace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 88a904d..c859622 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1253,11 +1253,11 @@ Enomem: return NULL; } -struct vfsmount *collect_mounts(struct vfsmount *mnt, struct dentry *dentry) +struct vfsmount *collect_mounts(struct path *path) { struct vfsmount *tree; down_write(&namespace_sem); - tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE); + tree = copy_tree(path->mnt, path->dentry, CL_COPY_ALL | CL_PRIVATE); up_write(&namespace_sem); return tree; } -- cgit v1.1 From 9393bd07cf218ca51d0e627653f906a9d76a9131 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Apr 2009 13:58:15 -0400 Subject: switch follow_down() Signed-off-by: Al Viro --- fs/afs/mntpt.c | 2 +- fs/autofs/dirhash.c | 5 ++--- fs/autofs4/autofs_i.h | 6 +++--- fs/autofs4/dev-ioctl.c | 2 +- fs/autofs4/expire.c | 15 +++++++-------- fs/autofs4/root.c | 7 +++---- fs/cifs/cifs_dfs_ref.c | 2 +- fs/namei.c | 12 ++++++------ fs/namespace.c | 4 ++-- fs/nfs/namespace.c | 2 +- fs/nfsd/vfs.c | 3 +-- 11 files changed, 28 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 2b9e2d0..c52be53 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -244,7 +244,7 @@ static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd) case -EBUSY: /* someone else made a mount here whilst we were busy */ while (d_mountpoint(nd->path.dentry) && - follow_down(&nd->path.mnt, &nd->path.dentry)) + follow_down(&nd->path)) ; err = 0; default: diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c index 4eb4d8d..2316e94 100644 --- a/fs/autofs/dirhash.c +++ b/fs/autofs/dirhash.c @@ -85,13 +85,12 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb, } path.mnt = mnt; path_get(&path); - if (!follow_down(&path.mnt, &path.dentry)) { + if (!follow_down(&path)) { path_put(&path); DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name)); continue; } - while (d_mountpoint(path.dentry) && - follow_down(&path.mnt, &path.dentry)) + while (d_mountpoint(path.dentry) && follow_down(&path)); ; umount_ok = may_umount(path.mnt); path_put(&path); diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index b7ff33c..8f7cdde 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -223,12 +223,12 @@ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify); int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int); void autofs4_catatonic_mode(struct autofs_sb_info *); -static inline int autofs4_follow_mount(struct vfsmount **mnt, struct dentry **dentry) +static inline int autofs4_follow_mount(struct path *path) { int res = 0; - while (d_mountpoint(*dentry)) { - int followed = follow_down(mnt, dentry); + while (d_mountpoint(path->dentry)) { + int followed = follow_down(path); if (!followed) break; res = 1; diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 6704075..f3da2eb 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -562,7 +562,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, err = have_submounts(path.dentry); if (path.mnt->mnt_mountpoint != path.mnt->mnt_root) { - if (follow_down(&path.mnt, &path.dentry)) + if (follow_down(&path)) magic = path.mnt->mnt_sb->s_magic; } } diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 3077d8f..aa39ae8 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -48,19 +48,19 @@ static inline int autofs4_can_expire(struct dentry *dentry, static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) { struct dentry *top = dentry; + struct path path = {.mnt = mnt, .dentry = dentry}; int status = 1; DPRINTK("dentry %p %.*s", dentry, (int)dentry->d_name.len, dentry->d_name.name); - mntget(mnt); - dget(dentry); + path_get(&path); - if (!follow_down(&mnt, &dentry)) + if (!follow_down(&path)) goto done; - if (is_autofs4_dentry(dentry)) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + if (is_autofs4_dentry(path.dentry)) { + struct autofs_sb_info *sbi = autofs4_sbi(path.dentry->d_sb); /* This is an autofs submount, we can't expire it */ if (autofs_type_indirect(sbi->type)) @@ -70,7 +70,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) * Otherwise it's an offset mount and we need to check * if we can umount its mount, if there is one. */ - if (!d_mountpoint(dentry)) { + if (!d_mountpoint(path.dentry)) { status = 0; goto done; } @@ -86,8 +86,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) status = 0; done: DPRINTK("returning = %d", status); - dput(dentry); - mntput(mnt); + path_put(&path); return status; } diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index e383bf0..b96a3c5 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -181,7 +181,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) nd->flags); /* * For an expire of a covered direct or offset mount we need - * to beeak out of follow_down() at the autofs mount trigger + * to break out of follow_down() at the autofs mount trigger * (d_mounted--), so we can see the expiring flag, and manage * the blocking and following here until the expire is completed. */ @@ -190,7 +190,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) if (ino->flags & AUTOFS_INF_EXPIRING) { spin_unlock(&sbi->fs_lock); /* Follow down to our covering mount. */ - if (!follow_down(&nd->path.mnt, &nd->path.dentry)) + if (!follow_down(&nd->path)) goto done; goto follow; } @@ -230,8 +230,7 @@ follow: * to follow it. */ if (d_mountpoint(dentry)) { - if (!autofs4_follow_mount(&nd->path.mnt, - &nd->path.dentry)) { + if (!autofs4_follow_mount(&nd->path)) { status = -ENOENT; goto out_error; } diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 83d6275..3bb11be 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -275,7 +275,7 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd, case -EBUSY: /* someone else made a mount here whilst we were busy */ while (d_mountpoint(nd->path.dentry) && - follow_down(&nd->path.mnt, &nd->path.dentry)) + follow_down(&nd->path)) ; err = 0; default: diff --git a/fs/namei.c b/fs/namei.c index 8c1f48a..4d49a3e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -731,16 +731,16 @@ static void follow_mount(struct vfsmount **mnt, struct dentry **dentry) /* no need for dcache_lock, as serialization is taken care in * namespace.c */ -int follow_down(struct vfsmount **mnt, struct dentry **dentry) +int follow_down(struct path *path) { struct vfsmount *mounted; - mounted = lookup_mnt(*mnt, *dentry); + mounted = lookup_mnt(path->mnt, path->dentry); if (mounted) { - dput(*dentry); - mntput(*mnt); - *mnt = mounted; - *dentry = dget(mounted->mnt_root); + dput(path->dentry); + mntput(path->mnt); + path->mnt = mounted; + path->dentry = dget(mounted->mnt_root); return 1; } return 0; diff --git a/fs/namespace.c b/fs/namespace.c index c859622..ba5237b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1601,7 +1601,7 @@ static int do_move_mount(struct path *path, char *old_name) down_write(&namespace_sem); while (d_mountpoint(path->dentry) && - follow_down(&path->mnt, &path->dentry)) + follow_down(path)) ; err = -EINVAL; if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) @@ -1695,7 +1695,7 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path, down_write(&namespace_sem); /* Something was mounted here while we slept */ while (d_mountpoint(path->dentry) && - follow_down(&path->mnt, &path->dentry)) + follow_down(path)) ; err = -EINVAL; if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt)) diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 64a288e..f01caec 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -154,7 +154,7 @@ out_err: goto out; out_follow: while (d_mountpoint(nd->path.dentry) && - follow_down(&nd->path.mnt, &nd->path.dentry)) + follow_down(&nd->path)) ; err = 0; goto out; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 7b2b3f7..99f8357 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -105,8 +105,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, .dentry = dget(dentry)}; int err = 0; - while (follow_down(&path.mnt, &path.dentry) && - d_mountpoint(path.dentry)) + while (d_mountpoint(path.dentry) && follow_down(&path)) ; exp2 = rqst_exp_get_by_name(rqstp, &path); -- cgit v1.1 From 79ed0226198c628133530b179a90dbf42b1c2eba Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Apr 2009 13:59:41 -0400 Subject: switch follow_mount() Signed-off-by: Al Viro --- fs/namei.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 4d49a3e..c006bc6 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -715,16 +715,16 @@ static int __follow_mount(struct path *path) return res; } -static void follow_mount(struct vfsmount **mnt, struct dentry **dentry) +static void follow_mount(struct path *path) { - while (d_mountpoint(*dentry)) { - struct vfsmount *mounted = lookup_mnt(*mnt, *dentry); + while (d_mountpoint(path->dentry)) { + struct vfsmount *mounted = lookup_mnt(path->mnt, path->dentry); if (!mounted) break; - dput(*dentry); - mntput(*mnt); - *mnt = mounted; - *dentry = dget(mounted->mnt_root); + dput(path->dentry); + mntput(path->mnt); + path->mnt = mounted; + path->dentry = dget(mounted->mnt_root); } } @@ -779,7 +779,7 @@ static __always_inline void follow_dotdot(struct nameidata *nd) mntput(nd->path.mnt); nd->path.mnt = parent; } - follow_mount(&nd->path.mnt, &nd->path.dentry); + follow_mount(&nd->path); } /* -- cgit v1.1 From 1c755af4df75996b0dd4b7e6cacaf9d57a6ef2ef Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Apr 2009 14:06:57 -0400 Subject: switch lookup_mnt() Signed-off-by: Al Viro --- fs/namei.c | 6 +++--- fs/namespace.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index c006bc6..527119a 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -702,7 +702,7 @@ static int __follow_mount(struct path *path) { int res = 0; while (d_mountpoint(path->dentry)) { - struct vfsmount *mounted = lookup_mnt(path->mnt, path->dentry); + struct vfsmount *mounted = lookup_mnt(path); if (!mounted) break; dput(path->dentry); @@ -718,7 +718,7 @@ static int __follow_mount(struct path *path) static void follow_mount(struct path *path) { while (d_mountpoint(path->dentry)) { - struct vfsmount *mounted = lookup_mnt(path->mnt, path->dentry); + struct vfsmount *mounted = lookup_mnt(path); if (!mounted) break; dput(path->dentry); @@ -735,7 +735,7 @@ int follow_down(struct path *path) { struct vfsmount *mounted; - mounted = lookup_mnt(path->mnt, path->dentry); + mounted = lookup_mnt(path); if (mounted) { dput(path->dentry); mntput(path->mnt); diff --git a/fs/namespace.c b/fs/namespace.c index ba5237b..b94ad3d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -442,11 +442,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, * lookup_mnt increments the ref count before returning * the vfsmount struct. */ -struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) +struct vfsmount *lookup_mnt(struct path *path) { struct vfsmount *child_mnt; spin_lock(&vfsmount_lock); - if ((child_mnt = __lookup_mnt(mnt, dentry, 1))) + if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1))) mntget(child_mnt); spin_unlock(&vfsmount_lock); return child_mnt; -- cgit v1.1 From 3174c21b74b56c6a53fddd41a30fd6f757a32bd0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 7 Apr 2009 13:19:18 -0400 Subject: Move junk from proc_fs.h to fs/proc/internal.h Signed-off-by: Al Viro --- fs/proc/internal.h | 25 +++++++++++++++++++++++++ fs/proc/proc_devtree.c | 1 + 2 files changed, 26 insertions(+) (limited to 'fs') diff --git a/fs/proc/internal.h b/fs/proc/internal.h index f6db9618..753ca37 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -92,3 +92,28 @@ struct pde_opener { struct list_head lh; }; void pde_users_dec(struct proc_dir_entry *pde); + +extern spinlock_t proc_subdir_lock; + +struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *); +int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir); +unsigned long task_vsize(struct mm_struct *); +int task_statm(struct mm_struct *, int *, int *, int *, int *); +void task_mem(struct seq_file *, struct mm_struct *); + +struct proc_dir_entry *de_get(struct proc_dir_entry *de); +void de_put(struct proc_dir_entry *de); + +extern struct vfsmount *proc_mnt; +int proc_fill_super(struct super_block *); +struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *); + +/* + * These are generic /proc routines that use the internal + * "struct proc_dir_entry" tree to traverse the filesystem. + * + * The /proc root directory has extended versions to take care + * of the /proc/ subdirectories. + */ +int proc_readdir(struct file *, void *, filldir_t); +struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *); diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index de2bba5..fc6c302 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -11,6 +11,7 @@ #include #include #include +#include "internal.h" #ifndef HAVE_ARCH_DEVTREE_FIXUPS static inline void set_node_proc_entry(struct device_node *np, -- cgit v1.1 From d3ef3d7351ccfbef3e5d926efc5ee332136f40d4 Mon Sep 17 00:00:00 2001 From: "npiggin@suse.de" Date: Sun, 26 Apr 2009 20:25:54 +1000 Subject: fs: mnt_want_write speedup This patch speeds up lmbench lat_mmap test by about 8%. lat_mmap is set up basically to mmap a 64MB file on tmpfs, fault in its pages, then unmap it. A microbenchmark yes, but it exercises some important paths in the mm. Before: avg = 501.9 std = 14.7773 After: avg = 462.286 std = 5.46106 (50 runs of each, stddev gives a reasonable confidence, but there is quite a bit of variation there still) It does this by removing the complex per-cpu locking and counter-cache and replaces it with a percpu counter in struct vfsmount. This makes the code much simpler, and avoids spinlocks (although the msync is still pretty costly, unfortunately). It results in about 900 bytes smaller code too. It does increase the size of a vfsmount, however. It should also give a speedup on large systems if CPUs are frequently operating on different mounts (because the existing scheme has to operate on an atomic in the struct vfsmount when switching between mounts). But I'm most interested in the single threaded path performance for the moment. [AV: minor cleanup] Cc: Dave Hansen Signed-off-by: Nick Piggin Signed-off-by: Al Viro --- fs/namespace.c | 268 ++++++++++++++++++++------------------------------------- 1 file changed, 91 insertions(+), 177 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index b94ad3d..22ae06a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -131,10 +131,20 @@ struct vfsmount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_share); INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); - atomic_set(&mnt->__mnt_writers, 0); +#ifdef CONFIG_SMP + mnt->mnt_writers = alloc_percpu(int); + if (!mnt->mnt_writers) + goto out_free_devname; +#else + mnt->mnt_writers = 0; +#endif } return mnt; +#ifdef CONFIG_SMP +out_free_devname: + kfree(mnt->mnt_devname); +#endif out_free_id: mnt_free_id(mnt); out_free_cache: @@ -171,65 +181,38 @@ int __mnt_is_readonly(struct vfsmount *mnt) } EXPORT_SYMBOL_GPL(__mnt_is_readonly); -struct mnt_writer { - /* - * If holding multiple instances of this lock, they - * must be ordered by cpu number. - */ - spinlock_t lock; - struct lock_class_key lock_class; /* compiles out with !lockdep */ - unsigned long count; - struct vfsmount *mnt; -} ____cacheline_aligned_in_smp; -static DEFINE_PER_CPU(struct mnt_writer, mnt_writers); +static inline void inc_mnt_writers(struct vfsmount *mnt) +{ +#ifdef CONFIG_SMP + (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++; +#else + mnt->mnt_writers++; +#endif +} -static int __init init_mnt_writers(void) +static inline void dec_mnt_writers(struct vfsmount *mnt) { - int cpu; - for_each_possible_cpu(cpu) { - struct mnt_writer *writer = &per_cpu(mnt_writers, cpu); - spin_lock_init(&writer->lock); - lockdep_set_class(&writer->lock, &writer->lock_class); - writer->count = 0; - } - return 0; +#ifdef CONFIG_SMP + (*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--; +#else + mnt->mnt_writers--; +#endif } -fs_initcall(init_mnt_writers); -static void unlock_mnt_writers(void) +static unsigned int count_mnt_writers(struct vfsmount *mnt) { +#ifdef CONFIG_SMP + unsigned int count = 0; int cpu; - struct mnt_writer *cpu_writer; for_each_possible_cpu(cpu) { - cpu_writer = &per_cpu(mnt_writers, cpu); - spin_unlock(&cpu_writer->lock); + count += *per_cpu_ptr(mnt->mnt_writers, cpu); } -} -static inline void __clear_mnt_count(struct mnt_writer *cpu_writer) -{ - if (!cpu_writer->mnt) - return; - /* - * This is in case anyone ever leaves an invalid, - * old ->mnt and a count of 0. - */ - if (!cpu_writer->count) - return; - atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers); - cpu_writer->count = 0; -} - /* - * must hold cpu_writer->lock - */ -static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer, - struct vfsmount *mnt) -{ - if (cpu_writer->mnt == mnt) - return; - __clear_mnt_count(cpu_writer); - cpu_writer->mnt = mnt; + return count; +#else + return mnt->mnt_writers; +#endif } /* @@ -253,75 +236,34 @@ static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer, int mnt_want_write(struct vfsmount *mnt) { int ret = 0; - struct mnt_writer *cpu_writer; - cpu_writer = &get_cpu_var(mnt_writers); - spin_lock(&cpu_writer->lock); + preempt_disable(); + inc_mnt_writers(mnt); + /* + * The store to inc_mnt_writers must be visible before we pass + * MNT_WRITE_HOLD loop below, so that the slowpath can see our + * incremented count after it has set MNT_WRITE_HOLD. + */ + smp_mb(); + while (mnt->mnt_flags & MNT_WRITE_HOLD) + cpu_relax(); + /* + * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will + * be set to match its requirements. So we must not load that until + * MNT_WRITE_HOLD is cleared. + */ + smp_rmb(); if (__mnt_is_readonly(mnt)) { + dec_mnt_writers(mnt); ret = -EROFS; goto out; } - use_cpu_writer_for_mount(cpu_writer, mnt); - cpu_writer->count++; out: - spin_unlock(&cpu_writer->lock); - put_cpu_var(mnt_writers); + preempt_enable(); return ret; } EXPORT_SYMBOL_GPL(mnt_want_write); -static void lock_mnt_writers(void) -{ - int cpu; - struct mnt_writer *cpu_writer; - - for_each_possible_cpu(cpu) { - cpu_writer = &per_cpu(mnt_writers, cpu); - spin_lock(&cpu_writer->lock); - __clear_mnt_count(cpu_writer); - cpu_writer->mnt = NULL; - } -} - -/* - * These per-cpu write counts are not guaranteed to have - * matched increments and decrements on any given cpu. - * A file open()ed for write on one cpu and close()d on - * another cpu will imbalance this count. Make sure it - * does not get too far out of whack. - */ -static void handle_write_count_underflow(struct vfsmount *mnt) -{ - if (atomic_read(&mnt->__mnt_writers) >= - MNT_WRITER_UNDERFLOW_LIMIT) - return; - /* - * It isn't necessary to hold all of the locks - * at the same time, but doing it this way makes - * us share a lot more code. - */ - lock_mnt_writers(); - /* - * vfsmount_lock is for mnt_flags. - */ - spin_lock(&vfsmount_lock); - /* - * If coalescing the per-cpu writer counts did not - * get us back to a positive writer count, we have - * a bug. - */ - if ((atomic_read(&mnt->__mnt_writers) < 0) && - !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) { - WARN(1, KERN_DEBUG "leak detected on mount(%p) writers " - "count: %d\n", - mnt, atomic_read(&mnt->__mnt_writers)); - /* use the flag to keep the dmesg spam down */ - mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT; - } - spin_unlock(&vfsmount_lock); - unlock_mnt_writers(); -} - /** * mnt_drop_write - give up write access to a mount * @mnt: the mount on which to give up write access @@ -332,37 +274,9 @@ static void handle_write_count_underflow(struct vfsmount *mnt) */ void mnt_drop_write(struct vfsmount *mnt) { - int must_check_underflow = 0; - struct mnt_writer *cpu_writer; - - cpu_writer = &get_cpu_var(mnt_writers); - spin_lock(&cpu_writer->lock); - - use_cpu_writer_for_mount(cpu_writer, mnt); - if (cpu_writer->count > 0) { - cpu_writer->count--; - } else { - must_check_underflow = 1; - atomic_dec(&mnt->__mnt_writers); - } - - spin_unlock(&cpu_writer->lock); - /* - * Logically, we could call this each time, - * but the __mnt_writers cacheline tends to - * be cold, and makes this expensive. - */ - if (must_check_underflow) - handle_write_count_underflow(mnt); - /* - * This could be done right after the spinlock - * is taken because the spinlock keeps us on - * the cpu, and disables preemption. However, - * putting it here bounds the amount that - * __mnt_writers can underflow. Without it, - * we could theoretically wrap __mnt_writers. - */ - put_cpu_var(mnt_writers); + preempt_disable(); + dec_mnt_writers(mnt); + preempt_enable(); } EXPORT_SYMBOL_GPL(mnt_drop_write); @@ -370,24 +284,41 @@ static int mnt_make_readonly(struct vfsmount *mnt) { int ret = 0; - lock_mnt_writers(); + spin_lock(&vfsmount_lock); + mnt->mnt_flags |= MNT_WRITE_HOLD; /* - * With all the locks held, this value is stable + * After storing MNT_WRITE_HOLD, we'll read the counters. This store + * should be visible before we do. */ - if (atomic_read(&mnt->__mnt_writers) > 0) { - ret = -EBUSY; - goto out; - } + smp_mb(); + /* - * nobody can do a successful mnt_want_write() with all - * of the counts in MNT_DENIED_WRITE and the locks held. + * With writers on hold, if this value is zero, then there are + * definitely no active writers (although held writers may subsequently + * increment the count, they'll have to wait, and decrement it after + * seeing MNT_READONLY). + * + * It is OK to have counter incremented on one CPU and decremented on + * another: the sum will add up correctly. The danger would be when we + * sum up each counter, if we read a counter before it is incremented, + * but then read another CPU's count which it has been subsequently + * decremented from -- we would see more decrements than we should. + * MNT_WRITE_HOLD protects against this scenario, because + * mnt_want_write first increments count, then smp_mb, then spins on + * MNT_WRITE_HOLD, so it can't be decremented by another CPU while + * we're counting up here. */ - spin_lock(&vfsmount_lock); - if (!ret) + if (count_mnt_writers(mnt) > 0) + ret = -EBUSY; + else mnt->mnt_flags |= MNT_READONLY; + /* + * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers + * that become unheld will see MNT_READONLY. + */ + smp_wmb(); + mnt->mnt_flags &= ~MNT_WRITE_HOLD; spin_unlock(&vfsmount_lock); -out: - unlock_mnt_writers(); return ret; } @@ -410,6 +341,9 @@ void free_vfsmnt(struct vfsmount *mnt) { kfree(mnt->mnt_devname); mnt_free_id(mnt); +#ifdef CONFIG_SMP + free_percpu(mnt->mnt_writers); +#endif kmem_cache_free(mnt_cache, mnt); } @@ -604,38 +538,18 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, static inline void __mntput(struct vfsmount *mnt) { - int cpu; struct super_block *sb = mnt->mnt_sb; /* - * We don't have to hold all of the locks at the - * same time here because we know that we're the - * last reference to mnt and that no new writers - * can come in. - */ - for_each_possible_cpu(cpu) { - struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu); - spin_lock(&cpu_writer->lock); - if (cpu_writer->mnt != mnt) { - spin_unlock(&cpu_writer->lock); - continue; - } - atomic_add(cpu_writer->count, &mnt->__mnt_writers); - cpu_writer->count = 0; - /* - * Might as well do this so that no one - * ever sees the pointer and expects - * it to be valid. - */ - cpu_writer->mnt = NULL; - spin_unlock(&cpu_writer->lock); - } - /* * This probably indicates that somebody messed * up a mnt_want/drop_write() pair. If this * happens, the filesystem was probably unable * to make r/w->r/o transitions. */ - WARN_ON(atomic_read(&mnt->__mnt_writers)); + /* + * atomic_dec_and_lock() used to deal with ->mnt_count decrements + * provides barriers, so count_mnt_writers() below is safe. AV + */ + WARN_ON(count_mnt_writers(mnt)); dput(mnt->mnt_root); free_vfsmnt(mnt); deactivate_super(sb); -- cgit v1.1 From 96029c4e09ccbd73a6d0ed2b29e80bf2586ad7ef Mon Sep 17 00:00:00 2001 From: "npiggin@suse.de" Date: Sun, 26 Apr 2009 20:25:55 +1000 Subject: fs: introduce mnt_clone_write This patch speeds up lmbench lat_mmap test by about another 2% after the first patch. Before: avg = 462.286 std = 5.46106 After: avg = 453.12 std = 9.58257 (50 runs of each, stddev gives a reasonable confidence) It does this by introducing mnt_clone_write, which avoids some heavyweight operations of mnt_want_write if called on a vfsmount which we know already has a write count; and mnt_want_write_file, which can call mnt_clone_write if the file is open for write. After these two patches, mnt_want_write and mnt_drop_write go from 7% on the profile down to 1.3% (including mnt_clone_write). [AV: mnt_want_write_file() should take file alone and derive mnt from it; not only all callers have that form, but that's the only mnt about which we know that it's already held for write if file is opened for write] Cc: Dave Hansen Signed-off-by: Nick Piggin Signed-off-by: Al Viro --- fs/file_table.c | 2 +- fs/inode.c | 2 +- fs/namespace.c | 40 ++++++++++++++++++++++++++++++++++++++++ fs/open.c | 4 ++-- fs/xattr.c | 4 ++-- 5 files changed, 46 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/file_table.c b/fs/file_table.c index 54018fe..3d66dbc 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -214,7 +214,7 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry, */ if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) { file_take_write(file); - error = mnt_want_write(mnt); + error = mnt_clone_write(mnt); WARN_ON(error); } return error; diff --git a/fs/inode.c b/fs/inode.c index ca33701..a88baeb 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1422,7 +1422,7 @@ void file_update_time(struct file *file) if (IS_NOCMTIME(inode)) return; - err = mnt_want_write(file->f_path.mnt); + err = mnt_want_write_file(file); if (err) return; diff --git a/fs/namespace.c b/fs/namespace.c index 22ae06a..120b8a6 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -265,6 +265,46 @@ out: EXPORT_SYMBOL_GPL(mnt_want_write); /** + * mnt_clone_write - get write access to a mount + * @mnt: the mount on which to take a write + * + * This is effectively like mnt_want_write, except + * it must only be used to take an extra write reference + * on a mountpoint that we already know has a write reference + * on it. This allows some optimisation. + * + * After finished, mnt_drop_write must be called as usual to + * drop the reference. + */ +int mnt_clone_write(struct vfsmount *mnt) +{ + /* superblock may be r/o */ + if (__mnt_is_readonly(mnt)) + return -EROFS; + preempt_disable(); + inc_mnt_writers(mnt); + preempt_enable(); + return 0; +} +EXPORT_SYMBOL_GPL(mnt_clone_write); + +/** + * mnt_want_write_file - get write access to a file's mount + * @file: the file who's mount on which to take a write + * + * This is like mnt_want_write, but it takes a file and can + * do some optimisations if the file is open for write already + */ +int mnt_want_write_file(struct file *file) +{ + if (!(file->f_mode & FMODE_WRITE)) + return mnt_want_write(file->f_path.mnt); + else + return mnt_clone_write(file->f_path.mnt); +} +EXPORT_SYMBOL_GPL(mnt_want_write_file); + +/** * mnt_drop_write - give up write access to a mount * @mnt: the mount on which to give up write access * diff --git a/fs/open.c b/fs/open.c index bdfbf03..7200e23 100644 --- a/fs/open.c +++ b/fs/open.c @@ -612,7 +612,7 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode) audit_inode(NULL, dentry); - err = mnt_want_write(file->f_path.mnt); + err = mnt_want_write_file(file); if (err) goto out_putf; mutex_lock(&inode->i_mutex); @@ -761,7 +761,7 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) if (!file) goto out; - error = mnt_want_write(file->f_path.mnt); + error = mnt_want_write_file(file); if (error) goto out_fput; dentry = file->f_path.dentry; diff --git a/fs/xattr.c b/fs/xattr.c index d51b8f9..1c3d0af 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -297,7 +297,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, return error; dentry = f->f_path.dentry; audit_inode(NULL, dentry); - error = mnt_want_write(f->f_path.mnt); + error = mnt_want_write_file(f); if (!error) { error = setxattr(dentry, name, value, size, flags); mnt_drop_write(f->f_path.mnt); @@ -524,7 +524,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) return error; dentry = f->f_path.dentry; audit_inode(NULL, dentry); - error = mnt_want_write(f->f_path.mnt); + error = mnt_want_write_file(f); if (!error) { error = removexattr(dentry, name); mnt_drop_write(f->f_path.mnt); -- cgit v1.1 From 864d7c4c068f23642efe91b33be3a84afe5f71e0 Mon Sep 17 00:00:00 2001 From: "npiggin@suse.de" Date: Sun, 26 Apr 2009 20:25:56 +1000 Subject: fs: move mark_files_ro into file_table.c This function walks the s_files lock, and operates primarily on the files in a superblock, so it better belongs here (eg. see also fs_may_remount_ro). [AV: ... and it shouldn't be static after that move] Signed-off-by: Nick Piggin Signed-off-by: Al Viro --- fs/file_table.c | 38 ++++++++++++++++++++++++++++++++++++++ fs/internal.h | 5 +++++ fs/super.c | 39 --------------------------------------- 3 files changed, 43 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/file_table.c b/fs/file_table.c index 3d66dbc..334ce39 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -399,6 +399,44 @@ too_bad: return 0; } +/** + * mark_files_ro - mark all files read-only + * @sb: superblock in question + * + * All files are marked read-only. We don't care about pending + * delete files so this should be used in 'force' mode only. + */ +void mark_files_ro(struct super_block *sb) +{ + struct file *f; + +retry: + file_list_lock(); + list_for_each_entry(f, &sb->s_files, f_u.fu_list) { + struct vfsmount *mnt; + if (!S_ISREG(f->f_path.dentry->d_inode->i_mode)) + continue; + if (!file_count(f)) + continue; + if (!(f->f_mode & FMODE_WRITE)) + continue; + f->f_mode &= ~FMODE_WRITE; + if (file_check_writeable(f) != 0) + continue; + file_release_write(f); + mnt = mntget(f->f_path.mnt); + file_list_unlock(); + /* + * This can sleep, so we can't hold + * the file_list_lock() spinlock. + */ + mnt_drop_write(mnt); + mntput(mnt); + goto retry; + } + file_list_unlock(); +} + void __init files_init(unsigned long mempages) { int n; diff --git a/fs/internal.h b/fs/internal.h index b4dac4f..6d4ef20 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -66,3 +66,8 @@ extern void __init mnt_init(void); * fs_struct.c */ extern void chroot_fs_refs(struct path *, struct path *); + +/* + * file_table.c + */ +extern void mark_files_ro(struct super_block *); diff --git a/fs/super.c b/fs/super.c index 1943fdf..c170551 100644 --- a/fs/super.c +++ b/fs/super.c @@ -616,45 +616,6 @@ out: } /** - * mark_files_ro - mark all files read-only - * @sb: superblock in question - * - * All files are marked read-only. We don't care about pending - * delete files so this should be used in 'force' mode only. - */ - -static void mark_files_ro(struct super_block *sb) -{ - struct file *f; - -retry: - file_list_lock(); - list_for_each_entry(f, &sb->s_files, f_u.fu_list) { - struct vfsmount *mnt; - if (!S_ISREG(f->f_path.dentry->d_inode->i_mode)) - continue; - if (!file_count(f)) - continue; - if (!(f->f_mode & FMODE_WRITE)) - continue; - f->f_mode &= ~FMODE_WRITE; - if (file_check_writeable(f) != 0) - continue; - file_release_write(f); - mnt = mntget(f->f_path.mnt); - file_list_unlock(); - /* - * This can sleep, so we can't hold - * the file_list_lock() spinlock. - */ - mnt_drop_write(mnt); - mntput(mnt); - goto retry; - } - file_list_unlock(); -} - -/** * do_remount_sb - asks filesystem to change mount options. * @sb: superblock in question * @flags: numeric part of options -- cgit v1.1 From 876a9f76abbcb775f8d21cbc99fa161f9e5937f1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 28 Apr 2009 18:05:55 +0200 Subject: remove s_async_list Remove the unused s_async_list in the superblock, a leftover of the broken async inode deletion code that leaked into mainline. Having this in the middle of the sync/unmount path is not helpful for the following cleanups. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/super.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index c170551..3d9f117 100644 --- a/fs/super.c +++ b/fs/super.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include "internal.h" @@ -72,7 +71,6 @@ static struct super_block *alloc_super(struct file_system_type *type) INIT_HLIST_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); INIT_LIST_HEAD(&s->s_dentry_lru); - INIT_LIST_HEAD(&s->s_async_list); init_rwsem(&s->s_umount); mutex_init(&s->s_lock); lockdep_set_class(&s->s_umount, &type->s_umount_key); @@ -342,11 +340,6 @@ void generic_shutdown_super(struct super_block *sb) lock_super(sb); sb->s_flags &= ~MS_ACTIVE; - /* - * wait for asynchronous fs operations to finish before going further - */ - async_synchronize_full_domain(&sb->s_async_list); - /* bad name - it should be evict_inodes() */ invalidate_inodes(sb); lock_kernel(); @@ -517,7 +510,6 @@ restart: sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); - async_synchronize_full_domain(&sb->s_async_list); if (sb->s_root && (wait || sb->s_dirt)) sb->s_op->sync_fs(sb, wait); up_read(&sb->s_umount); -- cgit v1.1 From 5a3e5cb8e08bd876e2542c1451c9a93dab1b0e39 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 27 Apr 2009 16:43:48 +0200 Subject: vfs: Fix sys_sync() and fsync_super() reliability (version 4) So far, do_sync() called: sync_inodes(0); sync_supers(); sync_filesystems(0); sync_filesystems(1); sync_inodes(1); This ordering makes it kind of hard for filesystems as sync_inodes(0) need not submit all the IO (for example it skips inodes with I_SYNC set) so e.g. forcing transaction to disk in ->sync_fs() is not really enough. Therefore sys_sync has not been completely reliable on some filesystems (ext3, ext4, reiserfs, ocfs2 and others are hit by this) when racing e.g. with background writeback. A similar problem hits also other filesystems (e.g. ext2) because of write_supers() being called before the sync_inodes(1). Change the ordering of calls in do_sync() - this requires a new function sync_blockdevs() to preserve the property that block devices are always synced after write_super() / sync_fs() call. The same issue is fixed in __fsync_super() function used on umount / remount read-only. [AV: build fixes] Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/internal.h | 9 +++++++++ fs/super.c | 29 ++++++++++++++++++++++++++++- fs/sync.c | 4 +++- 3 files changed, 40 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/internal.h b/fs/internal.h index 6d4ef20..343a537 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -71,3 +71,12 @@ extern void chroot_fs_refs(struct path *, struct path *); * file_table.c */ extern void mark_files_ro(struct super_block *); + +/* + * super.c + */ +#ifdef CONFIG_BLOCK +extern void sync_blockdevs(void); +#else +static inline void sync_blockdevs(void) { } +#endif diff --git a/fs/super.c b/fs/super.c index 3d9f117..18d159d 100644 --- a/fs/super.c +++ b/fs/super.c @@ -293,6 +293,7 @@ void __fsync_super(struct super_block *sb) { sync_inodes_sb(sb, 0); vfs_dq_sync(sb); + sync_inodes_sb(sb, 1); lock_super(sb); if (sb->s_dirt && sb->s_op->write_super) sb->s_op->write_super(sb); @@ -300,7 +301,6 @@ void __fsync_super(struct super_block *sb) if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, 1); sync_blockdev(sb->s_bdev); - sync_inodes_sb(sb, 1); } /* @@ -522,6 +522,33 @@ restart: mutex_unlock(&mutex); } +#ifdef CONFIG_BLOCK +/* + * Sync all block devices underlying some superblock + */ +void sync_blockdevs(void) +{ + struct super_block *sb; + + spin_lock(&sb_lock); +restart: + list_for_each_entry(sb, &super_blocks, s_list) { + if (!sb->s_bdev) + continue; + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + if (sb->s_root) + sync_blockdev(sb->s_bdev); + up_read(&sb->s_umount); + spin_lock(&sb_lock); + if (__put_super_and_need_restart(sb)) + goto restart; + } + spin_unlock(&sb_lock); +} +#endif + /** * get_super - get the superblock of a device * @bdev: device to get the superblock for diff --git a/fs/sync.c b/fs/sync.c index 7abc65f..631fd5a 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -13,6 +13,7 @@ #include #include #include +#include "internal.h" #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ SYNC_FILE_RANGE_WAIT_AFTER) @@ -26,10 +27,11 @@ static void do_sync(unsigned long wait) wakeup_pdflush(0); sync_inodes(0); /* All mappings, inodes and their blockdevs */ vfs_dq_sync(NULL); + sync_inodes(wait); /* Mappings, inodes and blockdevs, again. */ sync_supers(); /* Write the superblocks */ sync_filesystems(0); /* Start syncing the filesystems */ sync_filesystems(wait); /* Waitingly sync the filesystems */ - sync_inodes(wait); /* Mappings, inodes and blockdevs, again. */ + sync_blockdevs(); if (!wait) printk("Emergency Sync complete\n"); if (unlikely(laptop_mode)) -- cgit v1.1 From bfe881255c74800147523b59c85328a1a826ba21 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 27 Apr 2009 16:43:49 +0200 Subject: vfs: Call ->sync_fs() even if s_dirt is 0 (version 4) sync_filesystems() has a condition that if wait == 0 and s_dirt == 0, then ->sync_fs() isn't called. This does not really make much sence since s_dirt is generally used by a filesystem to mean that ->write_super() needs to be called. But ->sync_fs() does different things. I even suspect that some filesystems (btrfs?) sets s_dirt just to fool this logic. Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index 18d159d..fae91ba 100644 --- a/fs/super.c +++ b/fs/super.c @@ -510,7 +510,7 @@ restart: sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); - if (sb->s_root && (wait || sb->s_dirt)) + if (sb->s_root) sb->s_op->sync_fs(sb, wait); up_read(&sb->s_umount); /* restart only when sb is no longer on the list */ -- cgit v1.1 From 429479f031322a0cc5c921ffb2321a51718dc875 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 27 Apr 2009 16:43:50 +0200 Subject: vfs: Make __fsync_super() a static function (version 4) __fsync_super() does the same thing as fsync_super(). So change the only caller to use fsync_super() and make __fsync_super() static. This removes unnecessarily duplicated call to sync_blockdev() and prepares ground for the changes to __fsync_super() in the following patches. Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/block_dev.c | 2 +- fs/super.c | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 931f6b8..fe47f72 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -241,7 +241,7 @@ struct super_block *freeze_bdev(struct block_device *bdev) sb->s_frozen = SB_FREEZE_WRITE; smp_wmb(); - __fsync_super(sb); + fsync_super(sb); sb->s_frozen = SB_FREEZE_TRANS; smp_wmb(); diff --git a/fs/super.c b/fs/super.c index fae91ba..8dbe1ea 100644 --- a/fs/super.c +++ b/fs/super.c @@ -289,7 +289,7 @@ EXPORT_SYMBOL(unlock_super); * device. Takes the superblock lock. Requires a second blkdev * flush by the caller to complete the operation. */ -void __fsync_super(struct super_block *sb) +static int __fsync_super(struct super_block *sb) { sync_inodes_sb(sb, 0); vfs_dq_sync(sb); @@ -300,7 +300,7 @@ void __fsync_super(struct super_block *sb) unlock_super(sb); if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, 1); - sync_blockdev(sb->s_bdev); + return sync_blockdev(sb->s_bdev); } /* @@ -310,8 +310,7 @@ void __fsync_super(struct super_block *sb) */ int fsync_super(struct super_block *sb) { - __fsync_super(sb); - return sync_blockdev(sb->s_bdev); + return __fsync_super(sb); } EXPORT_SYMBOL_GPL(fsync_super); -- cgit v1.1 From 5cee5815d1564bbbd505fea86f4550f1efdb5cd0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 27 Apr 2009 16:43:51 +0200 Subject: vfs: Make sys_sync() use fsync_super() (version 4) It is unnecessarily fragile to have two places (fsync_super() and do_sync()) doing data integrity sync of the filesystem. Alter __fsync_super() to accommodate needs of both callers and use it. So after this patch __fsync_super() is the only place where we gather all the calls needed to properly send all data on a filesystem to disk. Nice bonus is that we get a complete livelock avoidance and write_supers() is now only used for periodic writeback of superblocks. sync_blockdevs() introduced a couple of patches ago is gone now. [build fixes folded] Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/block_dev.c | 15 ++++++++---- fs/fs-writeback.c | 49 ------------------------------------- fs/internal.h | 16 ++++++------- fs/super.c | 72 ++++++++++++++++++------------------------------------- fs/sync.c | 31 ++++++++---------------- 5 files changed, 50 insertions(+), 133 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index fe47f72..4b6a3b9 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -176,17 +176,22 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, iov, offset, nr_segs, blkdev_get_blocks, NULL); } +int __sync_blockdev(struct block_device *bdev, int wait) +{ + if (!bdev) + return 0; + if (!wait) + return filemap_flush(bdev->bd_inode->i_mapping); + return filemap_write_and_wait(bdev->bd_inode->i_mapping); +} + /* * Write out and wait upon all the dirty data associated with a block * device via its mapping. Does not take the superblock lock. */ int sync_blockdev(struct block_device *bdev) { - int ret = 0; - - if (bdev) - ret = filemap_write_and_wait(bdev->bd_inode->i_mapping); - return ret; + return __sync_blockdev(bdev, 1); } EXPORT_SYMBOL(sync_blockdev); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 91013ff..e0fb2e7 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -679,55 +679,6 @@ void sync_inodes_sb(struct super_block *sb, int wait) } /** - * sync_inodes - writes all inodes to disk - * @wait: wait for completion - * - * sync_inodes() goes through each super block's dirty inode list, writes the - * inodes out, waits on the writeout and puts the inodes back on the normal - * list. - * - * This is for sys_sync(). fsync_dev() uses the same algorithm. The subtle - * part of the sync functions is that the blockdev "superblock" is processed - * last. This is because the write_inode() function of a typical fs will - * perform no I/O, but will mark buffers in the blockdev mapping as dirty. - * What we want to do is to perform all that dirtying first, and then write - * back all those inode blocks via the blockdev mapping in one sweep. So the - * additional (somewhat redundant) sync_blockdev() calls here are to make - * sure that really happens. Because if we call sync_inodes_sb(wait=1) with - * outstanding dirty inodes, the writeback goes block-at-a-time within the - * filesystem's write_inode(). This is extremely slow. - */ -static void __sync_inodes(int wait) -{ - struct super_block *sb; - - spin_lock(&sb_lock); -restart: - list_for_each_entry(sb, &super_blocks, s_list) { - sb->s_count++; - spin_unlock(&sb_lock); - down_read(&sb->s_umount); - if (sb->s_root) { - sync_inodes_sb(sb, wait); - sync_blockdev(sb->s_bdev); - } - up_read(&sb->s_umount); - spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto restart; - } - spin_unlock(&sb_lock); -} - -void sync_inodes(int wait) -{ - __sync_inodes(0); - - if (wait) - __sync_inodes(1); -} - -/** * write_inode_now - write an inode to disk * @inode: inode to write to disk * @sync: whether the write should be synchronous or not diff --git a/fs/internal.h b/fs/internal.h index 343a537..dbec3cc 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -25,6 +25,8 @@ static inline int sb_is_blkdev_sb(struct super_block *sb) return sb == blockdev_superblock; } +extern int __sync_blockdev(struct block_device *bdev, int wait); + #else static inline void bdev_cache_init(void) { @@ -34,6 +36,11 @@ static inline int sb_is_blkdev_sb(struct super_block *sb) { return 0; } + +static inline int __sync_blockdev(struct block_device *bdev, int wait) +{ + return 0; +} #endif /* @@ -71,12 +78,3 @@ extern void chroot_fs_refs(struct path *, struct path *); * file_table.c */ extern void mark_files_ro(struct super_block *); - -/* - * super.c - */ -#ifdef CONFIG_BLOCK -extern void sync_blockdevs(void); -#else -static inline void sync_blockdevs(void) { } -#endif diff --git a/fs/super.c b/fs/super.c index 8dbe1ea..c8ce5ed 100644 --- a/fs/super.c +++ b/fs/super.c @@ -284,23 +284,23 @@ EXPORT_SYMBOL(lock_super); EXPORT_SYMBOL(unlock_super); /* - * Write out and wait upon all dirty data associated with this - * superblock. Filesystem data as well as the underlying block - * device. Takes the superblock lock. Requires a second blkdev - * flush by the caller to complete the operation. + * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0) + * just dirties buffers with inodes so we have to submit IO for these buffers + * via __sync_blockdev(). This also speeds up the wait == 1 case since in that + * case write_inode() functions do sync_dirty_buffer() and thus effectively + * write one block at a time. */ -static int __fsync_super(struct super_block *sb) +static int __fsync_super(struct super_block *sb, int wait) { - sync_inodes_sb(sb, 0); vfs_dq_sync(sb); - sync_inodes_sb(sb, 1); + sync_inodes_sb(sb, wait); lock_super(sb); if (sb->s_dirt && sb->s_op->write_super) sb->s_op->write_super(sb); unlock_super(sb); if (sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, 1); - return sync_blockdev(sb->s_bdev); + sb->s_op->sync_fs(sb, wait); + return __sync_blockdev(sb->s_bdev, wait); } /* @@ -310,7 +310,12 @@ static int __fsync_super(struct super_block *sb) */ int fsync_super(struct super_block *sb) { - return __fsync_super(sb); + int ret; + + ret = __fsync_super(sb, 0); + if (ret < 0) + return ret; + return __fsync_super(sb, 1); } EXPORT_SYMBOL_GPL(fsync_super); @@ -469,20 +474,18 @@ restart: } /* - * Call the ->sync_fs super_op against all filesystems which are r/w and - * which implement it. + * Sync all the data for all the filesystems (called by sys_sync() and + * emergency sync) * * This operation is careful to avoid the livelock which could easily happen - * if two or more filesystems are being continuously dirtied. s_need_sync_fs + * if two or more filesystems are being continuously dirtied. s_need_sync * is used only here. We set it against all filesystems and then clear it as * we sync them. So redirtied filesystems are skipped. * * But if process A is currently running sync_filesystems and then process B - * calls sync_filesystems as well, process B will set all the s_need_sync_fs + * calls sync_filesystems as well, process B will set all the s_need_sync * flags again, which will cause process A to resync everything. Fix that with * a local mutex. - * - * (Fabian) Avoid sync_fs with clean fs & wait mode 0 */ void sync_filesystems(int wait) { @@ -492,25 +495,23 @@ void sync_filesystems(int wait) mutex_lock(&mutex); /* Could be down_interruptible */ spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (!sb->s_op->sync_fs) - continue; if (sb->s_flags & MS_RDONLY) continue; - sb->s_need_sync_fs = 1; + sb->s_need_sync = 1; } restart: list_for_each_entry(sb, &super_blocks, s_list) { - if (!sb->s_need_sync_fs) + if (!sb->s_need_sync) continue; - sb->s_need_sync_fs = 0; + sb->s_need_sync = 0; if (sb->s_flags & MS_RDONLY) continue; /* hm. Was remounted r/o meanwhile */ sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); if (sb->s_root) - sb->s_op->sync_fs(sb, wait); + __fsync_super(sb, wait); up_read(&sb->s_umount); /* restart only when sb is no longer on the list */ spin_lock(&sb_lock); @@ -521,33 +522,6 @@ restart: mutex_unlock(&mutex); } -#ifdef CONFIG_BLOCK -/* - * Sync all block devices underlying some superblock - */ -void sync_blockdevs(void) -{ - struct super_block *sb; - - spin_lock(&sb_lock); -restart: - list_for_each_entry(sb, &super_blocks, s_list) { - if (!sb->s_bdev) - continue; - sb->s_count++; - spin_unlock(&sb_lock); - down_read(&sb->s_umount); - if (sb->s_root) - sync_blockdev(sb->s_bdev); - up_read(&sb->s_umount); - spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto restart; - } - spin_unlock(&sb_lock); -} -#endif - /** * get_super - get the superblock of a device * @bdev: device to get the superblock for diff --git a/fs/sync.c b/fs/sync.c index 631fd5a..be0798c 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -18,35 +18,24 @@ #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ SYNC_FILE_RANGE_WAIT_AFTER) -/* - * sync everything. Start out by waking pdflush, because that writes back - * all queues in parallel. - */ -static void do_sync(unsigned long wait) +SYSCALL_DEFINE0(sync) { - wakeup_pdflush(0); - sync_inodes(0); /* All mappings, inodes and their blockdevs */ - vfs_dq_sync(NULL); - sync_inodes(wait); /* Mappings, inodes and blockdevs, again. */ - sync_supers(); /* Write the superblocks */ - sync_filesystems(0); /* Start syncing the filesystems */ - sync_filesystems(wait); /* Waitingly sync the filesystems */ - sync_blockdevs(); - if (!wait) - printk("Emergency Sync complete\n"); + sync_filesystems(0); + sync_filesystems(1); if (unlikely(laptop_mode)) laptop_sync_completion(); -} - -SYSCALL_DEFINE0(sync) -{ - do_sync(1); return 0; } static void do_sync_work(struct work_struct *work) { - do_sync(0); + /* + * Sync twice to reduce the possibility we skipped some inodes / pages + * because they were temporarily locked + */ + sync_filesystems(0); + sync_filesystems(0); + printk("Emergency Sync complete\n"); kfree(work); } -- cgit v1.1 From c15c54f5f056ee4819da9fde59a5f2cd45445f23 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 27 Apr 2009 16:43:52 +0200 Subject: vfs: Move syncing code from super.c to sync.c (version 4) Move sync_filesystems(), __fsync_super(), fsync_super() from super.c to sync.c where it fits better. [build fixes folded] Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/super.c | 85 -------------------------------------------------------------- fs/sync.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 85 deletions(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index c8ce5ed..f822c74 100644 --- a/fs/super.c +++ b/fs/super.c @@ -283,42 +283,6 @@ void unlock_super(struct super_block * sb) EXPORT_SYMBOL(lock_super); EXPORT_SYMBOL(unlock_super); -/* - * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0) - * just dirties buffers with inodes so we have to submit IO for these buffers - * via __sync_blockdev(). This also speeds up the wait == 1 case since in that - * case write_inode() functions do sync_dirty_buffer() and thus effectively - * write one block at a time. - */ -static int __fsync_super(struct super_block *sb, int wait) -{ - vfs_dq_sync(sb); - sync_inodes_sb(sb, wait); - lock_super(sb); - if (sb->s_dirt && sb->s_op->write_super) - sb->s_op->write_super(sb); - unlock_super(sb); - if (sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, wait); - return __sync_blockdev(sb->s_bdev, wait); -} - -/* - * Write out and wait upon all dirty data associated with this - * superblock. Filesystem data as well as the underlying block - * device. Takes the superblock lock. - */ -int fsync_super(struct super_block *sb) -{ - int ret; - - ret = __fsync_super(sb, 0); - if (ret < 0) - return ret; - return __fsync_super(sb, 1); -} -EXPORT_SYMBOL_GPL(fsync_super); - /** * generic_shutdown_super - common helper for ->kill_sb() * @sb: superblock to kill @@ -473,55 +437,6 @@ restart: spin_unlock(&sb_lock); } -/* - * Sync all the data for all the filesystems (called by sys_sync() and - * emergency sync) - * - * This operation is careful to avoid the livelock which could easily happen - * if two or more filesystems are being continuously dirtied. s_need_sync - * is used only here. We set it against all filesystems and then clear it as - * we sync them. So redirtied filesystems are skipped. - * - * But if process A is currently running sync_filesystems and then process B - * calls sync_filesystems as well, process B will set all the s_need_sync - * flags again, which will cause process A to resync everything. Fix that with - * a local mutex. - */ -void sync_filesystems(int wait) -{ - struct super_block *sb; - static DEFINE_MUTEX(mutex); - - mutex_lock(&mutex); /* Could be down_interruptible */ - spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { - if (sb->s_flags & MS_RDONLY) - continue; - sb->s_need_sync = 1; - } - -restart: - list_for_each_entry(sb, &super_blocks, s_list) { - if (!sb->s_need_sync) - continue; - sb->s_need_sync = 0; - if (sb->s_flags & MS_RDONLY) - continue; /* hm. Was remounted r/o meanwhile */ - sb->s_count++; - spin_unlock(&sb_lock); - down_read(&sb->s_umount); - if (sb->s_root) - __fsync_super(sb, wait); - up_read(&sb->s_umount); - /* restart only when sb is no longer on the list */ - spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto restart; - } - spin_unlock(&sb_lock); - mutex_unlock(&mutex); -} - /** * get_super - get the superblock of a device * @bdev: device to get the superblock for diff --git a/fs/sync.c b/fs/sync.c index be0798c..d5fa7b7 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -18,6 +18,91 @@ #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ SYNC_FILE_RANGE_WAIT_AFTER) +/* + * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0) + * just dirties buffers with inodes so we have to submit IO for these buffers + * via __sync_blockdev(). This also speeds up the wait == 1 case since in that + * case write_inode() functions do sync_dirty_buffer() and thus effectively + * write one block at a time. + */ +static int __fsync_super(struct super_block *sb, int wait) +{ + vfs_dq_sync(sb); + sync_inodes_sb(sb, wait); + lock_super(sb); + if (sb->s_dirt && sb->s_op->write_super) + sb->s_op->write_super(sb); + unlock_super(sb); + if (sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, wait); + return __sync_blockdev(sb->s_bdev, wait); +} + +/* + * Write out and wait upon all dirty data associated with this + * superblock. Filesystem data as well as the underlying block + * device. Takes the superblock lock. + */ +int fsync_super(struct super_block *sb) +{ + int ret; + + ret = __fsync_super(sb, 0); + if (ret < 0) + return ret; + return __fsync_super(sb, 1); +} +EXPORT_SYMBOL_GPL(fsync_super); + +/* + * Sync all the data for all the filesystems (called by sys_sync() and + * emergency sync) + * + * This operation is careful to avoid the livelock which could easily happen + * if two or more filesystems are being continuously dirtied. s_need_sync + * is used only here. We set it against all filesystems and then clear it as + * we sync them. So redirtied filesystems are skipped. + * + * But if process A is currently running sync_filesystems and then process B + * calls sync_filesystems as well, process B will set all the s_need_sync + * flags again, which will cause process A to resync everything. Fix that with + * a local mutex. + */ +static void sync_filesystems(int wait) +{ + struct super_block *sb; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); /* Could be down_interruptible */ + spin_lock(&sb_lock); + list_for_each_entry(sb, &super_blocks, s_list) { + if (sb->s_flags & MS_RDONLY) + continue; + sb->s_need_sync = 1; + } + +restart: + list_for_each_entry(sb, &super_blocks, s_list) { + if (!sb->s_need_sync) + continue; + sb->s_need_sync = 0; + if (sb->s_flags & MS_RDONLY) + continue; /* hm. Was remounted r/o meanwhile */ + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + if (sb->s_root) + __fsync_super(sb, wait); + up_read(&sb->s_umount); + /* restart only when sb is no longer on the list */ + spin_lock(&sb_lock); + if (__put_super_and_need_restart(sb)) + goto restart; + } + spin_unlock(&sb_lock); + mutex_unlock(&mutex); +} + SYSCALL_DEFINE0(sync) { sync_filesystems(0); -- cgit v1.1 From 60b0680fa236ac4e17ce31a50048c9d75f9ec831 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 27 Apr 2009 16:43:53 +0200 Subject: vfs: Rename fsync_super() to sync_filesystem() (version 4) Rename the function so that it better describe what it really does. Also remove the unnecessary include of buffer_head.h. Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/block_dev.c | 4 ++-- fs/cachefiles/interface.c | 2 +- fs/super.c | 5 ++--- fs/sync.c | 12 ++++++------ 4 files changed, 11 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 4b6a3b9..3a6d4fb 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -204,7 +204,7 @@ int fsync_bdev(struct block_device *bdev) { struct super_block *sb = get_super(bdev); if (sb) { - int res = fsync_super(sb); + int res = sync_filesystem(sb); drop_super(sb); return res; } @@ -246,7 +246,7 @@ struct super_block *freeze_bdev(struct block_device *bdev) sb->s_frozen = SB_FREEZE_WRITE; smp_wmb(); - fsync_super(sb); + sync_filesystem(sb); sb->s_frozen = SB_FREEZE_TRANS; smp_wmb(); diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 1e96234..dafd484 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -354,7 +354,7 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache) /* make sure all pages pinned by operations on behalf of the netfs are * written to disc */ cachefiles_begin_secure(cache, &saved_cred); - ret = fsync_super(cache->mnt->mnt_sb); + ret = sync_filesystem(cache->mnt->mnt_sb); cachefiles_end_secure(cache, saved_cred); if (ret == -EIO) diff --git a/fs/super.c b/fs/super.c index f822c74..721236e 100644 --- a/fs/super.c +++ b/fs/super.c @@ -28,7 +28,6 @@ #include #include #include -#include /* for fsync_super() */ #include #include #include @@ -304,7 +303,7 @@ void generic_shutdown_super(struct super_block *sb) if (sb->s_root) { shrink_dcache_for_umount(sb); - fsync_super(sb); + sync_filesystem(sb); lock_super(sb); sb->s_flags &= ~MS_ACTIVE; @@ -543,7 +542,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) if (flags & MS_RDONLY) acct_auto_close(sb); shrink_dcache_sb(sb); - fsync_super(sb); + sync_filesystem(sb); /* If we are remounting RDONLY and current sb is read/write, make sure there are no rw files opened */ diff --git a/fs/sync.c b/fs/sync.c index d5fa7b7..8aa870a 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -25,7 +25,7 @@ * case write_inode() functions do sync_dirty_buffer() and thus effectively * write one block at a time. */ -static int __fsync_super(struct super_block *sb, int wait) +static int __sync_filesystem(struct super_block *sb, int wait) { vfs_dq_sync(sb); sync_inodes_sb(sb, wait); @@ -43,16 +43,16 @@ static int __fsync_super(struct super_block *sb, int wait) * superblock. Filesystem data as well as the underlying block * device. Takes the superblock lock. */ -int fsync_super(struct super_block *sb) +int sync_filesystem(struct super_block *sb) { int ret; - ret = __fsync_super(sb, 0); + ret = __sync_filesystem(sb, 0); if (ret < 0) return ret; - return __fsync_super(sb, 1); + return __sync_filesystem(sb, 1); } -EXPORT_SYMBOL_GPL(fsync_super); +EXPORT_SYMBOL_GPL(sync_filesystem); /* * Sync all the data for all the filesystems (called by sys_sync() and @@ -92,7 +92,7 @@ restart: spin_unlock(&sb_lock); down_read(&sb->s_umount); if (sb->s_root) - __fsync_super(sb, wait); + __sync_filesystem(sb, wait); up_read(&sb->s_umount); /* restart only when sb is no longer on the list */ spin_lock(&sb_lock); -- cgit v1.1 From 850b201b087f5525a0a7278551c2bcd0423c3b26 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 27 Apr 2009 16:43:54 +0200 Subject: quota: cleanup dquota sync functions (version 4) Currently the VFS calls vfs_dq_sync to sync out disk quotas for a given superblock. This is a small wrapper around sync_dquots which for the case of a non-NULL superblock is a small wrapper around quota_sync_sb. Just make quota_sync_sb global (rename it to sync_quota_sb) and call it directly. Also call it directly for those cases in quota.c that have a superblock and leave sync_dquots purely an iterator over sync_quota_sb and remove it's superblock argument. To make this nicer move the check for the lack of a quota_sync method from the callers into sync_quota_sb. [folded build fix from Alexander Beregalov ] Signed-off-by: Christoph Hellwig Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/quota/quota.c | 25 ++++++++++++++----------- fs/sync.c | 2 +- 2 files changed, 15 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/quota/quota.c b/fs/quota/quota.c index b7f5a46..95c5b42 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -159,10 +159,14 @@ static int check_quotactl_valid(struct super_block *sb, int type, int cmd, return error; } -static void quota_sync_sb(struct super_block *sb, int type) +#ifdef CONFIG_QUOTA +void sync_quota_sb(struct super_block *sb, int type) { int cnt; + if (!sb->s_qcop->quota_sync) + return; + sb->s_qcop->quota_sync(sb, type); if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE) @@ -191,17 +195,13 @@ static void quota_sync_sb(struct super_block *sb, int type) } mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); } +#endif -void sync_dquots(struct super_block *sb, int type) +static void sync_dquots(int type) { + struct super_block *sb; int cnt; - if (sb) { - if (sb->s_qcop->quota_sync) - quota_sync_sb(sb, type); - return; - } - spin_lock(&sb_lock); restart: list_for_each_entry(sb, &super_blocks, s_list) { @@ -222,8 +222,8 @@ restart: sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); - if (sb->s_root && sb->s_qcop->quota_sync) - quota_sync_sb(sb, type); + if (sb->s_root) + sync_quota_sb(sb, type); up_read(&sb->s_umount); spin_lock(&sb_lock); if (__put_super_and_need_restart(sb)) @@ -301,7 +301,10 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, return sb->s_qcop->set_dqblk(sb, type, id, &idq); } case Q_SYNC: - sync_dquots(sb, type); + if (sb) + sync_quota_sb(sb, type); + else + sync_dquots(type); return 0; case Q_XQUOTAON: diff --git a/fs/sync.c b/fs/sync.c index 8aa870a..d90ab77 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -27,7 +27,7 @@ */ static int __sync_filesystem(struct super_block *sb, int wait) { - vfs_dq_sync(sb); + sync_quota_sb(sb, -1); sync_inodes_sb(sb, wait); lock_super(sb); if (sb->s_dirt && sb->s_op->write_super) -- cgit v1.1 From c3f8a40c1cd5591b882497d1d00d43d0e5bb4698 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 27 Apr 2009 16:43:55 +0200 Subject: quota: Introduce writeout_quota_sb() (version 4) Introduce this function which just writes all the quota structures but avoids all the syncing and cache pruning work to expose quota structures to userspace. Use this function from __sync_filesystem when wait == 0. Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/sync.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sync.c b/fs/sync.c index d90ab77..4487b55 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -27,7 +27,11 @@ */ static int __sync_filesystem(struct super_block *sb, int wait) { - sync_quota_sb(sb, -1); + /* Avoid doing twice syncing and cache pruning for quota sync */ + if (!wait) + writeout_quota_sb(sb, -1); + else + sync_quota_sb(sb, -1); sync_inodes_sb(sb, wait); lock_super(sb); if (sb->s_dirt && sb->s_op->write_super) -- cgit v1.1 From 59d697b70285c348c01cfc2695c3469ba71d7539 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 27 Apr 2009 09:46:41 -0400 Subject: btrfs: remove ->write_super and stop maintaining ->s_dirt Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/btrfs/inode.c | 7 ------- fs/btrfs/super.c | 8 -------- 2 files changed, 15 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5b68330..8612b3a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2322,7 +2322,6 @@ err: btrfs_update_inode(trans, root, dir); btrfs_drop_nlink(inode); ret = btrfs_update_inode(trans, root, inode); - dir->i_sb->s_dirt = 1; out: return ret; } @@ -2806,7 +2805,6 @@ error: pending_del_nr); } btrfs_free_path(path); - inode->i_sb->s_dirt = 1; return ret; } @@ -3768,7 +3766,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); btrfs_update_inode(trans, root, inode); } - dir->i_sb->s_dirt = 1; btrfs_update_inode_block_group(trans, inode); btrfs_update_inode_block_group(trans, dir); out_unlock: @@ -3833,7 +3830,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, inode->i_op = &btrfs_file_inode_operations; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; } - dir->i_sb->s_dirt = 1; btrfs_update_inode_block_group(trans, inode); btrfs_update_inode_block_group(trans, dir); out_unlock: @@ -3880,7 +3876,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (err) drop_inode = 1; - dir->i_sb->s_dirt = 1; btrfs_update_inode_block_group(trans, dir); err = btrfs_update_inode(trans, root, inode); @@ -3962,7 +3957,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) d_instantiate(dentry, inode); drop_on_err = 0; - dir->i_sb->s_dirt = 1; btrfs_update_inode_block_group(trans, inode); btrfs_update_inode_block_group(trans, dir); @@ -4991,7 +4985,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &btrfs_file_inode_operations; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; } - dir->i_sb->s_dirt = 1; btrfs_update_inode_block_group(trans, inode); btrfs_update_inode_block_group(trans, dir); if (drop_inode) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 708ac06..52d8452 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -397,7 +397,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait) if (sb->s_flags & MS_RDONLY) return 0; - sb->s_dirt = 0; if (!wait) { filemap_flush(root->fs_info->btree_inode->i_mapping); return 0; @@ -408,7 +407,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait) trans = btrfs_start_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); - sb->s_dirt = 0; return ret; } @@ -454,11 +452,6 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) return 0; } -static void btrfs_write_super(struct super_block *sb) -{ - sb->s_dirt = 0; -} - static int btrfs_test_super(struct super_block *s, void *data) { struct btrfs_fs_devices *test_fs_devices = data; @@ -689,7 +682,6 @@ static int btrfs_unfreeze(struct super_block *sb) static struct super_operations btrfs_super_ops = { .delete_inode = btrfs_delete_inode, .put_super = btrfs_put_super, - .write_super = btrfs_write_super, .sync_fs = btrfs_sync_fs, .show_options = btrfs_show_options, .write_inode = btrfs_write_inode, -- cgit v1.1 From ca41f7b918294c2a17780e057568413dcbfc6d49 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 27 Apr 2009 09:46:42 -0400 Subject: ext3: remove ->write_super and stop maintaining ->s_dirt Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/ext3/balloc.c | 3 +-- fs/ext3/ialloc.c | 3 +-- fs/ext3/inode.c | 1 - fs/ext3/resize.c | 2 -- fs/ext3/super.c | 22 ---------------------- fs/ext3/xattr.c | 1 - 6 files changed, 2 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index 225202d..27967f9 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -649,7 +649,7 @@ do_more: count = overflow; goto do_more; } - sb->s_dirt = 1; + error_return: brelse(bitmap_bh); ext3_std_error(sb, err); @@ -1708,7 +1708,6 @@ allocated: if (!fatal) fatal = err; - sb->s_dirt = 1; if (fatal) goto out; diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index dd13d60..b399912 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -181,7 +181,7 @@ void ext3_free_inode (handle_t *handle, struct inode * inode) err = ext3_journal_dirty_metadata(handle, bitmap_bh); if (!fatal) fatal = err; - sb->s_dirt = 1; + error_return: brelse(bitmap_bh); ext3_std_error(sb, fatal); @@ -537,7 +537,6 @@ got: percpu_counter_dec(&sbi->s_freeinodes_counter); if (S_ISDIR(mode)) percpu_counter_inc(&sbi->s_dirs_counter); - sb->s_dirt = 1; inode->i_uid = current_fsuid(); if (test_opt (sb, GRPID)) diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index fcfa243..b0248c6 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -2960,7 +2960,6 @@ static int ext3_do_update_inode(handle_t *handle, ext3_update_dynamic_rev(sb); EXT3_SET_RO_COMPAT_FEATURE(sb, EXT3_FEATURE_RO_COMPAT_LARGE_FILE); - sb->s_dirt = 1; handle->h_sync = 1; err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 78fdf38..8a0b263 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -934,7 +934,6 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) EXT3_INODES_PER_GROUP(sb)); ext3_journal_dirty_metadata(handle, sbi->s_sbh); - sb->s_dirt = 1; exit_journal: unlock_super(sb); @@ -1066,7 +1065,6 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, } es->s_blocks_count = cpu_to_le32(o_blocks_count + add); ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); - sb->s_dirt = 1; unlock_super(sb); ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count, o_blocks_count + add); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 3c70d52..1efd958 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -67,7 +67,6 @@ static const char *ext3_decode_error(struct super_block * sb, int errno, static int ext3_remount (struct super_block * sb, int * flags, char * data); static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf); static int ext3_unfreeze(struct super_block *sb); -static void ext3_write_super (struct super_block * sb); static int ext3_freeze(struct super_block *sb); /* @@ -761,7 +760,6 @@ static const struct super_operations ext3_sops = { .dirty_inode = ext3_dirty_inode, .delete_inode = ext3_delete_inode, .put_super = ext3_put_super, - .write_super = ext3_write_super, .sync_fs = ext3_sync_fs, .freeze_fs = ext3_freeze, .unfreeze_fs = ext3_unfreeze, @@ -1785,7 +1783,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) #else es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); #endif - sb->s_dirt = 1; } if (sbi->s_blocks_per_group > blocksize * 8) { @@ -2265,7 +2262,6 @@ static int ext3_load_journal(struct super_block *sb, if (journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { es->s_journal_dev = cpu_to_le32(journal_devnum); - sb->s_dirt = 1; /* Make sure we flush the recovery flag to disk. */ ext3_commit_super(sb, es, 1); @@ -2308,7 +2304,6 @@ static int ext3_create_journal(struct super_block * sb, EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL); es->s_journal_inum = cpu_to_le32(journal_inum); - sb->s_dirt = 1; /* Make sure we flush the recovery flag to disk. */ ext3_commit_super(sb, es, 1); @@ -2354,7 +2349,6 @@ static void ext3_mark_recovery_complete(struct super_block * sb, if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) && sb->s_flags & MS_RDONLY) { EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); - sb->s_dirt = 0; ext3_commit_super(sb, es, 1); } unlock_super(sb); @@ -2413,29 +2407,14 @@ int ext3_force_commit(struct super_block *sb) return 0; journal = EXT3_SB(sb)->s_journal; - sb->s_dirt = 0; ret = ext3_journal_force_commit(journal); return ret; } -/* - * Ext3 always journals updates to the superblock itself, so we don't - * have to propagate any other updates to the superblock on disk at this - * point. (We can probably nuke this function altogether, and remove - * any mention to sb->s_dirt in all of fs/ext3; eventual cleanup...) - */ -static void ext3_write_super (struct super_block * sb) -{ - if (mutex_trylock(&sb->s_lock) != 0) - BUG(); - sb->s_dirt = 0; -} - static int ext3_sync_fs(struct super_block *sb, int wait) { tid_t target; - sb->s_dirt = 0; if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) { if (wait) log_wait_commit(EXT3_SB(sb)->s_journal, target); @@ -2451,7 +2430,6 @@ static int ext3_freeze(struct super_block *sb) { int error = 0; journal_t *journal; - sb->s_dirt = 0; if (!(sb->s_flags & MS_RDONLY)) { journal = EXT3_SB(sb)->s_journal; diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index 83b7be8..545e37c 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -463,7 +463,6 @@ static void ext3_xattr_update_super_block(handle_t *handle, if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) { EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR); - sb->s_dirt = 1; ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); } } -- cgit v1.1 From b7d245de25d1f0bb75a0d04194b647762b30d3db Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 27 Apr 2009 09:46:43 -0400 Subject: gfs2: remove ->write_super and stop maintaining ->s_dirt Signed-off-by: Christoph Hellwig Acked-by: Steven Whitehouse Signed-off-by: Al Viro --- fs/gfs2/log.c | 2 -- fs/gfs2/super.c | 13 ------------- 2 files changed, 15 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index aa62cf5..f2e449c 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -764,7 +764,6 @@ void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) } gfs2_log_unlock(sdp); - sdp->sd_vfs->s_dirt = 0; up_write(&sdp->sd_log_flush_lock); kfree(ai); @@ -823,7 +822,6 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) log_refund(sdp, tr); buf_lo_incore_commit(sdp, tr); - sdp->sd_vfs->s_dirt = 1; up_read(&sdp->sd_log_flush_lock); gfs2_log_lock(sdp); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 40bcc37..0a68013 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -788,17 +788,6 @@ restart: } /** - * gfs2_write_super - * @sb: the superblock - * - */ - -static void gfs2_write_super(struct super_block *sb) -{ - sb->s_dirt = 0; -} - -/** * gfs2_sync_fs - sync the filesystem * @sb: the superblock * @@ -807,7 +796,6 @@ static void gfs2_write_super(struct super_block *sb) static int gfs2_sync_fs(struct super_block *sb, int wait) { - sb->s_dirt = 0; if (wait && sb->s_fs_info) gfs2_log_flush(sb->s_fs_info, NULL); return 0; @@ -1324,7 +1312,6 @@ const struct super_operations gfs2_super_ops = { .write_inode = gfs2_write_inode, .delete_inode = gfs2_delete_inode, .put_super = gfs2_put_super, - .write_super = gfs2_write_super, .sync_fs = gfs2_sync_fs, .freeze_fs = gfs2_freeze, .unfreeze_fs = gfs2_unfreeze, -- cgit v1.1 From 94cb993f2ee99f3a9318e7b4dbb383497c4bedea Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 27 Apr 2009 09:46:44 -0400 Subject: ocfs2: remove ->write_super and stop maintaining ->s_dirt Signed-off-by: Christoph Hellwig Acked-by: Joel Becker Signed-off-by: Al Viro --- fs/ocfs2/super.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 5c6163f..3eb076c 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -126,7 +126,6 @@ static int ocfs2_get_sector(struct super_block *sb, struct buffer_head **bh, int block, int sect_size); -static void ocfs2_write_super(struct super_block *sb); static struct inode *ocfs2_alloc_inode(struct super_block *sb); static void ocfs2_destroy_inode(struct inode *inode); static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend); @@ -141,7 +140,6 @@ static const struct super_operations ocfs2_sops = { .clear_inode = ocfs2_clear_inode, .delete_inode = ocfs2_delete_inode, .sync_fs = ocfs2_sync_fs, - .write_super = ocfs2_write_super, .put_super = ocfs2_put_super, .remount_fs = ocfs2_remount, .show_options = ocfs2_show_options, @@ -365,24 +363,12 @@ static struct file_operations ocfs2_osb_debug_fops = { .llseek = generic_file_llseek, }; -/* - * write_super and sync_fs ripped right out of ext3. - */ -static void ocfs2_write_super(struct super_block *sb) -{ - if (mutex_trylock(&sb->s_lock) != 0) - BUG(); - sb->s_dirt = 0; -} - static int ocfs2_sync_fs(struct super_block *sb, int wait) { int status; tid_t target; struct ocfs2_super *osb = OCFS2_SB(sb); - sb->s_dirt = 0; - if (ocfs2_is_hard_readonly(osb)) return -EROFS; -- cgit v1.1 From 517bfae28353e996160518add4d00033d3886e61 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 27 Apr 2009 09:46:45 -0400 Subject: qnx4: remove ->write_super Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/qnx4/inode.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'fs') diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index fe1f0f3..95c12fc 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -70,14 +70,6 @@ static void qnx4_delete_inode(struct inode *inode) unlock_kernel(); } -static void qnx4_write_super(struct super_block *sb) -{ - lock_kernel(); - QNX4DEBUG(("qnx4: write_super\n")); - sb->s_dirt = 0; - unlock_kernel(); -} - static int qnx4_write_inode(struct inode *inode, int unused) { struct qnx4_inode_entry *raw_inode; @@ -138,7 +130,6 @@ static const struct super_operations qnx4_sops = #ifdef CONFIG_QNX4FS_RW .write_inode = qnx4_write_inode, .delete_inode = qnx4_delete_inode, - .write_super = qnx4_write_super, #endif }; -- cgit v1.1 From 8c85e125124a473d6f3e9bb187b0b84207f81d91 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 28 Apr 2009 18:00:26 +0200 Subject: remove ->write_super call in generic_shutdown_super We just did a full fs writeout using sync_filesystem before, and if that's not enough for the filesystem it can perform it's own writeout in ->put_super, which many filesystems already do. Move a call to foofs_write_super into every foofs_put_super for now to guarantee identical behaviour until it's cleaned up by the individual filesystem maintainers. Exceptions: - affs already has identical copy & pasted code at the beginning of affs_put_super so no need to do it twice. - xfs does the right thing without it and I have changes pending for the xfs tree touching this are so I don't really need conflicts here.. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/bfs/inode.c | 4 ++++ fs/exofs/super.c | 3 +++ fs/ext2/super.c | 3 +++ fs/ext4/super.c | 3 +++ fs/fat/inode.c | 3 +++ fs/hfs/super.c | 2 ++ fs/hfsplus/super.c | 2 ++ fs/jffs2/super.c | 3 +++ fs/nilfs2/super.c | 4 ++++ fs/reiserfs/super.c | 3 +++ fs/super.c | 2 -- fs/sysv/inode.c | 3 +++ fs/ufs/super.c | 3 +++ 13 files changed, 36 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index cc4062d..4cf3d26 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -30,6 +30,7 @@ MODULE_LICENSE("GPL"); #define dprintf(x...) #endif +static void bfs_write_super(struct super_block *s); void dump_imap(const char *prefix, struct super_block *s); struct inode *bfs_iget(struct super_block *sb, unsigned long ino) @@ -216,6 +217,9 @@ static void bfs_put_super(struct super_block *s) if (!info) return; + if (s->s_dirt) + bfs_write_super(s); + brelse(info->si_sbh); mutex_destroy(&info->bfs_lock); kfree(info->si_imap); diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 9f1985e..3cdb761 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -258,6 +258,9 @@ static void exofs_put_super(struct super_block *sb) int num_pend; struct exofs_sb_info *sbi = sb->s_fs_info; + if (sb->s_dirt) + exofs_write_super(sb); + /* make sure there are no pending commands */ for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0; num_pend = atomic_read(&sbi->s_curr_pending)) { diff --git a/fs/ext2/super.c b/fs/ext2/super.c index e3c748f..932a2bc 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -114,6 +114,9 @@ static void ext2_put_super (struct super_block * sb) int i; struct ext2_sb_info *sbi = EXT2_SB(sb); + if (sb->s_dirt) + ext2_write_super(sb); + ext2_xattr_put_super(sb); if (!(sb->s_flags & MS_RDONLY)) { struct ext2_super_block *es = sbi->s_es; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f016707..c7b8f8d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -576,6 +576,9 @@ static void ext4_put_super(struct super_block *sb) struct ext4_super_block *es = sbi->s_es; int i, err; + if (sb->s_dirt) + ext4_write_super(sb); + ext4_release_system_zone(sb); ext4_mb_release(sb); ext4_ext_release(sb); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 296785a..4978621 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -451,6 +451,9 @@ static void fat_put_super(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); + if (sb->s_dirt) + fat_write_super(sb); + if (sbi->nls_disk) { unload_nls(sbi->nls_disk); sbi->nls_disk = NULL; diff --git a/fs/hfs/super.c b/fs/hfs/super.c index a36bb74..e071e6d 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -65,6 +65,8 @@ static void hfs_write_super(struct super_block *sb) */ static void hfs_put_super(struct super_block *sb) { + if (sb->s_dirt) + hfs_write_super(sb); hfs_mdb_close(sb); /* release the MDB's resources */ hfs_mdb_put(sb); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index f2a6402..40bdab7 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -199,6 +199,8 @@ static void hfsplus_put_super(struct super_block *sb) dprint(DBG_SUPER, "hfsplus_put_super\n"); if (!sb->s_fs_info) return; + if (sb->s_dirt) + hfsplus_write_super(sb); if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) { struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 4c4e18c..5059e96 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -174,6 +174,9 @@ static void jffs2_put_super (struct super_block *sb) D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n")); + if (sb->s_dirt) + jffs2_write_super(sb); + mutex_lock(&c->alloc_sem); jffs2_flush_wbuf_pad(c); mutex_unlock(&c->alloc_sem); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 6989b03..7901d8c 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -65,6 +65,7 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem " "(NILFS)"); MODULE_LICENSE("GPL"); +static void nilfs_write_super(struct super_block *sb); static int nilfs_remount(struct super_block *sb, int *flags, char *data); static int test_exclusive_mount(struct file_system_type *fs_type, struct block_device *bdev, int flags); @@ -315,6 +316,9 @@ static void nilfs_put_super(struct super_block *sb) struct nilfs_sb_info *sbi = NILFS_SB(sb); struct the_nilfs *nilfs = sbi->s_nilfs; + if (sb->s_dirt) + nilfs_write_super(sb); + nilfs_detach_segment_constructor(sbi); if (!(sb->s_flags & MS_RDONLY)) { diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 9dbdcfb..1b52daa 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -468,6 +468,9 @@ static void reiserfs_put_super(struct super_block *s) struct reiserfs_transaction_handle th; th.t_trans_id = 0; + if (s->s_dirt) + reiserfs_write_super(s); + /* change file system state to current state if it was mounted with read-write permissions */ if (!(s->s_flags & MS_RDONLY)) { if (!journal_begin(&th, s, 10)) { diff --git a/fs/super.c b/fs/super.c index 721236e..d9a29d5 100644 --- a/fs/super.c +++ b/fs/super.c @@ -311,8 +311,6 @@ void generic_shutdown_super(struct super_block *sb) invalidate_inodes(sb); lock_kernel(); - if (sop->write_super && sb->s_dirt) - sop->write_super(sb); if (sop->put_super) sop->put_super(sb); diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index da20b48..cd80316 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -72,6 +72,9 @@ static void sysv_put_super(struct super_block *sb) { struct sysv_sb_info *sbi = SYSV_SB(sb); + if (sb->s_dirt) + sysv_write_super(sb); + if (!(sb->s_flags & MS_RDONLY)) { /* XXX ext2 also updates the state here */ mark_buffer_dirty(sbi->s_bh1); diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 6035929..74afb9f 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1152,6 +1152,9 @@ static void ufs_put_super(struct super_block *sb) UFSD("ENTER\n"); + if (sb->s_dirt) + ufs_write_super(sb); + if (!(sb->s_flags & MS_RDONLY)) ufs_put_super_internal(sb); -- cgit v1.1 From f3da392e9ff14b9f388e74319e6d195848991c07 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 4 May 2009 03:32:03 +0400 Subject: dcache: extrace and use d_unlinked() d_unlinked() will be used in middle-term to ban checkpointing when opened but unlinked file is detected, and in long term, to detect such situation and special case on it. Signed-off-by: Alexey Dobriyan Signed-off-by: Al Viro --- fs/dcache.c | 7 +++---- fs/namespace.c | 8 ++++---- 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 75659a6..9e5cd3c 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1910,7 +1910,7 @@ char *__d_path(const struct path *path, struct path *root, spin_lock(&vfsmount_lock); prepend(&end, &buflen, "\0", 1); - if (!IS_ROOT(dentry) && d_unhashed(dentry) && + if (d_unlinked(dentry) && (prepend(&end, &buflen, " (deleted)", 10) != 0)) goto Elong; @@ -2035,7 +2035,7 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen) spin_lock(&dcache_lock); prepend(&end, &buflen, "\0", 1); - if (!IS_ROOT(dentry) && d_unhashed(dentry) && + if (d_unlinked(dentry) && (prepend(&end, &buflen, "//deleted", 9) != 0)) goto Elong; if (buflen < 1) @@ -2097,9 +2097,8 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) read_unlock(¤t->fs->lock); error = -ENOENT; - /* Has the current directory has been unlinked? */ spin_lock(&dcache_lock); - if (IS_ROOT(pwd.dentry) || !d_unhashed(pwd.dentry)) { + if (!d_unlinked(pwd.dentry)) { unsigned long len; struct path tmp = root; char * cwd; diff --git a/fs/namespace.c b/fs/namespace.c index 120b8a6..7e537f0 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1384,7 +1384,7 @@ static int graft_tree(struct vfsmount *mnt, struct path *path) goto out_unlock; err = -ENOENT; - if (IS_ROOT(path->dentry) || !d_unhashed(path->dentry)) + if (!d_unlinked(path->dentry)) err = attach_recursive_mnt(mnt, path, NULL); out_unlock: mutex_unlock(&path->dentry->d_inode->i_mutex); @@ -1566,7 +1566,7 @@ static int do_move_mount(struct path *path, char *old_name) if (IS_DEADDIR(path->dentry->d_inode)) goto out1; - if (!IS_ROOT(path->dentry) && d_unhashed(path->dentry)) + if (d_unlinked(path->dentry)) goto out1; err = -EINVAL; @@ -2129,9 +2129,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, error = -ENOENT; if (IS_DEADDIR(new.dentry->d_inode)) goto out2; - if (d_unhashed(new.dentry) && !IS_ROOT(new.dentry)) + if (d_unlinked(new.dentry)) goto out2; - if (d_unhashed(old.dentry) && !IS_ROOT(old.dentry)) + if (d_unlinked(old.dentry)) goto out2; error = -EBUSY; if (new.mnt == root.mnt || -- cgit v1.1 From e5004753388dcf5e1b8a52ac0ab807d232340fbb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 5 May 2009 16:08:56 +0200 Subject: cleanup sync_supers Merge the write_super helper into sync_super and move the check for ->write_super earlier so that we can avoid grabbing a reference to a superblock that doesn't have it. While we're at it also add a little comment documenting sync_supers. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/super.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index d9a29d5..cb19fff 100644 --- a/fs/super.c +++ b/fs/super.c @@ -399,16 +399,14 @@ void drop_super(struct super_block *sb) EXPORT_SYMBOL(drop_super); -static inline void write_super(struct super_block *sb) -{ - lock_super(sb); - if (sb->s_root && sb->s_dirt) - if (sb->s_op->write_super) - sb->s_op->write_super(sb); - unlock_super(sb); -} - -/* +/** + * sync_supers - helper for periodic superblock writeback + * + * Call the write_super method if present on all dirty superblocks in + * the system. This is for the periodic writeback used by most older + * filesystems. For data integrity superblock writeback use + * sync_filesystems() instead. + * * Note: check the dirty flag before waiting, so we don't * hold up the sync while mounting a device. (The newly * mounted device won't need syncing.) @@ -420,12 +418,17 @@ void sync_supers(void) spin_lock(&sb_lock); restart: list_for_each_entry(sb, &super_blocks, s_list) { - if (sb->s_dirt) { + if (sb->s_op->write_super && sb->s_dirt) { sb->s_count++; spin_unlock(&sb_lock); + down_read(&sb->s_umount); - write_super(sb); + lock_super(sb); + if (sb->s_root && sb->s_dirt) + sb->s_op->write_super(sb); + unlock_super(sb); up_read(&sb->s_umount); + spin_lock(&sb_lock); if (__put_super_and_need_restart(sb)) goto restart; -- cgit v1.1 From 5af7926ff33b68b3ba46531471c6e0564b285efc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 5 May 2009 15:41:25 +0200 Subject: enforce ->sync_fs is only called for rw superblock Make sure a superblock really is writeable by checking MS_RDONLY under s_umount. sync_filesystems needed some re-arragement for that, but all but one sync_filesystem caller had the correct locking already so that we could add that check there. cachefiles grew s_umount locking. I've also added a WARN_ON to sync_filesystem to assert this for future callers. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/btrfs/super.c | 3 --- fs/cachefiles/interface.c | 2 ++ fs/reiserfs/super.c | 21 +++++++++------------ fs/sync.c | 23 ++++++++++++++++------- fs/ubifs/super.c | 3 --- 5 files changed, 27 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 52d8452..9f179d4 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -394,9 +394,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait) struct btrfs_root *root = btrfs_sb(sb); int ret; - if (sb->s_flags & MS_RDONLY) - return 0; - if (!wait) { filemap_flush(root->fs_info->btree_inode->i_mapping); return 0; diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index dafd484..431accd 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -354,7 +354,9 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache) /* make sure all pages pinned by operations on behalf of the netfs are * written to disc */ cachefiles_begin_secure(cache, &saved_cred); + down_read(&cache->mnt->mnt_sb->s_umount); ret = sync_filesystem(cache->mnt->mnt_sb); + up_read(&cache->mnt->mnt_sb->s_umount); cachefiles_end_secure(cache, saved_cred); if (ret == -EIO) diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 1b52daa..3da0401 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -64,18 +64,15 @@ static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf); static int reiserfs_sync_fs(struct super_block *s, int wait) { - if (!(s->s_flags & MS_RDONLY)) { - struct reiserfs_transaction_handle th; - reiserfs_write_lock(s); - if (!journal_begin(&th, s, 1)) - if (!journal_end_sync(&th, s, 1)) - reiserfs_flush_old_commits(s); - s->s_dirt = 0; /* Even if it's not true. - * We'll loop forever in sync_supers otherwise */ - reiserfs_write_unlock(s); - } else { - s->s_dirt = 0; - } + struct reiserfs_transaction_handle th; + + reiserfs_write_lock(s); + if (!journal_begin(&th, s, 1)) + if (!journal_end_sync(&th, s, 1)) + reiserfs_flush_old_commits(s); + s->s_dirt = 0; /* Even if it's not true. + * We'll loop forever in sync_supers otherwise */ + reiserfs_write_unlock(s); return 0; } diff --git a/fs/sync.c b/fs/sync.c index 4487b55..89c37f7 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -51,6 +51,18 @@ int sync_filesystem(struct super_block *sb) { int ret; + /* + * We need to be protected against the filesystem going from + * r/o to r/w or vice versa. + */ + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + + /* + * No point in syncing out anything if the filesystem is read-only. + */ + if (sb->s_flags & MS_RDONLY) + return 0; + ret = __sync_filesystem(sb, 0); if (ret < 0) return ret; @@ -79,25 +91,22 @@ static void sync_filesystems(int wait) mutex_lock(&mutex); /* Could be down_interruptible */ spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { - if (sb->s_flags & MS_RDONLY) - continue; + list_for_each_entry(sb, &super_blocks, s_list) sb->s_need_sync = 1; - } restart: list_for_each_entry(sb, &super_blocks, s_list) { if (!sb->s_need_sync) continue; sb->s_need_sync = 0; - if (sb->s_flags & MS_RDONLY) - continue; /* hm. Was remounted r/o meanwhile */ sb->s_count++; spin_unlock(&sb_lock); + down_read(&sb->s_umount); - if (sb->s_root) + if (!(sb->s_flags & MS_RDONLY) && sb->s_root) __sync_filesystem(sb, wait); up_read(&sb->s_umount); + /* restart only when sb is no longer on the list */ spin_lock(&sb_lock); if (__put_super_and_need_restart(sb)) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index e9f7a75..84f3c7f 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -447,9 +447,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait) if (!wait) return 0; - if (sb->s_flags & MS_RDONLY) - return 0; - /* * VFS calls '->sync_fs()' before synchronizing all dirty inodes and * pages, so synchronize them first, then commit the journal. Strictly -- cgit v1.1 From 443b94baaa16771e98b29ca7c24f1e305738ffca Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 5 May 2009 23:48:50 -0400 Subject: Make sure that all callers of remount hold s_umount exclusive Signed-off-by: Al Viro --- fs/super.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index cb19fff..49f670c 100644 --- a/fs/super.c +++ b/fs/super.c @@ -579,7 +579,7 @@ static void do_emergency_remount(struct work_struct *work) list_for_each_entry(sb, &super_blocks, s_list) { sb->s_count++; spin_unlock(&sb_lock); - down_read(&sb->s_umount); + down_write(&sb->s_umount); if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) { /* * ->remount_fs needs lock_kernel(). @@ -590,7 +590,8 @@ static void do_emergency_remount(struct work_struct *work) do_remount_sb(sb, MS_RDONLY, NULL, 1); unlock_kernel(); } - drop_super(sb); + up_write(&sb->s_umount); + put_super(sb); spin_lock(&sb_lock); } spin_unlock(&sb_lock); -- cgit v1.1 From 62c6943b4b1e818aea60c11c5a68a50785b83119 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 7 May 2009 03:12:29 -0400 Subject: Trim a bit of crap from fs.h do_remount_sb() is fs/internal.h fodder, fsync_no_super() is long gone. Signed-off-by: Al Viro --- fs/internal.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/internal.h b/fs/internal.h index dbec3cc..d55ef56 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -78,3 +78,8 @@ extern void chroot_fs_refs(struct path *, struct path *); * file_table.c */ extern void mark_files_ro(struct super_block *); + +/* + * super.c + */ +extern int do_remount_sb(struct super_block *, int, void *, int); -- cgit v1.1 From a9e220f8322e2b0e0b8903fe00265461cffad3f0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 5 May 2009 22:10:44 -0400 Subject: No need to do lock_super() for exclusion in generic_shutdown_super() We can't run into contention on it. All other callers of lock_super() either hold s_umount (and we have it exclusive) or hold an active reference to superblock in question, which prevents the call of generic_shutdown_super() while the reference is held. So we can replace lock_super(s) with get_fs_excl() in generic_shutdown_super() (and corresponding change for unlock_super(), of course). Since ext4 expects s_lock held for its put_super, take lock_super() into it. The rest of filesystems do not care at all. Signed-off-by: Al Viro --- fs/ext4/super.c | 2 +- fs/super.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c7b8f8d..0d3034c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -576,6 +576,7 @@ static void ext4_put_super(struct super_block *sb) struct ext4_super_block *es = sbi->s_es; int i, err; + lock_super(sb); if (sb->s_dirt) ext4_write_super(sb); @@ -645,7 +646,6 @@ static void ext4_put_super(struct super_block *sb) unlock_super(sb); kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); - lock_super(sb); lock_kernel(); kfree(sbi->s_blockgroup_lock); kfree(sbi); diff --git a/fs/super.c b/fs/super.c index 49f670c..54fd331 100644 --- a/fs/super.c +++ b/fs/super.c @@ -304,7 +304,7 @@ void generic_shutdown_super(struct super_block *sb) if (sb->s_root) { shrink_dcache_for_umount(sb); sync_filesystem(sb); - lock_super(sb); + get_fs_excl(); sb->s_flags &= ~MS_ACTIVE; /* bad name - it should be evict_inodes() */ @@ -322,7 +322,7 @@ void generic_shutdown_super(struct super_block *sb) } unlock_kernel(); - unlock_super(sb); + put_fs_excl(); } spin_lock(&sb_lock); /* should be initialized for __put_super_and_need_restart() */ -- cgit v1.1 From 6cfd0148425e528b859b26e436b01f23f6926224 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 5 May 2009 15:40:36 +0200 Subject: push BKL down into ->put_super Move BKL into ->put_super from the only caller. A couple of filesystems had trivial enough ->put_super (only kfree and NULLing of s_fs_info + stuff in there) to not get any locking: coda, cramfs, efs, hugetlbfs, omfs, qnx4, shmem, all others got the full treatment. Most of them probably don't need it, but I'd rather sort that out individually. Preferably after all the other BKL pushdowns in that area. [AV: original used to move lock_super() down as well; these changes are removed since we don't do lock_super() at all in generic_shutdown_super() now] [AV: fuse, btrfs and xfs are known to need no damn BKL, exempt] Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/adfs/super.c | 4 ++++ fs/affs/super.c | 5 ++++- fs/afs/super.c | 4 ++++ fs/befs/linuxvfs.c | 5 ++++- fs/bfs/inode.c | 4 ++++ fs/cifs/cifsfs.c | 6 +++++- fs/ecryptfs/super.c | 5 +++++ fs/exofs/super.c | 4 ++++ fs/ext2/super.c | 4 +++- fs/ext3/super.c | 5 ++++- fs/ext4/super.c | 2 +- fs/fat/inode.c | 4 ++++ fs/freevxfs/vxfs_super.c | 4 ++++ fs/gfs2/super.c | 4 ++++ fs/hfs/super.c | 4 ++++ fs/hfsplus/super.c | 5 +++++ fs/hpfs/super.c | 5 +++++ fs/isofs/inode.c | 5 +++++ fs/jffs2/super.c | 4 ++++ fs/jfs/super.c | 5 +++++ fs/minix/inode.c | 4 +++- fs/ncpfs/inode.c | 4 ++++ fs/nilfs2/super.c | 4 ++++ fs/ntfs/super.c | 6 +++++- fs/ocfs2/super.c | 4 ++++ fs/reiserfs/super.c | 4 +++- fs/smbfs/inode.c | 4 ++++ fs/squashfs/super.c | 4 ++++ fs/super.c | 3 --- fs/sysv/inode.c | 4 ++++ fs/ubifs/super.c | 5 +++++ fs/udf/super.c | 5 +++++ fs/ufs/super.c | 6 ++++++ 33 files changed, 133 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/adfs/super.c b/fs/adfs/super.c index dd9becc..0ec5aaf 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -132,11 +132,15 @@ static void adfs_put_super(struct super_block *sb) int i; struct adfs_sb_info *asb = ADFS_SB(sb); + lock_kernel(); + for (i = 0; i < asb->s_map_size; i++) brelse(asb->s_map[i].dm_bh); kfree(asb->s_map); kfree(asb); sb->s_fs_info = NULL; + + unlock_kernel(); } static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt) diff --git a/fs/affs/super.c b/fs/affs/super.c index 63f5183..d738646 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -29,6 +29,8 @@ affs_put_super(struct super_block *sb) struct affs_sb_info *sbi = AFFS_SB(sb); pr_debug("AFFS: put_super()\n"); + lock_kernel(); + if (!(sb->s_flags & MS_RDONLY)) { AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag = cpu_to_be32(1); secs_to_datestamp(get_seconds(), @@ -42,7 +44,8 @@ affs_put_super(struct super_block *sb) affs_brelse(sbi->s_root_bh); kfree(sbi); sb->s_fs_info = NULL; - return; + + unlock_kernel(); } static void diff --git a/fs/afs/super.c b/fs/afs/super.c index 76828e5..ad0514d 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -440,8 +440,12 @@ static void afs_put_super(struct super_block *sb) _enter(""); + lock_kernel(); + afs_put_volume(as->volume); + unlock_kernel(); + _leave(""); } diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 76afd0d..9367b62 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -737,6 +737,8 @@ parse_options(char *options, befs_mount_options * opts) static void befs_put_super(struct super_block *sb) { + lock_kernel(); + kfree(BEFS_SB(sb)->mount_opts.iocharset); BEFS_SB(sb)->mount_opts.iocharset = NULL; @@ -747,7 +749,8 @@ befs_put_super(struct super_block *sb) kfree(sb->s_fs_info); sb->s_fs_info = NULL; - return; + + unlock_kernel(); } /* Allocate private field of the superblock, fill it. diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 4cf3d26..3a9a136 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -217,6 +217,8 @@ static void bfs_put_super(struct super_block *s) if (!info) return; + lock_kernel(); + if (s->s_dirt) bfs_write_super(s); @@ -225,6 +227,8 @@ static void bfs_put_super(struct super_block *s) kfree(info->si_imap); kfree(info); s->s_fs_info = NULL; + + unlock_kernel(); } static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 0a10a59..0d92114 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -204,6 +204,9 @@ cifs_put_super(struct super_block *sb) cFYI(1, ("Empty cifs superblock info passed to unmount")); return; } + + lock_kernel(); + rc = cifs_umount(sb, cifs_sb); if (rc) cERROR(1, ("cifs_umount failed with return code %d", rc)); @@ -216,7 +219,8 @@ cifs_put_super(struct super_block *sb) unload_nls(cifs_sb->local_nls); kfree(cifs_sb); - return; + + unlock_kernel(); } static int diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index fa4c7e7..12d6496 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include "ecryptfs_kernel.h" @@ -120,9 +121,13 @@ static void ecryptfs_put_super(struct super_block *sb) { struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb); + lock_kernel(); + ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat); kmem_cache_free(ecryptfs_sb_info_cache, sb_info); ecryptfs_set_superblock_private(sb, NULL); + + unlock_kernel(); } /** diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 3cdb761..cd1f8b1 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -258,6 +258,8 @@ static void exofs_put_super(struct super_block *sb) int num_pend; struct exofs_sb_info *sbi = sb->s_fs_info; + lock_kernel(); + if (sb->s_dirt) exofs_write_super(sb); @@ -274,6 +276,8 @@ static void exofs_put_super(struct super_block *sb) osduld_put_device(sbi->s_dev); kfree(sb->s_fs_info); sb->s_fs_info = NULL; + + unlock_kernel(); } /* diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 932a2bc..a44963d 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -114,6 +114,8 @@ static void ext2_put_super (struct super_block * sb) int i; struct ext2_sb_info *sbi = EXT2_SB(sb); + lock_kernel(); + if (sb->s_dirt) ext2_write_super(sb); @@ -138,7 +140,7 @@ static void ext2_put_super (struct super_block * sb) kfree(sbi->s_blockgroup_lock); kfree(sbi); - return; + unlock_kernel(); } static struct kmem_cache * ext2_inode_cachep; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 1efd958..546b8d7 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -398,6 +398,8 @@ static void ext3_put_super (struct super_block * sb) struct ext3_super_block *es = sbi->s_es; int i, err; + lock_kernel(); + ext3_xattr_put_super(sb); err = journal_destroy(sbi->s_journal); sbi->s_journal = NULL; @@ -446,7 +448,8 @@ static void ext3_put_super (struct super_block * sb) sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); kfree(sbi); - return; + + unlock_kernel(); } static struct kmem_cache *ext3_inode_cachep; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0d3034c..1d4180b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -577,6 +577,7 @@ static void ext4_put_super(struct super_block *sb) int i, err; lock_super(sb); + lock_kernel(); if (sb->s_dirt) ext4_write_super(sb); @@ -646,7 +647,6 @@ static void ext4_put_super(struct super_block *sb) unlock_super(sb); kobject_put(&sbi->s_kobj); wait_for_completion(&sbi->s_kobj_unregister); - lock_kernel(); kfree(sbi->s_blockgroup_lock); kfree(sbi); } diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 4978621..2b88c93 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -451,6 +451,8 @@ static void fat_put_super(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); + lock_kernel(); + if (sb->s_dirt) fat_write_super(sb); @@ -470,6 +472,8 @@ static void fat_put_super(struct super_block *sb) sb->s_fs_info = NULL; kfree(sbi); + + unlock_kernel(); } static struct kmem_cache *fat_inode_cachep; diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index 1dacda8..cdbd165 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -80,12 +80,16 @@ vxfs_put_super(struct super_block *sbp) { struct vxfs_sb_info *infp = VXFS_SBI(sbp); + lock_kernel(); + vxfs_put_fake_inode(infp->vsi_fship); vxfs_put_fake_inode(infp->vsi_ilist); vxfs_put_fake_inode(infp->vsi_stilist); brelse(infp->vsi_bp); kfree(infp); + + unlock_kernel(); } /** diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 0a68013..c8930b3 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -719,6 +719,8 @@ static void gfs2_put_super(struct super_block *sb) int error; struct gfs2_jdesc *jd; + lock_kernel(); + /* Unfreeze the filesystem, if we need to */ mutex_lock(&sdp->sd_freeze_lock); @@ -785,6 +787,8 @@ restart: /* At this point, we're through participating in the lockspace */ gfs2_sys_fs_del(sdp); + + unlock_kernel(); } /** diff --git a/fs/hfs/super.c b/fs/hfs/super.c index e071e6d..9f5eaa0 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -65,11 +65,15 @@ static void hfs_write_super(struct super_block *sb) */ static void hfs_put_super(struct super_block *sb) { + lock_kernel(); + if (sb->s_dirt) hfs_write_super(sb); hfs_mdb_close(sb); /* release the MDB's resources */ hfs_mdb_put(sb); + + unlock_kernel(); } /* diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 40bdab7..9b292dc 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -199,6 +199,9 @@ static void hfsplus_put_super(struct super_block *sb) dprint(DBG_SUPER, "hfsplus_put_super\n"); if (!sb->s_fs_info) return; + + lock_kernel(); + if (sb->s_dirt) hfsplus_write_super(sb); if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) { @@ -220,6 +223,8 @@ static void hfsplus_put_super(struct super_block *sb) unload_nls(HFSPLUS_SB(sb).nls); kfree(sb->s_fs_info); sb->s_fs_info = NULL; + + unlock_kernel(); } static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index fc77965..437a32e 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -99,11 +99,16 @@ int hpfs_stop_cycles(struct super_block *s, int key, int *c1, int *c2, static void hpfs_put_super(struct super_block *s) { struct hpfs_sb_info *sbi = hpfs_sb(s); + + lock_kernel(); + kfree(sbi->sb_cp_table); kfree(sbi->sb_bmp_dir); unmark_dirty(s); s->s_fs_info = NULL; kfree(sbi); + + unlock_kernel(); } unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno) diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index b4cbe96..068b34b 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -42,11 +42,16 @@ static int isofs_dentry_cmp_ms(struct dentry *dentry, struct qstr *a, struct qst static void isofs_put_super(struct super_block *sb) { struct isofs_sb_info *sbi = ISOFS_SB(sb); + #ifdef CONFIG_JOLIET + lock_kernel(); + if (sbi->s_nls_iocharset) { unload_nls(sbi->s_nls_iocharset); sbi->s_nls_iocharset = NULL; } + + unlock_kernel(); #endif kfree(sbi); diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 5059e96..37b1212 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -174,6 +174,8 @@ static void jffs2_put_super (struct super_block *sb) D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n")); + lock_kernel(); + if (sb->s_dirt) jffs2_write_super(sb); @@ -195,6 +197,8 @@ static void jffs2_put_super (struct super_block *sb) if (c->mtd->sync) c->mtd->sync(c->mtd); + unlock_kernel(); + D1(printk(KERN_DEBUG "jffs2_put_super returning\n")); } diff --git a/fs/jfs/super.c b/fs/jfs/super.c index d9b0e92..3eb13ad 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -183,6 +183,9 @@ static void jfs_put_super(struct super_block *sb) int rc; jfs_info("In jfs_put_super"); + + lock_kernel(); + rc = jfs_umount(sb); if (rc) jfs_err("jfs_umount failed with return code %d", rc); @@ -195,6 +198,8 @@ static void jfs_put_super(struct super_block *sb) sbi->direct_inode = NULL; kfree(sbi); + + unlock_kernel(); } enum { diff --git a/fs/minix/inode.c b/fs/minix/inode.c index daad3c2..7eb5397 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -35,6 +35,8 @@ static void minix_put_super(struct super_block *sb) int i; struct minix_sb_info *sbi = minix_sb(sb); + lock_kernel(); + if (!(sb->s_flags & MS_RDONLY)) { if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */ sbi->s_ms->s_state = sbi->s_mount_state; @@ -49,7 +51,7 @@ static void minix_put_super(struct super_block *sb) sb->s_fs_info = NULL; kfree(sbi); - return; + unlock_kernel(); } static struct kmem_cache * minix_inode_cachep; diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index d642f0e..b99ce20 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -736,6 +736,8 @@ static void ncp_put_super(struct super_block *sb) { struct ncp_server *server = NCP_SBP(sb); + lock_kernel(); + ncp_lock_server(server); ncp_disconnect(server); ncp_unlock_server(server); @@ -769,6 +771,8 @@ static void ncp_put_super(struct super_block *sb) vfree(server->packet); sb->s_fs_info = NULL; kfree(server); + + unlock_kernel(); } static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf) diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 7901d8c..7262e84 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -316,6 +316,8 @@ static void nilfs_put_super(struct super_block *sb) struct nilfs_sb_info *sbi = NILFS_SB(sb); struct the_nilfs *nilfs = sbi->s_nilfs; + lock_kernel(); + if (sb->s_dirt) nilfs_write_super(sb); @@ -333,6 +335,8 @@ static void nilfs_put_super(struct super_block *sb) sbi->s_super = NULL; sb->s_fs_info = NULL; kfree(sbi); + + unlock_kernel(); } /** diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 6aa7c47..a9ec4e1 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -2246,6 +2246,9 @@ static void ntfs_put_super(struct super_block *sb) ntfs_volume *vol = NTFS_SB(sb); ntfs_debug("Entering."); + + lock_kernel(); + #ifdef NTFS_RW /* * Commit all inodes while they are still open in case some of them @@ -2444,7 +2447,8 @@ static void ntfs_put_super(struct super_block *sb) } sb->s_fs_info = NULL; kfree(vol); - return; + + unlock_kernel(); } /** diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 3eb076c..0273759 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1536,9 +1536,13 @@ static void ocfs2_put_super(struct super_block *sb) { mlog_entry("(0x%p)\n", sb); + lock_kernel(); + ocfs2_sync_blockdev(sb); ocfs2_dismount_volume(sb, 0); + unlock_kernel(); + mlog_exit_void(); } diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 3da0401..90dcb7b 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -465,6 +465,8 @@ static void reiserfs_put_super(struct super_block *s) struct reiserfs_transaction_handle th; th.t_trans_id = 0; + lock_kernel(); + if (s->s_dirt) reiserfs_write_super(s); @@ -500,7 +502,7 @@ static void reiserfs_put_super(struct super_block *s) kfree(s->s_fs_info); s->s_fs_info = NULL; - return; + unlock_kernel(); } static struct kmem_cache *reiserfs_inode_cachep; diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c index fc27fbf..1402d2d 100644 --- a/fs/smbfs/inode.c +++ b/fs/smbfs/inode.c @@ -474,6 +474,8 @@ smb_put_super(struct super_block *sb) { struct smb_sb_info *server = SMB_SB(sb); + lock_kernel(); + smb_lock_server(server); server->state = CONN_INVALID; smbiod_unregister_server(server); @@ -489,6 +491,8 @@ smb_put_super(struct super_block *sb) smb_unlock_server(server); put_pid(server->conn_pid); kfree(server); + + unlock_kernel(); } static int smb_fill_super(struct super_block *sb, void *raw_data, int silent) diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 0adc624..3b52770 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -338,6 +338,8 @@ static int squashfs_remount(struct super_block *sb, int *flags, char *data) static void squashfs_put_super(struct super_block *sb) { + lock_kernel(); + if (sb->s_fs_info) { struct squashfs_sb_info *sbi = sb->s_fs_info; squashfs_cache_delete(sbi->block_cache); @@ -350,6 +352,8 @@ static void squashfs_put_super(struct super_block *sb) kfree(sb->s_fs_info); sb->s_fs_info = NULL; } + + unlock_kernel(); } diff --git a/fs/super.c b/fs/super.c index 54fd331..bdd7158 100644 --- a/fs/super.c +++ b/fs/super.c @@ -309,7 +309,6 @@ void generic_shutdown_super(struct super_block *sb) /* bad name - it should be evict_inodes() */ invalidate_inodes(sb); - lock_kernel(); if (sop->put_super) sop->put_super(sb); @@ -320,8 +319,6 @@ void generic_shutdown_super(struct super_block *sb) "Self-destruct in 5 seconds. Have a nice day...\n", sb->s_id); } - - unlock_kernel(); put_fs_excl(); } spin_lock(&sb_lock); diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index cd80316..a818986 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -72,6 +72,8 @@ static void sysv_put_super(struct super_block *sb) { struct sysv_sb_info *sbi = SYSV_SB(sb); + lock_kernel(); + if (sb->s_dirt) sysv_write_super(sb); @@ -87,6 +89,8 @@ static void sysv_put_super(struct super_block *sb) brelse(sbi->s_bh2); kfree(sbi); + + unlock_kernel(); } static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 84f3c7f..522c3fd 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1684,6 +1684,9 @@ static void ubifs_put_super(struct super_block *sb) ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num, c->vi.vol_id); + + lock_kernel(); + /* * The following asserts are only valid if there has not been a failure * of the media. For example, there will be dirty inodes if we failed @@ -1750,6 +1753,8 @@ static void ubifs_put_super(struct super_block *sb) ubi_close_volume(c->ubi); mutex_unlock(&c->umount_mutex); kfree(c); + + unlock_kernel(); } static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) diff --git a/fs/udf/super.c b/fs/udf/super.c index 0ba4410..04802cc 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -2062,6 +2062,9 @@ static void udf_put_super(struct super_block *sb) struct udf_sb_info *sbi; sbi = UDF_SB(sb); + + lock_kernel(); + if (sbi->s_vat_inode) iput(sbi->s_vat_inode); if (sbi->s_partitions) @@ -2077,6 +2080,8 @@ static void udf_put_super(struct super_block *sb) kfree(sbi->s_partmaps); kfree(sb->s_fs_info); sb->s_fs_info = NULL; + + unlock_kernel(); } static int udf_sync_fs(struct super_block *sb, int wait) diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 74afb9f..2b4d2b6 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -594,6 +594,9 @@ static void ufs_put_super_internal(struct super_block *sb) UFSD("ENTER\n"); + + lock_kernel(); + ufs_put_cstotal(sb); size = uspi->s_cssize; blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; @@ -621,6 +624,9 @@ static void ufs_put_super_internal(struct super_block *sb) brelse (sbi->s_ucg[i]); kfree (sbi->s_ucg); kfree (base); + + unlock_kernel(); + UFSD("EXIT\n"); } -- cgit v1.1 From bbd6851a3213a525128473e978b692ab6ac11aba Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 6 May 2009 10:43:07 -0400 Subject: Push lock_super() into the ->remount_fs() of filesystems that care about it Note that since we can't run into contention between remount_fs and write_super (due to exclusion on s_umount), we have to care only about filesystems that touch lock_super() on their own. Out of those ext3, ext4, hpfs, sysv and ufs do need it; fat doesn't since its ->remount_fs() only accesses assign-once data (basically, it's "we have no atime on directories and only have atime on files for vfat; force nodiratime and possibly noatime into *flags"). [folded a build fix from hch] Signed-off-by: Al Viro --- fs/ext3/super.c | 3 +++ fs/ext4/super.c | 3 +++ fs/hpfs/super.c | 3 +++ fs/super.c | 2 -- fs/sysv/inode.c | 2 ++ fs/ufs/super.c | 11 ++++++++++- 6 files changed, 21 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 546b8d7..e213a26 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2491,6 +2491,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) #endif /* Store the original options */ + lock_super(sb); old_sb_flags = sb->s_flags; old_opts.s_mount_opt = sbi->s_mount_opt; old_opts.s_resuid = sbi->s_resuid; @@ -2598,6 +2599,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) old_opts.s_qf_names[i] != sbi->s_qf_names[i]) kfree(old_opts.s_qf_names[i]); #endif + unlock_super(sb); return 0; restore_opts: sb->s_flags = old_sb_flags; @@ -2614,6 +2616,7 @@ restore_opts: sbi->s_qf_names[i] = old_opts.s_qf_names[i]; } #endif + unlock_super(sb); return err; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1d4180b..a9c6834 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3421,6 +3421,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) #endif /* Store the original options */ + lock_super(sb); old_sb_flags = sb->s_flags; old_opts.s_mount_opt = sbi->s_mount_opt; old_opts.s_resuid = sbi->s_resuid; @@ -3554,6 +3555,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) old_opts.s_qf_names[i] != sbi->s_qf_names[i]) kfree(old_opts.s_qf_names[i]); #endif + unlock_super(sb); return 0; restore_opts: @@ -3573,6 +3575,7 @@ restore_opts: sbi->s_qf_names[i] = old_opts.s_qf_names[i]; } #endif + unlock_super(sb); return err; } diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 437a32e..f68193c 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -398,6 +398,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) *flags |= MS_NOATIME; + lock_super(s); uid = sbi->sb_uid; gid = sbi->sb_gid; umask = 0777 & ~sbi->sb_mode; lowercase = sbi->sb_lowercase; conv = sbi->sb_conv; @@ -430,9 +431,11 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) replace_mount_options(s, new_opts); + unlock_super(s); return 0; out_err: + unlock_super(s); kfree(new_opts); return -EINVAL; } diff --git a/fs/super.c b/fs/super.c index bdd7158..2a49fed 100644 --- a/fs/super.c +++ b/fs/super.c @@ -556,9 +556,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY); if (sb->s_op->remount_fs) { - lock_super(sb); retval = sb->s_op->remount_fs(sb, &flags, data); - unlock_super(sb); if (retval) return retval; } diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index a818986..e0a39f1 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -61,10 +61,12 @@ clean: static int sysv_remount(struct super_block *sb, int *flags, char *data) { struct sysv_sb_info *sbi = SYSV_SB(sb); + lock_super(sb); if (sbi->s_forced_ro) *flags |= MS_RDONLY; if (!(*flags & MS_RDONLY)) sb->s_dirt = 1; + unlock_super(sb); return 0; } diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 2b4d2b6..a5ecabf 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1181,6 +1181,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) unsigned new_mount_opt, ufstype; unsigned flags; + lock_super(sb); uspi = UFS_SB(sb)->s_uspi; flags = UFS_SB(sb)->s_flags; usb1 = ubh_get_usb_first(uspi); @@ -1193,17 +1194,21 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) ufstype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE; new_mount_opt = 0; ufs_set_opt (new_mount_opt, ONERROR_LOCK); - if (!ufs_parse_options (data, &new_mount_opt)) + if (!ufs_parse_options (data, &new_mount_opt)) { + unlock_super(sb); return -EINVAL; + } if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) { new_mount_opt |= ufstype; } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { printk("ufstype can't be changed during remount\n"); + unlock_super(sb); return -EINVAL; } if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { UFS_SB(sb)->s_mount_opt = new_mount_opt; + unlock_super(sb); return 0; } @@ -1228,6 +1233,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) #ifndef CONFIG_UFS_FS_WRITE printk("ufs was compiled with read-only support, " "can't be mounted as read-write\n"); + unlock_super(sb); return -EINVAL; #else if (ufstype != UFS_MOUNT_UFSTYPE_SUN && @@ -1236,16 +1242,19 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) ufstype != UFS_MOUNT_UFSTYPE_SUNx86 && ufstype != UFS_MOUNT_UFSTYPE_UFS2) { printk("this ufstype is read-only supported\n"); + unlock_super(sb); return -EINVAL; } if (!ufs_read_cylinder_structures(sb)) { printk("failed during remounting\n"); + unlock_super(sb); return -EPERM; } sb->s_flags &= ~MS_RDONLY; #endif } UFS_SB(sb)->s_mount_opt = new_mount_opt; + unlock_super(sb); return 0; } -- cgit v1.1 From 6fac98dd218653c6aff8a0f56305c424930cea2a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 8 May 2009 13:31:17 -0400 Subject: Push BKL into do_mount() Signed-off-by: Al Viro --- fs/compat.c | 2 -- fs/namespace.c | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index bb2a9b2..6aefb77 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -812,10 +812,8 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name, } } - lock_kernel(); retval = do_mount((char*)dev_page, dir_page, (char*)type_page, flags, (void*)data_page); - unlock_kernel(); out4: free_page(data_page); diff --git a/fs/namespace.c b/fs/namespace.c index 7e537f0..4740f7b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1921,6 +1921,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, if (retval) goto dput_out; + lock_kernel(); if (flags & MS_REMOUNT) retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, data_page); @@ -1933,6 +1934,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, else retval = do_new_mount(&path, type_page, flags, mnt_flags, dev_name, data_page); + unlock_kernel(); dput_out: path_put(&path); return retval; @@ -2046,10 +2048,8 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, if (retval < 0) goto out3; - lock_kernel(); retval = do_mount((char *)dev_page, dir_page, (char *)type_page, flags, (void *)data_page); - unlock_kernel(); free_page(data_page); out3: -- cgit v1.1 From 7f78d4cd4c5d01864943c22b79df1b6bde923129 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 8 May 2009 13:34:06 -0400 Subject: Push BKL down beyond VFS-only parts of do_mount() Signed-off-by: Al Viro --- fs/namespace.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 4740f7b..b94325f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1515,8 +1515,11 @@ static int do_remount(struct path *path, int flags, int mnt_flags, down_write(&sb->s_umount); if (flags & MS_BIND) err = change_mount_flags(path->mnt, flags); - else + else { + lock_kernel(); err = do_remount_sb(sb, flags, data, 0); + unlock_kernel(); + } if (!err) path->mnt->mnt_flags = mnt_flags; up_write(&sb->s_umount); @@ -1630,7 +1633,9 @@ static int do_new_mount(struct path *path, char *type, int flags, if (!capable(CAP_SYS_ADMIN)) return -EPERM; + lock_kernel(); mnt = do_kern_mount(type, flags, name, data); + unlock_kernel(); if (IS_ERR(mnt)) return PTR_ERR(mnt); @@ -1921,7 +1926,6 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, if (retval) goto dput_out; - lock_kernel(); if (flags & MS_REMOUNT) retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags, data_page); @@ -1934,7 +1938,6 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, else retval = do_new_mount(&path, type_page, flags, mnt_flags, dev_name, data_page); - unlock_kernel(); dput_out: path_put(&path); return retval; -- cgit v1.1 From 4aa98cf768b6f2ea4b204620d949a665959214f6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 8 May 2009 13:36:58 -0400 Subject: Push BKL down into do_remount_sb() [folded fix from Jiri Slaby] Signed-off-by: Al Viro --- fs/namespace.c | 10 ++-------- fs/super.c | 16 +++++++++++----- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index b94325f..2dd333b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1060,11 +1060,8 @@ static int do_umount(struct vfsmount *mnt, int flags) * we just try to remount it readonly. */ down_write(&sb->s_umount); - if (!(sb->s_flags & MS_RDONLY)) { - lock_kernel(); + if (!(sb->s_flags & MS_RDONLY)) retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); - unlock_kernel(); - } up_write(&sb->s_umount); return retval; } @@ -1515,11 +1512,8 @@ static int do_remount(struct path *path, int flags, int mnt_flags, down_write(&sb->s_umount); if (flags & MS_BIND) err = change_mount_flags(path->mnt, flags); - else { - lock_kernel(); + else err = do_remount_sb(sb, flags, data, 0); - unlock_kernel(); - } if (!err) path->mnt->mnt_flags = mnt_flags; up_write(&sb->s_umount); diff --git a/fs/super.c b/fs/super.c index 2a49fed..a64f362 100644 --- a/fs/super.c +++ b/fs/super.c @@ -542,25 +542,33 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) shrink_dcache_sb(sb); sync_filesystem(sb); + lock_kernel(); /* If we are remounting RDONLY and current sb is read/write, make sure there are no rw files opened */ if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) { if (force) mark_files_ro(sb); - else if (!fs_may_remount_ro(sb)) + else if (!fs_may_remount_ro(sb)) { + unlock_kernel(); return -EBUSY; + } retval = vfs_dq_off(sb, 1); - if (retval < 0 && retval != -ENOSYS) + if (retval < 0 && retval != -ENOSYS) { + unlock_kernel(); return -EBUSY; + } } remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY); if (sb->s_op->remount_fs) { retval = sb->s_op->remount_fs(sb, &flags, data); - if (retval) + if (retval) { + unlock_kernel(); return retval; + } } sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); + unlock_kernel(); if (remount_rw) vfs_dq_quota_on_remount(sb); return 0; @@ -581,9 +589,7 @@ static void do_emergency_remount(struct work_struct *work) * * What lock protects sb->s_flags?? */ - lock_kernel(); do_remount_sb(sb, MS_RDONLY, NULL, 1); - unlock_kernel(); } up_write(&sb->s_umount); put_super(sb); -- cgit v1.1 From 01ba687577647beef6c5f2ea59bfb56fac9fcde2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 11 May 2009 23:34:27 +0200 Subject: jffs2: move jffs2_write_super to super.c jffs2_write_super is only called from super.c and doesn't use any functionality from fs.c. So move it over to super.c and make it static there. [should go in through the vfs tree as it is a requirement for the next patch] Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/jffs2/fs.c | 15 --------------- fs/jffs2/os-linux.h | 1 - fs/jffs2/super.c | 14 ++++++++++++++ 3 files changed, 14 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 249305d..237b27a 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -402,21 +402,6 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data) return 0; } -void jffs2_write_super (struct super_block *sb) -{ - struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); - sb->s_dirt = 0; - - if (sb->s_flags & MS_RDONLY) - return; - - D1(printk(KERN_DEBUG "jffs2_write_super()\n")); - jffs2_garbage_collect_trigger(c); - jffs2_erase_pending_blocks(c, 0); - jffs2_flush_wbuf_gc(c, 0); -} - - /* jffs2_new_inode: allocate a new inode and inocache, add it to the hash, fill in the raw_inode while you're at it. */ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_inode *ri) diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 5e194a5..2228380 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -181,7 +181,6 @@ void jffs2_dirty_inode(struct inode *inode); struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_inode *ri); int jffs2_statfs (struct dentry *, struct kstatfs *); -void jffs2_write_super (struct super_block *); int jffs2_remount_fs (struct super_block *, int *, char *); int jffs2_do_fill_super(struct super_block *sb, void *data, int silent); void jffs2_gc_release_inode(struct jffs2_sb_info *c, diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 37b1212..a80a50e 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -53,6 +53,20 @@ static void jffs2_i_init_once(void *foo) inode_init_once(&f->vfs_inode); } +static void jffs2_write_super(struct super_block *sb) +{ + struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); + sb->s_dirt = 0; + + if (sb->s_flags & MS_RDONLY) + return; + + D1(printk(KERN_DEBUG "jffs2_write_super()\n")); + jffs2_garbage_collect_trigger(c); + jffs2_erase_pending_blocks(c, 0); + jffs2_flush_wbuf_gc(c, 0); +} + static int jffs2_sync_fs(struct super_block *sb, int wait) { struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); -- cgit v1.1 From ebc1ac164560a241d9bf1b7519062910c3f90a01 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 11 May 2009 23:35:03 +0200 Subject: ->write_super lock_super pushdown Push down lock_super into ->write_super instances and remove it from the caller. Following filesystem don't need ->s_lock in ->write_super and are skipped: * bfs, nilfs2 - no other uses of s_lock and have internal locks in ->write_super * ext2 - uses BKL in ext2_write_super and has internal calls without s_lock * reiserfs - no other uses of s_lock as has reiserfs_write_lock (BKL) in ->write_super * xfs - no other uses of s_lock and uses internal lock (buffer lock on superblock buffer) to serialize ->write_super. Also xfs_fs_write_super is superflous and will go away in the next merge window Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/affs/super.c | 2 ++ fs/exofs/super.c | 2 ++ fs/ext4/super.c | 4 +++- fs/fat/inode.c | 2 ++ fs/hfs/super.c | 8 +++++--- fs/hfsplus/super.c | 6 +++++- fs/jffs2/super.c | 15 +++++++++------ fs/super.c | 2 -- fs/sync.c | 4 ---- fs/sysv/inode.c | 2 ++ fs/ufs/super.c | 2 ++ 11 files changed, 32 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/affs/super.c b/fs/affs/super.c index d738646..280d361 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -54,6 +54,7 @@ affs_write_super(struct super_block *sb) int clean = 2; struct affs_sb_info *sbi = AFFS_SB(sb); + lock_super(sb); if (!(sb->s_flags & MS_RDONLY)) { // if (sbi->s_bitmap[i].bm_bh) { // if (buffer_dirty(sbi->s_bitmap[i].bm_bh)) { @@ -66,6 +67,7 @@ affs_write_super(struct super_block *sb) sb->s_dirt = !clean; /* redo until bitmap synced */ } else sb->s_dirt = 0; + unlock_super(sb); pr_debug("AFFS: write_super() at %lu, clean=%d\n", get_seconds(), clean); } diff --git a/fs/exofs/super.c b/fs/exofs/super.c index cd1f8b1..49e16af 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -214,6 +214,7 @@ static void exofs_write_super(struct super_block *sb) return; } + lock_super(sb); lock_kernel(); sbi = sb->s_fs_info; fscb->s_nextid = cpu_to_le64(sbi->s_nextid); @@ -246,6 +247,7 @@ out: if (or) osd_end_request(or); unlock_kernel(); + unlock_super(sb); kfree(fscb); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a9c6834..c17200a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -579,7 +579,7 @@ static void ext4_put_super(struct super_block *sb) lock_super(sb); lock_kernel(); if (sb->s_dirt) - ext4_write_super(sb); + ext4_commit_super(sb, 1); ext4_release_system_zone(sb); ext4_mb_release(sb); @@ -3336,7 +3336,9 @@ int ext4_force_commit(struct super_block *sb) static void ext4_write_super(struct super_block *sb) { + lock_super(sb); ext4_commit_super(sb, 1); + unlock_super(sb); } static int ext4_sync_fs(struct super_block *sb, int wait) diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 2b88c93..2292cbf 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -441,10 +441,12 @@ static void fat_clear_inode(struct inode *inode) static void fat_write_super(struct super_block *sb) { + lock_super(sb); sb->s_dirt = 0; if (!(sb->s_flags & MS_RDONLY)) fat_clusters_flush(sb); + unlock_super(sb); } static void fat_put_super(struct super_block *sb) diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 9f5eaa0..3aac417 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -49,11 +49,13 @@ MODULE_LICENSE("GPL"); */ static void hfs_write_super(struct super_block *sb) { + lock_super(sb); sb->s_dirt = 0; - if (sb->s_flags & MS_RDONLY) - return; + /* sync everything to the buffers */ - hfs_mdb_commit(sb); + if (!(sb->s_flags & MS_RDONLY)) + hfs_mdb_commit(sb); + unlock_super(sb); } /* diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 9b292dc..1aab8aa 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -157,10 +157,12 @@ static void hfsplus_write_super(struct super_block *sb) struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; dprint(DBG_SUPER, "hfsplus_write_super\n"); + + lock_super(sb); sb->s_dirt = 0; if (sb->s_flags & MS_RDONLY) /* warn? */ - return; + goto out; vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks); vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc); @@ -192,6 +194,8 @@ static void hfsplus_write_super(struct super_block *sb) } HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP; } + out: + unlock_super(sb); } static void hfsplus_put_super(struct super_block *sb) diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index a80a50e..f7bfd3a 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -56,15 +56,18 @@ static void jffs2_i_init_once(void *foo) static void jffs2_write_super(struct super_block *sb) { struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); + + lock_super(sb); sb->s_dirt = 0; - if (sb->s_flags & MS_RDONLY) - return; + if (!(sb->s_flags & MS_RDONLY)) { + D1(printk(KERN_DEBUG "jffs2_write_super()\n")); + jffs2_garbage_collect_trigger(c); + jffs2_erase_pending_blocks(c, 0); + jffs2_flush_wbuf_gc(c, 0); + } - D1(printk(KERN_DEBUG "jffs2_write_super()\n")); - jffs2_garbage_collect_trigger(c); - jffs2_erase_pending_blocks(c, 0); - jffs2_flush_wbuf_gc(c, 0); + unlock_super(sb); } static int jffs2_sync_fs(struct super_block *sb, int wait) diff --git a/fs/super.c b/fs/super.c index a64f362..1905f4a 100644 --- a/fs/super.c +++ b/fs/super.c @@ -420,10 +420,8 @@ restart: spin_unlock(&sb_lock); down_read(&sb->s_umount); - lock_super(sb); if (sb->s_root && sb->s_dirt) sb->s_op->write_super(sb); - unlock_super(sb); up_read(&sb->s_umount); spin_lock(&sb_lock); diff --git a/fs/sync.c b/fs/sync.c index 89c37f7..e9d56f6 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -33,10 +33,8 @@ static int __sync_filesystem(struct super_block *sb, int wait) else sync_quota_sb(sb, -1); sync_inodes_sb(sb, wait); - lock_super(sb); if (sb->s_dirt && sb->s_op->write_super) sb->s_op->write_super(sb); - unlock_super(sb); if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, wait); return __sync_blockdev(sb->s_bdev, wait); @@ -164,10 +162,8 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync) /* sync the superblock to buffers */ sb = inode->i_sb; - lock_super(sb); if (sb->s_dirt && sb->s_op->write_super) sb->s_op->write_super(sb); - unlock_super(sb); /* .. finally sync the buffers to disk */ err = sync_blockdev(sb->s_bdev); diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index e0a39f1..a3f45fc 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -37,6 +37,7 @@ static void sysv_write_super(struct super_block *sb) struct sysv_sb_info *sbi = SYSV_SB(sb); unsigned long time = get_seconds(), old_time; + lock_super(sb); lock_kernel(); if (sb->s_flags & MS_RDONLY) goto clean; @@ -56,6 +57,7 @@ static void sysv_write_super(struct super_block *sb) clean: sb->s_dirt = 0; unlock_kernel(); + unlock_super(sb); } static int sysv_remount(struct super_block *sb, int *flags, char *data) diff --git a/fs/ufs/super.c b/fs/ufs/super.c index a5ecabf..c97210e 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1131,6 +1131,7 @@ static void ufs_write_super(struct super_block *sb) struct ufs_super_block_third * usb3; unsigned flags; + lock_super(sb); lock_kernel(); UFSD("ENTER\n"); flags = UFS_SB(sb)->s_flags; @@ -1150,6 +1151,7 @@ static void ufs_write_super(struct super_block *sb) sb->s_dirt = 0; UFSD("EXIT\n"); unlock_kernel(); + unlock_super(sb); } static void ufs_put_super(struct super_block *sb) -- cgit v1.1 From 9fd5746fd3d7838bf6ff991d50f1257057d1156f Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 21 May 2009 16:01:00 -0400 Subject: fs: Remove i_cindex from struct inode The only user of the i_cindex element in the inode structure is used is by the firewire drivers. As part of an attempt to slim down the inode structure to save memory --- since a typical Linux system will have hundreds of thousands if not millions of inodes cached, a reduction in the size inode has high leverage. The firewire driver does not need i_cindex in any fast path, so it's simple enough to calculate when it is needed, instead of wasting space in the inode structure. Signed-off-by: "Theodore Ts'o" Cc: krh@redhat.com Cc: stefanr@s5r6.in-berlin.de Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Al Viro --- fs/char_dev.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/char_dev.c b/fs/char_dev.c index 38f7122..b7c9d51 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -375,7 +375,6 @@ static int chrdev_open(struct inode *inode, struct file *filp) p = inode->i_cdev; if (!p) { inode->i_cdev = p = new; - inode->i_cindex = idx; list_add(&inode->i_devices, &p->list); new = NULL; } else if (!cdev_get(p)) @@ -405,6 +404,18 @@ static int chrdev_open(struct inode *inode, struct file *filp) return ret; } +int cdev_index(struct inode *inode) +{ + int idx; + struct kobject *kobj; + + kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx); + if (!kobj) + return -1; + kobject_put(kobj); + return idx; +} + void cd_forget(struct inode *inode) { spin_lock(&cdev_lock); @@ -557,6 +568,7 @@ EXPORT_SYMBOL(cdev_init); EXPORT_SYMBOL(cdev_alloc); EXPORT_SYMBOL(cdev_del); EXPORT_SYMBOL(cdev_add); +EXPORT_SYMBOL(cdev_index); EXPORT_SYMBOL(register_chrdev); EXPORT_SYMBOL(unregister_chrdev); EXPORT_SYMBOL(directly_mappable_cdev_bdi); -- cgit v1.1 From 13205fb9260c2377438599ef0773c6a3eaeb0b07 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 25 May 2009 09:30:45 +0200 Subject: ntfs: remove old debug check for dirty data in ntfs_put_super() This should not trigger anymore, so kill it. Acked-by: Anton Altaparmakov Signed-off-by: Jens Axboe Signed-off-by: Al Viro --- fs/ntfs/super.c | 33 +++------------------------------ 1 file changed, 3 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index a9ec4e1..7a7b0d3 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -2376,39 +2376,12 @@ static void ntfs_put_super(struct super_block *sb) vol->mftmirr_ino = NULL; } /* - * If any dirty inodes are left, throw away all mft data page cache - * pages to allow a clean umount. This should never happen any more - * due to mft.c::ntfs_mft_writepage() cleaning all the dirty pages as - * the underlying mft records are written out and cleaned. If it does, - * happen anyway, we want to know... + * We should have no dirty inodes left, due to + * mft.c::ntfs_mft_writepage() cleaning all the dirty pages as + * the underlying mft records are written out and cleaned. */ ntfs_commit_inode(vol->mft_ino); write_inode_now(vol->mft_ino, 1); - if (sb_has_dirty_inodes(sb)) { - const char *s1, *s2; - - mutex_lock(&vol->mft_ino->i_mutex); - truncate_inode_pages(vol->mft_ino->i_mapping, 0); - mutex_unlock(&vol->mft_ino->i_mutex); - write_inode_now(vol->mft_ino, 1); - if (sb_has_dirty_inodes(sb)) { - static const char *_s1 = "inodes"; - static const char *_s2 = ""; - s1 = _s1; - s2 = _s2; - } else { - static const char *_s1 = "mft pages"; - static const char *_s2 = "They have been thrown " - "away. "; - s1 = _s1; - s2 = _s2; - } - ntfs_error(sb, "Dirty %s found at umount time. %sYou should " - "run chkdsk. Please email " - "linux-ntfs-dev@lists.sourceforge.net and say " - "that you saw this message. Thank you.", s1, - s2); - } #endif /* NTFS_RW */ iput(vol->mft_ino); -- cgit v1.1 From f95022161d23ee661a48af8f280472209f513a67 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 5 Jun 2009 12:26:23 +0200 Subject: xfs: remove ->write_super and stop maintaining ->s_dirt the write_super method is used for (1) writing back the superblock periodically from pdflush (2) called just before ->sync_fs for data integerity syncs We don't need (1) because we have our own peridoc writeout through xfssyncd, and we don't need (2) because xfs_fs_sync_fs performs a proper synchronous superblock writeout after all other data and metadata has been written out. Also remove ->s_dirt tracking as it's only used to decide when too call ->write_super. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen Signed-off-by: Al Viro --- fs/xfs/linux-2.6/xfs_super.c | 12 ------------ fs/xfs/xfs_trans.c | 2 -- 2 files changed, 14 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index bb68526..08d6bd9 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1104,15 +1104,6 @@ xfs_fs_put_super( kfree(mp); } -STATIC void -xfs_fs_write_super( - struct super_block *sb) -{ - if (!(sb->s_flags & MS_RDONLY)) - xfs_sync_fsdata(XFS_M(sb), 0); - sb->s_dirt = 0; -} - STATIC int xfs_fs_sync_super( struct super_block *sb, @@ -1137,7 +1128,6 @@ xfs_fs_sync_super( error = xfs_quiesce_data(mp); else error = xfs_sync_fsdata(mp, 0); - sb->s_dirt = 0; if (unlikely(laptop_mode)) { int prev_sync_seq = mp->m_sync_seq; @@ -1443,7 +1433,6 @@ xfs_fs_fill_super( XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname); - sb->s_dirt = 1; sb->s_magic = XFS_SB_MAGIC; sb->s_blocksize = mp->m_sb.sb_blocksize; sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; @@ -1533,7 +1522,6 @@ static struct super_operations xfs_super_operations = { .write_inode = xfs_fs_write_inode, .clear_inode = xfs_fs_clear_inode, .put_super = xfs_fs_put_super, - .write_super = xfs_fs_write_super, .sync_fs = xfs_fs_sync_super, .freeze_fs = xfs_fs_freeze, .statfs = xfs_fs_statfs, diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 8570b82..bcc39d3 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -628,8 +628,6 @@ xfs_trans_apply_sb_deltas( xfs_trans_log_buf(tp, bp, offsetof(xfs_dsb_t, sb_icount), offsetof(xfs_dsb_t, sb_frextents) + sizeof(sbp->sb_frextents) - 1); - - tp->t_mountp->m_super->s_dirt = 1; } /* -- cgit v1.1 From 545b9fd3d737afc0bb5203b1e79194a471605acd Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 2 Jun 2009 12:07:47 +0200 Subject: fs: remove incorrect I_NEW warnings Some filesystems can call in to sync an inode that is still in the I_NEW state (eg. ext family, when mounted with -osync). This is OK because the filesystem has sole access to the new inode, so it can modify i_state without races (because no other thread should be modifying it, by definition of I_NEW). Ie. a false positive, so remove the warnings. The races are described here 7ef0d7377cb287e08f3ae94cebc919448e1f5dff, which is also where the warnings were introduced. Reported-by: Stephen Hemminger Signed-off-by: Nick Piggin Signed-off-by: Al Viro --- fs/fs-writeback.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e0fb2e7..efcedb6 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -289,7 +289,6 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc) int ret; BUG_ON(inode->i_state & I_SYNC); - WARN_ON(inode->i_state & I_NEW); /* Set I_SYNC, reset I_DIRTY */ dirty = inode->i_state & I_DIRTY; @@ -314,7 +313,6 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc) } spin_lock(&inode_lock); - WARN_ON(inode->i_state & I_NEW); inode->i_state &= ~I_SYNC; if (!(inode->i_state & I_FREEING)) { if (!(inode->i_state & I_DIRTY) && -- cgit v1.1 From 4195f73d1329e49727bcceb028e58cb38376c2b0 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 28 May 2009 09:01:15 +0200 Subject: fs: block_dump missing dentry locking I think the block_dump output in __mark_inode_dirty is missing dentry locking. Surely the i_dentry list can change any time, so we may not even *get* a dentry there. If we do get one by chance, then it would appear to be able to go away or get renamed at any time... Signed-off-by: Al Viro --- fs/fs-writeback.c | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index efcedb6..40308e9 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -64,6 +64,28 @@ static void writeback_release(struct backing_dev_info *bdi) clear_bit(BDI_pdflush, &bdi->state); } +static noinline void block_dump___mark_inode_dirty(struct inode *inode) +{ + if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { + struct dentry *dentry; + const char *name = "?"; + + dentry = d_find_alias(inode); + if (dentry) { + spin_lock(&dentry->d_lock); + name = (const char *) dentry->d_name.name; + } + printk(KERN_DEBUG + "%s(%d): dirtied inode %lu (%s) on %s\n", + current->comm, task_pid_nr(current), inode->i_ino, + name, inode->i_sb->s_id); + if (dentry) { + spin_unlock(&dentry->d_lock); + dput(dentry); + } + } +} + /** * __mark_inode_dirty - internal function * @inode: inode to mark @@ -114,23 +136,8 @@ void __mark_inode_dirty(struct inode *inode, int flags) if ((inode->i_state & flags) == flags) return; - if (unlikely(block_dump)) { - struct dentry *dentry = NULL; - const char *name = "?"; - - if (!list_empty(&inode->i_dentry)) { - dentry = list_entry(inode->i_dentry.next, - struct dentry, d_alias); - if (dentry && dentry->d_name.name) - name = (const char *) dentry->d_name.name; - } - - if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) - printk(KERN_DEBUG - "%s(%d): dirtied inode %lu (%s) on %s\n", - current->comm, task_pid_nr(current), inode->i_ino, - name, inode->i_sb->s_id); - } + if (unlikely(block_dump)) + block_dump___mark_inode_dirty(inode); spin_lock(&inode_lock); if ((inode->i_state & flags) != flags) { -- cgit v1.1 From 337eb00a2c3a421999c39c94ce7e33545ee8baa7 Mon Sep 17 00:00:00 2001 From: Alessio Igor Bogani Date: Tue, 12 May 2009 15:10:54 +0200 Subject: Push BKL down into ->remount_fs() [xfs, btrfs, capifs, shmem don't need BKL, exempt] Signed-off-by: Alessio Igor Bogani Signed-off-by: Al Viro --- fs/affs/super.c | 7 ++++++- fs/ext2/super.c | 12 ++++++++++-- fs/ext3/super.c | 4 ++++ fs/ext4/super.c | 4 ++++ fs/hpfs/super.c | 4 ++++ fs/jffs2/fs.c | 3 +++ fs/jfs/super.c | 22 ++++++++++++++++++---- fs/nfs/super.c | 2 ++ fs/nilfs2/super.c | 4 ++++ fs/ntfs/super.c | 15 ++++++++++++++- fs/ocfs2/super.c | 4 ++++ fs/reiserfs/super.c | 4 ++++ fs/super.c | 2 -- fs/ubifs/super.c | 9 ++++++++- fs/udf/super.c | 6 +++++- fs/ufs/super.c | 11 ++++++++++- 16 files changed, 100 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/affs/super.c b/fs/affs/super.c index 280d361..c481493 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "affs.h" extern struct timezone sys_tz; @@ -512,6 +513,7 @@ affs_remount(struct super_block *sb, int *flags, char *data) kfree(new_opts); return -EINVAL; } + lock_kernel(); replace_mount_options(sb, new_opts); sbi->s_flags = mount_flags; @@ -519,8 +521,10 @@ affs_remount(struct super_block *sb, int *flags, char *data) sbi->s_uid = uid; sbi->s_gid = gid; - if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { + unlock_kernel(); return 0; + } if (*flags & MS_RDONLY) { sb->s_dirt = 1; while (sb->s_dirt) @@ -529,6 +533,7 @@ affs_remount(struct super_block *sb, int *flags, char *data) } else res = affs_init_bitmap(sb, flags); + unlock_kernel(); return res; } diff --git a/fs/ext2/super.c b/fs/ext2/super.c index a44963d..f8cbdf5 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1162,6 +1162,8 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) unsigned long old_sb_flags; int err; + lock_kernel(); + /* Store the old options */ old_sb_flags = sb->s_flags; old_opts.s_mount_opt = sbi->s_mount_opt; @@ -1197,12 +1199,16 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) sbi->s_mount_opt &= ~EXT2_MOUNT_XIP; sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP; } - if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { + unlock_kernel(); return 0; + } if (*flags & MS_RDONLY) { if (le16_to_cpu(es->s_state) & EXT2_VALID_FS || - !(sbi->s_mount_state & EXT2_VALID_FS)) + !(sbi->s_mount_state & EXT2_VALID_FS)) { + unlock_kernel(); return 0; + } /* * OK, we are remounting a valid rw partition rdonly, so set * the rdonly flag and then mark the partition as valid again. @@ -1229,12 +1235,14 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) sb->s_flags &= ~MS_RDONLY; } ext2_sync_super(sb, es); + unlock_kernel(); return 0; restore_opts: sbi->s_mount_opt = old_opts.s_mount_opt; sbi->s_resuid = old_opts.s_resuid; sbi->s_resgid = old_opts.s_resgid; sb->s_flags = old_sb_flags; + unlock_kernel(); return err; } diff --git a/fs/ext3/super.c b/fs/ext3/super.c index e213a26..26aa64d 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2490,6 +2490,8 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) int i; #endif + lock_kernel(); + /* Store the original options */ lock_super(sb); old_sb_flags = sb->s_flags; @@ -2600,6 +2602,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) kfree(old_opts.s_qf_names[i]); #endif unlock_super(sb); + unlock_kernel(); return 0; restore_opts: sb->s_flags = old_sb_flags; @@ -2617,6 +2620,7 @@ restore_opts: } #endif unlock_super(sb); + unlock_kernel(); return err; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c17200a..012c425 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3422,6 +3422,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) int i; #endif + lock_kernel(); + /* Store the original options */ lock_super(sb); old_sb_flags = sb->s_flags; @@ -3558,6 +3560,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) kfree(old_opts.s_qf_names[i]); #endif unlock_super(sb); + unlock_kernel(); return 0; restore_opts: @@ -3578,6 +3581,7 @@ restore_opts: } #endif unlock_super(sb); + unlock_kernel(); return err; } diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index f68193c..f2feaa0 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -13,6 +13,7 @@ #include #include #include +#include /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */ @@ -398,6 +399,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) *flags |= MS_NOATIME; + lock_kernel(); lock_super(s); uid = sbi->sb_uid; gid = sbi->sb_gid; umask = 0777 & ~sbi->sb_mode; @@ -432,10 +434,12 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) replace_mount_options(s, new_opts); unlock_super(s); + unlock_kernel(); return 0; out_err: unlock_super(s); + unlock_kernel(); kfree(new_opts); return -EINVAL; } diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 237b27a..3451a81 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "nodelist.h" static int jffs2_flash_setup(struct jffs2_sb_info *c); @@ -387,6 +388,7 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data) This also catches the case where it was stopped and this is just a remount to restart it. Flush the writebuffer, if neccecary, else we loose it */ + lock_kernel(); if (!(sb->s_flags & MS_RDONLY)) { jffs2_stop_garbage_collect_thread(c); mutex_lock(&c->alloc_sem); @@ -399,6 +401,7 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data) *flags |= MS_NOATIME; + unlock_kernel(); return 0; } diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 3eb13ad..09b1b6e 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "jfs_incore.h" #include "jfs_filsys.h" @@ -375,19 +376,24 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) s64 newLVSize = 0; int rc = 0; int flag = JFS_SBI(sb)->flag; + int ret; if (!parse_options(data, sb, &newLVSize, &flag)) { return -EINVAL; } + lock_kernel(); if (newLVSize) { if (sb->s_flags & MS_RDONLY) { printk(KERN_ERR "JFS: resize requires volume to be mounted read-write\n"); + unlock_kernel(); return -EROFS; } rc = jfs_extendfs(sb, newLVSize, 0); - if (rc) + if (rc) { + unlock_kernel(); return rc; + } } if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { @@ -398,23 +404,31 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) truncate_inode_pages(JFS_SBI(sb)->direct_inode->i_mapping, 0); JFS_SBI(sb)->flag = flag; - return jfs_mount_rw(sb, 1); + ret = jfs_mount_rw(sb, 1); + unlock_kernel(); + return ret; } if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) { rc = jfs_umount_rw(sb); JFS_SBI(sb)->flag = flag; + unlock_kernel(); return rc; } if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY)) if (!(sb->s_flags & MS_RDONLY)) { rc = jfs_umount_rw(sb); - if (rc) + if (rc) { + unlock_kernel(); return rc; + } JFS_SBI(sb)->flag = flag; - return jfs_mount_rw(sb, 1); + ret = jfs_mount_rw(sb, 1); + unlock_kernel(); + return ret; } JFS_SBI(sb)->flag = flag; + unlock_kernel(); return 0; } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index d2d6778..26127b6 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1813,6 +1813,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) if (data == NULL) return -ENOMEM; + lock_kernel(); /* fill out struct with values from existing mount */ data->flags = nfss->flags; data->rsize = nfss->rsize; @@ -1837,6 +1838,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) error = nfs_compare_remount_data(nfss, data); out: kfree(data); + unlock_kernel(); return error; } diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 7262e84..11151ea 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -906,6 +906,8 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) struct nilfs_mount_options old_opts; int err; + lock_kernel(); + old_sb_flags = sb->s_flags; old_opts.mount_opt = sbi->s_mount_opt; old_opts.snapshot_cno = sbi->s_snapshot_cno; @@ -985,6 +987,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) up(&sb->s_bdev->bd_mount_sem); } out: + unlock_kernel(); return 0; rw_remount_failed: @@ -993,6 +996,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) sb->s_flags = old_sb_flags; sbi->s_mount_opt = old_opts.mount_opt; sbi->s_snapshot_cno = old_opts.snapshot_cno; + unlock_kernel(); return err; } diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 7a7b0d3..abaaa1c 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -443,6 +443,8 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) ntfs_volume *vol = NTFS_SB(sb); ntfs_debug("Entering with remount options string: %s", opt); + + lock_kernel(); #ifndef NTFS_RW /* For read-only compiled driver, enforce read-only flag. */ *flags |= MS_RDONLY; @@ -466,15 +468,18 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) if (NVolErrors(vol)) { ntfs_error(sb, "Volume has errors and is read-only%s", es); + unlock_kernel(); return -EROFS; } if (vol->vol_flags & VOLUME_IS_DIRTY) { ntfs_error(sb, "Volume is dirty and read-only%s", es); + unlock_kernel(); return -EROFS; } if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { ntfs_error(sb, "Volume has been modified by chkdsk " "and is read-only%s", es); + unlock_kernel(); return -EROFS; } if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { @@ -482,11 +487,13 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) "(0x%x) and is read-only%s", (unsigned)le16_to_cpu(vol->vol_flags), es); + unlock_kernel(); return -EROFS; } if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { ntfs_error(sb, "Failed to set dirty bit in volume " "information flags%s", es); + unlock_kernel(); return -EROFS; } #if 0 @@ -506,18 +513,21 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) ntfs_error(sb, "Failed to empty journal $LogFile%s", es); NVolSetErrors(vol); + unlock_kernel(); return -EROFS; } if (!ntfs_mark_quotas_out_of_date(vol)) { ntfs_error(sb, "Failed to mark quotas out of date%s", es); NVolSetErrors(vol); + unlock_kernel(); return -EROFS; } if (!ntfs_stamp_usnjrnl(vol)) { ntfs_error(sb, "Failed to stamp transation log " "($UsnJrnl)%s", es); NVolSetErrors(vol); + unlock_kernel(); return -EROFS; } } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) { @@ -533,8 +543,11 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) // TODO: Deal with *flags. - if (!parse_options(vol, opt)) + if (!parse_options(vol, opt)) { + unlock_kernel(); return -EINVAL; + } + unlock_kernel(); ntfs_debug("Done."); return 0; } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 0273759..201b40a 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -42,6 +42,7 @@ #include #include #include +#include #define MLOG_MASK_PREFIX ML_SUPER #include @@ -581,6 +582,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) struct mount_options parsed_options; struct ocfs2_super *osb = OCFS2_SB(sb); + lock_kernel(); + if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) { ret = -EINVAL; goto out; @@ -684,6 +687,7 @@ unlock_osb: ocfs2_set_journal_params(osb); } out: + unlock_kernel(); return ret; } diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 90dcb7b..2969773 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -28,6 +28,7 @@ #include #include #include +#include struct file_system_type reiserfs_fs_type; @@ -1196,6 +1197,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names)); #endif + lock_kernel(); rs = SB_DISK_SUPER_BLOCK(s); if (!reiserfs_parse_options @@ -1318,10 +1320,12 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) out_ok: replace_mount_options(s, new_opts); + unlock_kernel(); return 0; out_err: kfree(new_opts); + unlock_kernel(); return err; } diff --git a/fs/super.c b/fs/super.c index 1905f4a..83b47416 100644 --- a/fs/super.c +++ b/fs/super.c @@ -540,7 +540,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) shrink_dcache_sb(sb); sync_filesystem(sb); - lock_kernel(); /* If we are remounting RDONLY and current sb is read/write, make sure there are no rw files opened */ if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) { @@ -566,7 +565,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) } } sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); - unlock_kernel(); if (remount_rw) vfs_dq_quota_on_remount(sb); return 0; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 522c3fd..3589eab 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "ubifs.h" /* @@ -1770,17 +1771,22 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) return err; } + lock_kernel(); if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { if (c->ro_media) { ubifs_msg("cannot re-mount due to prior errors"); + unlock_kernel(); return -EROFS; } err = ubifs_remount_rw(c); - if (err) + if (err) { + unlock_kernel(); return err; + } } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) { if (c->ro_media) { ubifs_msg("cannot re-mount due to prior errors"); + unlock_kernel(); return -EROFS; } ubifs_remount_ro(c); @@ -1795,6 +1801,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) } ubifs_assert(c->lst.taken_empty_lebs > 0); + unlock_kernel(); return 0; } diff --git a/fs/udf/super.c b/fs/udf/super.c index 04802cc..6832135 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -568,6 +568,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) if (!udf_parse_options(options, &uopt, true)) return -EINVAL; + lock_kernel(); sbi->s_flags = uopt.flags; sbi->s_uid = uopt.uid; sbi->s_gid = uopt.gid; @@ -581,13 +582,16 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) *flags |= MS_RDONLY; } - if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { + unlock_kernel(); return 0; + } if (*flags & MS_RDONLY) udf_close_lvid(sb); else udf_open_lvid(sb); + unlock_kernel(); return 0; } diff --git a/fs/ufs/super.c b/fs/ufs/super.c index c97210e..6560dda 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -263,6 +263,7 @@ void ufs_panic (struct super_block * sb, const char * function, struct ufs_super_block_first * usb1; va_list args; + lock_kernel(); uspi = UFS_SB(sb)->s_uspi; usb1 = ubh_get_usb_first(uspi); @@ -1182,7 +1183,8 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) struct ufs_super_block_third * usb3; unsigned new_mount_opt, ufstype; unsigned flags; - + + lock_kernel(); lock_super(sb); uspi = UFS_SB(sb)->s_uspi; flags = UFS_SB(sb)->s_flags; @@ -1198,6 +1200,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) ufs_set_opt (new_mount_opt, ONERROR_LOCK); if (!ufs_parse_options (data, &new_mount_opt)) { unlock_super(sb); + unlock_kernel(); return -EINVAL; } if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) { @@ -1205,12 +1208,14 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { printk("ufstype can't be changed during remount\n"); unlock_super(sb); + unlock_kernel(); return -EINVAL; } if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { UFS_SB(sb)->s_mount_opt = new_mount_opt; unlock_super(sb); + unlock_kernel(); return 0; } @@ -1236,6 +1241,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) printk("ufs was compiled with read-only support, " "can't be mounted as read-write\n"); unlock_super(sb); + unlock_kernel(); return -EINVAL; #else if (ufstype != UFS_MOUNT_UFSTYPE_SUN && @@ -1245,11 +1251,13 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) ufstype != UFS_MOUNT_UFSTYPE_UFS2) { printk("this ufstype is read-only supported\n"); unlock_super(sb); + unlock_kernel(); return -EINVAL; } if (!ufs_read_cylinder_structures(sb)) { printk("failed during remounting\n"); unlock_super(sb); + unlock_kernel(); return -EPERM; } sb->s_flags &= ~MS_RDONLY; @@ -1257,6 +1265,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) } UFS_SB(sb)->s_mount_opt = new_mount_opt; unlock_super(sb); + unlock_kernel(); return 0; } -- cgit v1.1 From d5aacad548db1ff547adf35d0a77eb2a8ed4fe14 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 7 Jun 2009 14:56:44 -0400 Subject: New helper - simple_fsync() writes associated buffers, then does sync_inode() to write the inode itself (and to make it clean). Depends on ->write_inode() honouring the second argument. Signed-off-by: Al Viro --- fs/libfs.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'fs') diff --git a/fs/libfs.c b/fs/libfs.c index 80046dd..ddfa899 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -9,6 +9,8 @@ #include #include #include +#include +#include #include @@ -807,6 +809,29 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid, } EXPORT_SYMBOL_GPL(generic_fh_to_parent); +int simple_fsync(struct file *file, struct dentry *dentry, int datasync) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 0, /* metadata-only; caller takes care of data */ + }; + struct inode *inode = dentry->d_inode; + int err; + int ret; + + ret = sync_mapping_buffers(inode->i_mapping); + if (!(inode->i_state & I_DIRTY)) + return ret; + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + return ret; + + err = sync_inode(inode, &wbc); + if (ret == 0) + ret = err; + return ret; +} +EXPORT_SYMBOL(simple_fsync); + EXPORT_SYMBOL(dcache_dir_close); EXPORT_SYMBOL(dcache_dir_lseek); EXPORT_SYMBOL(dcache_dir_open); -- cgit v1.1 From 79d25767583e4e086f8309bfd1f502660a64fe7f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 7 Jun 2009 09:30:08 -0400 Subject: Sanitize qnx4 fsync handling * have directory operations use mark_buffer_dirty_inode(), so that sync_mapping_buffers() would get those. * make qnx4_write_inode() honour its last argument. * get rid of insane copies of very ancient "walk the indirect blocks" in qnx4/fsync - they never matched the actual fs layout and, fortunately, never'd been called. Again, all this junk is not needed; ->fsync() should just do sync_mapping_buffers + sync_inode (and if we implement block allocation for qnx4, we'll need to use mark_buffer_dirty_inode() for extent blocks) Signed-off-by: Al Viro --- fs/qnx4/Makefile | 2 +- fs/qnx4/dir.c | 2 +- fs/qnx4/file.c | 2 +- fs/qnx4/fsync.c | 169 ------------------------------------------------------- fs/qnx4/inode.c | 38 ++++--------- fs/qnx4/namei.c | 4 +- 6 files changed, 17 insertions(+), 200 deletions(-) delete mode 100644 fs/qnx4/fsync.c (limited to 'fs') diff --git a/fs/qnx4/Makefile b/fs/qnx4/Makefile index 502d7fe..e4d408c 100644 --- a/fs/qnx4/Makefile +++ b/fs/qnx4/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_QNX4FS_FS) += qnx4.o -qnx4-objs := inode.o dir.o namei.o file.o bitmap.o truncate.o fsync.o +qnx4-objs := inode.o dir.o namei.o file.o bitmap.o truncate.o diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c index ea9ffef..ff6c1ba 100644 --- a/fs/qnx4/dir.c +++ b/fs/qnx4/dir.c @@ -84,7 +84,7 @@ const struct file_operations qnx4_dir_operations = { .read = generic_read_dir, .readdir = qnx4_readdir, - .fsync = file_fsync, + .fsync = simple_fsync, }; const struct inode_operations qnx4_dir_inode_operations = diff --git a/fs/qnx4/file.c b/fs/qnx4/file.c index 867f42b..e7033ea 100644 --- a/fs/qnx4/file.c +++ b/fs/qnx4/file.c @@ -29,7 +29,7 @@ const struct file_operations qnx4_file_operations = #ifdef CONFIG_QNX4FS_RW .write = do_sync_write, .aio_write = generic_file_aio_write, - .fsync = qnx4_sync_file, + .fsync = simple_fsync, #endif }; diff --git a/fs/qnx4/fsync.c b/fs/qnx4/fsync.c deleted file mode 100644 index aa3b195..0000000 --- a/fs/qnx4/fsync.c +++ /dev/null @@ -1,169 +0,0 @@ -/* - * QNX4 file system, Linux implementation. - * - * Version : 0.1 - * - * Using parts of the xiafs filesystem. - * - * History : - * - * 24-03-1998 by Richard Frowijn : first release. - */ - -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -/* - * The functions for qnx4 fs file synchronization. - */ - -#ifdef CONFIG_QNX4FS_RW - -static int sync_block(struct inode *inode, unsigned short *block, int wait) -{ - struct buffer_head *bh; - unsigned short tmp; - - if (!*block) - return 0; - tmp = *block; - bh = sb_find_get_block(inode->i_sb, *block); - if (!bh) - return 0; - if (*block != tmp) { - brelse(bh); - return 1; - } - if (wait && buffer_req(bh) && !buffer_uptodate(bh)) { - brelse(bh); - return -1; - } - if (wait || !buffer_uptodate(bh) || !buffer_dirty(bh)) { - brelse(bh); - return 0; - } - ll_rw_block(WRITE, 1, &bh); - atomic_dec(&bh->b_count); - return 0; -} - -#ifdef WTF -static int sync_iblock(struct inode *inode, unsigned short *iblock, - struct buffer_head **bh, int wait) -{ - int rc; - unsigned short tmp; - - *bh = NULL; - tmp = *iblock; - if (!tmp) - return 0; - rc = sync_block(inode, iblock, wait); - if (rc) - return rc; - *bh = sb_bread(inode->i_sb, tmp); - if (tmp != *iblock) { - brelse(*bh); - *bh = NULL; - return 1; - } - if (!*bh) - return -1; - return 0; -} -#endif - -static int sync_direct(struct inode *inode, int wait) -{ - int i; - int rc, err = 0; - - for (i = 0; i < 7; i++) { - rc = sync_block(inode, - (unsigned short *) qnx4_raw_inode(inode)->di_first_xtnt.xtnt_blk + i, wait); - if (rc > 0) - break; - if (rc) - err = rc; - } - return err; -} - -#ifdef WTF -static int sync_indirect(struct inode *inode, unsigned short *iblock, int wait) -{ - int i; - struct buffer_head *ind_bh; - int rc, err = 0; - - rc = sync_iblock(inode, iblock, &ind_bh, wait); - if (rc || !ind_bh) - return rc; - - for (i = 0; i < 512; i++) { - rc = sync_block(inode, - ((unsigned short *) ind_bh->b_data) + i, - wait); - if (rc > 0) - break; - if (rc) - err = rc; - } - brelse(ind_bh); - return err; -} - -static int sync_dindirect(struct inode *inode, unsigned short *diblock, - int wait) -{ - int i; - struct buffer_head *dind_bh; - int rc, err = 0; - - rc = sync_iblock(inode, diblock, &dind_bh, wait); - if (rc || !dind_bh) - return rc; - - for (i = 0; i < 512; i++) { - rc = sync_indirect(inode, - ((unsigned short *) dind_bh->b_data) + i, - wait); - if (rc > 0) - break; - if (rc) - err = rc; - } - brelse(dind_bh); - return err; -} -#endif - -int qnx4_sync_file(struct file *file, struct dentry *dentry, int unused) -{ - struct inode *inode = dentry->d_inode; - int wait, err = 0; - - (void) file; - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode))) - return -EINVAL; - - lock_kernel(); - for (wait = 0; wait <= 1; wait++) { - err |= sync_direct(inode, wait); - } - err |= qnx4_sync_inode(inode); - unlock_kernel(); - return (err < 0) ? -EIO : 0; -} - -#endif diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 95c12fc..4071286 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -34,31 +35,6 @@ static const struct super_operations qnx4_sops; #ifdef CONFIG_QNX4FS_RW -int qnx4_sync_inode(struct inode *inode) -{ - int err = 0; -# if 0 - struct buffer_head *bh; - - bh = qnx4_update_inode(inode); - if (bh && buffer_dirty(bh)) - { - sync_dirty_buffer(bh); - if (buffer_req(bh) && !buffer_uptodate(bh)) - { - printk ("IO error syncing qnx4 inode [%s:%08lx]\n", - inode->i_sb->s_id, inode->i_ino); - err = -1; - } - brelse (bh); - } else if (!bh) { - err = -1; - } -# endif - - return err; -} - static void qnx4_delete_inode(struct inode *inode) { QNX4DEBUG(("qnx4: deleting inode [%lu]\n", (unsigned long) inode->i_ino)); @@ -70,7 +46,7 @@ static void qnx4_delete_inode(struct inode *inode) unlock_kernel(); } -static int qnx4_write_inode(struct inode *inode, int unused) +static int qnx4_write_inode(struct inode *inode, int do_sync) { struct qnx4_inode_entry *raw_inode; int block, ino; @@ -107,6 +83,16 @@ static int qnx4_write_inode(struct inode *inode, int unused) raw_inode->di_ctime = cpu_to_le32(inode->i_ctime.tv_sec); raw_inode->di_first_xtnt.xtnt_size = cpu_to_le32(inode->i_blocks); mark_buffer_dirty(bh); + if (do_sync) { + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) { + printk("qnx4: IO error syncing inode [%s:%08x]\n", + inode->i_sb->s_id, ino); + brelse(bh); + unlock_kernel(); + return -EIO; + } + } brelse(bh); unlock_kernel(); return 0; diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c index 775eed3..123270c 100644 --- a/fs/qnx4/namei.c +++ b/fs/qnx4/namei.c @@ -187,7 +187,7 @@ int qnx4_rmdir(struct inode *dir, struct dentry *dentry) de->di_status = 0; memset(de->di_fname, 0, sizeof de->di_fname); de->di_mode = 0; - mark_buffer_dirty(bh); + mark_buffer_dirty_inode(bh, dir); clear_nlink(inode); mark_inode_dirty(inode); inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; @@ -232,7 +232,7 @@ int qnx4_unlink(struct inode *dir, struct dentry *dentry) de->di_status = 0; memset(de->di_fname, 0, sizeof de->di_fname); de->di_mode = 0; - mark_buffer_dirty(bh); + mark_buffer_dirty_inode(bh, dir); dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; mark_inode_dirty(dir); inode->i_ctime = dir->i_ctime; -- cgit v1.1 From 964f5369667b342994fe3f384e9ba41d404ee796 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 7 Jun 2009 09:47:13 -0400 Subject: fs/qnx4: sanitize includes fs-internal parts of qnx4_fs.h taken to fs/qnx4/qnx4.h, includes adjusted, qnx4_fs.h doesn't need unifdef anymore. Signed-off-by: Al Viro --- fs/qnx4/bitmap.c | 7 +------ fs/qnx4/dir.c | 7 +------ fs/qnx4/file.c | 3 +-- fs/qnx4/inode.c | 11 +++-------- fs/qnx4/namei.c | 9 +-------- fs/qnx4/qnx4.h | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/qnx4/truncate.c | 6 +----- 7 files changed, 65 insertions(+), 35 deletions(-) create mode 100644 fs/qnx4/qnx4.h (limited to 'fs') diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c index 8425cf6..e1cd061 100644 --- a/fs/qnx4/bitmap.c +++ b/fs/qnx4/bitmap.c @@ -13,14 +13,9 @@ * 28-06-1998 by Frank Denis : qnx4_free_inode (to be fixed) . */ -#include -#include -#include -#include -#include -#include #include #include +#include "qnx4.h" #if 0 int qnx4_new_block(struct super_block *sb) diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c index ff6c1ba..003c68f 100644 --- a/fs/qnx4/dir.c +++ b/fs/qnx4/dir.c @@ -11,14 +11,9 @@ * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support. */ -#include -#include -#include -#include -#include #include #include - +#include "qnx4.h" static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir) { diff --git a/fs/qnx4/file.c b/fs/qnx4/file.c index e7033ea..09b170a 100644 --- a/fs/qnx4/file.c +++ b/fs/qnx4/file.c @@ -12,8 +12,7 @@ * 27-06-1998 by Frank Denis : file overwriting. */ -#include -#include +#include "qnx4.h" /* * We have mostly NULL's here: the current defaults are ok for diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 4071286..681df5f 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -13,20 +13,15 @@ */ #include -#include -#include -#include -#include -#include -#include #include +#include #include #include #include #include #include -#include -#include +#include +#include "qnx4.h" #define QNX4_VERSION 4 #define QNX4_BMNAME ".bitmap" diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c index 123270c..5972ed2 100644 --- a/fs/qnx4/namei.c +++ b/fs/qnx4/namei.c @@ -12,16 +12,9 @@ * 04-07-1998 by Frank Denis : first step for rmdir/unlink. */ -#include -#include -#include -#include -#include -#include -#include -#include #include #include +#include "qnx4.h" /* diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h new file mode 100644 index 0000000..9efc089 --- /dev/null +++ b/fs/qnx4/qnx4.h @@ -0,0 +1,57 @@ +#include +#include + +#define QNX4_DEBUG 0 + +#if QNX4_DEBUG +#define QNX4DEBUG(X) printk X +#else +#define QNX4DEBUG(X) (void) 0 +#endif + +struct qnx4_sb_info { + struct buffer_head *sb_buf; /* superblock buffer */ + struct qnx4_super_block *sb; /* our superblock */ + unsigned int Version; /* may be useful */ + struct qnx4_inode_entry *BitMap; /* useful */ +}; + +struct qnx4_inode_info { + struct qnx4_inode_entry raw; + loff_t mmu_private; + struct inode vfs_inode; +}; + +extern struct inode *qnx4_iget(struct super_block *, unsigned long); +extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd); +extern unsigned long qnx4_count_free_blocks(struct super_block *sb); +extern unsigned long qnx4_block_map(struct inode *inode, long iblock); + +extern struct buffer_head *qnx4_bread(struct inode *, int, int); + +extern const struct inode_operations qnx4_file_inode_operations; +extern const struct inode_operations qnx4_dir_inode_operations; +extern const struct file_operations qnx4_file_operations; +extern const struct file_operations qnx4_dir_operations; +extern int qnx4_is_free(struct super_block *sb, long block); +extern int qnx4_set_bitmap(struct super_block *sb, long block, int busy); +extern int qnx4_create(struct inode *inode, struct dentry *dentry, int mode, struct nameidata *nd); +extern void qnx4_truncate(struct inode *inode); +extern void qnx4_free_inode(struct inode *inode); +extern int qnx4_unlink(struct inode *dir, struct dentry *dentry); +extern int qnx4_rmdir(struct inode *dir, struct dentry *dentry); + +static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct qnx4_inode_info *qnx4_i(struct inode *inode) +{ + return container_of(inode, struct qnx4_inode_info, vfs_inode); +} + +static inline struct qnx4_inode_entry *qnx4_raw_inode(struct inode *inode) +{ + return &qnx4_i(inode)->raw; +} diff --git a/fs/qnx4/truncate.c b/fs/qnx4/truncate.c index 6437c1c..d94d9ee 100644 --- a/fs/qnx4/truncate.c +++ b/fs/qnx4/truncate.c @@ -10,12 +10,8 @@ * 30-06-1998 by Frank DENIS : ugly filler. */ -#include -#include -#include -#include #include -#include +#include "qnx4.h" #ifdef CONFIG_QNX4FS_RW -- cgit v1.1 From b522412aeabadbb302fd4338eaabf09d10e2d29c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 7 Jun 2009 13:44:36 -0400 Subject: Sanitize ->fsync() for FAT * mark directory data blocks as assoc. metadata * add new inode to deal with FAT, mark FAT blocks as assoc. metadata of that * now ->fsync() is trivial both for files and directories Signed-off-by: Al Viro --- fs/fat/dir.c | 16 ++++++++-------- fs/fat/fat.h | 6 ++++++ fs/fat/fatent.c | 13 ++++++++----- fs/fat/file.c | 14 +++++++++++++- fs/fat/inode.c | 11 ++++++++++- fs/fat/namei_msdos.c | 4 ++-- fs/fat/namei_vfat.c | 4 ++-- 7 files changed, 49 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 3a7f603..f350029 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -840,7 +840,7 @@ const struct file_operations fat_dir_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = fat_compat_dir_ioctl, #endif - .fsync = file_fsync, + .fsync = fat_file_fsync, }; static int fat_get_short_entry(struct inode *dir, loff_t *pos, @@ -967,7 +967,7 @@ static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots) de++; nr_slots--; } - mark_buffer_dirty(bh); + mark_buffer_dirty_inode(bh, dir); if (IS_DIRSYNC(dir)) err = sync_dirty_buffer(bh); brelse(bh); @@ -1001,7 +1001,7 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo) de--; nr_slots--; } - mark_buffer_dirty(bh); + mark_buffer_dirty_inode(bh, dir); if (IS_DIRSYNC(dir)) err = sync_dirty_buffer(bh); brelse(bh); @@ -1051,7 +1051,7 @@ static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used, } memset(bhs[n]->b_data, 0, sb->s_blocksize); set_buffer_uptodate(bhs[n]); - mark_buffer_dirty(bhs[n]); + mark_buffer_dirty_inode(bhs[n], dir); n++; blknr++; @@ -1131,7 +1131,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts) de[0].size = de[1].size = 0; memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de)); set_buffer_uptodate(bhs[0]); - mark_buffer_dirty(bhs[0]); + mark_buffer_dirty_inode(bhs[0], dir); err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE); if (err) @@ -1193,7 +1193,7 @@ static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots, slots += copy; size -= copy; set_buffer_uptodate(bhs[n]); - mark_buffer_dirty(bhs[n]); + mark_buffer_dirty_inode(bhs[n], dir); if (!size) break; n++; @@ -1293,7 +1293,7 @@ found: for (i = 0; i < long_bhs; i++) { int copy = min_t(int, sb->s_blocksize - offset, size); memcpy(bhs[i]->b_data + offset, slots, copy); - mark_buffer_dirty(bhs[i]); + mark_buffer_dirty_inode(bhs[i], dir); offset = 0; slots += copy; size -= copy; @@ -1304,7 +1304,7 @@ found: /* Fill the short name slot. */ int copy = min_t(int, sb->s_blocksize - offset, size); memcpy(bhs[i]->b_data + offset, slots, copy); - mark_buffer_dirty(bhs[i]); + mark_buffer_dirty_inode(bhs[i], dir); if (IS_DIRSYNC(dir)) err = sync_dirty_buffer(bhs[i]); } diff --git a/fs/fat/fat.h b/fs/fat/fat.h index ea440d6..e4d8852 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -74,6 +74,7 @@ struct msdos_sb_info { int fatent_shift; struct fatent_operations *fatent_ops; + struct inode *fat_inode; spinlock_t inode_hash_lock; struct hlist_head inode_hashtable[FAT_HASH_SIZE]; @@ -251,6 +252,7 @@ struct fat_entry { } u; int nr_bhs; struct buffer_head *bhs[2]; + struct inode *fat_inode; }; static inline void fatent_init(struct fat_entry *fatent) @@ -259,6 +261,7 @@ static inline void fatent_init(struct fat_entry *fatent) fatent->entry = 0; fatent->u.ent32_p = NULL; fatent->bhs[0] = fatent->bhs[1] = NULL; + fatent->fat_inode = NULL; } static inline void fatent_set_entry(struct fat_entry *fatent, int entry) @@ -275,6 +278,7 @@ static inline void fatent_brelse(struct fat_entry *fatent) brelse(fatent->bhs[i]); fatent->nr_bhs = 0; fatent->bhs[0] = fatent->bhs[1] = NULL; + fatent->fat_inode = NULL; } extern void fat_ent_access_init(struct super_block *sb); @@ -296,6 +300,8 @@ extern int fat_setattr(struct dentry * dentry, struct iattr * attr); extern void fat_truncate(struct inode *inode); extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); +extern int fat_file_fsync(struct file *file, struct dentry *dentry, + int datasync); /* fat/inode.c */ extern void fat_attach(struct inode *inode, loff_t i_pos); diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index da6eea4..618f530 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -73,6 +73,8 @@ static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent, struct buffer_head **bhs = fatent->bhs; WARN_ON(blocknr < MSDOS_SB(sb)->fat_start); + fatent->fat_inode = MSDOS_SB(sb)->fat_inode; + bhs[0] = sb_bread(sb, blocknr); if (!bhs[0]) goto err; @@ -103,6 +105,7 @@ static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent, struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; WARN_ON(blocknr < MSDOS_SB(sb)->fat_start); + fatent->fat_inode = MSDOS_SB(sb)->fat_inode; fatent->bhs[0] = sb_bread(sb, blocknr); if (!fatent->bhs[0]) { printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n", @@ -167,9 +170,9 @@ static void fat12_ent_put(struct fat_entry *fatent, int new) } spin_unlock(&fat12_entry_lock); - mark_buffer_dirty(fatent->bhs[0]); + mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode); if (fatent->nr_bhs == 2) - mark_buffer_dirty(fatent->bhs[1]); + mark_buffer_dirty_inode(fatent->bhs[1], fatent->fat_inode); } static void fat16_ent_put(struct fat_entry *fatent, int new) @@ -178,7 +181,7 @@ static void fat16_ent_put(struct fat_entry *fatent, int new) new = EOF_FAT16; *fatent->u.ent16_p = cpu_to_le16(new); - mark_buffer_dirty(fatent->bhs[0]); + mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode); } static void fat32_ent_put(struct fat_entry *fatent, int new) @@ -189,7 +192,7 @@ static void fat32_ent_put(struct fat_entry *fatent, int new) WARN_ON(new & 0xf0000000); new |= le32_to_cpu(*fatent->u.ent32_p) & ~0x0fffffff; *fatent->u.ent32_p = cpu_to_le32(new); - mark_buffer_dirty(fatent->bhs[0]); + mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode); } static int fat12_ent_next(struct fat_entry *fatent) @@ -381,7 +384,7 @@ static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs, } memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize); set_buffer_uptodate(c_bh); - mark_buffer_dirty(c_bh); + mark_buffer_dirty_inode(c_bh, sbi->fat_inode); if (sb->s_flags & MS_SYNCHRONOUS) err = sync_dirty_buffer(c_bh); brelse(c_bh); diff --git a/fs/fat/file.c b/fs/fat/file.c index 0a7f4a9..e955a56 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -133,6 +133,18 @@ static int fat_file_release(struct inode *inode, struct file *filp) return 0; } +int fat_file_fsync(struct file *filp, struct dentry *dentry, int datasync) +{ + struct inode *inode = dentry->d_inode; + int res, err; + + res = simple_fsync(filp, dentry, datasync); + err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping); + + return res ? res : err; +} + + const struct file_operations fat_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -142,7 +154,7 @@ const struct file_operations fat_file_operations = { .mmap = generic_file_mmap, .release = fat_file_release, .ioctl = fat_generic_ioctl, - .fsync = file_fsync, + .fsync = fat_file_fsync, .splice_read = generic_file_splice_read, }; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 2292cbf..476f80b 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -458,6 +458,8 @@ static void fat_put_super(struct super_block *sb) if (sb->s_dirt) fat_write_super(sb); + iput(sbi->fat_inode); + if (sbi->nls_disk) { unload_nls(sbi->nls_disk); sbi->nls_disk = NULL; @@ -1183,7 +1185,7 @@ static int fat_read_root(struct inode *inode) int fat_fill_super(struct super_block *sb, void *data, int silent, const struct inode_operations *fs_dir_inode_ops, int isvfat) { - struct inode *root_inode = NULL; + struct inode *root_inode = NULL, *fat_inode = NULL; struct buffer_head *bh; struct fat_boot_sector *b; struct msdos_sb_info *sbi; @@ -1423,6 +1425,11 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, } error = -ENOMEM; + fat_inode = new_inode(sb); + if (!fat_inode) + goto out_fail; + MSDOS_I(fat_inode)->i_pos = 0; + sbi->fat_inode = fat_inode; root_inode = new_inode(sb); if (!root_inode) goto out_fail; @@ -1448,6 +1455,8 @@ out_invalid: " on dev %s.\n", sb->s_id); out_fail: + if (fat_inode) + iput(fat_inode); if (root_inode) iput(root_inode); if (sbi->nls_io) diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index da3f361..20f5228 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -544,7 +544,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, int start = MSDOS_I(new_dir)->i_logstart; dotdot_de->start = cpu_to_le16(start); dotdot_de->starthi = cpu_to_le16(start >> 16); - mark_buffer_dirty(dotdot_bh); + mark_buffer_dirty_inode(dotdot_bh, old_inode); if (IS_DIRSYNC(new_dir)) { err = sync_dirty_buffer(dotdot_bh); if (err) @@ -586,7 +586,7 @@ error_dotdot: int start = MSDOS_I(old_dir)->i_logstart; dotdot_de->start = cpu_to_le16(start); dotdot_de->starthi = cpu_to_le16(start >> 16); - mark_buffer_dirty(dotdot_bh); + mark_buffer_dirty_inode(dotdot_bh, old_inode); corrupt |= sync_dirty_buffer(dotdot_bh); } error_inode: diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index a0e00e3..b50ecbe 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -965,7 +965,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, int start = MSDOS_I(new_dir)->i_logstart; dotdot_de->start = cpu_to_le16(start); dotdot_de->starthi = cpu_to_le16(start >> 16); - mark_buffer_dirty(dotdot_bh); + mark_buffer_dirty_inode(dotdot_bh, old_inode); if (IS_DIRSYNC(new_dir)) { err = sync_dirty_buffer(dotdot_bh); if (err) @@ -1009,7 +1009,7 @@ error_dotdot: int start = MSDOS_I(old_dir)->i_logstart; dotdot_de->start = cpu_to_le16(start); dotdot_de->starthi = cpu_to_le16(start >> 16); - mark_buffer_dirty(dotdot_bh); + mark_buffer_dirty_inode(dotdot_bh, old_inode); corrupt |= sync_dirty_buffer(dotdot_bh); } error_inode: -- cgit v1.1 From e1740a462ecb2eae213be15857b577cc6f6bb8b4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 7 Jun 2009 15:14:02 -0400 Subject: switch ext2 to simple_fsync() kill ext2_sync_file() (along with ext2/fsync.c), get rid of ext2_update_inode() - it's an alias of ext2_write_inode(). Signed-off-by: Al Viro --- fs/ext2/Makefile | 2 +- fs/ext2/dir.c | 2 +- fs/ext2/ext2.h | 3 --- fs/ext2/file.c | 4 ++-- fs/ext2/fsync.c | 50 -------------------------------------------------- fs/ext2/inode.c | 11 ++--------- 6 files changed, 6 insertions(+), 66 deletions(-) delete mode 100644 fs/ext2/fsync.c (limited to 'fs') diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile index e0b2b43..f42af45 100644 --- a/fs/ext2/Makefile +++ b/fs/ext2/Makefile @@ -4,7 +4,7 @@ obj-$(CONFIG_EXT2_FS) += ext2.o -ext2-y := balloc.o dir.o file.o fsync.o ialloc.o inode.o \ +ext2-y := balloc.o dir.o file.o ialloc.o inode.o \ ioctl.o namei.o super.o symlink.o ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 2999d72..0035004 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -720,5 +720,5 @@ const struct file_operations ext2_dir_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = ext2_compat_ioctl, #endif - .fsync = ext2_sync_file, + .fsync = simple_fsync, }; diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 3203042..b2bbf45 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -113,9 +113,6 @@ extern int ext2_empty_dir (struct inode *); extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **); extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *); -/* fsync.c */ -extern int ext2_sync_file (struct file *, struct dentry *, int); - /* ialloc.c */ extern struct inode * ext2_new_inode (struct inode *, int); extern void ext2_free_inode (struct inode *); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 45ed071..2b9e47d 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -55,7 +55,7 @@ const struct file_operations ext2_file_operations = { .mmap = generic_file_mmap, .open = generic_file_open, .release = ext2_release_file, - .fsync = ext2_sync_file, + .fsync = simple_fsync, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, }; @@ -72,7 +72,7 @@ const struct file_operations ext2_xip_file_operations = { .mmap = xip_file_mmap, .open = generic_file_open, .release = ext2_release_file, - .fsync = ext2_sync_file, + .fsync = simple_fsync, }; #endif diff --git a/fs/ext2/fsync.c b/fs/ext2/fsync.c deleted file mode 100644 index fc66c93..0000000 --- a/fs/ext2/fsync.c +++ /dev/null @@ -1,50 +0,0 @@ -/* - * linux/fs/ext2/fsync.c - * - * Copyright (C) 1993 Stephen Tweedie (sct@dcs.ed.ac.uk) - * from - * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * from - * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds - * - * ext2fs fsync primitive - * - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 - * - * Removed unnecessary code duplication for little endian machines - * and excessive __inline__s. - * Andi Kleen, 1997 - * - * Major simplications and cleanup - we only need to do the metadata, because - * we can depend on generic_block_fdatasync() to sync the data blocks. - */ - -#include "ext2.h" -#include /* for sync_mapping_buffers() */ - - -/* - * File may be NULL when we are called. Perhaps we shouldn't - * even pass file to fsync ? - */ - -int ext2_sync_file(struct file *file, struct dentry *dentry, int datasync) -{ - struct inode *inode = dentry->d_inode; - int err; - int ret; - - ret = sync_mapping_buffers(inode->i_mapping); - if (!(inode->i_state & I_DIRTY)) - return ret; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - return ret; - - err = ext2_sync_inode(inode); - if (ret == 0) - ret = err; - return ret; -} diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index acf6788..29ed682 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -41,8 +41,6 @@ MODULE_AUTHOR("Remy Card and others"); MODULE_DESCRIPTION("Second Extended Filesystem"); MODULE_LICENSE("GPL"); -static int ext2_update_inode(struct inode * inode, int do_sync); - /* * Test whether an inode is a fast symlink. */ @@ -66,7 +64,7 @@ void ext2_delete_inode (struct inode * inode) goto no_delete; EXT2_I(inode)->i_dtime = get_seconds(); mark_inode_dirty(inode); - ext2_update_inode(inode, inode_needs_sync(inode)); + ext2_write_inode(inode, inode_needs_sync(inode)); inode->i_size = 0; if (inode->i_blocks) @@ -1337,7 +1335,7 @@ bad_inode: return ERR_PTR(ret); } -static int ext2_update_inode(struct inode * inode, int do_sync) +int ext2_write_inode(struct inode *inode, int do_sync) { struct ext2_inode_info *ei = EXT2_I(inode); struct super_block *sb = inode->i_sb; @@ -1442,11 +1440,6 @@ static int ext2_update_inode(struct inode * inode, int do_sync) return err; } -int ext2_write_inode(struct inode *inode, int wait) -{ - return ext2_update_inode(inode, wait); -} - int ext2_sync_inode(struct inode *inode) { struct writeback_control wbc = { -- cgit v1.1 From 0d7916d7e985da52cdd2989c900485e17b035972 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 7 Jun 2009 15:21:06 -0400 Subject: switch minix to simple_fsync() * get minix_write_inode() to honour the second argument * now we can use simple_fsync() for minixfs Signed-off-by: Al Viro --- fs/minix/dir.c | 2 +- fs/minix/file.c | 20 +------------------- fs/minix/inode.c | 33 ++++++++++----------------------- fs/minix/minix.h | 2 -- 4 files changed, 12 insertions(+), 45 deletions(-) (limited to 'fs') diff --git a/fs/minix/dir.c b/fs/minix/dir.c index d4946c4..e5f2064 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -22,7 +22,7 @@ static int minix_readdir(struct file *, void *, filldir_t); const struct file_operations minix_dir_operations = { .read = generic_read_dir, .readdir = minix_readdir, - .fsync = minix_sync_file, + .fsync = simple_fsync, }; static inline void dir_put_page(struct page *page) diff --git a/fs/minix/file.c b/fs/minix/file.c index 17765f6..3eec3e6 100644 --- a/fs/minix/file.c +++ b/fs/minix/file.c @@ -6,15 +6,12 @@ * minix regular file handling primitives */ -#include /* for fsync_inode_buffers() */ #include "minix.h" /* * We have mostly NULLs here: the current defaults are OK for * the minix filesystem. */ -int minix_sync_file(struct file *, struct dentry *, int); - const struct file_operations minix_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -22,7 +19,7 @@ const struct file_operations minix_file_operations = { .write = do_sync_write, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .fsync = minix_sync_file, + .fsync = simple_fsync, .splice_read = generic_file_splice_read, }; @@ -30,18 +27,3 @@ const struct inode_operations minix_file_inode_operations = { .truncate = minix_truncate, .getattr = minix_getattr, }; - -int minix_sync_file(struct file * file, struct dentry *dentry, int datasync) -{ - struct inode *inode = dentry->d_inode; - int err; - - err = sync_mapping_buffers(inode->i_mapping); - if (!(inode->i_state & I_DIRTY)) - return err; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - return err; - - err |= minix_sync_inode(inode); - return err ? -EIO : 0; -} diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 7eb5397..f91a236 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -556,38 +556,25 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode) return bh; } -static struct buffer_head *minix_update_inode(struct inode *inode) -{ - if (INODE_VERSION(inode) == MINIX_V1) - return V1_minix_update_inode(inode); - else - return V2_minix_update_inode(inode); -} - -static int minix_write_inode(struct inode * inode, int wait) -{ - brelse(minix_update_inode(inode)); - return 0; -} - -int minix_sync_inode(struct inode * inode) +static int minix_write_inode(struct inode *inode, int wait) { int err = 0; struct buffer_head *bh; - bh = minix_update_inode(inode); - if (bh && buffer_dirty(bh)) - { + if (INODE_VERSION(inode) == MINIX_V1) + bh = V1_minix_update_inode(inode); + else + bh = V2_minix_update_inode(inode); + if (!bh) + return -EIO; + if (wait && buffer_dirty(bh)) { sync_dirty_buffer(bh); - if (buffer_req(bh) && !buffer_uptodate(bh)) - { + if (buffer_req(bh) && !buffer_uptodate(bh)) { printk("IO error syncing minix inode [%s:%08lx]\n", inode->i_sb->s_id, inode->i_ino); - err = -1; + err = -EIO; } } - else if (!bh) - err = -1; brelse (bh); return err; } diff --git a/fs/minix/minix.h b/fs/minix/minix.h index e6a0b19..cb7fdd1 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -57,7 +57,6 @@ extern int __minix_write_begin(struct file *file, struct address_space *mapping, extern void V1_minix_truncate(struct inode *); extern void V2_minix_truncate(struct inode *); extern void minix_truncate(struct inode *); -extern int minix_sync_inode(struct inode *); extern void minix_set_inode(struct inode *, dev_t); extern int V1_minix_get_block(struct inode *, long, struct buffer_head *, int); extern int V2_minix_get_block(struct inode *, long, struct buffer_head *, int); @@ -72,7 +71,6 @@ extern int minix_empty_dir(struct inode*); extern void minix_set_link(struct minix_dir_entry*, struct page*, struct inode*); extern struct minix_dir_entry *minix_dotdot(struct inode*, struct page**); extern ino_t minix_inode_by_name(struct dentry*); -extern int minix_sync_file(struct file *, struct dentry *, int); extern const struct inode_operations minix_file_inode_operations; extern const struct inode_operations minix_dir_inode_operations; -- cgit v1.1 From 05459ca81ac3064cb040d983342bc453cccec458 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 7 Jun 2009 15:29:45 -0400 Subject: repair sysv_write_inode(), switch sysv to simple_fsync() Signed-off-by: Al Viro --- fs/sysv/dir.c | 2 +- fs/sysv/file.c | 17 +---------------- fs/sysv/inode.c | 45 ++++++++++++++++----------------------------- fs/sysv/sysv.h | 1 - 4 files changed, 18 insertions(+), 47 deletions(-) (limited to 'fs') diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 56f6552..c779807 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -24,7 +24,7 @@ static int sysv_readdir(struct file *, void *, filldir_t); const struct file_operations sysv_dir_operations = { .read = generic_read_dir, .readdir = sysv_readdir, - .fsync = sysv_sync_file, + .fsync = simple_fsync, }; static inline void dir_put_page(struct page *page) diff --git a/fs/sysv/file.c b/fs/sysv/file.c index 589be21..96340c0 100644 --- a/fs/sysv/file.c +++ b/fs/sysv/file.c @@ -26,7 +26,7 @@ const struct file_operations sysv_file_operations = { .write = do_sync_write, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .fsync = sysv_sync_file, + .fsync = simple_fsync, .splice_read = generic_file_splice_read, }; @@ -34,18 +34,3 @@ const struct inode_operations sysv_file_inode_operations = { .truncate = sysv_truncate, .getattr = sysv_getattr, }; - -int sysv_sync_file(struct file * file, struct dentry *dentry, int datasync) -{ - struct inode *inode = dentry->d_inode; - int err; - - err = sync_mapping_buffers(inode->i_mapping); - if (!(inode->i_state & I_DIRTY)) - return err; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - return err; - - err |= sysv_sync_inode(inode); - return err ? -EIO : 0; -} diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index a3f45fc..425c976 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -247,7 +247,7 @@ bad_inode: return ERR_PTR(-EIO); } -static struct buffer_head * sysv_update_inode(struct inode * inode) +int sysv_write_inode(struct inode *inode, int wait) { struct super_block * sb = inode->i_sb; struct sysv_sb_info * sbi = SYSV_SB(sb); @@ -255,19 +255,21 @@ static struct buffer_head * sysv_update_inode(struct inode * inode) struct sysv_inode * raw_inode; struct sysv_inode_info * si; unsigned int ino, block; + int err = 0; ino = inode->i_ino; if (!ino || ino > sbi->s_ninodes) { printk("Bad inode number on dev %s: %d is out of range\n", inode->i_sb->s_id, ino); - return NULL; + return -EIO; } raw_inode = sysv_raw_inode(sb, ino, &bh); if (!raw_inode) { printk("unable to read i-node block\n"); - return NULL; + return -EIO; } + lock_kernel(); raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode); raw_inode->i_uid = cpu_to_fs16(sbi, fs_high2lowuid(inode->i_uid)); raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(inode->i_gid)); @@ -283,38 +285,23 @@ static struct buffer_head * sysv_update_inode(struct inode * inode) for (block = 0; block < 10+1+1+1; block++) write3byte(sbi, (u8 *)&si->i_data[block], &raw_inode->i_data[3*block]); + unlock_kernel(); mark_buffer_dirty(bh); - return bh; -} - -int sysv_write_inode(struct inode * inode, int wait) -{ - struct buffer_head *bh; - lock_kernel(); - bh = sysv_update_inode(inode); + if (wait) { + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) { + printk ("IO error syncing sysv inode [%s:%08x]\n", + sb->s_id, ino); + err = -EIO; + } + } brelse(bh); - unlock_kernel(); return 0; } -int sysv_sync_inode(struct inode * inode) +int sysv_sync_inode(struct inode *inode) { - int err = 0; - struct buffer_head *bh; - - bh = sysv_update_inode(inode); - if (bh && buffer_dirty(bh)) { - sync_dirty_buffer(bh); - if (buffer_req(bh) && !buffer_uptodate(bh)) { - printk ("IO error syncing sysv inode [%s:%08lx]\n", - inode->i_sb->s_id, inode->i_ino); - err = -1; - } - } - else if (!bh) - err = -1; - brelse (bh); - return err; + return sysv_write_inode(inode, 1); } static void sysv_delete_inode(struct inode *inode) diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index 5784a31..53786eb 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -144,7 +144,6 @@ extern int __sysv_write_begin(struct file *file, struct address_space *mapping, extern struct inode *sysv_iget(struct super_block *, unsigned int); extern int sysv_write_inode(struct inode *, int); extern int sysv_sync_inode(struct inode *); -extern int sysv_sync_file(struct file *, struct dentry *, int); extern void sysv_set_inode(struct inode *, dev_t); extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *); extern int sysv_init_icache(void); -- cgit v1.1 From a932801543fe74050ebee07fde082234c46b624f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 7 Jun 2009 15:35:18 -0400 Subject: switch ufs to simple_fsync() Signed-off-by: Al Viro --- fs/ufs/dir.c | 2 +- fs/ufs/file.c | 23 +---------------------- fs/ufs/ufs.h | 1 - 3 files changed, 2 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 6321b79..6f671f1 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -666,6 +666,6 @@ not_empty: const struct file_operations ufs_dir_operations = { .read = generic_read_dir, .readdir = ufs_readdir, - .fsync = ufs_sync_file, + .fsync = simple_fsync, .llseek = generic_file_llseek, }; diff --git a/fs/ufs/file.c b/fs/ufs/file.c index 2bd3a16..73655c6 100644 --- a/fs/ufs/file.c +++ b/fs/ufs/file.c @@ -24,31 +24,10 @@ */ #include -#include /* for sync_mapping_buffers() */ #include "ufs_fs.h" #include "ufs.h" - -int ufs_sync_file(struct file *file, struct dentry *dentry, int datasync) -{ - struct inode *inode = dentry->d_inode; - int err; - int ret; - - ret = sync_mapping_buffers(inode->i_mapping); - if (!(inode->i_state & I_DIRTY)) - return ret; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - return ret; - - err = ufs_sync_inode(inode); - if (ret == 0) - ret = err; - return ret; -} - - /* * We have mostly NULL's here: the current defaults are ok for * the ufs filesystem. @@ -62,6 +41,6 @@ const struct file_operations ufs_file_operations = { .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, .open = generic_file_open, - .fsync = ufs_sync_file, + .fsync = simple_fsync, .splice_read = generic_file_splice_read, }; diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index d0c4acd..644e77e 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -99,7 +99,6 @@ extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, extern const struct inode_operations ufs_file_inode_operations; extern const struct file_operations ufs_file_operations; extern const struct address_space_operations ufs_aops; -extern int ufs_sync_file(struct file *, struct dentry *, int); /* ialloc.c */ extern void ufs_free_inode (struct inode *inode); -- cgit v1.1 From 90de066443a8632bb42fed0a8216313d7da07aba Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 7 Jun 2009 15:40:27 -0400 Subject: switch udf to simple_fsync() Signed-off-by: Al Viro --- fs/udf/Makefile | 2 +- fs/udf/dir.c | 2 +- fs/udf/file.c | 2 +- fs/udf/fsync.c | 52 ---------------------------------------------------- fs/udf/udfdecl.h | 3 --- 5 files changed, 3 insertions(+), 58 deletions(-) delete mode 100644 fs/udf/fsync.c (limited to 'fs') diff --git a/fs/udf/Makefile b/fs/udf/Makefile index 0d4503f..eb880f6 100644 --- a/fs/udf/Makefile +++ b/fs/udf/Makefile @@ -5,5 +5,5 @@ obj-$(CONFIG_UDF_FS) += udf.o udf-objs := balloc.o dir.o file.o ialloc.o inode.o lowlevel.o namei.o \ - partition.o super.o truncate.o symlink.o fsync.o \ + partition.o super.o truncate.o symlink.o \ directory.o misc.o udftime.o unicode.o diff --git a/fs/udf/dir.c b/fs/udf/dir.c index 2efd4d5..61d9a76 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -210,5 +210,5 @@ const struct file_operations udf_dir_operations = { .read = generic_read_dir, .readdir = udf_readdir, .ioctl = udf_ioctl, - .fsync = udf_fsync_file, + .fsync = simple_fsync, }; diff --git a/fs/udf/file.c b/fs/udf/file.c index eb91f3b..7464305 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -209,7 +209,7 @@ const struct file_operations udf_file_operations = { .write = do_sync_write, .aio_write = udf_file_aio_write, .release = udf_release_file, - .fsync = udf_fsync_file, + .fsync = simple_fsync, .splice_read = generic_file_splice_read, .llseek = generic_file_llseek, }; diff --git a/fs/udf/fsync.c b/fs/udf/fsync.c deleted file mode 100644 index b2c472b..0000000 --- a/fs/udf/fsync.c +++ /dev/null @@ -1,52 +0,0 @@ -/* - * fsync.c - * - * PURPOSE - * Fsync handling routines for the OSTA-UDF(tm) filesystem. - * - * COPYRIGHT - * This file is distributed under the terms of the GNU General Public - * License (GPL). Copies of the GPL can be obtained from: - * ftp://prep.ai.mit.edu/pub/gnu/GPL - * Each contributing author retains all rights to their own work. - * - * (C) 1999-2001 Ben Fennema - * (C) 1999-2000 Stelias Computing Inc - * - * HISTORY - * - * 05/22/99 blf Created. - */ - -#include "udfdecl.h" - -#include - -static int udf_fsync_inode(struct inode *, int); - -/* - * File may be NULL when we are called. Perhaps we shouldn't - * even pass file to fsync ? - */ - -int udf_fsync_file(struct file *file, struct dentry *dentry, int datasync) -{ - struct inode *inode = dentry->d_inode; - - return udf_fsync_inode(inode, datasync); -} - -static int udf_fsync_inode(struct inode *inode, int datasync) -{ - int err; - - err = sync_mapping_buffers(inode->i_mapping); - if (!(inode->i_state & I_DIRTY)) - return err; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - return err; - - err |= udf_sync_inode(inode); - - return err ? -EIO : 0; -} diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index cac51b7..8d46f42 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -223,9 +223,6 @@ extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t, extern int udf_new_block(struct super_block *, struct inode *, uint16_t, uint32_t, int *); -/* fsync.c */ -extern int udf_fsync_file(struct file *, struct dentry *, int); - /* directory.c */ extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *, struct udf_fileident_bh *, -- cgit v1.1 From bea6b64c277f0824cdaea6190209b26a164419d6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 7 Jun 2009 15:44:50 -0400 Subject: switch omfs to simple_fsync() Signed-off-by: Al Viro --- fs/omfs/file.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 834b233..d17e774 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -11,21 +11,6 @@ #include #include "omfs.h" -static int omfs_sync_file(struct file *file, struct dentry *dentry, - int datasync) -{ - struct inode *inode = dentry->d_inode; - int err; - - err = sync_mapping_buffers(inode->i_mapping); - if (!(inode->i_state & I_DIRTY)) - return err; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - return err; - err |= omfs_sync_inode(inode); - return err ? -EIO : 0; -} - static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset) { return (sbi->s_sys_blocksize - offset - @@ -344,7 +329,7 @@ struct file_operations omfs_file_operations = { .aio_read = generic_file_aio_read, .aio_write = generic_file_aio_write, .mmap = generic_file_mmap, - .fsync = omfs_sync_file, + .fsync = simple_fsync, .splice_read = generic_file_splice_read, }; -- cgit v1.1 From ffdc9064f8b4fa9db37a7d5180f41cce2ea2b7ad Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 8 Jun 2009 00:44:42 -0400 Subject: repair adfs ->write_inode(), switch to simple_fsync() Signed-off-by: Al Viro --- fs/adfs/adfs.h | 4 +++- fs/adfs/dir.c | 10 ++++++++-- fs/adfs/dir_f.c | 17 +++++++++++++++++ fs/adfs/dir_fplus.c | 17 +++++++++++++++++ fs/adfs/file.c | 2 +- fs/adfs/inode.c | 4 ++-- 6 files changed, 48 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index e0a85db..a6665f3 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -53,6 +53,7 @@ struct adfs_dir_ops { int (*update)(struct adfs_dir *dir, struct object_info *obj); int (*create)(struct adfs_dir *dir, struct object_info *obj); int (*remove)(struct adfs_dir *dir, struct object_info *obj); + int (*sync)(struct adfs_dir *dir); void (*free)(struct adfs_dir *dir); }; @@ -90,7 +91,8 @@ extern const struct dentry_operations adfs_dentry_operations; extern struct adfs_dir_ops adfs_f_dir_ops; extern struct adfs_dir_ops adfs_fplus_dir_ops; -extern int adfs_dir_update(struct super_block *sb, struct object_info *obj); +extern int adfs_dir_update(struct super_block *sb, struct object_info *obj, + int wait); /* file.c */ extern const struct inode_operations adfs_file_inode_operations; diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index e867ccf..4d40734 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -83,7 +83,7 @@ out: } int -adfs_dir_update(struct super_block *sb, struct object_info *obj) +adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait) { int ret = -EINVAL; #ifdef CONFIG_ADFS_FS_RW @@ -106,6 +106,12 @@ adfs_dir_update(struct super_block *sb, struct object_info *obj) ret = ops->update(&dir, obj); write_unlock(&adfs_dir_lock); + if (wait) { + int err = ops->sync(&dir); + if (!ret) + ret = err; + } + ops->free(&dir); out: #endif @@ -199,7 +205,7 @@ const struct file_operations adfs_dir_operations = { .read = generic_read_dir, .llseek = generic_file_llseek, .readdir = adfs_readdir, - .fsync = file_fsync, + .fsync = simple_fsync, }; static int diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c index ea7df21..31df6ad 100644 --- a/fs/adfs/dir_f.c +++ b/fs/adfs/dir_f.c @@ -437,6 +437,22 @@ bad_dir: #endif } +static int +adfs_f_sync(struct adfs_dir *dir) +{ + int err = 0; + int i; + + for (i = dir->nr_buffers - 1; i >= 0; i--) { + struct buffer_head *bh = dir->bh[i]; + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) + err = -EIO; + } + + return err; +} + static void adfs_f_free(struct adfs_dir *dir) { @@ -456,5 +472,6 @@ struct adfs_dir_ops adfs_f_dir_ops = { .setpos = adfs_f_setpos, .getnext = adfs_f_getnext, .update = adfs_f_update, + .sync = adfs_f_sync, .free = adfs_f_free }; diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c index 1ec644e..139e0f3 100644 --- a/fs/adfs/dir_fplus.c +++ b/fs/adfs/dir_fplus.c @@ -161,6 +161,22 @@ out: return ret; } +static int +adfs_fplus_sync(struct adfs_dir *dir) +{ + int err = 0; + int i; + + for (i = dir->nr_buffers - 1; i >= 0; i--) { + struct buffer_head *bh = dir->bh[i]; + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) + err = -EIO; + } + + return err; +} + static void adfs_fplus_free(struct adfs_dir *dir) { @@ -175,5 +191,6 @@ struct adfs_dir_ops adfs_fplus_dir_ops = { .read = adfs_fplus_read, .setpos = adfs_fplus_setpos, .getnext = adfs_fplus_getnext, + .sync = adfs_fplus_sync, .free = adfs_fplus_free }; diff --git a/fs/adfs/file.c b/fs/adfs/file.c index 36e381c..8224d54 100644 --- a/fs/adfs/file.c +++ b/fs/adfs/file.c @@ -30,7 +30,7 @@ const struct file_operations adfs_file_operations = { .read = do_sync_read, .aio_read = generic_file_aio_read, .mmap = generic_file_mmap, - .fsync = file_fsync, + .fsync = simple_fsync, .write = do_sync_write, .aio_write = generic_file_aio_write, .splice_read = generic_file_splice_read, diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index e647200..05b3a67 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -376,7 +376,7 @@ out: * The adfs-specific inode data has already been updated by * adfs_notify_change() */ -int adfs_write_inode(struct inode *inode, int unused) +int adfs_write_inode(struct inode *inode, int wait) { struct super_block *sb = inode->i_sb; struct object_info obj; @@ -391,7 +391,7 @@ int adfs_write_inode(struct inode *inode, int unused) obj.attr = ADFS_I(inode)->attr; obj.size = inode->i_size; - ret = adfs_dir_update(sb, &obj); + ret = adfs_dir_update(sb, &obj, wait); unlock_kernel(); return ret; } -- cgit v1.1 From 224c886643e52e6b4c1143489cd0b289b6c03976 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 8 Jun 2009 00:46:40 -0400 Subject: Fix adfs GET_FRAG_ID() on big-endian Missing conversion to host-endian before doing shifts Signed-off-by: Al Viro --- fs/adfs/map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/adfs/map.c b/fs/adfs/map.c index 92ab4fb..568081b 100644 --- a/fs/adfs/map.c +++ b/fs/adfs/map.c @@ -62,7 +62,7 @@ static DEFINE_RWLOCK(adfs_map_lock); #define GET_FRAG_ID(_map,_start,_idmask) \ ({ \ unsigned char *_m = _map + (_start >> 3); \ - u32 _frag = get_unaligned((u32 *)_m); \ + u32 _frag = get_unaligned_le32(_m); \ _frag >>= (_start & 7); \ _frag & _idmask; \ }) -- cgit v1.1 From 4427f0c36e22e2cd6696b2fe7643e9756a14b3d3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 8 Jun 2009 01:15:58 -0400 Subject: repair bfs_write_inode(), switch bfs to simple_fsync() Signed-off-by: Al Viro --- fs/bfs/dir.c | 8 ++++---- fs/bfs/inode.c | 12 +++++++++--- 2 files changed, 13 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 4dd1b62..54bd07d 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -79,7 +79,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir) const struct file_operations bfs_dir_operations = { .read = generic_read_dir, .readdir = bfs_readdir, - .fsync = file_fsync, + .fsync = simple_fsync, .llseek = generic_file_llseek, }; @@ -205,7 +205,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry) inode->i_nlink = 1; } de->ino = 0; - mark_buffer_dirty(bh); + mark_buffer_dirty_inode(bh, dir); dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; mark_inode_dirty(dir); inode->i_ctime = dir->i_ctime; @@ -267,7 +267,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_inode->i_ctime = CURRENT_TIME_SEC; inode_dec_link_count(new_inode); } - mark_buffer_dirty(old_bh); + mark_buffer_dirty_inode(old_bh, old_dir); error = 0; end_rename: @@ -320,7 +320,7 @@ static int bfs_add_entry(struct inode *dir, const unsigned char *name, for (i = 0; i < BFS_NAMELEN; i++) de->name[i] = (i < namelen) ? name[i] : 0; - mark_buffer_dirty(bh); + mark_buffer_dirty_inode(bh, dir); brelse(bh); return 0; } diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 3a9a136..d1d9d90 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -98,14 +98,15 @@ error: return ERR_PTR(-EIO); } -static int bfs_write_inode(struct inode *inode, int unused) +static int bfs_write_inode(struct inode *inode, int wait) { + struct bfs_sb_info *info = BFS_SB(inode->i_sb); unsigned int ino = (u16)inode->i_ino; unsigned long i_sblock; struct bfs_inode *di; struct buffer_head *bh; int block, off; - struct bfs_sb_info *info = BFS_SB(inode->i_sb); + int err = 0; dprintf("ino=%08x\n", ino); @@ -146,9 +147,14 @@ static int bfs_write_inode(struct inode *inode, int unused) di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1); mark_buffer_dirty(bh); + if (wait) { + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) + err = -EIO; + } brelse(bh); mutex_unlock(&info->bfs_lock); - return 0; + return err; } static void bfs_delete_inode(struct inode *inode) -- cgit v1.1 From c475879556a8602bbe2faa9a06f6e5fcc8c05bb2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 8 Jun 2009 01:22:00 -0400 Subject: sanitize ->fsync() for affs unfortunately, for affs (especially for affs directories) we have no real way to keep track of metadata ownership. So we have to do more or less what file_fsync() does, but we do *not* need to call write_super() there. Signed-off-by: Al Viro --- fs/affs/affs.h | 1 + fs/affs/dir.c | 2 +- fs/affs/file.c | 14 +++++++++++++- 3 files changed, 15 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 1a2d5e3..e511dc6 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -182,6 +182,7 @@ extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dent void affs_free_prealloc(struct inode *inode); extern void affs_truncate(struct inode *); +int affs_file_fsync(struct file *, struct dentry *, int); /* dir.c */ diff --git a/fs/affs/dir.c b/fs/affs/dir.c index 7b36904..8ca8f3a 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -21,7 +21,7 @@ const struct file_operations affs_dir_operations = { .read = generic_read_dir, .llseek = generic_file_llseek, .readdir = affs_readdir, - .fsync = file_fsync, + .fsync = affs_file_fsync, }; /* diff --git a/fs/affs/file.c b/fs/affs/file.c index 9246cb4..184e55c 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -34,7 +34,7 @@ const struct file_operations affs_file_operations = { .mmap = generic_file_mmap, .open = affs_file_open, .release = affs_file_release, - .fsync = file_fsync, + .fsync = affs_file_fsync, .splice_read = generic_file_splice_read, }; @@ -915,3 +915,15 @@ affs_truncate(struct inode *inode) } affs_free_prealloc(inode); } + +int affs_file_fsync(struct file *filp, struct dentry *dentry, int datasync) +{ + struct inode * inode = dentry->d_inode; + int ret, err; + + ret = write_inode_now(inode, 0); + err = sync_blockdev(inode->i_sb->s_bdev); + if (!ret) + ret = err; + return ret; +} -- cgit v1.1 From e28964365faf3b9961695eb62b48cbc9f2a2a245 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2009 10:03:15 +0200 Subject: affs: add ->sync_fs Add a ->sync_fs method for data integrity syncs. Factor out common code between affs_put_super, affs_write_super and the new affs_sync_fs into a helper. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/affs/super.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/affs/super.c b/fs/affs/super.c index c481493..104fdcb 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -25,6 +25,19 @@ static int affs_statfs(struct dentry *dentry, struct kstatfs *buf); static int affs_remount (struct super_block *sb, int *flags, char *data); static void +affs_commit_super(struct super_block *sb, int clean) +{ + struct affs_sb_info *sbi = AFFS_SB(sb); + struct buffer_head *bh = sbi->s_root_bh; + struct affs_root_tail *tail = AFFS_ROOT_TAIL(sb, bh); + + tail->bm_flag = cpu_to_be32(clean); + secs_to_datestamp(get_seconds(), &tail->disk_change); + affs_fix_checksum(sb, bh); + mark_buffer_dirty(bh); +} + +static void affs_put_super(struct super_block *sb) { struct affs_sb_info *sbi = AFFS_SB(sb); @@ -32,13 +45,8 @@ affs_put_super(struct super_block *sb) lock_kernel(); - if (!(sb->s_flags & MS_RDONLY)) { - AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag = cpu_to_be32(1); - secs_to_datestamp(get_seconds(), - &AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->disk_change); - affs_fix_checksum(sb, sbi->s_root_bh); - mark_buffer_dirty(sbi->s_root_bh); - } + if (!(sb->s_flags & MS_RDONLY)) + affs_commit_super(sb, 1); kfree(sbi->s_prefix); affs_free_bitmap(sb); @@ -53,18 +61,13 @@ static void affs_write_super(struct super_block *sb) { int clean = 2; - struct affs_sb_info *sbi = AFFS_SB(sb); lock_super(sb); if (!(sb->s_flags & MS_RDONLY)) { // if (sbi->s_bitmap[i].bm_bh) { // if (buffer_dirty(sbi->s_bitmap[i].bm_bh)) { // clean = 0; - AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag = cpu_to_be32(clean); - secs_to_datestamp(get_seconds(), - &AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->disk_change); - affs_fix_checksum(sb, sbi->s_root_bh); - mark_buffer_dirty(sbi->s_root_bh); + affs_commit_super(sb, clean); sb->s_dirt = !clean; /* redo until bitmap synced */ } else sb->s_dirt = 0; @@ -73,6 +76,16 @@ affs_write_super(struct super_block *sb) pr_debug("AFFS: write_super() at %lu, clean=%d\n", get_seconds(), clean); } +static int +affs_sync_fs(struct super_block *sb, int wait) +{ + lock_super(sb); + affs_commit_super(sb, 2); + sb->s_dirt = 0; + unlock_super(sb); + return 0; +} + static struct kmem_cache * affs_inode_cachep; static struct inode *affs_alloc_inode(struct super_block *sb) @@ -130,6 +143,7 @@ static const struct super_operations affs_sops = { .clear_inode = affs_clear_inode, .put_super = affs_put_super, .write_super = affs_write_super, + .sync_fs = affs_sync_fs, .statfs = affs_statfs, .remount_fs = affs_remount, .show_options = generic_show_options, -- cgit v1.1 From 561e47ce7244168788d4ecef9a2271df204b3c89 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2009 10:03:38 +0200 Subject: bfs: add ->sync_fs Add a ->sync_fs method for data integrity syncs, and reimplement ->write_super ontop of it. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/bfs/inode.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index d1d9d90..6f60336 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -216,6 +216,26 @@ static void bfs_delete_inode(struct inode *inode) clear_inode(inode); } +static int bfs_sync_fs(struct super_block *sb, int wait) +{ + struct bfs_sb_info *info = BFS_SB(sb); + + mutex_lock(&info->bfs_lock); + mark_buffer_dirty(info->si_sbh); + sb->s_dirt = 0; + mutex_unlock(&info->bfs_lock); + + return 0; +} + +static void bfs_write_super(struct super_block *sb) +{ + if (!(sb->s_flags & MS_RDONLY)) + bfs_sync_fs(sb, 1); + else + sb->s_dirt = 0; +} + static void bfs_put_super(struct super_block *s) { struct bfs_sb_info *info = BFS_SB(s); @@ -254,17 +274,6 @@ static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static void bfs_write_super(struct super_block *s) -{ - struct bfs_sb_info *info = BFS_SB(s); - - mutex_lock(&info->bfs_lock); - if (!(s->s_flags & MS_RDONLY)) - mark_buffer_dirty(info->si_sbh); - s->s_dirt = 0; - mutex_unlock(&info->bfs_lock); -} - static struct kmem_cache *bfs_inode_cachep; static struct inode *bfs_alloc_inode(struct super_block *sb) @@ -312,6 +321,7 @@ static const struct super_operations bfs_sops = { .delete_inode = bfs_delete_inode, .put_super = bfs_put_super, .write_super = bfs_write_super, + .sync_fs = bfs_sync_fs, .statfs = bfs_statfs, }; -- cgit v1.1 From 80e09fb942d38beb19dcffbeb14d496beeb0a989 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2009 10:03:58 +0200 Subject: exofs: add ->sync_fs Add a ->sync_fs method for data integrity syncs, and reimplement ->write_super ontop of it. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/exofs/super.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 49e16af..8216c5b 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -200,18 +200,18 @@ static const struct export_operations exofs_export_ops; /* * Write the superblock to the OSD */ -static void exofs_write_super(struct super_block *sb) +static int exofs_sync_fs(struct super_block *sb, int wait) { struct exofs_sb_info *sbi; struct exofs_fscb *fscb; struct osd_request *or; struct osd_obj_id obj; - int ret; + int ret = -ENOMEM; fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL); if (!fscb) { EXOFS_ERR("exofs_write_super: memory allocation failed.\n"); - return; + return -ENOMEM; } lock_super(sb); @@ -249,6 +249,15 @@ out: unlock_kernel(); unlock_super(sb); kfree(fscb); + return ret; +} + +static void exofs_write_super(struct super_block *sb) +{ + if (!(sb->s_flags & MS_RDONLY)) + exofs_sync_fs(sb, 1); + else + sb->s_dirt = 0; } /* @@ -493,6 +502,7 @@ static const struct super_operations exofs_sops = { .delete_inode = exofs_delete_inode, .put_super = exofs_put_super, .write_super = exofs_write_super, + .sync_fs = exofs_sync_fs, .statfs = exofs_statfs, }; -- cgit v1.1 From 40f31dd47e7c3d15af1f9845eda0fa0c4c33f32f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2009 10:04:17 +0200 Subject: ext2: add ->sync_fs Add a ->sync_fs method for data integrity syncs, and reimplement ->write_super ontop of it. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/ext2/super.c | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/ext2/super.c b/fs/ext2/super.c index f8cbdf5..4589996 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -42,6 +42,7 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es); static int ext2_remount (struct super_block * sb, int * flags, char * data); static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf); +static int ext2_sync_fs(struct super_block *sb, int wait); void ext2_error (struct super_block * sb, const char * function, const char * fmt, ...) @@ -309,6 +310,7 @@ static const struct super_operations ext2_sops = { .delete_inode = ext2_delete_inode, .put_super = ext2_put_super, .write_super = ext2_write_super, + .sync_fs = ext2_sync_fs, .statfs = ext2_statfs, .remount_fs = ext2_remount, .clear_inode = ext2_clear_inode, @@ -1132,25 +1134,36 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es) * set s_state to EXT2_VALID_FS after some corrections. */ -void ext2_write_super (struct super_block * sb) +static int ext2_sync_fs(struct super_block *sb, int wait) { - struct ext2_super_block * es; + struct ext2_super_block *es = EXT2_SB(sb)->s_es; + lock_kernel(); - if (!(sb->s_flags & MS_RDONLY)) { - es = EXT2_SB(sb)->s_es; - - if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) { - ext2_debug ("setting valid to 0\n"); - es->s_state &= cpu_to_le16(~EXT2_VALID_FS); - es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb)); - es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb)); - es->s_mtime = cpu_to_le32(get_seconds()); - ext2_sync_super(sb, es); - } else - ext2_commit_super (sb, es); + if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) { + ext2_debug("setting valid to 0\n"); + es->s_state &= cpu_to_le16(~EXT2_VALID_FS); + es->s_free_blocks_count = + cpu_to_le32(ext2_count_free_blocks(sb)); + es->s_free_inodes_count = + cpu_to_le32(ext2_count_free_inodes(sb)); + es->s_mtime = cpu_to_le32(get_seconds()); + ext2_sync_super(sb, es); + } else { + ext2_commit_super(sb, es); } sb->s_dirt = 0; unlock_kernel(); + + return 0; +} + + +void ext2_write_super(struct super_block *sb) +{ + if (!(sb->s_flags & MS_RDONLY)) + ext2_sync_fs(sb, 1); + else + sb->s_dirt = 0; } static int ext2_remount (struct super_block * sb, int * flags, char * data) -- cgit v1.1 From f83d6d46e7adf241a064a4a425e5cd8a8fd8925f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2009 10:04:35 +0200 Subject: fat: add ->sync_fs Add a ->sync_fs method for data integrity syncs. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/fat/inode.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 476f80b..51a5ecf 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -449,6 +449,16 @@ static void fat_write_super(struct super_block *sb) unlock_super(sb); } +static int fat_sync_fs(struct super_block *sb, int wait) +{ + lock_super(sb); + fat_clusters_flush(sb); + sb->s_dirt = 0; + unlock_super(sb); + + return 0; +} + static void fat_put_super(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); @@ -643,6 +653,7 @@ static const struct super_operations fat_sops = { .delete_inode = fat_delete_inode, .put_super = fat_put_super, .write_super = fat_write_super, + .sync_fs = fat_sync_fs, .statfs = fat_statfs, .clear_inode = fat_clear_inode, .remount_fs = fat_remount, -- cgit v1.1 From 58bc5bbb873eb5d86126a3fd3ff02aaa69ec15d0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2009 10:04:54 +0200 Subject: hfs: add ->sync_fs Add a ->sync_fs method for data integrity syncs. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/hfs/super.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 3aac417..6f833dc 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -58,6 +58,16 @@ static void hfs_write_super(struct super_block *sb) unlock_super(sb); } +static int hfs_sync_fs(struct super_block *sb, int wait) +{ + lock_super(sb); + hfs_mdb_commit(sb); + sb->s_dirt = 0; + unlock_super(sb); + + return 0; +} + /* * hfs_put_super() * @@ -172,6 +182,7 @@ static const struct super_operations hfs_super_operations = { .clear_inode = hfs_clear_inode, .put_super = hfs_put_super, .write_super = hfs_write_super, + .sync_fs = hfs_sync_fs, .statfs = hfs_statfs, .remount_fs = hfs_remount, .show_options = hfs_show_options, -- cgit v1.1 From 7fbc6df0e7a561a313f49faa77829d5de45a97f8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2009 10:05:12 +0200 Subject: hfsplus: add ->sync_fs Add a ->sync_fs method for data integrity syncs, and reimplement ->write_super ontop of it. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/hfsplus/super.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 1aab8aa..9fc3af0 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -152,7 +152,7 @@ static void hfsplus_clear_inode(struct inode *inode) } } -static void hfsplus_write_super(struct super_block *sb) +static int hfsplus_sync_fs(struct super_block *sb, int wait) { struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; @@ -160,9 +160,6 @@ static void hfsplus_write_super(struct super_block *sb) lock_super(sb); sb->s_dirt = 0; - if (sb->s_flags & MS_RDONLY) - /* warn? */ - goto out; vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks); vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc); @@ -194,8 +191,16 @@ static void hfsplus_write_super(struct super_block *sb) } HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP; } - out: unlock_super(sb); + return 0; +} + +static void hfsplus_write_super(struct super_block *sb) +{ + if (!(sb->s_flags & MS_RDONLY)) + hfsplus_sync_fs(sb, 1); + else + sb->s_dirt = 0; } static void hfsplus_put_super(struct super_block *sb) @@ -290,6 +295,7 @@ static const struct super_operations hfsplus_sops = { .clear_inode = hfsplus_clear_inode, .put_super = hfsplus_put_super, .write_super = hfsplus_write_super, + .sync_fs = hfsplus_sync_fs, .statfs = hfsplus_statfs, .remount_fs = hfsplus_remount, .show_options = hfsplus_show_options, -- cgit v1.1 From ad43ffdeea0a7bd3e6036c4aeec2e6699aef8ac7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2009 10:07:45 +0200 Subject: sysv: add ->sync_fs Add a ->sync_fs method for data integrity syncs, and reimplement ->write_super ontop of it. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/sysv/inode.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 425c976..4799234 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -31,16 +31,13 @@ #include #include "sysv.h" -/* This is only called on sync() and umount(), when s_dirt=1. */ -static void sysv_write_super(struct super_block *sb) +static int sysv_sync_fs(struct super_block *sb, int wait) { struct sysv_sb_info *sbi = SYSV_SB(sb); unsigned long time = get_seconds(), old_time; lock_super(sb); lock_kernel(); - if (sb->s_flags & MS_RDONLY) - goto clean; /* * If we are going to write out the super block, @@ -54,10 +51,19 @@ static void sysv_write_super(struct super_block *sb) *sbi->s_sb_time = cpu_to_fs32(sbi, time); mark_buffer_dirty(sbi->s_bh2); } -clean: - sb->s_dirt = 0; + unlock_kernel(); unlock_super(sb); + + return 0; +} + +static void sysv_write_super(struct super_block *sb) +{ + if (!(sb->s_flags & MS_RDONLY)) + sysv_sync_fs(sb, 1); + else + sb->s_dirt = 0; } static int sysv_remount(struct super_block *sb, int *flags, char *data) @@ -345,6 +351,7 @@ const struct super_operations sysv_sops = { .delete_inode = sysv_delete_inode, .put_super = sysv_put_super, .write_super = sysv_write_super, + .sync_fs = sysv_sync_fs, .remount_fs = sysv_remount, .statfs = sysv_statfs, }; -- cgit v1.1 From 8c8006564a58d0ea912bf7f2d0a758d97e4b464f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2009 10:08:05 +0200 Subject: ufs: add ->sync_fs Add a ->sync_fs method for data integrity syncs, and reimplement ->write_super ontop of it. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/ufs/super.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 6560dda..5faed79 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1125,7 +1125,7 @@ failed_nomem: return -ENOMEM; } -static void ufs_write_super(struct super_block *sb) +static int ufs_sync_fs(struct super_block *sb, int wait) { struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; @@ -1134,25 +1134,36 @@ static void ufs_write_super(struct super_block *sb) lock_super(sb); lock_kernel(); + UFSD("ENTER\n"); + flags = UFS_SB(sb)->s_flags; uspi = UFS_SB(sb)->s_uspi; usb1 = ubh_get_usb_first(uspi); usb3 = ubh_get_usb_third(uspi); - if (!(sb->s_flags & MS_RDONLY)) { - usb1->fs_time = cpu_to_fs32(sb, get_seconds()); - if ((flags & UFS_ST_MASK) == UFS_ST_SUN - || (flags & UFS_ST_MASK) == UFS_ST_SUNOS - || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) - ufs_set_fs_state(sb, usb1, usb3, - UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time)); - ufs_put_cstotal(sb); - } + usb1->fs_time = cpu_to_fs32(sb, get_seconds()); + if ((flags & UFS_ST_MASK) == UFS_ST_SUN || + (flags & UFS_ST_MASK) == UFS_ST_SUNOS || + (flags & UFS_ST_MASK) == UFS_ST_SUNx86) + ufs_set_fs_state(sb, usb1, usb3, + UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time)); + ufs_put_cstotal(sb); sb->s_dirt = 0; + UFSD("EXIT\n"); unlock_kernel(); unlock_super(sb); + + return 0; +} + +static void ufs_write_super(struct super_block *sb) +{ + if (!(sb->s_flags & MS_RDONLY)) + ufs_sync_fs(sb, 1); + else + sb->s_dirt = 0; } static void ufs_put_super(struct super_block *sb) @@ -1381,6 +1392,7 @@ static const struct super_operations ufs_super_ops = { .delete_inode = ufs_delete_inode, .put_super = ufs_put_super, .write_super = ufs_write_super, + .sync_fs = ufs_sync_fs, .statfs = ufs_statfs, .remount_fs = ufs_remount, .show_options = ufs_show_options, -- cgit v1.1 From d579ed00aa96a7f7486978540a0d7cecaff742ae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2009 10:08:21 +0200 Subject: jffs2: call jffs2_write_super from jffs2_sync_fs The call to ->write_super from __sync_filesystem will go away, so make sure jffs2 performs the same actions from inside ->sync_fs. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/jffs2/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index f7bfd3a..07a22ca 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -74,6 +74,8 @@ static int jffs2_sync_fs(struct super_block *sb, int wait) { struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); + jffs2_write_super(sb); + mutex_lock(&c->alloc_sem); jffs2_flush_wbuf_pad(c); mutex_unlock(&c->alloc_sem); -- cgit v1.1 From d731e06323cb705003e4172ec209e469be4c18e1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2009 10:08:36 +0200 Subject: nilfs2: call nilfs2_write_super from nilfs2_sync_fs The call to ->write_super from __sync_filesystem will go away, so make sure nilfs2 performs the same actions from inside ->sync_fs. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/nilfs2/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 11151ea..122dc1e 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -391,6 +391,8 @@ static int nilfs_sync_fs(struct super_block *sb, int wait) { int err = 0; + nilfs_write_super(sb); + /* This function is called when super block should be written back */ if (wait) err = nilfs_construct_segment(sb); -- cgit v1.1 From 0c95ee190e1dea60c55c834d14695341085c9b7b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 8 Jun 2009 10:08:54 +0200 Subject: remove the call to ->write_super in __sync_filesystem Now that all filesystems provide ->sync_fs methods we can change __sync_filesystem to only call ->sync_fs. This gives us a clear separation between periodic writeouts which are driven by ->write_super and data integrity syncs that go through ->sync_fs. (modulo file_fsync which is also going away) Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/sync.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/sync.c b/fs/sync.c index e9d56f6..dd20002 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -33,8 +33,6 @@ static int __sync_filesystem(struct super_block *sb, int wait) else sync_quota_sb(sb, -1); sync_inodes_sb(sb, wait); - if (sb->s_dirt && sb->s_op->write_super) - sb->s_op->write_super(sb); if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, wait); return __sync_blockdev(sb->s_bdev, wait); -- cgit v1.1 From 81fc20bd0e75ba6357bce2403767d7c2585d8f28 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 8 Jun 2009 01:39:28 +0900 Subject: nilfs2: remove meaningless EBUSY case from nilfs_get_sb function The following EBUSY case in nilfs_get_sb() is meaningless. Indeed, this error code is never returned to the caller. if (!s->s_root) { ... } else if (!(s->s_flags & MS_RDONLY)) { err = -EBUSY; } This simply removes the else case. Signed-off-by: Ryusuke Konishi Signed-off-by: Al Viro --- fs/nilfs2/super.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 122dc1e..1c505d0 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1186,8 +1186,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, s->s_flags |= MS_ACTIVE; need_to_close = 0; - } else if (!(s->s_flags & MS_RDONLY)) { - err = -EBUSY; } up(&sd.bdev->bd_mount_sem); -- cgit v1.1 From 33c8e57c86d1bd1548c12a4f7c4bceb94b862cca Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 8 Jun 2009 01:39:29 +0900 Subject: nilfs2: get rid of sget use for acquiring nilfs object This will change the way to obtain nilfs object in nilfs_get_sb() function. Previously, a preliminary sget() call was performed, and the nilfs object was acquired from a super block instance found by the sget() call. This patch, instead, instroduces a new dedicated function find_or_create_nilfs(); as the name implies, the function finds an existent nilfs object from a global list or creates a new one if no object is found on the device. Signed-off-by: Ryusuke Konishi Cc: Al Viro Signed-off-by: Al Viro --- fs/nilfs2/super.c | 80 ++++++++++++--------------------------------------- fs/nilfs2/the_nilfs.c | 57 ++++++++++++++++++++++++++++++++++-- fs/nilfs2/the_nilfs.h | 4 ++- 3 files changed, 76 insertions(+), 65 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 1c505d0..3c9833e 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1061,13 +1061,6 @@ static int nilfs_set_bdev_super(struct super_block *s, void *data) static int nilfs_test_bdev_super(struct super_block *s, void *data) { struct nilfs_super_data *sd = data; - - return s->s_bdev == sd->bdev; -} - -static int nilfs_test_bdev_super2(struct super_block *s, void *data) -{ - struct nilfs_super_data *sd = data; int ret; if (s->s_bdev != sd->bdev) @@ -1096,8 +1089,8 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) { struct nilfs_super_data sd; - struct super_block *s, *s2; - struct the_nilfs *nilfs = NULL; + struct super_block *s; + struct the_nilfs *nilfs; int err, need_to_close = 1; sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type); @@ -1118,11 +1111,12 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, goto failed; } - /* - * once the super is inserted into the list by sget, s_umount - * will protect the lockfs code from trying to start a snapshot - * while we are mounting - */ + nilfs = find_or_create_nilfs(sd.bdev); + if (!nilfs) { + err = -ENOMEM; + goto failed; + } + down(&sd.bdev->bd_mount_sem); if (!sd.cno && (err = test_exclusive_mount(fs_type, sd.bdev, flags ^ MS_RDONLY))) { @@ -1131,51 +1125,22 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, } /* - * Phase-1: search any existent instance and get the_nilfs + * Search specified snapshot or R/W mode super_block */ - s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd); - if (IS_ERR(s)) - goto error_s; - - if (!s->s_root) { - err = -ENOMEM; - nilfs = alloc_nilfs(sd.bdev); - if (!nilfs) - goto cancel_new; - } else { - struct nilfs_sb_info *sbi = NILFS_SB(s); + if (!sd.cno) + /* trying to get the latest checkpoint. */ + sd.cno = nilfs_last_cno(nilfs); - /* - * s_umount protects super_block from unmount process; - * It covers pointers of nilfs_sb_info and the_nilfs. - */ - nilfs = sbi->s_nilfs; - get_nilfs(nilfs); - up_write(&s->s_umount); - - /* - * Phase-2: search specified snapshot or R/W mode super_block - */ - if (!sd.cno) - /* trying to get the latest checkpoint. */ - sd.cno = nilfs_last_cno(nilfs); - - s2 = sget(fs_type, nilfs_test_bdev_super2, - nilfs_set_bdev_super, &sd); - deactivate_super(s); - /* - * Although deactivate_super() invokes close_bdev_exclusive() at - * kill_block_super(). Here, s is an existent mount; we need - * one more close_bdev_exclusive() call. - */ - s = s2; - if (IS_ERR(s)) - goto error_s; + s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd); + if (IS_ERR(s)) { + err = PTR_ERR(s); + goto failed_unlock; } if (!s->s_root) { char b[BDEVNAME_SIZE]; + /* New superblock instance created */ s->s_flags = flags; strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id)); sb_set_blocksize(s, block_size(sd.bdev)); @@ -1195,15 +1160,9 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, simple_set_mnt(mnt, s); return 0; - error_s: - up(&sd.bdev->bd_mount_sem); - if (nilfs) - put_nilfs(nilfs); - close_bdev_exclusive(sd.bdev, flags); - return PTR_ERR(s); - failed_unlock: up(&sd.bdev->bd_mount_sem); + put_nilfs(nilfs); failed: close_bdev_exclusive(sd.bdev, flags); @@ -1212,8 +1171,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, cancel_new: /* Abandoning the newly allocated superblock */ up(&sd.bdev->bd_mount_sem); - if (nilfs) - put_nilfs(nilfs); + put_nilfs(nilfs); up_write(&s->s_umount); deactivate_super(s); /* diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index a91f15b..45dbf6a 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -35,6 +35,10 @@ #include "seglist.h" #include "segbuf.h" + +static LIST_HEAD(nilfs_objects); +static DEFINE_SPINLOCK(nilfs_lock); + void nilfs_set_last_segment(struct the_nilfs *nilfs, sector_t start_blocknr, u64 seq, __u64 cno) { @@ -55,7 +59,7 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs, * Return Value: On success, pointer to the_nilfs is returned. * On error, NULL is returned. */ -struct the_nilfs *alloc_nilfs(struct block_device *bdev) +static struct the_nilfs *alloc_nilfs(struct block_device *bdev) { struct the_nilfs *nilfs; @@ -69,6 +73,7 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev) atomic_set(&nilfs->ns_ndirtyblks, 0); init_rwsem(&nilfs->ns_sem); mutex_init(&nilfs->ns_writer_mutex); + INIT_LIST_HEAD(&nilfs->ns_list); INIT_LIST_HEAD(&nilfs->ns_supers); spin_lock_init(&nilfs->ns_last_segment_lock); nilfs->ns_gc_inodes_h = NULL; @@ -78,6 +83,45 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev) } /** + * find_or_create_nilfs - find or create nilfs object + * @bdev: block device to which the_nilfs is related + * + * find_nilfs() looks up an existent nilfs object created on the + * device and gets the reference count of the object. If no nilfs object + * is found on the device, a new nilfs object is allocated. + * + * Return Value: On success, pointer to the nilfs object is returned. + * On error, NULL is returned. + */ +struct the_nilfs *find_or_create_nilfs(struct block_device *bdev) +{ + struct the_nilfs *nilfs, *new = NULL; + + retry: + spin_lock(&nilfs_lock); + list_for_each_entry(nilfs, &nilfs_objects, ns_list) { + if (nilfs->ns_bdev == bdev) { + get_nilfs(nilfs); + spin_unlock(&nilfs_lock); + if (new) + put_nilfs(new); + return nilfs; /* existing object */ + } + } + if (new) { + list_add_tail(&new->ns_list, &nilfs_objects); + spin_unlock(&nilfs_lock); + return new; /* new object */ + } + spin_unlock(&nilfs_lock); + + new = alloc_nilfs(bdev); + if (new) + goto retry; + return NULL; /* insufficient memory */ +} + +/** * put_nilfs - release a reference to the_nilfs * @nilfs: the_nilfs structure to be released * @@ -86,13 +130,20 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev) */ void put_nilfs(struct the_nilfs *nilfs) { - if (!atomic_dec_and_test(&nilfs->ns_count)) + spin_lock(&nilfs_lock); + if (!atomic_dec_and_test(&nilfs->ns_count)) { + spin_unlock(&nilfs_lock); return; + } + list_del_init(&nilfs->ns_list); + spin_unlock(&nilfs_lock); + /* - * Increment of ns_count never occur below because the caller + * Increment of ns_count never occurs below because the caller * of get_nilfs() holds at least one reference to the_nilfs. * Thus its exclusion control is not required here. */ + might_sleep(); if (nilfs_loaded(nilfs)) { nilfs_mdt_clear(nilfs->ns_sufile); diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 30fe587..116caf9 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -43,6 +43,7 @@ enum { * struct the_nilfs - struct to supervise multiple nilfs mount points * @ns_flags: flags * @ns_count: reference count + * @ns_list: list head for nilfs_list * @ns_bdev: block device * @ns_bdi: backing dev info * @ns_writer: back pointer to writable nilfs_sb_info @@ -88,6 +89,7 @@ enum { struct the_nilfs { unsigned long ns_flags; atomic_t ns_count; + struct list_head ns_list; struct block_device *ns_bdev; struct backing_dev_info *ns_bdi; @@ -191,7 +193,7 @@ THE_NILFS_FNS(DISCONTINUED, discontinued) #define NILFS_ALTSB_FREQ 60 /* spare superblock */ void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); -struct the_nilfs *alloc_nilfs(struct block_device *); +struct the_nilfs *find_or_create_nilfs(struct block_device *); void put_nilfs(struct the_nilfs *); int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *); int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *); -- cgit v1.1 From 3f82ff55168e92859119bf348e9e0bd6714d2fea Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 8 Jun 2009 01:39:30 +0900 Subject: nilfs2: get rid of sget use for checking if current mount is present This stops using sget() for checking if an r/w-mount or an r/o-mount exists on the device. This elimination uses a back pointer to the current mount added to nilfs object. Signed-off-by: Ryusuke Konishi Cc: Al Viro Signed-off-by: Al Viro --- fs/nilfs2/super.c | 92 ++++++++++++++++++--------------------------------- fs/nilfs2/the_nilfs.h | 3 ++ 2 files changed, 35 insertions(+), 60 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 3c9833e..5a8c5e4 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -67,8 +67,6 @@ MODULE_LICENSE("GPL"); static void nilfs_write_super(struct super_block *sb); static int nilfs_remount(struct super_block *sb, int *flags, char *data); -static int test_exclusive_mount(struct file_system_type *fs_type, - struct block_device *bdev, int flags); /** * nilfs_error() - report failure condition on a filesystem @@ -329,6 +327,10 @@ static void nilfs_put_super(struct super_block *sb) nilfs_commit_super(sbi, 1); up_write(&nilfs->ns_sem); } + down_write(&nilfs->ns_sem); + if (nilfs->ns_current == sbi) + nilfs->ns_current = NULL; + up_write(&nilfs->ns_sem); nilfs_detach_checkpoint(sbi); put_nilfs(sbi->s_nilfs); @@ -880,6 +882,11 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent, goto failed_root; } + down_write(&nilfs->ns_sem); + if (!nilfs_test_opt(sbi, SNAPSHOT)) + nilfs->ns_current = sbi; + up_write(&nilfs->ns_sem); + return 0; failed_root: @@ -958,14 +965,16 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) * by fsck since we originally mounted the partition.) */ down(&sb->s_bdev->bd_mount_sem); - /* Check existing RW-mount */ - if (test_exclusive_mount(sb->s_type, sb->s_bdev, 0)) { + down_read(&nilfs->ns_sem); + if (nilfs->ns_current && nilfs->ns_current != sbi) { printk(KERN_WARNING "NILFS (device %s): couldn't " - "remount because a RW-mount exists.\n", + "remount because an RW-mount exists.\n", sb->s_id); + up_read(&nilfs->ns_sem); err = -EBUSY; goto rw_remount_failed; } + up_read(&nilfs->ns_sem); if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) { printk(KERN_WARNING "NILFS (device %s): couldn't " "remount because the current RO-mount is not " @@ -984,6 +993,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) down_write(&nilfs->ns_sem); nilfs_setup_super(sbi); + nilfs->ns_current = sbi; up_write(&nilfs->ns_sem); up(&sb->s_bdev->bd_mount_sem); @@ -1118,10 +1128,23 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, } down(&sd.bdev->bd_mount_sem); - if (!sd.cno && - (err = test_exclusive_mount(fs_type, sd.bdev, flags ^ MS_RDONLY))) { - err = (err < 0) ? : -EBUSY; - goto failed_unlock; + + if (!sd.cno) { + /* + * Check if an exclusive mount exists or not. + * Snapshot mounts coexist with a current mount + * (i.e. rw-mount or ro-mount), whereas rw-mount and + * ro-mount are mutually exclusive. + */ + down_read(&nilfs->ns_sem); + if (nilfs->ns_current && + ((nilfs->ns_current->s_super->s_flags ^ flags) + & MS_RDONLY)) { + up_read(&nilfs->ns_sem); + err = -EBUSY; + goto failed_unlock; + } + up_read(&nilfs->ns_sem); } /* @@ -1182,57 +1205,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, return err; } -static int nilfs_test_bdev_super3(struct super_block *s, void *data) -{ - struct nilfs_super_data *sd = data; - int ret; - - if (s->s_bdev != sd->bdev) - return 0; - if (down_read_trylock(&s->s_umount)) { - ret = (s->s_flags & MS_RDONLY) && s->s_root && - nilfs_test_opt(NILFS_SB(s), SNAPSHOT); - up_read(&s->s_umount); - if (ret) - return 0; /* ignore snapshot mounts */ - } - return !((sd->flags ^ s->s_flags) & MS_RDONLY); -} - -static int __false_bdev_super(struct super_block *s, void *data) -{ -#if 0 /* XXX: workaround for lock debug. This is not good idea */ - up_write(&s->s_umount); -#endif - return -EFAULT; -} - -/** - * test_exclusive_mount - check whether an exclusive RW/RO mount exists or not. - * fs_type: filesystem type - * bdev: block device - * flag: 0 (check rw-mount) or MS_RDONLY (check ro-mount) - * res: pointer to an integer to store result - * - * This function must be called within a section protected by bd_mount_mutex. - */ -static int test_exclusive_mount(struct file_system_type *fs_type, - struct block_device *bdev, int flags) -{ - struct super_block *s; - struct nilfs_super_data sd = { .flags = flags, .bdev = bdev }; - - s = sget(fs_type, nilfs_test_bdev_super3, __false_bdev_super, &sd); - if (IS_ERR(s)) { - if (PTR_ERR(s) != -EFAULT) - return PTR_ERR(s); - return 0; /* Not found */ - } - up_write(&s->s_umount); - deactivate_super(s); - return 1; /* Found */ -} - struct file_system_type nilfs_fs_type = { .owner = THIS_MODULE, .name = "nilfs2", diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 116caf9..99f7e29 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -50,6 +50,7 @@ enum { * @ns_sem: semaphore for shared states * @ns_writer_mutex: mutex protecting ns_writer attach/detach * @ns_writer_refcount: number of referrers on ns_writer + * @ns_current: back pointer to current mount * @ns_sbh: buffer heads of on-disk super blocks * @ns_sbp: pointers to super block data * @ns_sbwtime: previous write time of super blocks @@ -98,6 +99,8 @@ struct the_nilfs { struct mutex ns_writer_mutex; atomic_t ns_writer_refcount; + struct nilfs_sb_info *ns_current; + /* * used for * - loading the latest checkpoint exclusively. -- cgit v1.1 From 6dd4740662405a68bb229ac2b9e0aeaaf2188bf2 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 8 Jun 2009 01:39:31 +0900 Subject: nilfs2: simplify remaining sget() use This simplifies the test function passed on the remaining sget() callsite in nilfs. Instead of checking mount type (i.e. ro-mount/rw-mount/snapshot mount) in the test function passed to sget(), this patch first looks up the nilfs_sb_info struct which the given mount type matches, and then acquires the super block instance holding the nilfs_sb_info. Signed-off-by: Ryusuke Konishi Cc: Al Viro Signed-off-by: Al Viro --- fs/nilfs2/sb.h | 1 + fs/nilfs2/super.c | 42 +++++++++++++++++------------------------- fs/nilfs2/the_nilfs.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ fs/nilfs2/the_nilfs.h | 7 +++++++ 4 files changed, 75 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h index adccd4f..0776ccc 100644 --- a/fs/nilfs2/sb.h +++ b/fs/nilfs2/sb.h @@ -60,6 +60,7 @@ struct nilfs_sb_info { struct super_block *s_super; /* reverse pointer to super_block */ struct the_nilfs *s_nilfs; struct list_head s_list; /* list head for nilfs->ns_supers */ + atomic_t s_count; /* reference count */ /* Segment constructor */ struct list_head s_dirty_files; /* dirty files list */ diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 5a8c5e4..1d1b6e1 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -336,7 +336,7 @@ static void nilfs_put_super(struct super_block *sb) put_nilfs(sbi->s_nilfs); sbi->s_super = NULL; sb->s_fs_info = NULL; - kfree(sbi); + nilfs_put_sbinfo(sbi); unlock_kernel(); } @@ -785,6 +785,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent, get_nilfs(nilfs); sbi->s_nilfs = nilfs; sbi->s_super = sb; + atomic_set(&sbi->s_count, 1); err = init_nilfs(nilfs, sbi, (char *)data); if (err) @@ -902,7 +903,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent, failed_sbi: put_nilfs(nilfs); sb->s_fs_info = NULL; - kfree(sbi); + nilfs_put_sbinfo(sbi); return err; } @@ -1014,6 +1015,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) struct nilfs_super_data { struct block_device *bdev; + struct nilfs_sb_info *sbi; __u64 cno; int flags; }; @@ -1071,27 +1073,8 @@ static int nilfs_set_bdev_super(struct super_block *s, void *data) static int nilfs_test_bdev_super(struct super_block *s, void *data) { struct nilfs_super_data *sd = data; - int ret; - - if (s->s_bdev != sd->bdev) - return 0; - - if (!((s->s_flags | sd->flags) & MS_RDONLY)) - return 1; /* Reuse an old R/W-mode super_block */ - - if (s->s_flags & sd->flags & MS_RDONLY) { - if (down_read_trylock(&s->s_umount)) { - ret = s->s_root && - (sd->cno == NILFS_SB(s)->s_snapshot_cno); - up_read(&s->s_umount); - /* - * This path is locked with sb_lock by sget(). - * So, drop_super() causes deadlock. - */ - return ret; - } - } - return 0; + + return sd->sbi && s->s_fs_info == (void *)sd->sbi; } static int @@ -1112,7 +1095,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, * much more information than normal filesystems to identify mount * instance. For snapshot mounts, not only a mount type (ro-mount * or rw-mount) but also a checkpoint number is required. - * The results are passed in sget() using nilfs_super_data. */ sd.cno = 0; sd.flags = flags; @@ -1148,13 +1130,23 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, } /* - * Search specified snapshot or R/W mode super_block + * Find existing nilfs_sb_info struct */ + sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno); + if (!sd.cno) /* trying to get the latest checkpoint. */ sd.cno = nilfs_last_cno(nilfs); + /* + * Get super block instance holding the nilfs_sb_info struct. + * A new instance is allocated if no existing mount is present or + * existing instance has been unmounted. + */ s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd); + if (sd.sbi) + nilfs_put_sbinfo(sd.sbi); + if (IS_ERR(s)) { err = PTR_ERR(s); goto failed_unlock; diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 45dbf6a..221953b 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -664,6 +664,56 @@ int nilfs_near_disk_full(struct the_nilfs *nilfs) return ret; } +/** + * nilfs_find_sbinfo - find existing nilfs_sb_info structure + * @nilfs: nilfs object + * @rw_mount: mount type (non-zero value for read/write mount) + * @cno: checkpoint number (zero for read-only mount) + * + * nilfs_find_sbinfo() returns the nilfs_sb_info structure which + * @rw_mount and @cno (in case of snapshots) matched. If no instance + * was found, NULL is returned. Although the super block instance can + * be unmounted after this function returns, the nilfs_sb_info struct + * is kept on memory until nilfs_put_sbinfo() is called. + */ +struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *nilfs, + int rw_mount, __u64 cno) +{ + struct nilfs_sb_info *sbi; + + down_read(&nilfs->ns_sem); + /* + * The SNAPSHOT flag and sb->s_flags are supposed to be + * protected with nilfs->ns_sem. + */ + sbi = nilfs->ns_current; + if (rw_mount) { + if (sbi && !(sbi->s_super->s_flags & MS_RDONLY)) + goto found; /* read/write mount */ + else + goto out; + } else if (cno == 0) { + if (sbi && (sbi->s_super->s_flags & MS_RDONLY)) + goto found; /* read-only mount */ + else + goto out; + } + + list_for_each_entry(sbi, &nilfs->ns_supers, s_list) { + if (nilfs_test_opt(sbi, SNAPSHOT) && + sbi->s_snapshot_cno == cno) + goto found; /* snapshot mount */ + } + out: + up_read(&nilfs->ns_sem); + return NULL; + + found: + atomic_inc(&sbi->s_count); + up_read(&nilfs->ns_sem); + return sbi; +} + int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno, int snapshot_mount) { diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 99f7e29..be4c040 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -201,6 +201,7 @@ void put_nilfs(struct the_nilfs *); int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *); int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *); int nilfs_count_free_blocks(struct the_nilfs *, sector_t *); +struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64); int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int); int nilfs_near_disk_full(struct the_nilfs *); void nilfs_fall_back_super_block(struct the_nilfs *); @@ -243,6 +244,12 @@ nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) mutex_unlock(&nilfs->ns_writer_mutex); } +static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi) +{ + if (!atomic_dec_and_test(&sbi->s_count)) + kfree(sbi); +} + static inline void nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum, sector_t *seg_start, sector_t *seg_end) -- cgit v1.1 From e59399d0102c1813cec48db5cebe1750313f88a0 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 8 Jun 2009 01:39:32 +0900 Subject: nilfs2: correct exclusion control in nilfs_remount function nilfs_remount() changes mount state of a superblock instance. Even though nilfs accesses other superblock instances during mount or remount, the mount state was not properly protected in nilfs_remount(). Moreover, nilfs_remount() has a lock order reversal problem; nilfs_get_sb() holds: 1. bdev->bd_mount_sem 2. sb->s_umount (sget acquires) and nilfs_remount() holds: 1. sb->s_umount (locked by the caller in vfs) 2. bdev->bd_mount_sem To avoid these problems, this patch divides a semaphore protecting super block instances from nilfs->ns_sem, and applies it to the mount state protection in nilfs_remount(). With this change, bd_mount_sem use is removed from nilfs_remount() and the lock order reversal will be resolved. And the new rw-semaphore, nilfs->ns_super_sem will properly protect the mount state except the modification from nilfs_error function. Signed-off-by: Ryusuke Konishi Signed-off-by: Al Viro --- fs/nilfs2/super.c | 44 ++++++++++++++++++++------------------------ fs/nilfs2/the_nilfs.c | 13 +++++++------ fs/nilfs2/the_nilfs.h | 7 ++++++- 3 files changed, 33 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 1d1b6e1..f02762f 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -327,10 +327,10 @@ static void nilfs_put_super(struct super_block *sb) nilfs_commit_super(sbi, 1); up_write(&nilfs->ns_sem); } - down_write(&nilfs->ns_sem); + down_write(&nilfs->ns_super_sem); if (nilfs->ns_current == sbi) nilfs->ns_current = NULL; - up_write(&nilfs->ns_sem); + up_write(&nilfs->ns_super_sem); nilfs_detach_checkpoint(sbi); put_nilfs(sbi->s_nilfs); @@ -408,9 +408,9 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) struct buffer_head *bh_cp; int err; - down_write(&nilfs->ns_sem); + down_write(&nilfs->ns_super_sem); list_add(&sbi->s_list, &nilfs->ns_supers); - up_write(&nilfs->ns_sem); + up_write(&nilfs->ns_super_sem); sbi->s_ifile = nilfs_mdt_new( nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP); @@ -448,9 +448,9 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) nilfs_mdt_destroy(sbi->s_ifile); sbi->s_ifile = NULL; - down_write(&nilfs->ns_sem); + down_write(&nilfs->ns_super_sem); list_del_init(&sbi->s_list); - up_write(&nilfs->ns_sem); + up_write(&nilfs->ns_super_sem); return err; } @@ -462,9 +462,9 @@ void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi) nilfs_mdt_clear(sbi->s_ifile); nilfs_mdt_destroy(sbi->s_ifile); sbi->s_ifile = NULL; - down_write(&nilfs->ns_sem); + down_write(&nilfs->ns_super_sem); list_del_init(&sbi->s_list); - up_write(&nilfs->ns_sem); + up_write(&nilfs->ns_super_sem); } static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi) @@ -883,10 +883,10 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent, goto failed_root; } - down_write(&nilfs->ns_sem); + down_write(&nilfs->ns_super_sem); if (!nilfs_test_opt(sbi, SNAPSHOT)) nilfs->ns_current = sbi; - up_write(&nilfs->ns_sem); + up_write(&nilfs->ns_super_sem); return 0; @@ -918,6 +918,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) lock_kernel(); + down_write(&nilfs->ns_super_sem); old_sb_flags = sb->s_flags; old_opts.mount_opt = sbi->s_mount_opt; old_opts.snapshot_cno = sbi->s_snapshot_cno; @@ -965,24 +966,20 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) * store the current valid flag. (It may have been changed * by fsck since we originally mounted the partition.) */ - down(&sb->s_bdev->bd_mount_sem); - down_read(&nilfs->ns_sem); if (nilfs->ns_current && nilfs->ns_current != sbi) { printk(KERN_WARNING "NILFS (device %s): couldn't " "remount because an RW-mount exists.\n", sb->s_id); - up_read(&nilfs->ns_sem); err = -EBUSY; - goto rw_remount_failed; + goto restore_opts; } - up_read(&nilfs->ns_sem); if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) { printk(KERN_WARNING "NILFS (device %s): couldn't " "remount because the current RO-mount is not " "the latest one.\n", sb->s_id); err = -EINVAL; - goto rw_remount_failed; + goto restore_opts; } sb->s_flags &= ~MS_RDONLY; nilfs_clear_opt(sbi, SNAPSHOT); @@ -990,25 +987,24 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) err = nilfs_attach_segment_constructor(sbi); if (err) - goto rw_remount_failed; + goto restore_opts; down_write(&nilfs->ns_sem); nilfs_setup_super(sbi); - nilfs->ns_current = sbi; up_write(&nilfs->ns_sem); - up(&sb->s_bdev->bd_mount_sem); + nilfs->ns_current = sbi; } out: + up_write(&nilfs->ns_super_sem); unlock_kernel(); return 0; - rw_remount_failed: - up(&sb->s_bdev->bd_mount_sem); restore_opts: sb->s_flags = old_sb_flags; sbi->s_mount_opt = old_opts.mount_opt; sbi->s_snapshot_cno = old_opts.snapshot_cno; + up_write(&nilfs->ns_super_sem); unlock_kernel(); return err; } @@ -1118,15 +1114,15 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, * (i.e. rw-mount or ro-mount), whereas rw-mount and * ro-mount are mutually exclusive. */ - down_read(&nilfs->ns_sem); + down_read(&nilfs->ns_super_sem); if (nilfs->ns_current && ((nilfs->ns_current->s_super->s_flags ^ flags) & MS_RDONLY)) { - up_read(&nilfs->ns_sem); + up_read(&nilfs->ns_super_sem); err = -EBUSY; goto failed_unlock; } - up_read(&nilfs->ns_sem); + up_read(&nilfs->ns_super_sem); } /* diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 221953b..06e8dfd 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -72,6 +72,7 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev) atomic_set(&nilfs->ns_writer_refcount, -1); atomic_set(&nilfs->ns_ndirtyblks, 0); init_rwsem(&nilfs->ns_sem); + init_rwsem(&nilfs->ns_super_sem); mutex_init(&nilfs->ns_writer_mutex); INIT_LIST_HEAD(&nilfs->ns_list); INIT_LIST_HEAD(&nilfs->ns_supers); @@ -681,10 +682,10 @@ struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *nilfs, { struct nilfs_sb_info *sbi; - down_read(&nilfs->ns_sem); + down_read(&nilfs->ns_super_sem); /* * The SNAPSHOT flag and sb->s_flags are supposed to be - * protected with nilfs->ns_sem. + * protected with nilfs->ns_super_sem. */ sbi = nilfs->ns_current; if (rw_mount) { @@ -705,12 +706,12 @@ struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *nilfs, goto found; /* snapshot mount */ } out: - up_read(&nilfs->ns_sem); + up_read(&nilfs->ns_super_sem); return NULL; found: atomic_inc(&sbi->s_count); - up_read(&nilfs->ns_sem); + up_read(&nilfs->ns_super_sem); return sbi; } @@ -720,7 +721,7 @@ int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno, struct nilfs_sb_info *sbi; int ret = 0; - down_read(&nilfs->ns_sem); + down_read(&nilfs->ns_super_sem); if (cno == 0 || cno > nilfs->ns_cno) goto out_unlock; @@ -737,6 +738,6 @@ int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno, ret++; out_unlock: - up_read(&nilfs->ns_sem); + up_read(&nilfs->ns_super_sem); return ret; } diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index be4c040..d0cf4fb 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -48,6 +48,7 @@ enum { * @ns_bdi: backing dev info * @ns_writer: back pointer to writable nilfs_sb_info * @ns_sem: semaphore for shared states + * @ns_super_sem: semaphore for global operations across super block instances * @ns_writer_mutex: mutex protecting ns_writer attach/detach * @ns_writer_refcount: number of referrers on ns_writer * @ns_current: back pointer to current mount @@ -96,10 +97,15 @@ struct the_nilfs { struct backing_dev_info *ns_bdi; struct nilfs_sb_info *ns_writer; struct rw_semaphore ns_sem; + struct rw_semaphore ns_super_sem; struct mutex ns_writer_mutex; atomic_t ns_writer_refcount; + /* + * components protected by ns_super_sem + */ struct nilfs_sb_info *ns_current; + struct list_head ns_supers; /* * used for @@ -113,7 +119,6 @@ struct the_nilfs { time_t ns_sbwtime[2]; unsigned ns_sbsize; unsigned ns_mount_state; - struct list_head ns_supers; /* * Following fields are dedicated to a writable FS-instance. -- cgit v1.1 From aa7dfb8954ccf49e026ba13d12991a4eb7defb96 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 8 Jun 2009 01:39:33 +0900 Subject: nilfs2: get rid of bd_mount_sem use from nilfs This will remove every bd_mount_sem use in nilfs. The intended exclusion control was replaced by the previous patch ("nilfs2: correct exclusion control in nilfs_remount function") for nilfs_remount(), and this patch will replace remains with a new mutex that this inserts in nilfs object. Signed-off-by: Ryusuke Konishi Cc: Christoph Hellwig Signed-off-by: Al Viro --- fs/nilfs2/cpfile.c | 6 +++--- fs/nilfs2/super.c | 12 ++++++------ fs/nilfs2/the_nilfs.c | 1 + fs/nilfs2/the_nilfs.h | 2 ++ 4 files changed, 12 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index 300f1cd..cadd36b 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -864,11 +864,11 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode) case NILFS_CHECKPOINT: /* * Check for protecting existing snapshot mounts: - * bd_mount_sem is used to make this operation atomic and + * ns_mount_mutex is used to make this operation atomic and * exclusive with a new mount job. Though it doesn't cover * umount, it's enough for the purpose. */ - down(&nilfs->ns_bdev->bd_mount_sem); + mutex_lock(&nilfs->ns_mount_mutex); if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) { /* Current implementation does not have to protect plain read-only mounts since they are exclusive @@ -877,7 +877,7 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode) ret = -EBUSY; } else ret = nilfs_cpfile_clear_snapshot(cpfile, cno); - up(&nilfs->ns_bdev->bd_mount_sem); + mutex_unlock(&nilfs->ns_mount_mutex); return ret; case NILFS_SNAPSHOT: return nilfs_cpfile_set_snapshot(cpfile, cno); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index f02762f..1777a34 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -764,7 +764,7 @@ int nilfs_store_magic_and_option(struct super_block *sb, * @silent: silent mode flag * @nilfs: the_nilfs struct * - * This function is called exclusively by bd_mount_mutex. + * This function is called exclusively by nilfs->ns_mount_mutex. * So, the recovery process is protected from other simultaneous mounts. */ static int @@ -1105,7 +1105,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, goto failed; } - down(&sd.bdev->bd_mount_sem); + mutex_lock(&nilfs->ns_mount_mutex); if (!sd.cno) { /* @@ -1164,7 +1164,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, need_to_close = 0; } - up(&sd.bdev->bd_mount_sem); + mutex_unlock(&nilfs->ns_mount_mutex); put_nilfs(nilfs); if (need_to_close) close_bdev_exclusive(sd.bdev, flags); @@ -1172,7 +1172,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, return 0; failed_unlock: - up(&sd.bdev->bd_mount_sem); + mutex_unlock(&nilfs->ns_mount_mutex); put_nilfs(nilfs); failed: close_bdev_exclusive(sd.bdev, flags); @@ -1181,14 +1181,14 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, cancel_new: /* Abandoning the newly allocated superblock */ - up(&sd.bdev->bd_mount_sem); + mutex_unlock(&nilfs->ns_mount_mutex); put_nilfs(nilfs); up_write(&s->s_umount); deactivate_super(s); /* * deactivate_super() invokes close_bdev_exclusive(). * We must finish all post-cleaning before this call; - * put_nilfs() and unlocking bd_mount_sem need the block device. + * put_nilfs() needs the block device. */ return err; } diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 06e8dfd..e4e5c78 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -73,6 +73,7 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev) atomic_set(&nilfs->ns_ndirtyblks, 0); init_rwsem(&nilfs->ns_sem); init_rwsem(&nilfs->ns_super_sem); + mutex_init(&nilfs->ns_mount_mutex); mutex_init(&nilfs->ns_writer_mutex); INIT_LIST_HEAD(&nilfs->ns_list); INIT_LIST_HEAD(&nilfs->ns_supers); diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index d0cf4fb..e8adbff 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -49,6 +49,7 @@ enum { * @ns_writer: back pointer to writable nilfs_sb_info * @ns_sem: semaphore for shared states * @ns_super_sem: semaphore for global operations across super block instances + * @ns_mount_mutex: mutex protecting mount process of nilfs * @ns_writer_mutex: mutex protecting ns_writer attach/detach * @ns_writer_refcount: number of referrers on ns_writer * @ns_current: back pointer to current mount @@ -98,6 +99,7 @@ struct the_nilfs { struct nilfs_sb_info *ns_writer; struct rw_semaphore ns_sem; struct rw_semaphore ns_super_sem; + struct mutex ns_mount_mutex; struct mutex ns_writer_mutex; atomic_t ns_writer_refcount; -- cgit v1.1