From 0671e704658b9f26f85e78d51176daa861f955c7 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Mon, 10 May 2010 00:00:00 -0400
Subject: ext4: check missed return value in ext4_sync_file()

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/fsync.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 0d0c323..42bd94a 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -101,7 +101,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 		    (journal->j_fs_dev != journal->j_dev) &&
 		    (journal->j_flags & JBD2_BARRIER))
 			blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
-		jbd2_log_wait_commit(journal, commit_tid);
+		ret = jbd2_log_wait_commit(journal, commit_tid);
 	} else if (journal->j_flags & JBD2_BARRIER)
 		blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
 	return ret;
-- 
cgit v1.1


From c26d0bad3d0e951487e5dee36632dd3817f42b10 Mon Sep 17 00:00:00 2001
From: Steven Liu <lingjiujianke@gmail.com>
Date: Tue, 11 May 2010 00:00:00 -0400
Subject: ext4: Fix coding style in fs/ext4/move_extent.c

Making sure ee_block is initialized to zero to prevent gcc from
kvetching.  It's harmless (although it's not obvious that it's
harmless) from code inspection:

fs/ext4/move_extent.c:478: warning: 'start_ext.ee_block' may be used
uninitialized in this function

Thanks to Stefan Richter for first bringing this to the attention of
linux-ext4@vger.kernel.org.

Signed-off-by: LiuQi <lingjiujianke@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 fs/ext4/move_extent.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index d1fc662..82621a3 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -482,6 +482,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
 	int depth = ext_depth(orig_inode);
 	int ret;
 
+	start_ext.ee_block = end_ext.ee_block = 0;
 	o_start = o_end = oext = orig_path[depth].p_ext;
 	oext_alen = ext4_ext_get_actual_len(oext);
 	start_ext.ee_len = end_ext.ee_len = 0;
-- 
cgit v1.1


From b720303df7352d4a7a1f61e467e0a124913c0d41 Mon Sep 17 00:00:00 2001
From: Jing Zhang <zj.barak@gmail.com>
Date: Wed, 12 May 2010 00:00:00 -0400
Subject: ext4: fix memory leaks in error path handling of ext4_ext_zeroout()

When EIO occurs after bio is submitted, there is no memory free
operation for bio, which results in memory leakage. And there is also
no check against bio_alloc() for bio.

Acked-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Signed-off-by: Jing Zhang <zj.barak@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 236b834..228eeaf 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2544,7 +2544,7 @@ static void bi_complete(struct bio *bio, int error)
 /* FIXME!! we need to try to merge to left or right after zero-out  */
 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 {
-	int ret = -EIO;
+	int ret;
 	struct bio *bio;
 	int blkbits, blocksize;
 	sector_t ee_pblock;
@@ -2568,6 +2568,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 			len = ee_len;
 
 		bio = bio_alloc(GFP_NOIO, len);
+		if (!bio)
+			return -ENOMEM;
+
 		bio->bi_sector = ee_pblock;
 		bio->bi_bdev   = inode->i_sb->s_bdev;
 
@@ -2595,17 +2598,15 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 		submit_bio(WRITE, bio);
 		wait_for_completion(&event);
 
-		if (test_bit(BIO_UPTODATE, &bio->bi_flags))
-			ret = 0;
-		else {
-			ret = -EIO;
-			break;
+		if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+			bio_put(bio);
+			return -EIO;
 		}
 		bio_put(bio);
 		ee_len    -= done;
 		ee_pblock += done  << (blkbits - 9);
 	}
-	return ret;
+	return 0;
 }
 
 #define EXT4_EXT_ZERO_LEN 7
-- 
cgit v1.1


From 62e823a2cba18509ee826d775270e8ef9071b5bc Mon Sep 17 00:00:00 2001
From: Jing Zhang <zj.barak@gmail.com>
Date: Thu, 13 May 2010 00:00:00 -0400
Subject: ext4: Remove unnecessary call to ext4_get_group_desc() in mballoc

Signed-off-by: Jing Zhang <zj.barak@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b423a36..0550ea3 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2025,7 +2025,6 @@ repeat:
 
 		for (i = 0; i < ngroups; group++, i++) {
 			struct ext4_group_info *grp;
-			struct ext4_group_desc *desc;
 
 			if (group == ngroups)
 				group = 0;
@@ -2048,7 +2047,6 @@ repeat:
 			}
 
 			ac->ac_groups_scanned++;
-			desc = ext4_get_group_desc(sb, group, NULL);
 			if (cr == 0)
 				ext4_mb_simple_scan_group(ac, &e4b);
 			else if (cr == 1 &&
-- 
cgit v1.1


From e39e07fdfd98be8650385f12a7b81d6adc547510 Mon Sep 17 00:00:00 2001
From: Jing Zhang <zj.barak@gmail.com>
Date: Fri, 14 May 2010 00:00:00 -0400
Subject: ext4: rename ext4_mb_release_desc() to ext4_mb_unload_buddy()

This function cleans up after ext4_mb_load_buddy(), so the renaming
makes the code clearer.

Signed-off-by: Jing Zhang <zj.barak@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 0550ea3..8639d5a6 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1150,7 +1150,7 @@ err:
 	return ret;
 }
 
-static void ext4_mb_release_desc(struct ext4_buddy *e4b)
+static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
 {
 	if (e4b->bd_bitmap_page)
 		page_cache_release(e4b->bd_bitmap_page);
@@ -1617,7 +1617,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
 	}
 
 	ext4_unlock_group(ac->ac_sb, group);
-	ext4_mb_release_desc(e4b);
+	ext4_mb_unload_buddy(e4b);
 
 	return 0;
 }
@@ -1672,7 +1672,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
 		ext4_mb_use_best_found(ac, e4b);
 	}
 	ext4_unlock_group(ac->ac_sb, group);
-	ext4_mb_release_desc(e4b);
+	ext4_mb_unload_buddy(e4b);
 
 	return 0;
 }
@@ -2042,7 +2042,7 @@ repeat:
 			if (!ext4_mb_good_group(ac, group, cr)) {
 				/* someone did allocation from this group */
 				ext4_unlock_group(sb, group);
-				ext4_mb_release_desc(&e4b);
+				ext4_mb_unload_buddy(&e4b);
 				continue;
 			}
 
@@ -2056,7 +2056,7 @@ repeat:
 				ext4_mb_complex_scan_group(ac, &e4b);
 
 			ext4_unlock_group(sb, group);
-			ext4_mb_release_desc(&e4b);
+			ext4_mb_unload_buddy(&e4b);
 
 			if (ac->ac_status != AC_STATUS_CONTINUE)
 				break;
@@ -2146,7 +2146,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
 	ext4_lock_group(sb, group);
 	memcpy(&sg, ext4_get_group_info(sb, group), i);
 	ext4_unlock_group(sb, group);
-	ext4_mb_release_desc(&e4b);
+	ext4_mb_unload_buddy(&e4b);
 
 	seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
 			sg.info.bb_fragments, sg.info.bb_first_free);
@@ -2566,7 +2566,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 		}
 		ext4_unlock_group(sb, entry->group);
 		kmem_cache_free(ext4_free_ext_cachep, entry);
-		ext4_mb_release_desc(&e4b);
+		ext4_mb_unload_buddy(&e4b);
 	}
 
 	mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
@@ -3695,7 +3695,7 @@ out:
 	ext4_unlock_group(sb, group);
 	if (ac)
 		kmem_cache_free(ext4_ac_cachep, ac);
-	ext4_mb_release_desc(&e4b);
+	ext4_mb_unload_buddy(&e4b);
 	put_bh(bitmap_bh);
 	return free;
 }
@@ -3799,7 +3799,7 @@ repeat:
 		if (bitmap_bh == NULL) {
 			ext4_error(sb, "Error reading block bitmap for %u",
 					group);
-			ext4_mb_release_desc(&e4b);
+			ext4_mb_unload_buddy(&e4b);
 			continue;
 		}
 
@@ -3808,7 +3808,7 @@ repeat:
 		ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
 		ext4_unlock_group(sb, group);
 
-		ext4_mb_release_desc(&e4b);
+		ext4_mb_unload_buddy(&e4b);
 		put_bh(bitmap_bh);
 
 		list_del(&pa->u.pa_tmp_list);
@@ -4072,7 +4072,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
 		ext4_mb_release_group_pa(&e4b, pa, ac);
 		ext4_unlock_group(sb, group);
 
-		ext4_mb_release_desc(&e4b);
+		ext4_mb_unload_buddy(&e4b);
 		list_del(&pa->u.pa_tmp_list);
 		call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
 	}
@@ -4608,7 +4608,7 @@ do_more:
 		atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
 	}
 
-	ext4_mb_release_desc(&e4b);
+	ext4_mb_unload_buddy(&e4b);
 
 	freed += count;
 
-- 
cgit v1.1


From b684b2ee9409f2890a8b3aea98525bbe5f84e276 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Sat, 15 May 2010 00:00:00 -0400
Subject: ext4: allow defrag (EXT4_IOC_MOVE_EXT) in 32bit compat mode

I have an x86_64 kernel with i386 userspace. e4defrag fails on the
EXT4_IOC_MOVE_EXT ioctl because it is not wired up for the compat
case. It seems that struct move_extent is compat save, only types
with fixed widths are used:
{
        __u32 reserved;         /* should be zero */
        __u32 donor_fd;         /* donor file descriptor */
        __u64 orig_start;       /* logical start offset in block for orig */
        __u64 donor_start;      /* logical start offset in block for donor */
        __u64 len;              /* block length to be moved */
        __u64 moved_len;        /* moved block length */
};

Lets just wire up EXT4_IOC_MOVE_EXT for the compat case.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
CC: Akira Fujita <a-fujita@rs.jp.nec.com>
---
 fs/ext4/ioctl.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 016d024..66fa0b0 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -375,6 +375,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		break;
 	case EXT4_IOC_GROUP_ADD:
 		break;
+	case EXT4_IOC_MOVE_EXT:
+		break;
 	default:
 		return -ENOIOCTLCMD;
 	}
-- 
cgit v1.1


From 35121c9860316d7799cea0fbc359a9186e7c2747 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Sun, 16 May 2010 00:00:00 -0400
Subject: ext4: fix quota accounting in case of fallocate

allocated_meta_data is already included in 'used' variable.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 81d6054..55bfcd9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1126,7 +1126,8 @@ void ext4_da_update_reserve_space(struct inode *inode,
 		 */
 		if (allocated_meta_blocks)
 			dquot_claim_block(inode, allocated_meta_blocks);
-		dquot_release_reservation_block(inode, mdb_free + used);
+		dquot_release_reservation_block(inode, mdb_free + used -
+						allocated_meta_blocks);
 	}
 
 	/*
-- 
cgit v1.1


From 42007efd569f1cf3bfb9a61da60ef6c2179508ca Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Sun, 16 May 2010 01:00:00 -0400
Subject: ext4: check s_log_groups_per_flex in online resize code

If groups_per_flex < 2, sbi->s_flex_groups[] doesn't get filled out,
and every other access to this first tests s_log_groups_per_flex;
same thing needs to happen in resize or we'll wander off into
a null pointer when doing an online resize of the file system.

Thanks to Christoph Biedl, who came up with the trivial testcase:

# truncate --size 128M fsfile
# mkfs.ext3 -F fsfile
# tune2fs -O extents,uninit_bg,dir_index,flex_bg,huge_file,dir_nlink,extra_isize fsfile
# e2fsck -yDf -C0 fsfile
# truncate --size 132M fsfile
# losetup /dev/loop0 fsfile
# mount /dev/loop0 mnt
# resize2fs -p /dev/loop0

	https://bugzilla.kernel.org/show_bug.cgi?id=13549

Reported-by: Alessandro Polverini <alex@nibbles.it>
Test-case-by: Christoph Biedl  <bugzilla.kernel.bpeb@manchmal.in-ulm.de>
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/resize.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 5692c48..6df797e 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -911,7 +911,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	percpu_counter_add(&sbi->s_freeinodes_counter,
 			   EXT4_INODES_PER_GROUP(sb));
 
-	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+	    sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group;
 		flex_group = ext4_flex_group(sbi, input->group);
 		atomic_add(input->free_blocks_count,
-- 
cgit v1.1


From 256a453546f1cf2c29a639c79fe0d603761a8e25 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Sun, 16 May 2010 02:00:00 -0400
Subject: ext4: symlink must be handled via filesystem specific operation

generic setattr implementation is no longer responsible for
quota transfer so synlinks must be handled via ext4_setattr.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/symlink.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 00740cb..ed9354a 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -34,6 +34,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
+	.setattr	= ext4_setattr,
 #ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
@@ -45,6 +46,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
 const struct inode_operations ext4_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
 	.follow_link	= ext4_follow_link,
+	.setattr	= ext4_setattr,
 #ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
-- 
cgit v1.1


From 6b0310fbf087ad6e9e3b8392adca97cd77184084 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Sun, 16 May 2010 02:00:00 -0400
Subject: ext4: don't return to userspace after freezing the fs with a mutex
 held

ext4_freeze() used jbd2_journal_lock_updates() which takes
the j_barrier mutex, and then returns to userspace.  The
kernel does not like this:

================================================
[ BUG: lock held when returning to user space! ]
------------------------------------------------
lvcreate/1075 is leaving the kernel with locks still held!
1 lock held by lvcreate/1075:
 #0:  (&journal->j_barrier){+.+...}, at: [<ffffffff811c6214>]
jbd2_journal_lock_updates+0xe1/0xf0

Use vfs_check_frozen() added to ext4_journal_start_sb() and
ext4_force_commit() instead.

Addresses-Red-Hat-Bugzilla: #568503

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e14d22c..00d09f5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -241,6 +241,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
 	if (sb->s_flags & MS_RDONLY)
 		return ERR_PTR(-EROFS);
 
+	vfs_check_frozen(sb, SB_FREEZE_WRITE);
 	/* Special case here: if the journal has aborted behind our
 	 * backs (eg. EIO in the commit thread), then we still need to
 	 * take the FS itself readonly cleanly. */
@@ -3485,8 +3486,10 @@ int ext4_force_commit(struct super_block *sb)
 		return 0;
 
 	journal = EXT4_SB(sb)->s_journal;
-	if (journal)
+	if (journal) {
+		vfs_check_frozen(sb, SB_FREEZE_WRITE);
 		ret = ext4_journal_force_commit(journal);
+	}
 
 	return ret;
 }
@@ -3535,18 +3538,16 @@ static int ext4_freeze(struct super_block *sb)
 	 * the journal.
 	 */
 	error = jbd2_journal_flush(journal);
-	if (error < 0) {
-	out:
-		jbd2_journal_unlock_updates(journal);
-		return error;
-	}
+	if (error < 0)
+		goto out;
 
 	/* Journal blocked and flushed, clear needs_recovery flag. */
 	EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 	error = ext4_commit_super(sb, 1);
-	if (error)
-		goto out;
-	return 0;
+out:
+	/* we rely on s_frozen to stop further updates */
+	jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+	return error;
 }
 
 /*
@@ -3563,7 +3564,6 @@ static int ext4_unfreeze(struct super_block *sb)
 	EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 	ext4_commit_super(sb, 1);
 	unlock_super(sb);
-	jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
 	return 0;
 }
 
-- 
cgit v1.1


From a30eec2a8650a77f754e84b2e15f062fe652baa7 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Sun, 16 May 2010 03:00:00 -0400
Subject: ext4: stop issuing discards if not supported by device

Turn off issuance of discard requests if the device does
not support it - similar to the action we take for barriers.
This will save a little computation time if a non-discardable
device is mounted with -o discard, and also makes it obvious
that it's not doing what was asked at mount time ...

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 8639d5a6..4f2d3a9 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2534,6 +2534,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 			 entry->count, entry->group, entry);
 
 		if (test_opt(sb, DISCARD)) {
+			int ret;
 			ext4_fsblk_t discard_block;
 
 			discard_block = entry->start_blk +
@@ -2541,7 +2542,12 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
 			trace_ext4_discard_blocks(sb,
 					(unsigned long long)discard_block,
 					entry->count);
-			sb_issue_discard(sb, discard_block, entry->count);
+			ret = sb_issue_discard(sb, discard_block, entry->count);
+			if (ret == EOPNOTSUPP) {
+				ext4_warning(sb,
+					"discard not supported, disabling");
+				clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
+			}
 		}
 
 		err = ext4_mb_load_buddy(sb, entry->group, &e4b);
-- 
cgit v1.1


From c445e3e0a5c2804524dec6e55f66d63f6bc5bc3e Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Sun, 16 May 2010 04:00:00 -0400
Subject: ext4: don't scan/accumulate more pages than mballoc will allocate

There was a bug reported on RHEL5 that a 10G dd on a 12G box
had a very, very slow sync after that.

At issue was the loop in write_cache_pages scanning all the way
to the end of the 10G file, even though the subsequent call
to mpage_da_submit_io would only actually write a smallish amt; then
we went back to the write_cache_pages loop ... wasting tons of time
in calling __mpage_da_writepage for thousands of pages we would
just revisit (many times) later.

Upstream it's not such a big issue for sys_sync because we get
to the loop with a much smaller nr_to_write, which limits the loop.

However, talking with Aneesh he realized that fsync upstream still
gets here with a very large nr_to_write and we face the same problem.

This patch makes mpage_add_bh_to_extent stop the loop after we've
accumulated 2048 pages, by setting mpd->io_done = 1; which ultimately
causes the write_cache_pages loop to break.

Repeating the test with a dirty_ratio of 80 (to leave something for
fsync to do), I don't see huge IO performance gains, but the reduction
in cpu usage is striking: 80% usage with stock, and 2% with the
below patch.  Instrumenting the loop in write_cache_pages clearly
shows that we are wasting time here.

Eventually we need to change mpage_da_map_pages() also submit its I/O
to the block layer, subsuming mpage_da_submit_io(), and then change it
call ext4_get_blocks() multiple times.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 55bfcd9..89a31e8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2350,6 +2350,15 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
 	sector_t next;
 	int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
 
+	/*
+	 * XXX Don't go larger than mballoc is willing to allocate
+	 * This is a stopgap solution.  We eventually need to fold
+	 * mpage_da_submit_io() into this function and then call
+	 * ext4_get_blocks() multiple times in a loop
+	 */
+	if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
+		goto flush_it;
+
 	/* check if thereserved journal credits might overflow */
 	if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
 		if (nrblocks >= EXT4_MAX_TRANS_DATA) {
-- 
cgit v1.1


From c35a56a090eacefca07afeb994029b57d8dd8025 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 16 May 2010 05:00:00 -0400
Subject: jbd2: Improve scalability by not taking j_state_lock in
 jbd2_journal_stop()

One of the most contended locks in the jbd2 layer is j_state_lock when
running dbench.  This is especially true if using the real-time kernel
with its "sleeping spinlocks" patch that replaces spinlocks with
priority inheriting mutexes --- but it also shows up on large SMP
benchmarks.

Thanks to John Stultz for pointing this out.

Reviewed by Mingming Cao and Jan Kara.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/transaction.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index bfc70f5..e214d68 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1311,7 +1311,6 @@ int jbd2_journal_stop(handle_t *handle)
 	if (handle->h_sync)
 		transaction->t_synchronous_commit = 1;
 	current->journal_info = NULL;
-	spin_lock(&journal->j_state_lock);
 	spin_lock(&transaction->t_handle_lock);
 	transaction->t_outstanding_credits -= handle->h_buffer_credits;
 	transaction->t_updates--;
@@ -1340,8 +1339,7 @@ int jbd2_journal_stop(handle_t *handle)
 		jbd_debug(2, "transaction too old, requesting commit for "
 					"handle %p\n", handle);
 		/* This is non-blocking */
-		__jbd2_log_start_commit(journal, transaction->t_tid);
-		spin_unlock(&journal->j_state_lock);
+		jbd2_log_start_commit(journal, transaction->t_tid);
 
 		/*
 		 * Special case: JBD2_SYNC synchronous updates require us
@@ -1351,7 +1349,6 @@ int jbd2_journal_stop(handle_t *handle)
 			err = jbd2_log_wait_commit(journal, tid);
 	} else {
 		spin_unlock(&transaction->t_handle_lock);
-		spin_unlock(&journal->j_state_lock);
 	}
 
 	lock_map_release(&handle->h_lockdep_map);
-- 
cgit v1.1


From 21ca087a3891efab4d45488db8febee474d26c68 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Sun, 16 May 2010 06:00:00 -0400
Subject: ext4: Do not zero out uninitialized extents beyond i_size

The extents code will sometimes zero out blocks and mark them as
initialized instead of splitting an extent into several smaller ones.
This optimization however, causes problems if the extent is beyond
i_size because fsck will complain if there are uninitialized blocks
after i_size as this can not be distinguished from an inode that has
an incorrect i_size field.

https://bugzilla.kernel.org/show_bug.cgi?id=15742

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 67 ++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 51 insertions(+), 16 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 228eeaf..ee611da 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2631,11 +2631,21 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	struct ext4_extent *ex2 = NULL;
 	struct ext4_extent *ex3 = NULL;
 	struct ext4_extent_header *eh;
-	ext4_lblk_t ee_block;
+	ext4_lblk_t ee_block, eof_block;
 	unsigned int allocated, ee_len, depth;
 	ext4_fsblk_t newblock;
 	int err = 0;
 	int ret = 0;
+	int may_zeroout;
+
+	ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
+		"block %llu, max_blocks %u\n", inode->i_ino,
+		(unsigned long long)iblock, max_blocks);
+
+	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+		inode->i_sb->s_blocksize_bits;
+	if (eof_block < iblock + max_blocks)
+		eof_block = iblock + max_blocks;
 
 	depth = ext_depth(inode);
 	eh = path[depth].p_hdr;
@@ -2644,16 +2654,23 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	ee_len = ext4_ext_get_actual_len(ex);
 	allocated = ee_len - (iblock - ee_block);
 	newblock = iblock - ee_block + ext_pblock(ex);
+
 	ex2 = ex;
 	orig_ex.ee_block = ex->ee_block;
 	orig_ex.ee_len   = cpu_to_le16(ee_len);
 	ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
 
+	/*
+	 * It is safe to convert extent to initialized via explicit
+	 * zeroout only if extent is fully insde i_size or new_size.
+	 */
+	may_zeroout = ee_block + ee_len <= eof_block;
+
 	err = ext4_ext_get_access(handle, inode, path + depth);
 	if (err)
 		goto out;
 	/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
-	if (ee_len <= 2*EXT4_EXT_ZERO_LEN) {
+	if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
 		err =  ext4_ext_zeroout(inode, &orig_ex);
 		if (err)
 			goto fix_extent_len;
@@ -2684,7 +2701,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	if (allocated > max_blocks) {
 		unsigned int newdepth;
 		/* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
-		if (allocated <= EXT4_EXT_ZERO_LEN) {
+		if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
 			/*
 			 * iblock == ee_block is handled by the zerouout
 			 * at the beginning.
@@ -2760,7 +2777,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
 		ext4_ext_mark_uninitialized(ex3);
 		err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
-		if (err == -ENOSPC) {
+		if (err == -ENOSPC && may_zeroout) {
 			err =  ext4_ext_zeroout(inode, &orig_ex);
 			if (err)
 				goto fix_extent_len;
@@ -2784,8 +2801,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		 * update the extent length after successful insert of the
 		 * split extent
 		 */
-		orig_ex.ee_len = cpu_to_le16(ee_len -
-						ext4_ext_get_actual_len(ex3));
+		ee_len -= ext4_ext_get_actual_len(ex3);
+		orig_ex.ee_len = cpu_to_le16(ee_len);
+		may_zeroout = ee_block + ee_len <= eof_block;
+
 		depth = newdepth;
 		ext4_ext_drop_refs(path);
 		path = ext4_ext_find_extent(inode, iblock, path);
@@ -2809,7 +2828,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		 * otherwise give the extent a chance to merge to left
 		 */
 		if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
-							iblock != ee_block) {
+			iblock != ee_block && may_zeroout) {
 			err =  ext4_ext_zeroout(inode, &orig_ex);
 			if (err)
 				goto fix_extent_len;
@@ -2878,7 +2897,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	goto out;
 insert:
 	err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
-	if (err == -ENOSPC) {
+	if (err == -ENOSPC && may_zeroout) {
 		err =  ext4_ext_zeroout(inode, &orig_ex);
 		if (err)
 			goto fix_extent_len;
@@ -2938,14 +2957,21 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 	struct ext4_extent *ex2 = NULL;
 	struct ext4_extent *ex3 = NULL;
 	struct ext4_extent_header *eh;
-	ext4_lblk_t ee_block;
+	ext4_lblk_t ee_block, eof_block;
 	unsigned int allocated, ee_len, depth;
 	ext4_fsblk_t newblock;
 	int err = 0;
+	int may_zeroout;
+
+	ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
+		"block %llu, max_blocks %u\n", inode->i_ino,
+		(unsigned long long)iblock, max_blocks);
+
+	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+		inode->i_sb->s_blocksize_bits;
+	if (eof_block < iblock + max_blocks)
+		eof_block = iblock + max_blocks;
 
-	ext_debug("ext4_split_unwritten_extents: inode %lu,"
-		  "iblock %llu, max_blocks %u\n", inode->i_ino,
-		  (unsigned long long)iblock, max_blocks);
 	depth = ext_depth(inode);
 	eh = path[depth].p_hdr;
 	ex = path[depth].p_ext;
@@ -2953,12 +2979,19 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 	ee_len = ext4_ext_get_actual_len(ex);
 	allocated = ee_len - (iblock - ee_block);
 	newblock = iblock - ee_block + ext_pblock(ex);
+
 	ex2 = ex;
 	orig_ex.ee_block = ex->ee_block;
 	orig_ex.ee_len   = cpu_to_le16(ee_len);
 	ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
 
 	/*
+	 * It is safe to convert extent to initialized via explicit
+	 * zeroout only if extent is fully insde i_size or new_size.
+	 */
+	may_zeroout = ee_block + ee_len <= eof_block;
+
+	/*
  	 * If the uninitialized extent begins at the same logical
  	 * block where the write begins, and the write completely
  	 * covers the extent, then we don't need to split it.
@@ -2992,7 +3025,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
 		ext4_ext_mark_uninitialized(ex3);
 		err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
-		if (err == -ENOSPC) {
+		if (err == -ENOSPC && may_zeroout) {
 			err =  ext4_ext_zeroout(inode, &orig_ex);
 			if (err)
 				goto fix_extent_len;
@@ -3016,8 +3049,10 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 		 * update the extent length after successful insert of the
 		 * split extent
 		 */
-		orig_ex.ee_len = cpu_to_le16(ee_len -
-						ext4_ext_get_actual_len(ex3));
+		ee_len -= ext4_ext_get_actual_len(ex3);
+		orig_ex.ee_len = cpu_to_le16(ee_len);
+		may_zeroout = ee_block + ee_len <= eof_block;
+
 		depth = newdepth;
 		ext4_ext_drop_refs(path);
 		path = ext4_ext_find_extent(inode, iblock, path);
@@ -3063,7 +3098,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 	goto out;
 insert:
 	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
-	if (err == -ENOSPC) {
+	if (err == -ENOSPC && may_zeroout) {
 		err =  ext4_ext_zeroout(inode, &orig_ex);
 		if (err)
 			goto fix_extent_len;
-- 
cgit v1.1


From d17413c08cd2b1dd2bf2cfdbb0f7b736b2b2b15c Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Sun, 16 May 2010 07:00:00 -0400
Subject: ext4: clean up inode bitmaps manipulation in ext4_free_inode

- Reorganize locking scheme to batch two atomic operation in to one.
  This also allow us to state what healthy group must obey following rule
  ext4_free_inodes_count(sb, gdp) == ext4_count_free(inode_bitmap, NUM);
- Fix possible undefined pointer dereference.
- Even if group descriptor stats aren't accessible we have to update
  inode bitmaps.
- Move non-group members update out of group_lock.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ialloc.c | 85 ++++++++++++++++++++++++++------------------------------
 1 file changed, 39 insertions(+), 46 deletions(-)

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 57f6eef..52618d5 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -240,56 +240,49 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 	if (fatal)
 		goto error_return;
 
-	/* Ok, now we can actually update the inode bitmaps.. */
-	cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
-					bit, bitmap_bh->b_data);
-	if (!cleared)
-		ext4_error(sb, "bit already cleared for inode %lu", ino);
-	else {
-		gdp = ext4_get_group_desc(sb, block_group, &bh2);
-
+	fatal = -ESRCH;
+	gdp = ext4_get_group_desc(sb, block_group, &bh2);
+	if (gdp) {
 		BUFFER_TRACE(bh2, "get_write_access");
 		fatal = ext4_journal_get_write_access(handle, bh2);
-		if (fatal) goto error_return;
-
-		if (gdp) {
-			ext4_lock_group(sb, block_group);
-			count = ext4_free_inodes_count(sb, gdp) + 1;
-			ext4_free_inodes_set(sb, gdp, count);
-			if (is_directory) {
-				count = ext4_used_dirs_count(sb, gdp) - 1;
-				ext4_used_dirs_set(sb, gdp, count);
-				if (sbi->s_log_groups_per_flex) {
-					ext4_group_t f;
-
-					f = ext4_flex_group(sbi, block_group);
-					atomic_dec(&sbi->s_flex_groups[f].used_dirs);
-				}
+	}
+	ext4_lock_group(sb, block_group);
+	cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
+	if (fatal || !cleared) {
+		ext4_unlock_group(sb, block_group);
+		goto out;
+	}
 
-			}
-			gdp->bg_checksum = ext4_group_desc_csum(sbi,
-							block_group, gdp);
-			ext4_unlock_group(sb, block_group);
-			percpu_counter_inc(&sbi->s_freeinodes_counter);
-			if (is_directory)
-				percpu_counter_dec(&sbi->s_dirs_counter);
-
-			if (sbi->s_log_groups_per_flex) {
-				ext4_group_t f;
-
-				f = ext4_flex_group(sbi, block_group);
-				atomic_inc(&sbi->s_flex_groups[f].free_inodes);
-			}
-		}
-		BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
-		err = ext4_handle_dirty_metadata(handle, NULL, bh2);
-		if (!fatal) fatal = err;
+	count = ext4_free_inodes_count(sb, gdp) + 1;
+	ext4_free_inodes_set(sb, gdp, count);
+	if (is_directory) {
+		count = ext4_used_dirs_count(sb, gdp) - 1;
+		ext4_used_dirs_set(sb, gdp, count);
+		percpu_counter_dec(&sbi->s_dirs_counter);
 	}
-	BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
-	err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
-	if (!fatal)
-		fatal = err;
-	sb->s_dirt = 1;
+	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
+	ext4_unlock_group(sb, block_group);
+
+	percpu_counter_inc(&sbi->s_freeinodes_counter);
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t f = ext4_flex_group(sbi, block_group);
+
+		atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+		if (is_directory)
+			atomic_dec(&sbi->s_flex_groups[f].used_dirs);
+	}
+	BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
+	fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
+out:
+	if (cleared) {
+		BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
+		err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+		if (!fatal)
+			fatal = err;
+		sb->s_dirt = 1;
+	} else
+		ext4_error(sb, "bit already cleared for inode %lu", ino);
+
 error_return:
 	brelse(bitmap_bh);
 	ext4_std_error(sb, fatal);
-- 
cgit v1.1


From 84061e07c5fbbbf9dc8aef8fb750fc3a2dfc31f3 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Sun, 16 May 2010 08:00:00 -0400
Subject: ext4: init statistics after journal recovery

Currently block/inode/dir counters initialized before journal was
recovered. In fact after journal recovery this info will probably
change. And freeblocks it critical for correct delalloc mode
accounting.

https://bugzilla.kernel.org/show_bug.cgi?id=15768

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 40 ++++++++++++++++++----------------------
 1 file changed, 18 insertions(+), 22 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 00d09f5..d83cdcb 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2794,24 +2794,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
 
-	err = percpu_counter_init(&sbi->s_freeblocks_counter,
-			ext4_count_free_blocks(sb));
-	if (!err) {
-		err = percpu_counter_init(&sbi->s_freeinodes_counter,
-				ext4_count_free_inodes(sb));
-	}
-	if (!err) {
-		err = percpu_counter_init(&sbi->s_dirs_counter,
-				ext4_count_dirs(sb));
-	}
-	if (!err) {
-		err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
-	}
-	if (err) {
-		ext4_msg(sb, KERN_ERR, "insufficient memory");
-		goto failed_mount3;
-	}
-
 	sbi->s_stripe = ext4_get_stripe_size(sbi);
 	sbi->s_max_writeback_mb_bump = 128;
 
@@ -2911,6 +2893,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
 
 no_journal:
+	err = percpu_counter_init(&sbi->s_freeblocks_counter,
+				  ext4_count_free_blocks(sb));
+	if (!err)
+		err = percpu_counter_init(&sbi->s_freeinodes_counter,
+					  ext4_count_free_inodes(sb));
+	if (!err)
+		err = percpu_counter_init(&sbi->s_dirs_counter,
+					  ext4_count_dirs(sb));
+	if (!err)
+		err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+	if (err) {
+		ext4_msg(sb, KERN_ERR, "insufficient memory");
+		goto failed_mount_wq;
+	}
 	if (test_opt(sb, NOBH)) {
 		if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
 			ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
@@ -3060,6 +3056,10 @@ failed_mount_wq:
 		jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
 	}
+	percpu_counter_destroy(&sbi->s_freeblocks_counter);
+	percpu_counter_destroy(&sbi->s_freeinodes_counter);
+	percpu_counter_destroy(&sbi->s_dirs_counter);
+	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount3:
 	if (sbi->s_flex_groups) {
 		if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -3067,10 +3067,6 @@ failed_mount3:
 		else
 			kfree(sbi->s_flex_groups);
 	}
-	percpu_counter_destroy(&sbi->s_freeblocks_counter);
-	percpu_counter_destroy(&sbi->s_freeinodes_counter);
-	percpu_counter_destroy(&sbi->s_dirs_counter);
-	percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount2:
 	for (i = 0; i < db_count; i++)
 		brelse(sbi->s_group_desc[i]);
-- 
cgit v1.1


From 56246f9ae4cfa95b460f9dfbcfb1b772d85db046 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Sun, 16 May 2010 09:00:00 -0400
Subject: quota: use flags interface for dquot alloc/free space

Switch __dquot_alloc_space and __dquot_free_space to take flags
to indicate whether to warn and/or to reserve (or free reserve).

This is slightly more readable at the callpoints, and makes it
cleaner to add a "nofail" option in the next patch.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/quota/dquot.c         |  8 +++++---
 include/linux/quotaops.h | 25 ++++++++++++++-----------
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 788b580..8c40c07 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1488,11 +1488,12 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
 /*
  * This operation can block, but only after everything is updated
  */
-int __dquot_alloc_space(struct inode *inode, qsize_t number,
-		int warn, int reserve)
+int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
 {
 	int cnt, ret = 0;
 	char warntype[MAXQUOTAS];
+	int warn = flags & DQUOT_SPACE_WARN;
+	int reserve = flags & DQUOT_SPACE_RESERVE;
 
 	/*
 	 * First test before acquiring mutex - solves deadlocks when we
@@ -1612,10 +1613,11 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);
 /*
  * This operation can block, but only after everything is updated
  */
-void __dquot_free_space(struct inode *inode, qsize_t number, int reserve)
+void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
 {
 	unsigned int cnt;
 	char warntype[MAXQUOTAS];
+	int reserve = flags & DQUOT_SPACE_RESERVE;
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index e6fa7ac..9edd53c 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -9,6 +9,9 @@
 
 #include <linux/fs.h>
 
+#define DQUOT_SPACE_WARN	0x1
+#define DQUOT_SPACE_RESERVE	0x2
+
 static inline struct quota_info *sb_dqopt(struct super_block *sb)
 {
 	return &sb->s_dquot;
@@ -33,9 +36,8 @@ int dquot_scan_active(struct super_block *sb,
 struct dquot *dquot_alloc(struct super_block *sb, int type);
 void dquot_destroy(struct dquot *dquot);
 
-int __dquot_alloc_space(struct inode *inode, qsize_t number,
-		int warn, int reserve);
-void __dquot_free_space(struct inode *inode, qsize_t number, int reserve);
+int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags);
+void __dquot_free_space(struct inode *inode, qsize_t number, int flags);
 
 int dquot_alloc_inode(const struct inode *inode);
 
@@ -231,17 +233,17 @@ static inline int dquot_transfer(struct inode *inode, struct iattr *iattr)
 }
 
 static inline int __dquot_alloc_space(struct inode *inode, qsize_t number,
-		int warn, int reserve)
+		int flags)
 {
-	if (!reserve)
+	if (!(flags & DQUOT_SPACE_RESERVE))
 		inode_add_bytes(inode, number);
 	return 0;
 }
 
 static inline void __dquot_free_space(struct inode *inode, qsize_t number,
-		int reserve)
+		int flags)
 {
-	if (!reserve)
+	if (!(flags & DQUOT_SPACE_RESERVE))
 		inode_sub_bytes(inode, number);
 }
 
@@ -257,7 +259,7 @@ static inline int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
 
 static inline int dquot_alloc_space_nodirty(struct inode *inode, qsize_t nr)
 {
-	return __dquot_alloc_space(inode, nr, 1, 0);
+	return __dquot_alloc_space(inode, nr, DQUOT_SPACE_WARN);
 }
 
 static inline int dquot_alloc_space(struct inode *inode, qsize_t nr)
@@ -282,7 +284,7 @@ static inline int dquot_alloc_block(struct inode *inode, qsize_t nr)
 
 static inline int dquot_prealloc_block_nodirty(struct inode *inode, qsize_t nr)
 {
-	return __dquot_alloc_space(inode, nr << inode->i_blkbits, 0, 0);
+	return __dquot_alloc_space(inode, nr << inode->i_blkbits, 0);
 }
 
 static inline int dquot_prealloc_block(struct inode *inode, qsize_t nr)
@@ -297,7 +299,8 @@ static inline int dquot_prealloc_block(struct inode *inode, qsize_t nr)
 
 static inline int dquot_reserve_block(struct inode *inode, qsize_t nr)
 {
-	return __dquot_alloc_space(inode, nr << inode->i_blkbits, 1, 1);
+	return __dquot_alloc_space(inode, nr << inode->i_blkbits,
+				DQUOT_SPACE_WARN|DQUOT_SPACE_RESERVE);
 }
 
 static inline int dquot_claim_block(struct inode *inode, qsize_t nr)
@@ -334,7 +337,7 @@ static inline void dquot_free_block(struct inode *inode, qsize_t nr)
 static inline void dquot_release_reservation_block(struct inode *inode,
 		qsize_t nr)
 {
-	__dquot_free_space(inode, nr << inode->i_blkbits, 1);
+	__dquot_free_space(inode, nr << inode->i_blkbits, DQUOT_SPACE_RESERVE);
 }
 
 #endif /* _LINUX_QUOTAOPS_ */
-- 
cgit v1.1


From 0e05842bc117ea70ceb979cca798fd026879951b Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Sun, 16 May 2010 10:00:00 -0400
Subject: quota: add the option to not fail with EDQUOT in block

To simplify metadata tracking for delalloc writes, ext4
will simply claim metadata blocks at allocation time, without
first speculatively reserving the worst case and then freeing
what was not used.

To do this, we need a mechanism to track allocations in
the quota subsystem, but potentially allow that allocation
to actually go over quota.

This patch adds a DQUOT_SPACE_NOFAIL flag and function
variants for this purpose.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/quota/dquot.c         |  3 ++-
 include/linux/quotaops.h | 12 ++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 8c40c07..df6832e 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1494,6 +1494,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
 	char warntype[MAXQUOTAS];
 	int warn = flags & DQUOT_SPACE_WARN;
 	int reserve = flags & DQUOT_SPACE_RESERVE;
+	int nofail = flags & DQUOT_SPACE_NOFAIL;
 
 	/*
 	 * First test before acquiring mutex - solves deadlocks when we
@@ -1514,7 +1515,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
 			continue;
 		ret = check_bdq(inode->i_dquot[cnt], number, !warn,
 				warntype+cnt);
-		if (ret) {
+		if (ret && !nofail) {
 			spin_unlock(&dq_data_lock);
 			goto out_flush_warn;
 		}
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 9edd53c..f8dbeb0 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -11,6 +11,7 @@
 
 #define DQUOT_SPACE_WARN	0x1
 #define DQUOT_SPACE_RESERVE	0x2
+#define DQUOT_SPACE_NOFAIL	0x4
 
 static inline struct quota_info *sb_dqopt(struct super_block *sb)
 {
@@ -262,6 +263,12 @@ static inline int dquot_alloc_space_nodirty(struct inode *inode, qsize_t nr)
 	return __dquot_alloc_space(inode, nr, DQUOT_SPACE_WARN);
 }
 
+static inline void dquot_alloc_space_nofail(struct inode *inode, qsize_t nr)
+{
+	__dquot_alloc_space(inode, nr, DQUOT_SPACE_WARN|DQUOT_SPACE_NOFAIL);
+	mark_inode_dirty(inode);
+}
+
 static inline int dquot_alloc_space(struct inode *inode, qsize_t nr)
 {
 	int ret;
@@ -277,6 +284,11 @@ static inline int dquot_alloc_block_nodirty(struct inode *inode, qsize_t nr)
 	return dquot_alloc_space_nodirty(inode, nr << inode->i_blkbits);
 }
 
+static inline void dquot_alloc_block_nofail(struct inode *inode, qsize_t nr)
+{
+	dquot_alloc_space_nofail(inode, nr << inode->i_blkbits);
+}
+
 static inline int dquot_alloc_block(struct inode *inode, qsize_t nr)
 {
 	return dquot_alloc_space(inode, nr << inode->i_blkbits);
-- 
cgit v1.1


From 72b8ab9dde211ea518ff27e631b2046ef90c29a2 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Sun, 16 May 2010 11:00:00 -0400
Subject: ext4: don't use quota reservation for speculative metadata

Because we can badly over-reserve metadata when we
calculate worst-case, it complicates things for quota, since
we must reserve and then claim later, retry on EDQUOT, etc.
Quota is also a generally smaller pool than fs free blocks,
so this over-reservation hurts more, and more often.

I'm of the opinion that it's not the worst thing to allow
metadata to push a user slightly over quota.  This simplifies
the code and avoids the false quota rejections that result
from worst-case speculation.

This patch stops the speculative quota-charging for
worst-case metadata requirements, and just charges quota
when the blocks are allocated at writeout.  It also is
able to remove the try-again loop on EDQUOT.

This patch has been tested indirectly by running the xfstests
suite with a hack to mount & enable quota prior to the test.

I also did a more specific test of fragmenting freespace
and then doing a large delalloc write under quota; quota
stopped me at the right amount of file IO, and then the
writeout generated enough metadata (due to the fragmentation)
that it put me slightly over quota, as expected.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c |  5 ++--
 fs/ext4/inode.c  | 69 +++++++++++++++++++-------------------------------------
 2 files changed, 26 insertions(+), 48 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d2f37a5..95b7594 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -591,14 +591,15 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 	ret = ext4_mb_new_blocks(handle, &ar, errp);
 	if (count)
 		*count = ar.len;
-
 	/*
-	 * Account for the allocated meta blocks
+	 * Account for the allocated meta blocks.  We will never
+	 * fail EDQUOT for metdata, but we do account for it.
 	 */
 	if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
 		spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 		EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
 		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+		dquot_alloc_block_nofail(inode, ar.len);
 	}
 	return ret;
 }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 89a31e8..df43217 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1076,7 +1076,6 @@ void ext4_da_update_reserve_space(struct inode *inode,
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	int mdb_free = 0, allocated_meta_blocks = 0;
 
 	spin_lock(&ei->i_block_reservation_lock);
 	trace_ext4_da_update_reserve_space(inode, used);
@@ -1091,11 +1090,10 @@ void ext4_da_update_reserve_space(struct inode *inode,
 
 	/* Update per-inode reservations */
 	ei->i_reserved_data_blocks -= used;
-	used += ei->i_allocated_meta_blocks;
 	ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
-	allocated_meta_blocks = ei->i_allocated_meta_blocks;
+	percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+			   used + ei->i_allocated_meta_blocks);
 	ei->i_allocated_meta_blocks = 0;
-	percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
 
 	if (ei->i_reserved_data_blocks == 0) {
 		/*
@@ -1103,31 +1101,23 @@ void ext4_da_update_reserve_space(struct inode *inode,
 		 * only when we have written all of the delayed
 		 * allocation blocks.
 		 */
-		mdb_free = ei->i_reserved_meta_blocks;
+		percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+				   ei->i_reserved_meta_blocks);
 		ei->i_reserved_meta_blocks = 0;
 		ei->i_da_metadata_calc_len = 0;
-		percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
 	}
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 
-	/* Update quota subsystem */
-	if (quota_claim) {
+	/* Update quota subsystem for data blocks */
+	if (quota_claim)
 		dquot_claim_block(inode, used);
-		if (mdb_free)
-			dquot_release_reservation_block(inode, mdb_free);
-	} else {
+	else {
 		/*
 		 * We did fallocate with an offset that is already delayed
 		 * allocated. So on delayed allocated writeback we should
-		 * not update the quota for allocated blocks. But then
-		 * converting an fallocate region to initialized region would
-		 * have caused a metadata allocation. So claim quota for
-		 * that
+		 * not re-claim the quota for fallocated blocks.
 		 */
-		if (allocated_meta_blocks)
-			dquot_claim_block(inode, allocated_meta_blocks);
-		dquot_release_reservation_block(inode, mdb_free + used -
-						allocated_meta_blocks);
+		dquot_release_reservation_block(inode, used);
 	}
 
 	/*
@@ -1861,7 +1851,7 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
 	int retries = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	unsigned long md_needed, md_reserved;
+	unsigned long md_needed;
 	int ret;
 
 	/*
@@ -1871,22 +1861,24 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
 	 */
 repeat:
 	spin_lock(&ei->i_block_reservation_lock);
-	md_reserved = ei->i_reserved_meta_blocks;
 	md_needed = ext4_calc_metadata_amount(inode, lblock);
 	trace_ext4_da_reserve_space(inode, md_needed);
 	spin_unlock(&ei->i_block_reservation_lock);
 
 	/*
-	 * Make quota reservation here to prevent quota overflow
-	 * later. Real quota accounting is done at pages writeout
-	 * time.
+	 * We will charge metadata quota at writeout time; this saves
+	 * us from metadata over-estimation, though we may go over by
+	 * a small amount in the end.  Here we just reserve for data.
 	 */
-	ret = dquot_reserve_block(inode, md_needed + 1);
+	ret = dquot_reserve_block(inode, 1);
 	if (ret)
 		return ret;
-
+	/*
+	 * We do still charge estimated metadata to the sb though;
+	 * we cannot afford to run out of free blocks.
+	 */
 	if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
-		dquot_release_reservation_block(inode, md_needed + 1);
+		dquot_release_reservation_block(inode, 1);
 		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
 			yield();
 			goto repeat;
@@ -1933,12 +1925,13 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
 		 * only when we have written all of the delayed
 		 * allocation blocks.
 		 */
-		to_free += ei->i_reserved_meta_blocks;
+		percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+				   ei->i_reserved_meta_blocks);
 		ei->i_reserved_meta_blocks = 0;
 		ei->i_da_metadata_calc_len = 0;
 	}
 
-	/* update fs dirty blocks counter */
+	/* update fs dirty data blocks counter */
 	percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
 
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -3086,7 +3079,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 			       loff_t pos, unsigned len, unsigned flags,
 			       struct page **pagep, void **fsdata)
 {
-	int ret, retries = 0, quota_retries = 0;
+	int ret, retries = 0;
 	struct page *page;
 	pgoff_t index;
 	unsigned from, to;
@@ -3145,22 +3138,6 @@ retry:
 
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
-
-	if ((ret == -EDQUOT) &&
-	    EXT4_I(inode)->i_reserved_meta_blocks &&
-	    (quota_retries++ < 3)) {
-		/*
-		 * Since we often over-estimate the number of meta
-		 * data blocks required, we may sometimes get a
-		 * spurios out of quota error even though there would
-		 * be enough space once we write the data blocks and
-		 * find out how many meta data blocks were _really_
-		 * required.  So try forcing the inode write to see if
-		 * that helps.
-		 */
-		write_inode_now(inode, (quota_retries == 3));
-		goto retry;
-	}
 out:
 	return ret;
 }
-- 
cgit v1.1


From d4c402d9fd97a54bb7ed89f9e9869897eb4f6e1f Mon Sep 17 00:00:00 2001
From: Curt Wohlgemuth <curtw@google.com>
Date: Sun, 16 May 2010 12:00:00 -0400
Subject: ext4: Print mount options in when mounting and add a remount message

This adds a "re-mounted" message to ext4_remount(), and both it and
the mount message in ext4_fill_super() now have the original mount
options data string.

Signed-off-by: Curt Wohlgemuth <curtw@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d83cdcb..8e655f5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2431,6 +2431,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				__releases(kernel_lock)
 				__acquires(kernel_lock)
 {
+	char *orig_data = kstrdup(data, GFP_KERNEL);
 	struct buffer_head *bh;
 	struct ext4_super_block *es = NULL;
 	struct ext4_sb_info *sbi;
@@ -3037,9 +3038,11 @@ no_journal:
 	} else
 		descr = "out journal";
 
-	ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr);
+	ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
+		"Opts: %s", descr, orig_data);
 
 	lock_kernel();
+	kfree(orig_data);
 	return 0;
 
 cantfind_ext4:
@@ -3086,6 +3089,7 @@ out_fail:
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 	lock_kernel();
+	kfree(orig_data);
 	return ret;
 }
 
@@ -3576,6 +3580,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 #ifdef CONFIG_QUOTA
 	int i;
 #endif
+	char *orig_data = kstrdup(data, GFP_KERNEL);
 
 	lock_kernel();
 
@@ -3709,6 +3714,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 #endif
 	unlock_super(sb);
 	unlock_kernel();
+
+	ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
+	kfree(orig_data);
 	return 0;
 
 restore_opts:
@@ -3730,6 +3738,7 @@ restore_opts:
 #endif
 	unlock_super(sb);
 	unlock_kernel();
+	kfree(orig_data);
 	return err;
 }
 
-- 
cgit v1.1


From fbe845ddf368f77f86aa7500f8fd2690f54c66a8 Mon Sep 17 00:00:00 2001
From: Curt Wohlgemuth <curtw@google.com>
Date: Sun, 16 May 2010 13:00:00 -0400
Subject: ext4: Remove extraneous newlines in ext4_msg() calls

Addresses-Google-Bug: #2562325

Signed-off-by: Curt Wohlgemuth <curtw@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 6 +++---
 fs/ext4/super.c | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index df43217..6aa0442 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2276,7 +2276,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 		ext4_msg(mpd->inode->i_sb, KERN_CRIT,
 			 "delayed block allocation failed for inode %lu at "
 			 "logical offset %llu with max blocks %zd with "
-			 "error %d\n", mpd->inode->i_ino,
+			 "error %d", mpd->inode->i_ino,
 			 (unsigned long long) next,
 			 mpd->b_size >> mpd->inode->i_blkbits, err);
 		printk(KERN_CRIT "This should not happen!!  "
@@ -2944,7 +2944,7 @@ retry:
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
 			ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
-			       "%ld pages, ino %lu; err %d\n", __func__,
+			       "%ld pages, ino %lu; err %d", __func__,
 				wbc->nr_to_write, inode->i_ino, ret);
 			goto out_writepages;
 		}
@@ -3019,7 +3019,7 @@ retry:
 	if (pages_skipped != wbc->pages_skipped)
 		ext4_msg(inode->i_sb, KERN_CRIT,
 			 "This should not happen leaving %s "
-			 "with nr_to_write = %ld ret = %d\n",
+			 "with nr_to_write = %ld ret = %d",
 			 __func__, wbc->nr_to_write, ret);
 
 	/* Update index */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8e655f5..c597ec7 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2999,7 +2999,7 @@ no_journal:
 	err = ext4_setup_system_zone(sb);
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "failed to initialize system "
-			 "zone (%d)\n", err);
+			 "zone (%d)", err);
 		goto failed_mount4;
 	}
 
-- 
cgit v1.1


From 6d19c42b7cf81c39632b6d4dbc514e8449bcd346 Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Sun, 16 May 2010 14:00:00 -0400
Subject: ext4: Prevent creation of files larger than RLIMIT_FSIZE using
 fallocate

Currently using posix_fallocate one can bypass an RLIMIT_FSIZE limit
and create a file larger than the limit. Add a check for that.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Signed-off-by: Amit Arora <aarora@in.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ee611da..8a8f9f0 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3708,6 +3708,11 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 	 */
 	credits = ext4_chunk_trans_blocks(inode, max_blocks);
 	mutex_lock(&inode->i_mutex);
+	ret = inode_newsize_ok(inode, (len + offset));
+	if (ret) {
+		mutex_unlock(&inode->i_mutex);
+		return ret;
+	}
 retry:
 	while (ret >= 0 && ret < max_blocks) {
 		block = block + ret;
-- 
cgit v1.1


From 8a57d9d61a6e361c7bb159dda797672c1df1a691 Mon Sep 17 00:00:00 2001
From: Curt Wohlgemuth <curtw@google.com>
Date: Sun, 16 May 2010 15:00:00 -0400
Subject: ext4: check for a good block group before loading buddy pages

This adds a new field in ext4_group_info to cache the largest available
block range in a block group; and don't load the buddy pages until *after*
we've done a sanity check on the block group.

With large allocation requests (e.g., fallocate(), 8MiB) and relatively full
partitions, it's easy to have no block groups with a block extent large
enough to satisfy the input request length.  This currently causes the loop
during cr == 0 in ext4_mb_regular_allocator() to load the buddy bitmap pages
for EVERY block group.  That can be a lot of pages.  The patch below allows
us to call ext4_mb_good_group() BEFORE we load the buddy pages (although we
have check again after we lock the block group).

Addresses-Google-Bug: #2578108
Addresses-Google-Bug: #2704453

Signed-off-by: Curt Wohlgemuth <curtw@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  1 +
 fs/ext4/mballoc.c | 70 ++++++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 58 insertions(+), 13 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf938cf..d266003 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1678,6 +1678,7 @@ struct ext4_group_info {
 	ext4_grpblk_t	bb_first_free;	/* first free block */
 	ext4_grpblk_t	bb_free;	/* total free blocks */
 	ext4_grpblk_t	bb_fragments;	/* nr of freespace fragments */
+	ext4_grpblk_t	bb_largest_free_order;/* order of largest frag in BG */
 	struct          list_head bb_prealloc_list;
 #ifdef DOUBLE_CHECK
 	void            *bb_bitmap;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4f2d3a9..aa499fe 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -658,6 +658,27 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
 	}
 }
 
+/*
+ * Cache the order of the largest free extent we have available in this block
+ * group.
+ */
+static void
+mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
+{
+	int i;
+	int bits;
+
+	grp->bb_largest_free_order = -1; /* uninit */
+
+	bits = sb->s_blocksize_bits + 1;
+	for (i = bits; i >= 0; i--) {
+		if (grp->bb_counters[i] > 0) {
+			grp->bb_largest_free_order = i;
+			break;
+		}
+	}
+}
+
 static noinline_for_stack
 void ext4_mb_generate_buddy(struct super_block *sb,
 				void *buddy, void *bitmap, ext4_group_t group)
@@ -700,6 +721,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
 		 */
 		grp->bb_free = free;
 	}
+	mb_set_largest_free_order(sb, grp);
 
 	clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
 
@@ -725,6 +747,9 @@ void ext4_mb_generate_buddy(struct super_block *sb,
  * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize)  blocks.
  * So it can have information regarding groups_per_page which
  * is blocks_per_page/2
+ *
+ * Locking note:  This routine takes the block group lock of all groups
+ * for this page; do not hold this lock when calling this routine!
  */
 
 static int ext4_mb_init_cache(struct page *page, char *incore)
@@ -910,6 +935,11 @@ out:
 	return err;
 }
 
+/*
+ * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
 static noinline_for_stack
 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 {
@@ -1004,6 +1034,11 @@ err:
 	return ret;
 }
 
+/*
+ * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
 static noinline_for_stack int
 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 					struct ext4_buddy *e4b)
@@ -1299,6 +1334,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 			buddy = buddy2;
 		} while (1);
 	}
+	mb_set_largest_free_order(sb, e4b->bd_info);
 	mb_check_buddy(e4b);
 }
 
@@ -1427,6 +1463,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
 		e4b->bd_info->bb_counters[ord]++;
 		e4b->bd_info->bb_counters[ord]++;
 	}
+	mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
 
 	mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
 	mb_check_buddy(e4b);
@@ -1821,16 +1858,22 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
 	}
 }
 
+/* This is now called BEFORE we load the buddy bitmap. */
 static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 				ext4_group_t group, int cr)
 {
 	unsigned free, fragments;
-	unsigned i, bits;
 	int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
 	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
 
 	BUG_ON(cr < 0 || cr >= 4);
-	BUG_ON(EXT4_MB_GRP_NEED_INIT(grp));
+
+	/* We only do this if the grp has never been initialized */
+	if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+		int ret = ext4_mb_init_group(ac->ac_sb, group);
+		if (ret)
+			return 0;
+	}
 
 	free = grp->bb_free;
 	fragments = grp->bb_fragments;
@@ -1843,17 +1886,16 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 	case 0:
 		BUG_ON(ac->ac_2order == 0);
 
+		if (grp->bb_largest_free_order < ac->ac_2order)
+			return 0;
+
 		/* Avoid using the first bg of a flexgroup for data files */
 		if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
 		    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
 		    ((group % flex_size) == 0))
 			return 0;
 
-		bits = ac->ac_sb->s_blocksize_bits + 1;
-		for (i = ac->ac_2order; i <= bits; i++)
-			if (grp->bb_counters[i] > 0)
-				return 1;
-		break;
+		return 1;
 	case 1:
 		if ((free / fragments) >= ac->ac_g_ex.fe_len)
 			return 1;
@@ -2024,14 +2066,11 @@ repeat:
 		group = ac->ac_g_ex.fe_group;
 
 		for (i = 0; i < ngroups; group++, i++) {
-			struct ext4_group_info *grp;
-
 			if (group == ngroups)
 				group = 0;
 
-			/* quick check to skip empty groups */
-			grp = ext4_get_group_info(sb, group);
-			if (grp->bb_free == 0)
+			/* This now checks without needing the buddy page */
+			if (!ext4_mb_good_group(ac, group, cr))
 				continue;
 
 			err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -2039,8 +2078,12 @@ repeat:
 				goto out;
 
 			ext4_lock_group(sb, group);
+
+			/*
+			 * We need to check again after locking the
+			 * block group
+			 */
 			if (!ext4_mb_good_group(ac, group, cr)) {
-				/* someone did allocation from this group */
 				ext4_unlock_group(sb, group);
 				ext4_mb_unload_buddy(&e4b);
 				continue;
@@ -2253,6 +2296,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
 	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
 	init_rwsem(&meta_group_info[i]->alloc_sem);
 	meta_group_info[i]->bb_free_root = RB_ROOT;
+	meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
 
 #ifdef DOUBLE_CHECK
 	{
-- 
cgit v1.1


From 291dae472a8976ff461f24c848fee2a03f9ea3c8 Mon Sep 17 00:00:00 2001
From: Curt Wohlgemuth <curtw@google.com>
Date: Sun, 16 May 2010 16:00:00 -0400
Subject: ext4: Fix for ext4_mb_collect_stats()

Fix ext4_mb_collect_stats() to use the correct test for s_bal_success; it
should be testing "best-extent.fe_len >= orig-extent.fe_len" , not
"orig-extent.fe_len >= goal-extent.fe_len" .

Signed-off-by: Curt Wohlgemuth <curtw@google.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index aa499fe..e5dcdc9 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3029,7 +3029,7 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
 	if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
 		atomic_inc(&sbi->s_bal_reqs);
 		atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
-		if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len)
+		if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
 			atomic_inc(&sbi->s_bal_success);
 		atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
 		if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
-- 
cgit v1.1


From 39a4bade8c1826b658316d66ee81c09b0a4d7d42 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sun, 16 May 2010 17:00:00 -0400
Subject: ext4: Show journal_checksum option

We failed to show journal_checksum option in /proc/mounts. Fix it.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c597ec7..2e8790e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -942,6 +942,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
 	if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
 		seq_puts(seq, ",journal_async_commit");
+	else if (test_opt(sb, JOURNAL_CHECKSUM))
+		seq_puts(seq, ",journal_checksum");
 	if (test_opt(sb, NOBH))
 		seq_puts(seq, ",nobh");
 	if (test_opt(sb, I_VERSION))
-- 
cgit v1.1


From 8e48dcfbd7c0892b4cfd064d682cc4c95a29df32 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 16 May 2010 18:00:00 -0400
Subject: ext4: Use our own write_cache_pages()

Make a copy of write_cache_pages() for the benefit of
ext4_da_writepages().  This allows us to simplify the code some, and
will allow us to further customize the code in future patches.

There are some nasty hacks in write_cache_pages(), which Linus has
(correctly) characterized as vile.  I've just copied it into
write_cache_pages_da(), without trying to clean those bits up lest I
break something in the ext4's delalloc implementation, which is a bit
fragile right now.  This will allow Dave Chinner to clean up
write_cache_pages() in mm/page-writeback.c, without worrying about
breaking ext4.  Eventually write_cache_pages_da() will go away when I
rewrite ext4's delayed allocation and create a general
ext4_writepages() which is used for all of ext4's writeback.  Until
now this is the lowest risk way to clean up the core
write_cache_pages() function.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Dave Chinner <david@fromorbit.com>
---
 fs/ext4/inode.c | 141 +++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 119 insertions(+), 22 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 6aa0442..830336d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2426,17 +2426,6 @@ static int __mpage_da_writepage(struct page *page,
 	struct buffer_head *bh, *head;
 	sector_t logical;
 
-	if (mpd->io_done) {
-		/*
-		 * Rest of the page in the page_vec
-		 * redirty then and skip then. We will
-		 * try to write them again after
-		 * starting a new transaction
-		 */
-		redirty_page_for_writepage(wbc, page);
-		unlock_page(page);
-		return MPAGE_DA_EXTENT_TAIL;
-	}
 	/*
 	 * Can we merge this page to current extent?
 	 */
@@ -2831,6 +2820,124 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
 	return ext4_chunk_trans_blocks(inode, max_blocks);
 }
 
+/*
+ * write_cache_pages_da - walk the list of dirty pages of the given
+ * address space and call the callback function (which usually writes
+ * the pages).
+ *
+ * This is a forked version of write_cache_pages().  Differences:
+ *	Range cyclic is ignored.
+ *	no_nrwrite_index_update is always presumed true
+ */
+static int write_cache_pages_da(struct address_space *mapping,
+				struct writeback_control *wbc,
+				struct mpage_da_data *mpd)
+{
+	int ret = 0;
+	int done = 0;
+	struct pagevec pvec;
+	int nr_pages;
+	pgoff_t index;
+	pgoff_t end;		/* Inclusive */
+	long nr_to_write = wbc->nr_to_write;
+
+	pagevec_init(&pvec, 0);
+	index = wbc->range_start >> PAGE_CACHE_SHIFT;
+	end = wbc->range_end >> PAGE_CACHE_SHIFT;
+
+	while (!done && (index <= end)) {
+		int i;
+
+		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+			      PAGECACHE_TAG_DIRTY,
+			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+		if (nr_pages == 0)
+			break;
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/*
+			 * At this point, the page may be truncated or
+			 * invalidated (changing page->mapping to NULL), or
+			 * even swizzled back from swapper_space to tmpfs file
+			 * mapping. However, page->index will not change
+			 * because we have a reference on the page.
+			 */
+			if (page->index > end) {
+				done = 1;
+				break;
+			}
+
+			lock_page(page);
+
+			/*
+			 * Page truncated or invalidated. We can freely skip it
+			 * then, even for data integrity operations: the page
+			 * has disappeared concurrently, so there could be no
+			 * real expectation of this data interity operation
+			 * even if there is now a new, dirty page at the same
+			 * pagecache address.
+			 */
+			if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+				unlock_page(page);
+				continue;
+			}
+
+			if (!PageDirty(page)) {
+				/* someone wrote it for us */
+				goto continue_unlock;
+			}
+
+			if (PageWriteback(page)) {
+				if (wbc->sync_mode != WB_SYNC_NONE)
+					wait_on_page_writeback(page);
+				else
+					goto continue_unlock;
+			}
+
+			BUG_ON(PageWriteback(page));
+			if (!clear_page_dirty_for_io(page))
+				goto continue_unlock;
+
+			ret = __mpage_da_writepage(page, wbc, mpd);
+			if (unlikely(ret)) {
+				if (ret == AOP_WRITEPAGE_ACTIVATE) {
+					unlock_page(page);
+					ret = 0;
+				} else {
+					done = 1;
+					break;
+				}
+			}
+
+			if (nr_to_write > 0) {
+				nr_to_write--;
+				if (nr_to_write == 0 &&
+				    wbc->sync_mode == WB_SYNC_NONE) {
+					/*
+					 * We stop writing back only if we are
+					 * not doing integrity sync. In case of
+					 * integrity sync we have to keep going
+					 * because someone may be concurrently
+					 * dirtying pages, and we might have
+					 * synced a lot of newly appeared dirty
+					 * pages, but have not synced all of the
+					 * old dirty pages.
+					 */
+					done = 1;
+					break;
+				}
+			}
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+	return ret;
+}
+
+
 static int ext4_da_writepages(struct address_space *mapping,
 			      struct writeback_control *wbc)
 {
@@ -2839,7 +2946,6 @@ static int ext4_da_writepages(struct address_space *mapping,
 	handle_t *handle = NULL;
 	struct mpage_da_data mpd;
 	struct inode *inode = mapping->host;
-	int no_nrwrite_index_update;
 	int pages_written = 0;
 	long pages_skipped;
 	unsigned int max_pages;
@@ -2919,12 +3025,6 @@ static int ext4_da_writepages(struct address_space *mapping,
 	mpd.wbc = wbc;
 	mpd.inode = mapping->host;
 
-	/*
-	 * we don't want write_cache_pages to update
-	 * nr_to_write and writeback_index
-	 */
-	no_nrwrite_index_update = wbc->no_nrwrite_index_update;
-	wbc->no_nrwrite_index_update = 1;
 	pages_skipped = wbc->pages_skipped;
 
 retry:
@@ -2966,8 +3066,7 @@ retry:
 		mpd.io_done = 0;
 		mpd.pages_written = 0;
 		mpd.retval = 0;
-		ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
-					&mpd);
+		ret = write_cache_pages_da(mapping, wbc, &mpd);
 		/*
 		 * If we have a contiguous extent of pages and we
 		 * haven't done the I/O yet, map the blocks and submit
@@ -3033,8 +3132,6 @@ retry:
 		mapping->writeback_index = index;
 
 out_writepages:
-	if (!no_nrwrite_index_update)
-		wbc->no_nrwrite_index_update = 0;
 	wbc->nr_to_write -= nr_to_writebump;
 	wbc->range_start = range_start;
 	trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
-- 
cgit v1.1


From e35fd6609b2fee54484d520deccb8f18bf7d38f3 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 16 May 2010 19:00:00 -0400
Subject: ext4: Add new abstraction ext4_map_blocks() underneath
 ext4_get_blocks()

Jack up ext4_get_blocks() and add a new function, ext4_map_blocks()
which uses a much smaller structure, struct ext4_map_blocks which is
20 bytes, as opposed to a struct buffer_head, which nearly 5 times
bigger on an x86_64 machine.  By switching things to use
ext4_map_blocks(), we can save stack space by using ext4_map_blocks()
since we can avoid allocating a struct buffer_head on the stack.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  30 ++++++-
 fs/ext4/extents.c | 237 ++++++++++++++++++++++++++----------------------------
 fs/ext4/inode.c   | 102 +++++++++++++----------
 3 files changed, 199 insertions(+), 170 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d266003..57fc0e5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -126,6 +126,29 @@ struct ext4_allocation_request {
 };
 
 /*
+ * Logical to physical block mapping, used by ext4_map_blocks()
+ *
+ * This structure is used to pass requests into ext4_map_blocks() as
+ * well as to store the information returned by ext4_map_blocks().  It
+ * takes less room on the stack than a struct buffer_head.
+ */
+#define EXT4_MAP_NEW		(1 << BH_New)
+#define EXT4_MAP_MAPPED		(1 << BH_Mapped)
+#define EXT4_MAP_UNWRITTEN	(1 << BH_Unwritten)
+#define EXT4_MAP_BOUNDARY	(1 << BH_Boundary)
+#define EXT4_MAP_UNINIT		(1 << BH_Uninit)
+#define EXT4_MAP_FLAGS		(EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
+				 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
+				 EXT4_MAP_UNINIT)
+
+struct ext4_map_blocks {
+	ext4_fsblk_t m_pblk;
+	ext4_lblk_t m_lblk;
+	unsigned int m_len;
+	unsigned int m_flags;
+};
+
+/*
  * For delayed allocation tracking
  */
 struct mpage_da_data {
@@ -1773,9 +1796,8 @@ extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
 extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
 				       int chunk);
-extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
-			       ext4_lblk_t iblock, unsigned int max_blocks,
-			       struct buffer_head *bh_result, int flags);
+extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
+			       struct ext4_map_blocks *map, int flags);
 extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
@@ -1783,6 +1805,8 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
 			  loff_t len);
 extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
 			  ssize_t len);
+extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
+			   struct ext4_map_blocks *map, int flags);
 extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
 			   sector_t block, unsigned int max_blocks,
 			   struct buffer_head *bh, int flags);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 8a8f9f0..37f9387 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2611,7 +2611,7 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 
 #define EXT4_EXT_ZERO_LEN 7
 /*
- * This function is called by ext4_ext_get_blocks() if someone tries to write
+ * This function is called by ext4_ext_map_blocks() if someone tries to write
  * to an uninitialized extent. It may result in splitting the uninitialized
  * extent into multiple extents (upto three - one initialized and two
  * uninitialized).
@@ -2621,10 +2621,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
  *   c> Splits in three extents: Somone is writing in middle of the extent
  */
 static int ext4_ext_convert_to_initialized(handle_t *handle,
-						struct inode *inode,
-						struct ext4_ext_path *path,
-						ext4_lblk_t iblock,
-						unsigned int max_blocks)
+					   struct inode *inode,
+					   struct ext4_map_blocks *map,
+					   struct ext4_ext_path *path)
 {
 	struct ext4_extent *ex, newex, orig_ex;
 	struct ext4_extent *ex1 = NULL;
@@ -2640,20 +2639,20 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 
 	ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
 		"block %llu, max_blocks %u\n", inode->i_ino,
-		(unsigned long long)iblock, max_blocks);
+		(unsigned long long)map->m_lblk, map->m_len);
 
 	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
 		inode->i_sb->s_blocksize_bits;
-	if (eof_block < iblock + max_blocks)
-		eof_block = iblock + max_blocks;
+	if (eof_block < map->m_lblk + map->m_len)
+		eof_block = map->m_lblk + map->m_len;
 
 	depth = ext_depth(inode);
 	eh = path[depth].p_hdr;
 	ex = path[depth].p_ext;
 	ee_block = le32_to_cpu(ex->ee_block);
 	ee_len = ext4_ext_get_actual_len(ex);
-	allocated = ee_len - (iblock - ee_block);
-	newblock = iblock - ee_block + ext_pblock(ex);
+	allocated = ee_len - (map->m_lblk - ee_block);
+	newblock = map->m_lblk - ee_block + ext_pblock(ex);
 
 	ex2 = ex;
 	orig_ex.ee_block = ex->ee_block;
@@ -2683,10 +2682,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		return allocated;
 	}
 
-	/* ex1: ee_block to iblock - 1 : uninitialized */
-	if (iblock > ee_block) {
+	/* ex1: ee_block to map->m_lblk - 1 : uninitialized */
+	if (map->m_lblk > ee_block) {
 		ex1 = ex;
-		ex1->ee_len = cpu_to_le16(iblock - ee_block);
+		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
 		ext4_ext_mark_uninitialized(ex1);
 		ex2 = &newex;
 	}
@@ -2695,15 +2694,15 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
 	 * overlap of blocks.
 	 */
-	if (!ex1 && allocated > max_blocks)
-		ex2->ee_len = cpu_to_le16(max_blocks);
+	if (!ex1 && allocated > map->m_len)
+		ex2->ee_len = cpu_to_le16(map->m_len);
 	/* ex3: to ee_block + ee_len : uninitialised */
-	if (allocated > max_blocks) {
+	if (allocated > map->m_len) {
 		unsigned int newdepth;
 		/* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
 		if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
 			/*
-			 * iblock == ee_block is handled by the zerouout
+			 * map->m_lblk == ee_block is handled by the zerouout
 			 * at the beginning.
 			 * Mark first half uninitialized.
 			 * Mark second half initialized and zero out the
@@ -2716,7 +2715,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 			ext4_ext_dirty(handle, inode, path + depth);
 
 			ex3 = &newex;
-			ex3->ee_block = cpu_to_le32(iblock);
+			ex3->ee_block = cpu_to_le32(map->m_lblk);
 			ext4_ext_store_pblock(ex3, newblock);
 			ex3->ee_len = cpu_to_le16(allocated);
 			err = ext4_ext_insert_extent(handle, inode, path,
@@ -2729,7 +2728,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 				ex->ee_len   = orig_ex.ee_len;
 				ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
 				ext4_ext_dirty(handle, inode, path + depth);
-				/* blocks available from iblock */
+				/* blocks available from map->m_lblk */
 				return allocated;
 
 			} else if (err)
@@ -2751,8 +2750,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 				 */
 				depth = ext_depth(inode);
 				ext4_ext_drop_refs(path);
-				path = ext4_ext_find_extent(inode,
-								iblock, path);
+				path = ext4_ext_find_extent(inode, map->m_lblk,
+							    path);
 				if (IS_ERR(path)) {
 					err = PTR_ERR(path);
 					return err;
@@ -2772,9 +2771,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 			return allocated;
 		}
 		ex3 = &newex;
-		ex3->ee_block = cpu_to_le32(iblock + max_blocks);
-		ext4_ext_store_pblock(ex3, newblock + max_blocks);
-		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+		ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
+		ext4_ext_store_pblock(ex3, newblock + map->m_len);
+		ex3->ee_len = cpu_to_le16(allocated - map->m_len);
 		ext4_ext_mark_uninitialized(ex3);
 		err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
 		if (err == -ENOSPC && may_zeroout) {
@@ -2787,7 +2786,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
 			ext4_ext_dirty(handle, inode, path + depth);
 			/* zeroed the full extent */
-			/* blocks available from iblock */
+			/* blocks available from map->m_lblk */
 			return allocated;
 
 		} else if (err)
@@ -2807,7 +2806,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 
 		depth = newdepth;
 		ext4_ext_drop_refs(path);
-		path = ext4_ext_find_extent(inode, iblock, path);
+		path = ext4_ext_find_extent(inode, map->m_lblk, path);
 		if (IS_ERR(path)) {
 			err = PTR_ERR(path);
 			goto out;
@@ -2821,14 +2820,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		if (err)
 			goto out;
 
-		allocated = max_blocks;
+		allocated = map->m_len;
 
 		/* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
 		 * to insert a extent in the middle zerout directly
 		 * otherwise give the extent a chance to merge to left
 		 */
 		if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
-			iblock != ee_block && may_zeroout) {
+			map->m_lblk != ee_block && may_zeroout) {
 			err =  ext4_ext_zeroout(inode, &orig_ex);
 			if (err)
 				goto fix_extent_len;
@@ -2838,7 +2837,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
 			ext4_ext_dirty(handle, inode, path + depth);
 			/* zero out the first half */
-			/* blocks available from iblock */
+			/* blocks available from map->m_lblk */
 			return allocated;
 		}
 	}
@@ -2849,12 +2848,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	 */
 	if (ex1 && ex1 != ex) {
 		ex1 = ex;
-		ex1->ee_len = cpu_to_le16(iblock - ee_block);
+		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
 		ext4_ext_mark_uninitialized(ex1);
 		ex2 = &newex;
 	}
-	/* ex2: iblock to iblock + maxblocks-1 : initialised */
-	ex2->ee_block = cpu_to_le32(iblock);
+	/* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
+	ex2->ee_block = cpu_to_le32(map->m_lblk);
 	ext4_ext_store_pblock(ex2, newblock);
 	ex2->ee_len = cpu_to_le16(allocated);
 	if (ex2 != ex)
@@ -2924,7 +2923,7 @@ fix_extent_len:
 }
 
 /*
- * This function is called by ext4_ext_get_blocks() from
+ * This function is called by ext4_ext_map_blocks() from
  * ext4_get_blocks_dio_write() when DIO to write
  * to an uninitialized extent.
  *
@@ -2947,9 +2946,8 @@ fix_extent_len:
  */
 static int ext4_split_unwritten_extents(handle_t *handle,
 					struct inode *inode,
+					struct ext4_map_blocks *map,
 					struct ext4_ext_path *path,
-					ext4_lblk_t iblock,
-					unsigned int max_blocks,
 					int flags)
 {
 	struct ext4_extent *ex, newex, orig_ex;
@@ -2965,20 +2963,20 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 
 	ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
 		"block %llu, max_blocks %u\n", inode->i_ino,
-		(unsigned long long)iblock, max_blocks);
+		(unsigned long long)map->m_lblk, map->m_len);
 
 	eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
 		inode->i_sb->s_blocksize_bits;
-	if (eof_block < iblock + max_blocks)
-		eof_block = iblock + max_blocks;
+	if (eof_block < map->m_lblk + map->m_len)
+		eof_block = map->m_lblk + map->m_len;
 
 	depth = ext_depth(inode);
 	eh = path[depth].p_hdr;
 	ex = path[depth].p_ext;
 	ee_block = le32_to_cpu(ex->ee_block);
 	ee_len = ext4_ext_get_actual_len(ex);
-	allocated = ee_len - (iblock - ee_block);
-	newblock = iblock - ee_block + ext_pblock(ex);
+	allocated = ee_len - (map->m_lblk - ee_block);
+	newblock = map->m_lblk - ee_block + ext_pblock(ex);
 
 	ex2 = ex;
 	orig_ex.ee_block = ex->ee_block;
@@ -2996,16 +2994,16 @@ static int ext4_split_unwritten_extents(handle_t *handle,
  	 * block where the write begins, and the write completely
  	 * covers the extent, then we don't need to split it.
  	 */
-	if ((iblock == ee_block) && (allocated <= max_blocks))
+	if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
 		return allocated;
 
 	err = ext4_ext_get_access(handle, inode, path + depth);
 	if (err)
 		goto out;
-	/* ex1: ee_block to iblock - 1 : uninitialized */
-	if (iblock > ee_block) {
+	/* ex1: ee_block to map->m_lblk - 1 : uninitialized */
+	if (map->m_lblk > ee_block) {
 		ex1 = ex;
-		ex1->ee_len = cpu_to_le16(iblock - ee_block);
+		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
 		ext4_ext_mark_uninitialized(ex1);
 		ex2 = &newex;
 	}
@@ -3014,15 +3012,15 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
 	 * overlap of blocks.
 	 */
-	if (!ex1 && allocated > max_blocks)
-		ex2->ee_len = cpu_to_le16(max_blocks);
+	if (!ex1 && allocated > map->m_len)
+		ex2->ee_len = cpu_to_le16(map->m_len);
 	/* ex3: to ee_block + ee_len : uninitialised */
-	if (allocated > max_blocks) {
+	if (allocated > map->m_len) {
 		unsigned int newdepth;
 		ex3 = &newex;
-		ex3->ee_block = cpu_to_le32(iblock + max_blocks);
-		ext4_ext_store_pblock(ex3, newblock + max_blocks);
-		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+		ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
+		ext4_ext_store_pblock(ex3, newblock + map->m_len);
+		ex3->ee_len = cpu_to_le16(allocated - map->m_len);
 		ext4_ext_mark_uninitialized(ex3);
 		err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
 		if (err == -ENOSPC && may_zeroout) {
@@ -3035,7 +3033,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
 			ext4_ext_dirty(handle, inode, path + depth);
 			/* zeroed the full extent */
-			/* blocks available from iblock */
+			/* blocks available from map->m_lblk */
 			return allocated;
 
 		} else if (err)
@@ -3055,7 +3053,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 
 		depth = newdepth;
 		ext4_ext_drop_refs(path);
-		path = ext4_ext_find_extent(inode, iblock, path);
+		path = ext4_ext_find_extent(inode, map->m_lblk, path);
 		if (IS_ERR(path)) {
 			err = PTR_ERR(path);
 			goto out;
@@ -3069,7 +3067,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 		if (err)
 			goto out;
 
-		allocated = max_blocks;
+		allocated = map->m_len;
 	}
 	/*
 	 * If there was a change of depth as part of the
@@ -3078,15 +3076,15 @@ static int ext4_split_unwritten_extents(handle_t *handle,
 	 */
 	if (ex1 && ex1 != ex) {
 		ex1 = ex;
-		ex1->ee_len = cpu_to_le16(iblock - ee_block);
+		ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
 		ext4_ext_mark_uninitialized(ex1);
 		ex2 = &newex;
 	}
 	/*
-	 * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
-	 * uninitialised still.
+	 * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
+	 * using direct I/O, uninitialised still.
 	 */
-	ex2->ee_block = cpu_to_le32(iblock);
+	ex2->ee_block = cpu_to_le32(map->m_lblk);
 	ext4_ext_store_pblock(ex2, newblock);
 	ex2->ee_len = cpu_to_le16(allocated);
 	ext4_ext_mark_uninitialized(ex2);
@@ -3188,10 +3186,9 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
 
 static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
-			ext4_lblk_t iblock, unsigned int max_blocks,
+			struct ext4_map_blocks *map,
 			struct ext4_ext_path *path, int flags,
-			unsigned int allocated, struct buffer_head *bh_result,
-			ext4_fsblk_t newblock)
+			unsigned int allocated, ext4_fsblk_t newblock)
 {
 	int ret = 0;
 	int err = 0;
@@ -3199,15 +3196,14 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 
 	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
 		  "block %llu, max_blocks %u, flags %d, allocated %u",
-		  inode->i_ino, (unsigned long long)iblock, max_blocks,
+		  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
 		  flags, allocated);
 	ext4_ext_show_leaf(inode, path);
 
 	/* get_block() before submit the IO, split the extent */
 	if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-		ret = ext4_split_unwritten_extents(handle,
-						inode, path, iblock,
-						max_blocks, flags);
+		ret = ext4_split_unwritten_extents(handle, inode, map,
+						   path, flags);
 		/*
 		 * Flag the inode(non aio case) or end_io struct (aio case)
 		 * that this IO needs to convertion to written when IO is
@@ -3218,7 +3214,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 		else
 			ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
 		if (ext4_should_dioread_nolock(inode))
-			set_buffer_uninit(bh_result);
+			map->m_flags |= EXT4_MAP_UNINIT;
 		goto out;
 	}
 	/* IO end_io complete, convert the filled extent to written */
@@ -3246,14 +3242,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 		 * the buffer head will be unmapped so that
 		 * a read from the block returns 0s.
 		 */
-		set_buffer_unwritten(bh_result);
+		map->m_flags |= EXT4_MAP_UNWRITTEN;
 		goto out1;
 	}
 
 	/* buffered write, writepage time, convert*/
-	ret = ext4_ext_convert_to_initialized(handle, inode,
-						path, iblock,
-						max_blocks);
+	ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
 	if (ret >= 0)
 		ext4_update_inode_fsync_trans(handle, inode, 1);
 out:
@@ -3262,7 +3256,7 @@ out:
 		goto out2;
 	} else
 		allocated = ret;
-	set_buffer_new(bh_result);
+	map->m_flags |= EXT4_MAP_NEW;
 	/*
 	 * if we allocated more blocks than requested
 	 * we need to make sure we unmap the extra block
@@ -3270,11 +3264,11 @@ out:
 	 * unmapped later when we find the buffer_head marked
 	 * new.
 	 */
-	if (allocated > max_blocks) {
+	if (allocated > map->m_len) {
 		unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
-					newblock + max_blocks,
-					allocated - max_blocks);
-		allocated = max_blocks;
+					newblock + map->m_len,
+					allocated - map->m_len);
+		allocated = map->m_len;
 	}
 
 	/*
@@ -3288,13 +3282,13 @@ out:
 		ext4_da_update_reserve_space(inode, allocated, 0);
 
 map_out:
-	set_buffer_mapped(bh_result);
+	map->m_flags |= EXT4_MAP_MAPPED;
 out1:
-	if (allocated > max_blocks)
-		allocated = max_blocks;
+	if (allocated > map->m_len)
+		allocated = map->m_len;
 	ext4_ext_show_leaf(inode, path);
-	bh_result->b_bdev = inode->i_sb->s_bdev;
-	bh_result->b_blocknr = newblock;
+	map->m_pblk = newblock;
+	map->m_len = allocated;
 out2:
 	if (path) {
 		ext4_ext_drop_refs(path);
@@ -3320,10 +3314,8 @@ out2:
  *
  * return < 0, error case.
  */
-int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
-			ext4_lblk_t iblock,
-			unsigned int max_blocks, struct buffer_head *bh_result,
-			int flags)
+int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
+			struct ext4_map_blocks *map, int flags)
 {
 	struct ext4_ext_path *path = NULL;
 	struct ext4_extent_header *eh;
@@ -3334,12 +3326,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	struct ext4_allocation_request ar;
 	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
 
-	__clear_bit(BH_New, &bh_result->b_state);
 	ext_debug("blocks %u/%u requested for inode %lu\n",
-			iblock, max_blocks, inode->i_ino);
+		  map->m_lblk, map->m_len, inode->i_ino);
 
 	/* check in cache */
-	cache_type = ext4_ext_in_cache(inode, iblock, &newex);
+	cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex);
 	if (cache_type) {
 		if (cache_type == EXT4_EXT_CACHE_GAP) {
 			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
@@ -3352,12 +3343,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			/* we should allocate requested block */
 		} else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
 			/* block is already allocated */
-			newblock = iblock
+			newblock = map->m_lblk
 				   - le32_to_cpu(newex.ee_block)
 				   + ext_pblock(&newex);
 			/* number of remaining blocks in the extent */
 			allocated = ext4_ext_get_actual_len(&newex) -
-					(iblock - le32_to_cpu(newex.ee_block));
+				(map->m_lblk - le32_to_cpu(newex.ee_block));
 			goto out;
 		} else {
 			BUG();
@@ -3365,7 +3356,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	}
 
 	/* find extent for this block */
-	path = ext4_ext_find_extent(inode, iblock, NULL);
+	path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
 	if (IS_ERR(path)) {
 		err = PTR_ERR(path);
 		path = NULL;
@@ -3382,7 +3373,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
 		EXT4_ERROR_INODE(inode, "bad extent address "
 				 "iblock: %d, depth: %d pblock %lld",
-				 iblock, depth, path[depth].p_block);
+				 map->m_lblk, depth, path[depth].p_block);
 		err = -EIO;
 		goto out2;
 	}
@@ -3400,12 +3391,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 		 */
 		ee_len = ext4_ext_get_actual_len(ex);
 		/* if found extent covers block, simply return it */
-		if (in_range(iblock, ee_block, ee_len)) {
-			newblock = iblock - ee_block + ee_start;
+		if (in_range(map->m_lblk, ee_block, ee_len)) {
+			newblock = map->m_lblk - ee_block + ee_start;
 			/* number of remaining blocks in the extent */
-			allocated = ee_len - (iblock - ee_block);
-			ext_debug("%u fit into %u:%d -> %llu\n", iblock,
-					ee_block, ee_len, newblock);
+			allocated = ee_len - (map->m_lblk - ee_block);
+			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
+				  ee_block, ee_len, newblock);
 
 			/* Do not put uninitialized extent in the cache */
 			if (!ext4_ext_is_uninitialized(ex)) {
@@ -3415,8 +3406,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 				goto out;
 			}
 			ret = ext4_ext_handle_uninitialized_extents(handle,
-					inode, iblock, max_blocks, path,
-					flags, allocated, bh_result, newblock);
+					inode, map, path, flags, allocated,
+					newblock);
 			return ret;
 		}
 	}
@@ -3430,7 +3421,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 		 * put just found gap into cache to speed up
 		 * subsequent requests
 		 */
-		ext4_ext_put_gap_in_cache(inode, path, iblock);
+		ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
 		goto out2;
 	}
 	/*
@@ -3438,11 +3429,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	 */
 
 	/* find neighbour allocated blocks */
-	ar.lleft = iblock;
+	ar.lleft = map->m_lblk;
 	err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
 	if (err)
 		goto out2;
-	ar.lright = iblock;
+	ar.lright = map->m_lblk;
 	err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
 	if (err)
 		goto out2;
@@ -3453,26 +3444,26 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
 	 * EXT_UNINIT_MAX_LEN.
 	 */
-	if (max_blocks > EXT_INIT_MAX_LEN &&
+	if (map->m_len > EXT_INIT_MAX_LEN &&
 	    !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
-		max_blocks = EXT_INIT_MAX_LEN;
-	else if (max_blocks > EXT_UNINIT_MAX_LEN &&
+		map->m_len = EXT_INIT_MAX_LEN;
+	else if (map->m_len > EXT_UNINIT_MAX_LEN &&
 		 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
-		max_blocks = EXT_UNINIT_MAX_LEN;
+		map->m_len = EXT_UNINIT_MAX_LEN;
 
-	/* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
-	newex.ee_block = cpu_to_le32(iblock);
-	newex.ee_len = cpu_to_le16(max_blocks);
+	/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
+	newex.ee_block = cpu_to_le32(map->m_lblk);
+	newex.ee_len = cpu_to_le16(map->m_len);
 	err = ext4_ext_check_overlap(inode, &newex, path);
 	if (err)
 		allocated = ext4_ext_get_actual_len(&newex);
 	else
-		allocated = max_blocks;
+		allocated = map->m_len;
 
 	/* allocate new block */
 	ar.inode = inode;
-	ar.goal = ext4_ext_find_goal(inode, path, iblock);
-	ar.logical = iblock;
+	ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
+	ar.logical = map->m_lblk;
 	ar.len = allocated;
 	if (S_ISREG(inode->i_mode))
 		ar.flags = EXT4_MB_HINT_DATA;
@@ -3506,7 +3497,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 						     EXT4_STATE_DIO_UNWRITTEN);
 		}
 		if (ext4_should_dioread_nolock(inode))
-			set_buffer_uninit(bh_result);
+			map->m_flags |= EXT4_MAP_UNINIT;
 	}
 
 	if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
@@ -3518,7 +3509,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			goto out2;
 		}
 		last_ex = EXT_LAST_EXTENT(eh);
-		if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
+		if (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block)
 		    + ext4_ext_get_actual_len(last_ex))
 			EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
 	}
@@ -3536,9 +3527,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	/* previous routine could use block we allocated */
 	newblock = ext_pblock(&newex);
 	allocated = ext4_ext_get_actual_len(&newex);
-	if (allocated > max_blocks)
-		allocated = max_blocks;
-	set_buffer_new(bh_result);
+	if (allocated > map->m_len)
+		allocated = map->m_len;
+	map->m_flags |= EXT4_MAP_NEW;
 
 	/*
 	 * Update reserved blocks/metadata blocks after successful
@@ -3552,18 +3543,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	 * when it is _not_ an uninitialized extent.
 	 */
 	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
-		ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
+		ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock,
 						EXT4_EXT_CACHE_EXTENT);
 		ext4_update_inode_fsync_trans(handle, inode, 1);
 	} else
 		ext4_update_inode_fsync_trans(handle, inode, 0);
 out:
-	if (allocated > max_blocks)
-		allocated = max_blocks;
+	if (allocated > map->m_len)
+		allocated = map->m_len;
 	ext4_ext_show_leaf(inode, path);
-	set_buffer_mapped(bh_result);
-	bh_result->b_bdev = inode->i_sb->s_bdev;
-	bh_result->b_blocknr = newblock;
+	map->m_flags |= EXT4_MAP_MAPPED;
+	map->m_pblk = newblock;
+	map->m_len = allocated;
 out2:
 	if (path) {
 		ext4_ext_drop_refs(path);
@@ -3729,7 +3720,7 @@ retry:
 		if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
 			WARN_ON(ret <= 0);
-			printk(KERN_ERR "%s: ext4_ext_get_blocks "
+			printk(KERN_ERR "%s: ext4_ext_map_blocks "
 				    "returned error inode#%lu, block=%u, "
 				    "max_blocks=%u", __func__,
 				    inode->i_ino, block, max_blocks);
@@ -3806,7 +3797,7 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
 				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
 		if (ret <= 0) {
 			WARN_ON(ret <= 0);
-			printk(KERN_ERR "%s: ext4_ext_get_blocks "
+			printk(KERN_ERR "%s: ext4_ext_map_blocks "
 				    "returned error inode#%lu, block=%u, "
 				    "max_blocks=%u", __func__,
 				    inode->i_ino, block, max_blocks);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 830336d..ff2f5fd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -149,7 +149,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
 	int ret;
 
 	/*
-	 * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
+	 * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
 	 * moment, get_block can be called only for blocks inside i_size since
 	 * page cache has been already dropped and writes are blocked by
 	 * i_mutex. So we can safely drop the i_data_sem here.
@@ -890,9 +890,9 @@ err_out:
 }
 
 /*
- * The ext4_ind_get_blocks() function handles non-extents inodes
+ * The ext4_ind_map_blocks() function handles non-extents inodes
  * (i.e., using the traditional indirect/double-indirect i_blocks
- * scheme) for ext4_get_blocks().
+ * scheme) for ext4_map_blocks().
  *
  * Allocation strategy is simple: if we have to allocate something, we will
  * have to go the whole way to leaf. So let's do it before attaching anything
@@ -917,9 +917,8 @@ err_out:
  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
  * blocks.
  */
-static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
-			       ext4_lblk_t iblock, unsigned int maxblocks,
-			       struct buffer_head *bh_result,
+static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
+			       struct ext4_map_blocks *map,
 			       int flags)
 {
 	int err = -EIO;
@@ -935,7 +934,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 
 	J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
 	J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
-	depth = ext4_block_to_path(inode, iblock, offsets,
+	depth = ext4_block_to_path(inode, map->m_lblk, offsets,
 				   &blocks_to_boundary);
 
 	if (depth == 0)
@@ -946,10 +945,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	/* Simplest case - block found, no allocation needed */
 	if (!partial) {
 		first_block = le32_to_cpu(chain[depth - 1].key);
-		clear_buffer_new(bh_result);
 		count++;
 		/*map more blocks*/
-		while (count < maxblocks && count <= blocks_to_boundary) {
+		while (count < map->m_len && count <= blocks_to_boundary) {
 			ext4_fsblk_t blk;
 
 			blk = le32_to_cpu(*(chain[depth-1].p + count));
@@ -969,7 +967,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	/*
 	 * Okay, we need to do block allocation.
 	*/
-	goal = ext4_find_goal(inode, iblock, partial);
+	goal = ext4_find_goal(inode, map->m_lblk, partial);
 
 	/* the number of blocks need to allocate for [d,t]indirect blocks */
 	indirect_blks = (chain + depth) - partial - 1;
@@ -979,11 +977,11 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	 * direct blocks to allocate for this branch.
 	 */
 	count = ext4_blks_to_allocate(partial, indirect_blks,
-					maxblocks, blocks_to_boundary);
+				      map->m_len, blocks_to_boundary);
 	/*
 	 * Block out ext4_truncate while we alter the tree
 	 */
-	err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
+	err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
 				&count, goal,
 				offsets + (partial - chain), partial);
 
@@ -995,18 +993,20 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	 * may need to return -EAGAIN upwards in the worst case.  --sct
 	 */
 	if (!err)
-		err = ext4_splice_branch(handle, inode, iblock,
+		err = ext4_splice_branch(handle, inode, map->m_lblk,
 					 partial, indirect_blks, count);
 	if (err)
 		goto cleanup;
 
-	set_buffer_new(bh_result);
+	map->m_flags |= EXT4_MAP_NEW;
 
 	ext4_update_inode_fsync_trans(handle, inode, 1);
 got_it:
-	map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
+	map->m_flags |= EXT4_MAP_MAPPED;
+	map->m_pblk = le32_to_cpu(chain[depth-1].key);
+	map->m_len = count;
 	if (count > blocks_to_boundary)
-		set_buffer_boundary(bh_result);
+		map->m_flags |= EXT4_MAP_BOUNDARY;
 	err = count;
 	/* Clean up and exit */
 	partial = chain + depth - 1;	/* the whole chain */
@@ -1016,7 +1016,6 @@ cleanup:
 		brelse(partial->bh);
 		partial--;
 	}
-	BUFFER_TRACE(bh_result, "returned");
 out:
 	return err;
 }
@@ -1203,15 +1202,15 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 }
 
 /*
- * The ext4_get_blocks() function tries to look up the requested blocks,
+ * The ext4_map_blocks() function tries to look up the requested blocks,
  * and returns if the blocks are already mapped.
  *
  * Otherwise it takes the write lock of the i_data_sem and allocate blocks
  * and store the allocated blocks in the result buffer head and mark it
  * mapped.
  *
- * If file type is extents based, it will call ext4_ext_get_blocks(),
- * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping
+ * If file type is extents based, it will call ext4_ext_map_blocks(),
+ * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
  * based files
  *
  * On success, it returns the number of blocks being mapped or allocate.
@@ -1224,35 +1223,30 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
  *
  * It returns the error in case of allocation failure.
  */
-int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
-		    unsigned int max_blocks, struct buffer_head *bh,
-		    int flags)
+int ext4_map_blocks(handle_t *handle, struct inode *inode,
+		    struct ext4_map_blocks *map, int flags)
 {
 	int retval;
 
-	clear_buffer_mapped(bh);
-	clear_buffer_unwritten(bh);
-
-	ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
-		  "logical block %lu\n", inode->i_ino, flags, max_blocks,
-		  (unsigned long)block);
+	map->m_flags = 0;
+	ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
+		  "logical block %lu\n", inode->i_ino, flags, map->m_len,
+		  (unsigned long) map->m_lblk);
 	/*
 	 * Try to see if we can get the block without requesting a new
 	 * file system block.
 	 */
 	down_read((&EXT4_I(inode)->i_data_sem));
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
-		retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
-				bh, 0);
+		retval = ext4_ext_map_blocks(handle, inode, map, 0);
 	} else {
-		retval = ext4_ind_get_blocks(handle, inode, block, max_blocks,
-					     bh, 0);
+		retval = ext4_ind_map_blocks(handle, inode, map, 0);
 	}
 	up_read((&EXT4_I(inode)->i_data_sem));
 
-	if (retval > 0 && buffer_mapped(bh)) {
+	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
 		int ret = check_block_validity(inode, "file system corruption",
-					       block, bh->b_blocknr, retval);
+					map->m_lblk, map->m_pblk, retval);
 		if (ret != 0)
 			return ret;
 	}
@@ -1268,7 +1262,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	 * ext4_ext_get_block() returns th create = 0
 	 * with buffer head unmapped.
 	 */
-	if (retval > 0 && buffer_mapped(bh))
+	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
 		return retval;
 
 	/*
@@ -1281,7 +1275,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	 * of BH_Unwritten and BH_Mapped flags being simultaneously
 	 * set on the buffer_head.
 	 */
-	clear_buffer_unwritten(bh);
+	map->m_flags &= ~EXT4_MAP_UNWRITTEN;
 
 	/*
 	 * New blocks allocate and/or writing to uninitialized extent
@@ -1304,13 +1298,11 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	 * could have changed the inode type in between
 	 */
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
-		retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
-					      bh, flags);
+		retval = ext4_ext_map_blocks(handle, inode, map, flags);
 	} else {
-		retval = ext4_ind_get_blocks(handle, inode, block,
-					     max_blocks, bh, flags);
+		retval = ext4_ind_map_blocks(handle, inode, map, flags);
 
-		if (retval > 0 && buffer_new(bh)) {
+		if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
 			/*
 			 * We allocated new blocks which will result in
 			 * i_data's format changing.  Force the migrate
@@ -1333,16 +1325,38 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 		EXT4_I(inode)->i_delalloc_reserved_flag = 0;
 
 	up_write((&EXT4_I(inode)->i_data_sem));
-	if (retval > 0 && buffer_mapped(bh)) {
+	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
 		int ret = check_block_validity(inode, "file system "
 					       "corruption after allocation",
-					       block, bh->b_blocknr, retval);
+					       map->m_lblk, map->m_pblk,
+					       retval);
 		if (ret != 0)
 			return ret;
 	}
 	return retval;
 }
 
+int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
+		    unsigned int max_blocks, struct buffer_head *bh,
+		    int flags)
+{
+	struct ext4_map_blocks map;
+	int ret;
+
+	map.m_lblk = block;
+	map.m_len = max_blocks;
+
+	ret = ext4_map_blocks(handle, inode, &map, flags);
+	if (ret < 0)
+		return ret;
+
+	bh->b_blocknr = map.m_pblk;
+	bh->b_size = inode->i_sb->s_blocksize * map.m_len;
+	bh->b_bdev = inode->i_sb->s_bdev;
+	bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+	return ret;
+}
+
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
 
-- 
cgit v1.1


From 2ed886852adfcb070bf350e66a0da0d98b2f3ab5 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 16 May 2010 20:00:00 -0400
Subject: ext4: Convert callers of ext4_get_blocks() to use ext4_map_blocks()

This saves a huge amount of stack space by avoiding unnecesary struct
buffer_head's from being allocated on the stack.

In addition, to make the code easier to understand, collapse and
refactor ext4_get_block(), ext4_get_block_write(),
noalloc_get_block_write(), into a single function.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/dir.c     |  12 +-
 fs/ext4/extents.c |  40 +++----
 fs/ext4/inode.c   | 327 +++++++++++++++++++++---------------------------------
 3 files changed, 150 insertions(+), 229 deletions(-)

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 86cb6d8..e3f2700 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -128,14 +128,14 @@ static int ext4_readdir(struct file *filp,
 	offset = filp->f_pos & (sb->s_blocksize - 1);
 
 	while (!error && !stored && filp->f_pos < inode->i_size) {
-		ext4_lblk_t blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
-		struct buffer_head map_bh;
+		struct ext4_map_blocks map;
 		struct buffer_head *bh = NULL;
 
-		map_bh.b_state = 0;
-		err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0);
+		map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
+		map.m_len = 1;
+		err = ext4_map_blocks(NULL, inode, &map, 0);
 		if (err > 0) {
-			pgoff_t index = map_bh.b_blocknr >>
+			pgoff_t index = map.m_pblk >>
 					(PAGE_CACHE_SHIFT - inode->i_blkbits);
 			if (!ra_has_index(&filp->f_ra, index))
 				page_cache_sync_readahead(
@@ -143,7 +143,7 @@ static int ext4_readdir(struct file *filp,
 					&filp->f_ra, filp,
 					index, 1);
 			filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
-			bh = ext4_bread(NULL, inode, blk, 0, &err);
+			bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
 		}
 
 		/*
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 37f9387..d6801eb7 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3667,13 +3667,12 @@ static void ext4_falloc_update_inode(struct inode *inode,
 long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 {
 	handle_t *handle;
-	ext4_lblk_t block;
 	loff_t new_size;
 	unsigned int max_blocks;
 	int ret = 0;
 	int ret2 = 0;
 	int retries = 0;
-	struct buffer_head map_bh;
+	struct ext4_map_blocks map;
 	unsigned int credits, blkbits = inode->i_blkbits;
 
 	/*
@@ -3687,13 +3686,13 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 	if (S_ISDIR(inode->i_mode))
 		return -ENODEV;
 
-	block = offset >> blkbits;
+	map.m_lblk = offset >> blkbits;
 	/*
 	 * We can't just convert len to max_blocks because
 	 * If blocksize = 4096 offset = 3072 and len = 2048
 	 */
 	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
-							- block;
+		- map.m_lblk;
 	/*
 	 * credits to insert 1 extent into extent tree
 	 */
@@ -3706,16 +3705,14 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 	}
 retry:
 	while (ret >= 0 && ret < max_blocks) {
-		block = block + ret;
-		max_blocks = max_blocks - ret;
+		map.m_lblk = map.m_lblk + ret;
+		map.m_len = max_blocks = max_blocks - ret;
 		handle = ext4_journal_start(inode, credits);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
 			break;
 		}
-		map_bh.b_state = 0;
-		ret = ext4_get_blocks(handle, inode, block,
-				      max_blocks, &map_bh,
+		ret = ext4_map_blocks(handle, inode, &map,
 				      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
 		if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
@@ -3729,14 +3726,14 @@ retry:
 			ret2 = ext4_journal_stop(handle);
 			break;
 		}
-		if ((block + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
+		if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
 						blkbits) >> blkbits))
 			new_size = offset + len;
 		else
-			new_size = (block + ret) << blkbits;
+			new_size = (map.m_lblk + ret) << blkbits;
 
 		ext4_falloc_update_inode(inode, mode, new_size,
-						buffer_new(&map_bh));
+					 (map.m_flags & EXT4_MAP_NEW));
 		ext4_mark_inode_dirty(handle, inode);
 		ret2 = ext4_journal_stop(handle);
 		if (ret2)
@@ -3765,42 +3762,39 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
 				    ssize_t len)
 {
 	handle_t *handle;
-	ext4_lblk_t block;
 	unsigned int max_blocks;
 	int ret = 0;
 	int ret2 = 0;
-	struct buffer_head map_bh;
+	struct ext4_map_blocks map;
 	unsigned int credits, blkbits = inode->i_blkbits;
 
-	block = offset >> blkbits;
+	map.m_lblk = offset >> blkbits;
 	/*
 	 * We can't just convert len to max_blocks because
 	 * If blocksize = 4096 offset = 3072 and len = 2048
 	 */
-	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
-							- block;
+	max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
+		      map.m_lblk);
 	/*
 	 * credits to insert 1 extent into extent tree
 	 */
 	credits = ext4_chunk_trans_blocks(inode, max_blocks);
 	while (ret >= 0 && ret < max_blocks) {
-		block = block + ret;
-		max_blocks = max_blocks - ret;
+		map.m_lblk += ret;
+		map.m_len = (max_blocks -= ret);
 		handle = ext4_journal_start(inode, credits);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
 			break;
 		}
-		map_bh.b_state = 0;
-		ret = ext4_get_blocks(handle, inode, block,
-				      max_blocks, &map_bh,
+		ret = ext4_map_blocks(handle, inode, &map,
 				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
 		if (ret <= 0) {
 			WARN_ON(ret <= 0);
 			printk(KERN_ERR "%s: ext4_ext_map_blocks "
 				    "returned error inode#%lu, block=%u, "
 				    "max_blocks=%u", __func__,
-				    inode->i_ino, block, max_blocks);
+				    inode->i_ino, map.m_lblk, map.m_len);
 		}
 		ext4_mark_inode_dirty(handle, inode);
 		ret2 = ext4_journal_stop(handle);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ff2f5fd..0b1d7c8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1336,133 +1336,112 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 	return retval;
 }
 
-int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
-		    unsigned int max_blocks, struct buffer_head *bh,
-		    int flags)
-{
-	struct ext4_map_blocks map;
-	int ret;
-
-	map.m_lblk = block;
-	map.m_len = max_blocks;
-
-	ret = ext4_map_blocks(handle, inode, &map, flags);
-	if (ret < 0)
-		return ret;
-
-	bh->b_blocknr = map.m_pblk;
-	bh->b_size = inode->i_sb->s_blocksize * map.m_len;
-	bh->b_bdev = inode->i_sb->s_bdev;
-	bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
-	return ret;
-}
-
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
 
-int ext4_get_block(struct inode *inode, sector_t iblock,
-		   struct buffer_head *bh_result, int create)
+static int _ext4_get_block(struct inode *inode, sector_t iblock,
+			   struct buffer_head *bh, int flags)
 {
 	handle_t *handle = ext4_journal_current_handle();
+	struct ext4_map_blocks map;
 	int ret = 0, started = 0;
-	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
 	int dio_credits;
 
-	if (create && !handle) {
+	map.m_lblk = iblock;
+	map.m_len = bh->b_size >> inode->i_blkbits;
+
+	if (flags && !handle) {
 		/* Direct IO write... */
-		if (max_blocks > DIO_MAX_BLOCKS)
-			max_blocks = DIO_MAX_BLOCKS;
-		dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+		if (map.m_len > DIO_MAX_BLOCKS)
+			map.m_len = DIO_MAX_BLOCKS;
+		dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
 		handle = ext4_journal_start(inode, dio_credits);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
-			goto out;
+			return ret;
 		}
 		started = 1;
 	}
 
-	ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
-			      create ? EXT4_GET_BLOCKS_CREATE : 0);
+	ret = ext4_map_blocks(handle, inode, &map, flags);
 	if (ret > 0) {
-		bh_result->b_size = (ret << inode->i_blkbits);
+		map_bh(bh, inode->i_sb, map.m_pblk);
+		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
 		ret = 0;
 	}
 	if (started)
 		ext4_journal_stop(handle);
-out:
 	return ret;
 }
 
+int ext4_get_block(struct inode *inode, sector_t iblock,
+		   struct buffer_head *bh, int create)
+{
+	return _ext4_get_block(inode, iblock, bh,
+			       create ? EXT4_GET_BLOCKS_CREATE : 0);
+}
+
 /*
  * `handle' can be NULL if create is zero
  */
 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 				ext4_lblk_t block, int create, int *errp)
 {
-	struct buffer_head dummy;
+	struct ext4_map_blocks map;
+	struct buffer_head *bh;
 	int fatal = 0, err;
-	int flags = 0;
 
 	J_ASSERT(handle != NULL || create == 0);
 
-	dummy.b_state = 0;
-	dummy.b_blocknr = -1000;
-	buffer_trace_init(&dummy.b_history);
-	if (create)
-		flags |= EXT4_GET_BLOCKS_CREATE;
-	err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags);
-	/*
-	 * ext4_get_blocks() returns number of blocks mapped. 0 in
-	 * case of a HOLE.
-	 */
-	if (err > 0) {
-		if (err > 1)
-			WARN_ON(1);
-		err = 0;
+	map.m_lblk = block;
+	map.m_len = 1;
+	err = ext4_map_blocks(handle, inode, &map,
+			      create ? EXT4_GET_BLOCKS_CREATE : 0);
+
+	if (err < 0)
+		*errp = err;
+	if (err <= 0)
+		return NULL;
+	*errp = 0;
+
+	bh = sb_getblk(inode->i_sb, map.m_pblk);
+	if (!bh) {
+		*errp = -EIO;
+		return NULL;
 	}
-	*errp = err;
-	if (!err && buffer_mapped(&dummy)) {
-		struct buffer_head *bh;
-		bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
-		if (!bh) {
-			*errp = -EIO;
-			goto err;
-		}
-		if (buffer_new(&dummy)) {
-			J_ASSERT(create != 0);
-			J_ASSERT(handle != NULL);
+	if (map.m_flags & EXT4_MAP_NEW) {
+		J_ASSERT(create != 0);
+		J_ASSERT(handle != NULL);
 
-			/*
-			 * Now that we do not always journal data, we should
-			 * keep in mind whether this should always journal the
-			 * new buffer as metadata.  For now, regular file
-			 * writes use ext4_get_block instead, so it's not a
-			 * problem.
-			 */
-			lock_buffer(bh);
-			BUFFER_TRACE(bh, "call get_create_access");
-			fatal = ext4_journal_get_create_access(handle, bh);
-			if (!fatal && !buffer_uptodate(bh)) {
-				memset(bh->b_data, 0, inode->i_sb->s_blocksize);
-				set_buffer_uptodate(bh);
-			}
-			unlock_buffer(bh);
-			BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-			err = ext4_handle_dirty_metadata(handle, inode, bh);
-			if (!fatal)
-				fatal = err;
-		} else {
-			BUFFER_TRACE(bh, "not a new buffer");
-		}
-		if (fatal) {
-			*errp = fatal;
-			brelse(bh);
-			bh = NULL;
+		/*
+		 * Now that we do not always journal data, we should
+		 * keep in mind whether this should always journal the
+		 * new buffer as metadata.  For now, regular file
+		 * writes use ext4_get_block instead, so it's not a
+		 * problem.
+		 */
+		lock_buffer(bh);
+		BUFFER_TRACE(bh, "call get_create_access");
+		fatal = ext4_journal_get_create_access(handle, bh);
+		if (!fatal && !buffer_uptodate(bh)) {
+			memset(bh->b_data, 0, inode->i_sb->s_blocksize);
+			set_buffer_uptodate(bh);
 		}
-		return bh;
+		unlock_buffer(bh);
+		BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+		err = ext4_handle_dirty_metadata(handle, inode, bh);
+		if (!fatal)
+			fatal = err;
+	} else {
+		BUFFER_TRACE(bh, "not a new buffer");
 	}
-err:
-	return NULL;
+	if (fatal) {
+		*errp = fatal;
+		brelse(bh);
+		bh = NULL;
+	}
+	return bh;
 }
 
 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
@@ -2050,28 +2029,23 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 /*
  * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
  *
- * @mpd->inode - inode to walk through
- * @exbh->b_blocknr - first block on a disk
- * @exbh->b_size - amount of space in bytes
- * @logical - first logical block to start assignment with
- *
  * the function goes through all passed space and put actual disk
  * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
  */
-static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
-				 struct buffer_head *exbh)
+static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
+				 struct ext4_map_blocks *map)
 {
 	struct inode *inode = mpd->inode;
 	struct address_space *mapping = inode->i_mapping;
-	int blocks = exbh->b_size >> inode->i_blkbits;
-	sector_t pblock = exbh->b_blocknr, cur_logical;
+	int blocks = map->m_len;
+	sector_t pblock = map->m_pblk, cur_logical;
 	struct buffer_head *head, *bh;
 	pgoff_t index, end;
 	struct pagevec pvec;
 	int nr_pages, i;
 
-	index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
 	cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 
 	pagevec_init(&pvec, 0);
@@ -2098,17 +2072,16 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
 
 			/* skip blocks out of the range */
 			do {
-				if (cur_logical >= logical)
+				if (cur_logical >= map->m_lblk)
 					break;
 				cur_logical++;
 			} while ((bh = bh->b_this_page) != head);
 
 			do {
-				if (cur_logical >= logical + blocks)
+				if (cur_logical >= map->m_lblk + blocks)
 					break;
 
-				if (buffer_delay(bh) ||
-						buffer_unwritten(bh)) {
+				if (buffer_delay(bh) || buffer_unwritten(bh)) {
 
 					BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
 
@@ -2127,7 +2100,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
 				} else if (buffer_mapped(bh))
 					BUG_ON(bh->b_blocknr != pblock);
 
-				if (buffer_uninit(exbh))
+				if (map->m_flags & EXT4_MAP_UNINIT)
 					set_buffer_uninit(bh);
 				cur_logical++;
 				pblock++;
@@ -2138,21 +2111,6 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
 }
 
 
-/*
- * __unmap_underlying_blocks - just a helper function to unmap
- * set of blocks described by @bh
- */
-static inline void __unmap_underlying_blocks(struct inode *inode,
-					     struct buffer_head *bh)
-{
-	struct block_device *bdev = inode->i_sb->s_bdev;
-	int blocks, i;
-
-	blocks = bh->b_size >> inode->i_blkbits;
-	for (i = 0; i < blocks; i++)
-		unmap_underlying_metadata(bdev, bh->b_blocknr + i);
-}
-
 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
 					sector_t logical, long blk_cnt)
 {
@@ -2214,7 +2172,7 @@ static void ext4_print_free_blocks(struct inode *inode)
 static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
 	int err, blks, get_blocks_flags;
-	struct buffer_head new;
+	struct ext4_map_blocks map;
 	sector_t next = mpd->b_blocknr;
 	unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
 	loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
@@ -2255,15 +2213,15 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
 	 * variables are updated after the blocks have been allocated.
 	 */
-	new.b_state = 0;
+	map.m_lblk = next;
+	map.m_len = max_blocks;
 	get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
 	if (ext4_should_dioread_nolock(mpd->inode))
 		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
 	if (mpd->b_state & (1 << BH_Delay))
 		get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
 
-	blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
-			       &new, get_blocks_flags);
+	blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
 	if (blks < 0) {
 		err = blks;
 		/*
@@ -2305,10 +2263,13 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	}
 	BUG_ON(blks == 0);
 
-	new.b_size = (blks << mpd->inode->i_blkbits);
+	if (map.m_flags & EXT4_MAP_NEW) {
+		struct block_device *bdev = mpd->inode->i_sb->s_bdev;
+		int i;
 
-	if (buffer_new(&new))
-		__unmap_underlying_blocks(mpd->inode, &new);
+		for (i = 0; i < map.m_len; i++)
+			unmap_underlying_metadata(bdev, map.m_pblk + i);
+	}
 
 	/*
 	 * If blocks are delayed marked, we need to
@@ -2316,7 +2277,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	 */
 	if ((mpd->b_state & (1 << BH_Delay)) ||
 	    (mpd->b_state & (1 << BH_Unwritten)))
-		mpage_put_bnr_to_bhs(mpd, next, &new);
+		mpage_put_bnr_to_bhs(mpd, &map);
 
 	if (ext4_should_order_data(mpd->inode)) {
 		err = ext4_jbd2_file_inode(handle, mpd->inode);
@@ -2534,8 +2495,9 @@ static int __mpage_da_writepage(struct page *page,
  * initialized properly.
  */
 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
-				  struct buffer_head *bh_result, int create)
+				  struct buffer_head *bh, int create)
 {
+	struct ext4_map_blocks map;
 	int ret = 0;
 	sector_t invalid_block = ~((sector_t) 0xffff);
 
@@ -2543,16 +2505,22 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 		invalid_block = ~0;
 
 	BUG_ON(create == 0);
-	BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+	BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
+
+	map.m_lblk = iblock;
+	map.m_len = 1;
 
 	/*
 	 * first, we need to know whether the block is allocated already
 	 * preallocated blocks are unmapped but should treated
 	 * the same as allocated blocks.
 	 */
-	ret = ext4_get_blocks(NULL, inode, iblock, 1,  bh_result, 0);
-	if ((ret == 0) && !buffer_delay(bh_result)) {
-		/* the block isn't (pre)allocated yet, let's reserve space */
+	ret = ext4_map_blocks(NULL, inode, &map, 0);
+	if (ret < 0)
+		return ret;
+	if (ret == 0) {
+		if (buffer_delay(bh))
+			return 0; /* Not sure this could or should happen */
 		/*
 		 * XXX: __block_prepare_write() unmaps passed block,
 		 * is it OK?
@@ -2562,26 +2530,26 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 			/* not enough space to reserve */
 			return ret;
 
-		map_bh(bh_result, inode->i_sb, invalid_block);
-		set_buffer_new(bh_result);
-		set_buffer_delay(bh_result);
-	} else if (ret > 0) {
-		bh_result->b_size = (ret << inode->i_blkbits);
-		if (buffer_unwritten(bh_result)) {
-			/* A delayed write to unwritten bh should
-			 * be marked new and mapped.  Mapped ensures
-			 * that we don't do get_block multiple times
-			 * when we write to the same offset and new
-			 * ensures that we do proper zero out for
-			 * partial write.
-			 */
-			set_buffer_new(bh_result);
-			set_buffer_mapped(bh_result);
-		}
-		ret = 0;
+		map_bh(bh, inode->i_sb, invalid_block);
+		set_buffer_new(bh);
+		set_buffer_delay(bh);
+		return 0;
 	}
 
-	return ret;
+	map_bh(bh, inode->i_sb, map.m_pblk);
+	bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+
+	if (buffer_unwritten(bh)) {
+		/* A delayed write to unwritten bh should be marked
+		 * new and mapped.  Mapped ensures that we don't do
+		 * get_block multiple times when we write to the same
+		 * offset and new ensures that we do proper zero out
+		 * for partial write.
+		 */
+		set_buffer_new(bh);
+		set_buffer_mapped(bh);
+	}
+	return 0;
 }
 
 /*
@@ -2603,21 +2571,8 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create)
 {
-	int ret = 0;
-	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-
 	BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
-
-	/*
-	 * we don't want to do block allocation in writepage
-	 * so call get_block_wrap with create = 0
-	 */
-	ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
-	if (ret > 0) {
-		bh_result->b_size = (ret << inode->i_blkbits);
-		ret = 0;
-	}
-	return ret;
+	return _ext4_get_block(inode, iblock, bh_result, 0);
 }
 
 static int bget_one(handle_t *handle, struct buffer_head *bh)
@@ -3644,46 +3599,18 @@ out:
 	return ret;
 }
 
+/*
+ * ext4_get_block used when preparing for a DIO write or buffer write.
+ * We allocate an uinitialized extent if blocks haven't been allocated.
+ * The extent will be converted to initialized after the IO is complete.
+ */
 static int ext4_get_block_write(struct inode *inode, sector_t iblock,
 		   struct buffer_head *bh_result, int create)
 {
-	handle_t *handle = ext4_journal_current_handle();
-	int ret = 0;
-	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-	int dio_credits;
-	int started = 0;
-
 	ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
 		   inode->i_ino, create);
-	/*
-	 * ext4_get_block in prepare for a DIO write or buffer write.
-	 * We allocate an uinitialized extent if blocks haven't been allocated.
-	 * The extent will be converted to initialized after IO complete.
-	 */
-	create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
-
-	if (!handle) {
-		if (max_blocks > DIO_MAX_BLOCKS)
-			max_blocks = DIO_MAX_BLOCKS;
-		dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
-		handle = ext4_journal_start(inode, dio_credits);
-		if (IS_ERR(handle)) {
-			ret = PTR_ERR(handle);
-			goto out;
-		}
-		started = 1;
-	}
-
-	ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
-			      create);
-	if (ret > 0) {
-		bh_result->b_size = (ret << inode->i_blkbits);
-		ret = 0;
-	}
-	if (started)
-		ext4_journal_stop(handle);
-out:
-	return ret;
+	return _ext4_get_block(inode, iblock, bh_result,
+			       EXT4_GET_BLOCKS_IO_CREATE_EXT);
 }
 
 static void dump_completed_IO(struct inode * inode)
-- 
cgit v1.1


From 24676da469f50f433baa347845639662c561d1f6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 16 May 2010 21:00:00 -0400
Subject: ext4: Convert calls of ext4_error() to EXT4_ERROR_INODE()

EXT4_ERROR_INODE() tends to provide better error information and in a
more consistent format.  Some errors were not even identifying the inode
or directory which was corrupted, which made them not very useful.

Addresses-Google-Bug: #2507977

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/dir.c         | 10 +++----
 fs/ext4/ext4.h        |  4 +--
 fs/ext4/extents.c     | 10 +++----
 fs/ext4/inode.c       | 83 +++++++++++++++++++++++----------------------------
 fs/ext4/move_extent.c |  8 ++---
 fs/ext4/namei.c       | 39 +++++++++++-------------
 fs/ext4/xattr.c       | 35 ++++++++++------------
 7 files changed, 86 insertions(+), 103 deletions(-)

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index e3f2700..6a2b8bc 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -83,11 +83,10 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
 		error_msg = "inode out of bounds";
 
 	if (error_msg != NULL)
-		__ext4_error(dir->i_sb, function,
-			"bad entry in directory #%lu: %s - block=%llu"
+		ext4_error_inode(function, dir,
+			"bad entry in directory: %s - block=%llu"
 			"offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
-			dir->i_ino, error_msg, 
-			(unsigned long long) bh->b_blocknr,     
+			error_msg, (unsigned long long) bh->b_blocknr,
 			(unsigned) (offset%bh->b_size), offset,
 			le32_to_cpu(de->inode),
 			rlen, de->name_len);
@@ -152,9 +151,8 @@ static int ext4_readdir(struct file *filp,
 		 */
 		if (!bh) {
 			if (!dir_has_error) {
-				ext4_error(sb, "directory #%lu "
+				EXT4_ERROR_INODE(inode, "directory "
 					   "contains a hole at offset %Lu",
-					   inode->i_ino,
 					   (unsigned long long) filp->f_pos);
 				dir_has_error = 1;
 			}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 57fc0e5..413321f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -54,10 +54,10 @@
 #endif
 
 #define EXT4_ERROR_INODE(inode, fmt, a...) \
-	ext4_error_inode(__func__, (inode), (fmt), ## a);
+	ext4_error_inode(__func__, (inode), (fmt), ## a)
 
 #define EXT4_ERROR_FILE(file, fmt, a...)	\
-	ext4_error_file(__func__, (file), (fmt), ## a);
+	ext4_error_file(__func__, (file), (fmt), ## a)
 
 /* data type for block offset of block group */
 typedef int ext4_grpblk_t;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index d6801eb7..91c3873 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -439,10 +439,10 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
 	return 0;
 
 corrupted:
-	__ext4_error(inode->i_sb, function,
-			"bad header/extent in inode #%lu: %s - magic %x, "
+	ext4_error_inode(function, inode,
+			"bad header/extent: %s - magic %x, "
 			"entries %u, max %u(%u), depth %u(%u)",
-			inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
+			error_msg, le16_to_cpu(eh->eh_magic),
 			le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
 			max, le16_to_cpu(eh->eh_depth), depth);
 
@@ -1622,9 +1622,7 @@ int ext4_ext_try_to_merge(struct inode *inode,
 		merge_done = 1;
 		WARN_ON(eh->eh_entries == 0);
 		if (!eh->eh_entries)
-			ext4_error(inode->i_sb,
-				   "inode#%lu, eh->eh_entries = 0!",
-				   inode->i_ino);
+			EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
 	}
 
 	return merge_done;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0b1d7c8..3b68772 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -348,9 +348,8 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
 		if (blk &&
 		    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
 						    blk, 1))) {
-			__ext4_error(inode->i_sb, function,
-				   "invalid block reference %u "
-				   "in inode #%lu", blk, inode->i_ino);
+			ext4_error_inode(function, inode,
+					 "invalid block reference %u", blk);
 			return -EIO;
 		}
 	}
@@ -1129,15 +1128,15 @@ void ext4_da_update_reserve_space(struct inode *inode,
 		ext4_discard_preallocations(inode);
 }
 
-static int check_block_validity(struct inode *inode, const char *msg,
-				sector_t logical, sector_t phys, int len)
+static int check_block_validity(struct inode *inode, const char *func,
+				struct ext4_map_blocks *map)
 {
-	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
-		__ext4_error(inode->i_sb, msg,
-			   "inode #%lu logical block %llu mapped to %llu "
-			   "(size %d)", inode->i_ino,
-			   (unsigned long long) logical,
-			   (unsigned long long) phys, len);
+	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
+				   map->m_len)) {
+		ext4_error_inode(func, inode,
+			   "lblock %lu mapped to illegal pblock %llu "
+			   "(length %d)", (unsigned long) map->m_lblk,
+				 map->m_pblk, map->m_len);
 		return -EIO;
 	}
 	return 0;
@@ -1245,8 +1244,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 	up_read((&EXT4_I(inode)->i_data_sem));
 
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-		int ret = check_block_validity(inode, "file system corruption",
-					map->m_lblk, map->m_pblk, retval);
+		int ret = check_block_validity(inode, __func__, map);
 		if (ret != 0)
 			return ret;
 	}
@@ -1326,10 +1324,9 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 
 	up_write((&EXT4_I(inode)->i_data_sem));
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-		int ret = check_block_validity(inode, "file system "
-					       "corruption after allocation",
-					       map->m_lblk, map->m_pblk,
-					       retval);
+		int ret = check_block_validity(inode,
+					       "ext4_map_blocks_after_alloc",
+					       map);
 		if (ret != 0)
 			return ret;
 	}
@@ -4327,10 +4324,9 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
 
 	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
 				   count)) {
-		ext4_error(inode->i_sb, "inode #%lu: "
-			   "attempt to clear blocks %llu len %lu, invalid",
-			   inode->i_ino, (unsigned long long) block_to_free,
-			   count);
+		EXT4_ERROR_INODE(inode, "attempt to clear invalid "
+				 "blocks %llu len %lu",
+				 (unsigned long long) block_to_free, count);
 		return 1;
 	}
 
@@ -4435,11 +4431,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
 		if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
 			ext4_handle_dirty_metadata(handle, inode, this_bh);
 		else
-			ext4_error(inode->i_sb,
-				   "circular indirect block detected, "
-				   "inode=%lu, block=%llu",
-				   inode->i_ino,
-				   (unsigned long long) this_bh->b_blocknr);
+			EXT4_ERROR_INODE(inode,
+					 "circular indirect block detected at "
+					 "block %llu",
+				(unsigned long long) this_bh->b_blocknr);
 	}
 }
 
@@ -4477,11 +4472,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 
 			if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
 						   nr, 1)) {
-				ext4_error(inode->i_sb,
-					   "indirect mapped block in inode "
-					   "#%lu invalid (level %d, blk #%lu)",
-					   inode->i_ino, depth,
-					   (unsigned long) nr);
+				EXT4_ERROR_INODE(inode,
+						 "invalid indirect mapped "
+						 "block %lu (level %d)",
+						 (unsigned long) nr, depth);
 				break;
 			}
 
@@ -4493,9 +4487,9 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 			 * (should be rare).
 			 */
 			if (!bh) {
-				ext4_error(inode->i_sb,
-					   "Read failure, inode=%lu, block=%llu",
-					   inode->i_ino, nr);
+				EXT4_ERROR_INODE(inode,
+						 "Read failure block=%llu",
+						 (unsigned long long) nr);
 				continue;
 			}
 
@@ -4810,8 +4804,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
 
 	bh = sb_getblk(sb, block);
 	if (!bh) {
-		ext4_error(sb, "unable to read inode block - "
-			   "inode=%lu, block=%llu", inode->i_ino, block);
+		EXT4_ERROR_INODE(inode, "unable to read inode block - "
+				 "block %llu", block);
 		return -EIO;
 	}
 	if (!buffer_uptodate(bh)) {
@@ -4909,8 +4903,8 @@ make_io:
 		submit_bh(READ_META, bh);
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
-			ext4_error(sb, "unable to read inode block - inode=%lu,"
-				   " block=%llu", inode->i_ino, block);
+			EXT4_ERROR_INODE(inode, "unable to read inode "
+					 "block %llu", block);
 			brelse(bh);
 			return -EIO;
 		}
@@ -5121,8 +5115,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	ret = 0;
 	if (ei->i_file_acl &&
 	    !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
-		ext4_error(sb, "bad extended attribute block %llu inode #%lu",
-			   ei->i_file_acl, inode->i_ino);
+		EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
+				 ei->i_file_acl);
 		ret = -EIO;
 		goto bad_inode;
 	} else if (ei->i_flags & EXT4_EXTENTS_FL) {
@@ -5167,8 +5161,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 			   new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
 	} else {
 		ret = -EIO;
-		ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu",
-			   inode->i_mode, inode->i_ino);
+		EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
 		goto bad_inode;
 	}
 	brelse(iloc.bh);
@@ -5406,9 +5399,9 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
 		if (wbc->sync_mode == WB_SYNC_ALL)
 			sync_dirty_buffer(iloc.bh);
 		if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
-			ext4_error(inode->i_sb, "IO error syncing inode, "
-				   "inode=%lu, block=%llu", inode->i_ino,
-				   (unsigned long long)iloc.bh->b_blocknr);
+			EXT4_ERROR_INODE(inode,
+				"IO error syncing inode (block=%llu)",
+				(unsigned long long) iloc.bh->b_blocknr);
 			err = -EIO;
 		}
 		brelse(iloc.bh);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 82621a3..b8b8ea1 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -530,7 +530,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
 	 * new_ext       |-------|
 	 */
 	if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
-		ext4_error(orig_inode->i_sb,
+		EXT4_ERROR_INODE(orig_inode,
 			"new_ext_end(%u) should be less than or equal to "
 			"oext->ee_block(%u) + oext_alen(%d) - 1",
 			new_ext_end, le32_to_cpu(oext->ee_block),
@@ -693,12 +693,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
 	while (1) {
 		/* The extent for donor must be found. */
 		if (!dext) {
-			ext4_error(donor_inode->i_sb,
+			EXT4_ERROR_INODE(donor_inode,
 				   "The extent for donor must be found");
 			*err = -EIO;
 			goto out;
 		} else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
-			ext4_error(donor_inode->i_sb,
+			EXT4_ERROR_INODE(donor_inode,
 				"Donor offset(%u) and the first block of donor "
 				"extent(%u) should be equal",
 				donor_off,
@@ -1355,7 +1355,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 			if (ret1 < 0)
 				break;
 			if (*moved_len > len) {
-				ext4_error(orig_inode->i_sb,
+				EXT4_ERROR_INODE(orig_inode,
 					"We replaced blocks too much! "
 					"sum of replaced: %llu requested: %llu",
 					*moved_len, len);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 0c070fa..bff77b0 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -943,8 +943,8 @@ restart:
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
 			/* read error, skip block & hope for the best */
-			ext4_error(sb, "reading directory #%lu offset %lu",
-				   dir->i_ino, (unsigned long)block);
+			EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
+					 (unsigned long) block);
 			brelse(bh);
 			goto next;
 		}
@@ -1066,15 +1066,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
 		__u32 ino = le32_to_cpu(de->inode);
 		brelse(bh);
 		if (!ext4_valid_inum(dir->i_sb, ino)) {
-			ext4_error(dir->i_sb, "bad inode number: %u", ino);
+			EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
 			return ERR_PTR(-EIO);
 		}
 		inode = ext4_iget(dir->i_sb, ino);
 		if (unlikely(IS_ERR(inode))) {
 			if (PTR_ERR(inode) == -ESTALE) {
-				ext4_error(dir->i_sb,
-						"deleted inode referenced: %u",
-						ino);
+				EXT4_ERROR_INODE(dir,
+						 "deleted inode referenced: %u",
+						 ino);
 				return ERR_PTR(-EIO);
 			} else {
 				return ERR_CAST(inode);
@@ -1104,8 +1104,8 @@ struct dentry *ext4_get_parent(struct dentry *child)
 	brelse(bh);
 
 	if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
-		ext4_error(child->d_inode->i_sb,
-			   "bad inode number: %u", ino);
+		EXT4_ERROR_INODE(child->d_inode,
+				 "bad parent inode number: %u", ino);
 		return ERR_PTR(-EIO);
 	}
 
@@ -1404,9 +1404,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 	de = (struct ext4_dir_entry_2 *)((char *)fde +
 		ext4_rec_len_from_disk(fde->rec_len, blocksize));
 	if ((char *) de >= (((char *) root) + blocksize)) {
-		ext4_error(dir->i_sb,
-			   "invalid rec_len for '..' in inode %lu",
-			   dir->i_ino);
+		EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
 		brelse(bh);
 		return -EIO;
 	}
@@ -1915,9 +1913,8 @@ static int empty_dir(struct inode *inode)
 	if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
 	    !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
 		if (err)
-			ext4_error(inode->i_sb,
-				   "error %d reading directory #%lu offset 0",
-				   err, inode->i_ino);
+			EXT4_ERROR_INODE(inode,
+				"error %d reading directory lblock 0", err);
 		else
 			ext4_warning(inode->i_sb,
 				     "bad directory (dir #%lu) - no data block",
@@ -1941,17 +1938,17 @@ static int empty_dir(struct inode *inode)
 	de = ext4_next_entry(de1, sb->s_blocksize);
 	while (offset < inode->i_size) {
 		if (!bh ||
-			(void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+		    (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+			unsigned int lblock;
 			err = 0;
 			brelse(bh);
-			bh = ext4_bread(NULL, inode,
-				offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
+			lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
+			bh = ext4_bread(NULL, inode, lblock, 0, &err);
 			if (!bh) {
 				if (err)
-					ext4_error(sb,
-						   "error %d reading directory"
-						   " #%lu offset %u",
-						   err, inode->i_ino, offset);
+					EXT4_ERROR_INODE(inode,
+						"error %d reading directory "
+						"lblock %u", err, lblock);
 				offset += sb->s_blocksize;
 				continue;
 			}
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index b4c5aa8..0a09a2e 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -228,9 +228,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	if (ext4_xattr_check_block(bh)) {
 bad_block:
-		ext4_error(inode->i_sb,
-			   "inode %lu: bad block %llu", inode->i_ino,
-			   EXT4_I(inode)->i_file_acl);
+		EXT4_ERROR_INODE(inode, "bad block %llu",
+				 EXT4_I(inode)->i_file_acl);
 		error = -EIO;
 		goto cleanup;
 	}
@@ -372,9 +371,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 	ea_bdebug(bh, "b_count=%d, refcount=%d",
 		atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
 	if (ext4_xattr_check_block(bh)) {
-		ext4_error(inode->i_sb,
-			   "inode %lu: bad block %llu", inode->i_ino,
-			   EXT4_I(inode)->i_file_acl);
+		EXT4_ERROR_INODE(inode, "bad block %llu",
+				 EXT4_I(inode)->i_file_acl);
 		error = -EIO;
 		goto cleanup;
 	}
@@ -666,8 +664,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
 			atomic_read(&(bs->bh->b_count)),
 			le32_to_cpu(BHDR(bs->bh)->h_refcount));
 		if (ext4_xattr_check_block(bs->bh)) {
-			ext4_error(sb, "inode %lu: bad block %llu",
-				   inode->i_ino, EXT4_I(inode)->i_file_acl);
+			EXT4_ERROR_INODE(inode, "bad block %llu",
+					 EXT4_I(inode)->i_file_acl);
 			error = -EIO;
 			goto cleanup;
 		}
@@ -880,8 +878,8 @@ cleanup_dquot:
 	goto cleanup;
 
 bad_block:
-	ext4_error(inode->i_sb, "inode %lu: bad block %llu",
-		   inode->i_ino, EXT4_I(inode)->i_file_acl);
+	EXT4_ERROR_INODE(inode, "bad block %llu",
+			 EXT4_I(inode)->i_file_acl);
 	goto cleanup;
 
 #undef header
@@ -1194,8 +1192,8 @@ retry:
 		if (!bh)
 			goto cleanup;
 		if (ext4_xattr_check_block(bh)) {
-			ext4_error(inode->i_sb, "inode %lu: bad block %llu",
-				   inode->i_ino, EXT4_I(inode)->i_file_acl);
+			EXT4_ERROR_INODE(inode, "bad block %llu",
+					 EXT4_I(inode)->i_file_acl);
 			error = -EIO;
 			goto cleanup;
 		}
@@ -1372,14 +1370,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
 		goto cleanup;
 	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
 	if (!bh) {
-		ext4_error(inode->i_sb, "inode %lu: block %llu read error",
-			   inode->i_ino, EXT4_I(inode)->i_file_acl);
+		EXT4_ERROR_INODE(inode, "block %llu read error",
+				 EXT4_I(inode)->i_file_acl);
 		goto cleanup;
 	}
 	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
 	    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
-		ext4_error(inode->i_sb, "inode %lu: bad block %llu",
-			   inode->i_ino, EXT4_I(inode)->i_file_acl);
+		EXT4_ERROR_INODE(inode, "bad block %llu",
+				 EXT4_I(inode)->i_file_acl);
 		goto cleanup;
 	}
 	ext4_xattr_release_block(handle, inode, bh);
@@ -1504,9 +1502,8 @@ again:
 		}
 		bh = sb_bread(inode->i_sb, ce->e_block);
 		if (!bh) {
-			ext4_error(inode->i_sb,
-				"inode %lu: block %lu read error",
-				inode->i_ino, (unsigned long) ce->e_block);
+			EXT4_ERROR_INODE(inode, "block %lu read error",
+					 (unsigned long) ce->e_block);
 		} else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
 				EXT4_XATTR_REFCOUNT_MAX) {
 			ea_idebug(inode, "block %lu refcount %d>=%d",
-- 
cgit v1.1


From 12e9b892002d9af057655d35b44db8ee9243b0dc Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Sun, 16 May 2010 22:00:00 -0400
Subject: ext4: Use bitops to read/modify i_flags in struct ext4_inode_info

At several places we modify EXT4_I(inode)->i_flags without holding
i_mutex (ext4_do_update_inode, ...). These modifications are racy and
we can lose updates to i_flags. So convert handling of i_flags to use
bitops which are atomic.

https://bugzilla.kernel.org/show_bug.cgi?id=15792

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/dir.c         |   4 +-
 fs/ext4/ext4.h        | 109 +++++++++++++++++++++++++++++++++++++++++++-------
 fs/ext4/ext4_jbd2.h   |   8 ++--
 fs/ext4/extents.c     |  10 ++---
 fs/ext4/file.c        |   2 +-
 fs/ext4/ialloc.c      |   4 +-
 fs/ext4/inode.c       |  30 +++++++-------
 fs/ext4/mballoc.c     |   4 +-
 fs/ext4/migrate.c     |   2 +-
 fs/ext4/move_extent.c |   4 +-
 fs/ext4/namei.c       |  10 ++---
 fs/ext4/super.c       |   1 +
 fs/ext4/xattr.c       |   4 +-
 13 files changed, 136 insertions(+), 56 deletions(-)

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 6a2b8bc..ea5e6cb 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -110,7 +110,7 @@ static int ext4_readdir(struct file *filp,
 
 	if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
 				    EXT4_FEATURE_COMPAT_DIR_INDEX) &&
-	    ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) ||
+	    ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
 	     ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
 		err = ext4_dx_readdir(filp, dirent, filldir);
 		if (err != ERR_BAD_DX_DIR) {
@@ -121,7 +121,7 @@ static int ext4_readdir(struct file *filp,
 		 * We don't set the inode dirty flag since it's not
 		 * critical that it get flushed back to the disk.
 		 */
-		EXT4_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL;
+		ext4_clear_inode_flag(filp->f_path.dentry->d_inode, EXT4_INODE_INDEX);
 	}
 	stored = 0;
 	offset = filp->f_pos & (sb->s_blocksize - 1);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 413321f..7441488 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -344,6 +344,83 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
 		return flags & EXT4_OTHER_FLMASK;
 }
 
+/*
+ * Inode flags used for atomic set/get
+ */
+enum {
+	EXT4_INODE_SECRM	= 0,	/* Secure deletion */
+	EXT4_INODE_UNRM		= 1,	/* Undelete */
+	EXT4_INODE_COMPR	= 2,	/* Compress file */
+	EXT4_INODE_SYNC		= 3,	/* Synchronous updates */
+	EXT4_INODE_IMMUTABLE	= 4,	/* Immutable file */
+	EXT4_INODE_APPEND	= 5,	/* writes to file may only append */
+	EXT4_INODE_NODUMP	= 6,	/* do not dump file */
+	EXT4_INODE_NOATIME	= 7,	/* do not update atime */
+/* Reserved for compression usage... */
+	EXT4_INODE_DIRTY	= 8,
+	EXT4_INODE_COMPRBLK	= 9,	/* One or more compressed clusters */
+	EXT4_INODE_NOCOMPR	= 10,	/* Don't compress */
+	EXT4_INODE_ECOMPR	= 11,	/* Compression error */
+/* End compression flags --- maybe not all used */
+	EXT4_INODE_INDEX	= 12,	/* hash-indexed directory */
+	EXT4_INODE_IMAGIC	= 13,	/* AFS directory */
+	EXT4_INODE_JOURNAL_DATA	= 14,	/* file data should be journaled */
+	EXT4_INODE_NOTAIL	= 15,	/* file tail should not be merged */
+	EXT4_INODE_DIRSYNC	= 16,	/* dirsync behaviour (directories only) */
+	EXT4_INODE_TOPDIR	= 17,	/* Top of directory hierarchies*/
+	EXT4_INODE_HUGE_FILE	= 18,	/* Set to each huge file */
+	EXT4_INODE_EXTENTS	= 19,	/* Inode uses extents */
+	EXT4_INODE_EA_INODE	= 21,	/* Inode used for large EA */
+	EXT4_INODE_EOFBLOCKS	= 22,	/* Blocks allocated beyond EOF */
+	EXT4_INODE_RESERVED	= 31,	/* reserved for ext4 lib */
+};
+
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \
+	printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \
+		EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); }
+
+/*
+ * Since it's pretty easy to mix up bit numbers and hex values, and we
+ * can't do a compile-time test for ENUM values, we use a run-time
+ * test to make sure that EXT4_XXX_FL is consistent with respect to
+ * EXT4_INODE_XXX.  If all is well the printk and BUG_ON will all drop
+ * out so it won't cost any extra space in the compiled kernel image.
+ * But it's important that these values are the same, since we are
+ * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
+ * must be consistent with the values of FS_XXX_FL defined in
+ * include/linux/fs.h and the on-disk values found in ext2, ext3, and
+ * ext4 filesystems, and of course the values defined in e2fsprogs.
+ *
+ * It's not paranoia if the Murphy's Law really *is* out to get you.  :-)
+ */
+static inline void ext4_check_flag_values(void)
+{
+	CHECK_FLAG_VALUE(SECRM);
+	CHECK_FLAG_VALUE(UNRM);
+	CHECK_FLAG_VALUE(COMPR);
+	CHECK_FLAG_VALUE(SYNC);
+	CHECK_FLAG_VALUE(IMMUTABLE);
+	CHECK_FLAG_VALUE(APPEND);
+	CHECK_FLAG_VALUE(NODUMP);
+	CHECK_FLAG_VALUE(NOATIME);
+	CHECK_FLAG_VALUE(DIRTY);
+	CHECK_FLAG_VALUE(COMPRBLK);
+	CHECK_FLAG_VALUE(NOCOMPR);
+	CHECK_FLAG_VALUE(ECOMPR);
+	CHECK_FLAG_VALUE(INDEX);
+	CHECK_FLAG_VALUE(IMAGIC);
+	CHECK_FLAG_VALUE(JOURNAL_DATA);
+	CHECK_FLAG_VALUE(NOTAIL);
+	CHECK_FLAG_VALUE(DIRSYNC);
+	CHECK_FLAG_VALUE(TOPDIR);
+	CHECK_FLAG_VALUE(HUGE_FILE);
+	CHECK_FLAG_VALUE(EXTENTS);
+	CHECK_FLAG_VALUE(EA_INODE);
+	CHECK_FLAG_VALUE(EOFBLOCKS);
+	CHECK_FLAG_VALUE(RESERVED);
+}
+
 /* Used to pass group descriptor data when online resize is done */
 struct ext4_new_group_input {
 	__u32 group;		/* Group number for this data */
@@ -639,9 +716,8 @@ struct ext4_ext_cache {
  */
 struct ext4_inode_info {
 	__le32	i_data[15];	/* unconverted */
-	__u32	i_flags;
-	ext4_fsblk_t	i_file_acl;
 	__u32	i_dtime;
+	ext4_fsblk_t	i_file_acl;
 
 	/*
 	 * i_block_group is the number of the block group which contains
@@ -652,6 +728,7 @@ struct ext4_inode_info {
 	 */
 	ext4_group_t	i_block_group;
 	unsigned long	i_state_flags;		/* Dynamic state flags */
+	unsigned long	i_flags;
 
 	ext4_lblk_t		i_dir_start_lookup;
 #ifdef CONFIG_EXT4_FS_XATTR
@@ -1087,20 +1164,22 @@ enum {
 	EXT4_STATE_DIO_UNWRITTEN,	/* need convert on dio done*/
 };
 
-static inline int ext4_test_inode_state(struct inode *inode, int bit)
-{
-	return test_bit(bit, &EXT4_I(inode)->i_state_flags);
+#define EXT4_INODE_BIT_FNS(name, field)					\
+static inline int ext4_test_inode_##name(struct inode *inode, int bit)	\
+{									\
+	return test_bit(bit, &EXT4_I(inode)->i_##field);		\
+}									\
+static inline void ext4_set_inode_##name(struct inode *inode, int bit)	\
+{									\
+	set_bit(bit, &EXT4_I(inode)->i_##field);			\
+}									\
+static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
+{									\
+	clear_bit(bit, &EXT4_I(inode)->i_##field);			\
 }
 
-static inline void ext4_set_inode_state(struct inode *inode, int bit)
-{
-	set_bit(bit, &EXT4_I(inode)->i_state_flags);
-}
-
-static inline void ext4_clear_inode_state(struct inode *inode, int bit)
-{
-	clear_bit(bit, &EXT4_I(inode)->i_state_flags);
-}
+EXT4_INODE_BIT_FNS(flag, flags)
+EXT4_INODE_BIT_FNS(state, state_flags)
 #else
 /* Assume that user mode programs are passing in an ext4fs superblock, not
  * a kernel struct super_block.  This will allow us to call the feature-test
@@ -1287,7 +1366,7 @@ struct ext4_dir_entry_2 {
 
 #define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
 				      EXT4_FEATURE_COMPAT_DIR_INDEX) && \
-		      (EXT4_I(dir)->i_flags & EXT4_INDEX_FL))
+		    ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
 #define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
 #define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
 
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b79ad51..dade0c0 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -273,7 +273,7 @@ static inline int ext4_should_journal_data(struct inode *inode)
 		return 1;
 	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
 		return 1;
-	if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+	if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
 		return 1;
 	return 0;
 }
@@ -284,7 +284,7 @@ static inline int ext4_should_order_data(struct inode *inode)
 		return 0;
 	if (!S_ISREG(inode->i_mode))
 		return 0;
-	if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+	if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
 		return 0;
 	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
 		return 1;
@@ -297,7 +297,7 @@ static inline int ext4_should_writeback_data(struct inode *inode)
 		return 0;
 	if (EXT4_JOURNAL(inode) == NULL)
 		return 1;
-	if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+	if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
 		return 0;
 	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
 		return 1;
@@ -321,7 +321,7 @@ static inline int ext4_should_dioread_nolock(struct inode *inode)
 		return 0;
 	if (!S_ISREG(inode->i_mode))
 		return 0;
-	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return 0;
 	if (ext4_should_journal_data(inode))
 		return 0;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 91c3873..42d8ce9 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3498,7 +3498,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			map->m_flags |= EXT4_MAP_UNINIT;
 	}
 
-	if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
+	if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
 		if (unlikely(!eh->eh_entries)) {
 			EXT4_ERROR_INODE(inode,
 					 "eh->eh_entries == 0 ee_block %d",
@@ -3509,7 +3509,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 		last_ex = EXT_LAST_EXTENT(eh);
 		if (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block)
 		    + ext4_ext_get_actual_len(last_ex))
-			EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
+			ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
 	}
 	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
 	if (err) {
@@ -3650,7 +3650,7 @@ static void ext4_falloc_update_inode(struct inode *inode,
 		 * can proceed even if the new size is the same as i_size.
 		 */
 		if (new_size > i_size_read(inode))
-			EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL;
+			ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
 	}
 
 }
@@ -3677,7 +3677,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 	 * currently supporting (pre)allocate mode for extent-based
 	 * files _only_
 	 */
-	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return -EOPNOTSUPP;
 
 	/* preallocation to directories is currently not supported */
@@ -3922,7 +3922,7 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	int error = 0;
 
 	/* fallback to generic here if not in extents fmt */
-	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return generic_block_fiemap(inode, fieinfo, start, len,
 			ext4_get_block);
 
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d0776e4..5313ae4 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -66,7 +66,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
 	 * is smaller than s_maxbytes, which is for extent-mapped files.
 	 */
 
-	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
 		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 		size_t length = iov_length(iov, nr_segs);
 
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 52618d5..7f6b582 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -492,7 +492,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 
 	if (S_ISDIR(mode) &&
 	    ((parent == sb->s_root->d_inode) ||
-	     (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
+	     (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
 		int best_ndir = inodes_per_group;
 		int ret = -1;
 
@@ -1038,7 +1038,7 @@ got:
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
 		/* set extent flag only for directory, file and normal symlink*/
 		if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
-			EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
+			ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
 			ext4_ext_tree_init(handle, inode);
 		}
 	}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3b68772..c2b0724 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -931,7 +931,7 @@ static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
 	int count = 0;
 	ext4_fsblk_t first_block = 0;
 
-	J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
+	J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
 	J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
 	depth = ext4_block_to_path(inode, map->m_lblk, offsets,
 				   &blocks_to_boundary);
@@ -1059,7 +1059,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
  */
 static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
 {
-	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		return ext4_ext_calc_metadata_amount(inode, lblock);
 
 	return ext4_indirect_calc_metadata_amount(inode, lblock);
@@ -1236,7 +1236,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 	 * file system block.
 	 */
 	down_read((&EXT4_I(inode)->i_data_sem));
-	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		retval = ext4_ext_map_blocks(handle, inode, map, 0);
 	} else {
 		retval = ext4_ind_map_blocks(handle, inode, map, 0);
@@ -1295,7 +1295,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 	 * We need to check for EXT4 here because migrate
 	 * could have changed the inode type in between
 	 */
-	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		retval = ext4_ext_map_blocks(handle, inode, map, flags);
 	} else {
 		retval = ext4_ind_map_blocks(handle, inode, map, flags);
@@ -2325,7 +2325,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
 		goto flush_it;
 
 	/* check if thereserved journal credits might overflow */
-	if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
+	if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
 		if (nrblocks >= EXT4_MAX_TRANS_DATA) {
 			/*
 			 * With non-extent format we are limited by the journal
@@ -2779,7 +2779,7 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
 	 * number of contiguous block. So we will limit
 	 * number of contiguous block to a sane value
 	 */
-	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
 	    (max_blocks > EXT4_MAX_TRANS_DATA))
 		max_blocks = EXT4_MAX_TRANS_DATA;
 
@@ -3995,7 +3995,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 
-	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
 
 	return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -4631,12 +4631,12 @@ void ext4_truncate(struct inode *inode)
 	if (!ext4_can_truncate(inode))
 		return;
 
-	EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
+	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
 
 	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
 		ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
 
-	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		ext4_ext_truncate(inode);
 		return;
 	}
@@ -5473,7 +5473,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	if (attr->ia_valid & ATTR_SIZE) {
-		if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+		if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
 			struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 
 			if (attr->ia_size > sbi->s_bitmap_maxbytes) {
@@ -5486,7 +5486,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 	if (S_ISREG(inode->i_mode) &&
 	    attr->ia_valid & ATTR_SIZE &&
 	    (attr->ia_size < inode->i_size ||
-	     (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) {
+	     (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
 		handle_t *handle;
 
 		handle = ext4_journal_start(inode, 3);
@@ -5518,7 +5518,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 			}
 		}
 		/* ext4_truncate will clear the flag */
-		if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))
+		if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
 			ext4_truncate(inode);
 	}
 
@@ -5594,7 +5594,7 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
 
 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
-	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
 	return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
 }
@@ -5929,9 +5929,9 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	 */
 
 	if (val)
-		EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;
+		ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
 	else
-		EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
+		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
 	ext4_set_aops(inode);
 
 	jbd2_journal_unlock_updates(journal);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e5dcdc9..0bdc018 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2006,7 +2006,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 	sbi = EXT4_SB(sb);
 	ngroups = ext4_get_groups_count(sb);
 	/* non-extent files are limited to low blocks/groups */
-	if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
+	if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
 		ngroups = sbi->s_blockfile_groups;
 
 	BUG_ON(ac->ac_status == AC_STATUS_FOUND);
@@ -3171,7 +3171,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
 			continue;
 
 		/* non-extent files can't have physical blocks past 2^32 */
-		if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
+		if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
 			pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
 			continue;
 
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 34dcfc5..6f3a27e 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -475,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode)
 	 */
 	if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
 				       EXT4_FEATURE_INCOMPAT_EXTENTS) ||
-	    (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+	    (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 		return -EINVAL;
 
 	if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index b8b8ea1..3a6c92a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -977,11 +977,11 @@ mext_check_arguments(struct inode *orig_inode,
 	}
 
 	/* Ext4 move extent supports only extent based file */
-	if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
+	if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
 		ext4_debug("ext4 move extent: orig file is not extents "
 			"based file [ino:orig %lu]\n", orig_inode->i_ino);
 		return -EOPNOTSUPP;
-	} else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) {
+	} else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
 		ext4_debug("ext4 move extent: donor file is not extents "
 			"based file [ino:donor %lu]\n", donor_inode->i_ino);
 		return -EOPNOTSUPP;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index bff77b0..7b4bf8f 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -656,7 +656,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 	dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", 
 		       start_hash, start_minor_hash));
 	dir = dir_file->f_path.dentry->d_inode;
-	if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
+	if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
 		hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
 		if (hinfo.hash_version <= DX_HASH_TEA)
 			hinfo.hash_version +=
@@ -801,7 +801,7 @@ static void ext4_update_dx_flag(struct inode *inode)
 {
 	if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
 				     EXT4_FEATURE_COMPAT_DIR_INDEX))
-		EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL;
+		ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
 }
 
 /*
@@ -1416,7 +1416,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
 		brelse(bh);
 		return retval;
 	}
-	EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
+	ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
 	data1 = bh2->b_data;
 
 	memcpy (data1, de, len);
@@ -1489,7 +1489,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 		retval = ext4_dx_add_entry(handle, dentry, inode);
 		if (!retval || (retval != ERR_BAD_DX_DIR))
 			return retval;
-		EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL;
+		ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
 		dx_fallback++;
 		ext4_mark_inode_dirty(handle, dir);
 	}
@@ -2294,7 +2294,7 @@ retry:
 		}
 	} else {
 		/* clear the extent format for fast symlink */
-		EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL;
+		ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
 		inode->i_op = &ext4_fast_symlink_inode_operations;
 		memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
 		inode->i_size = l-1;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2e8790e..9766b2a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4148,6 +4148,7 @@ static int __init init_ext4_fs(void)
 {
 	int err;
 
+	ext4_check_flag_values();
 	err = init_ext4_system_zone();
 	if (err)
 		return err;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 0a09a2e..47431bc 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -818,7 +818,7 @@ inserted:
 						EXT4_I(inode)->i_block_group);
 
 			/* non-extent files can't have physical blocks past 2^32 */
-			if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+			if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 				goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
 
 			block = ext4_new_meta_blocks(handle, inode,
@@ -826,7 +826,7 @@ inserted:
 			if (error)
 				goto cleanup;
 
-			if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+			if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 				BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
 
 			ea_idebug(inode, "creating block %d", block);
-- 
cgit v1.1


From f70f362b4a6fe47c239dbfb3efc0cc2c10e4f09c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 16 May 2010 23:00:00 -0400
Subject: ext4: Avoid crashing on NULL ptr dereference on a filesystem error

If the EOFBLOCK_FL flag is set when it should not be and the inode is
zero length, then eh_entries is zero, and ex is NULL, so dereferencing
ex to print ex->ee_block causes a kernel OOPS in
ext4_ext_map_blocks().

On top of that, the error message which is printed isn't very helpful.
So we fix this by printing something more explanatory which doesn't
involve trying to print ex->ee_block.

Addresses-Google-Bug: #2655740

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 42d8ce9..e40d2b7 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3370,8 +3370,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	 */
 	if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
 		EXT4_ERROR_INODE(inode, "bad extent address "
-				 "iblock: %d, depth: %d pblock %lld",
-				 map->m_lblk, depth, path[depth].p_block);
+				 "lblock: %lu, depth: %d pblock %lld",
+				 (unsigned long) map->m_lblk, depth,
+				 path[depth].p_block);
 		err = -EIO;
 		goto out2;
 	}
@@ -3501,8 +3502,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
 		if (unlikely(!eh->eh_entries)) {
 			EXT4_ERROR_INODE(inode,
-					 "eh->eh_entries == 0 ee_block %d",
-					 ex->ee_block);
+					 "eh->eh_entries == 0 and "
+					 "EOFBLOCKS_FL set");
 			err = -EIO;
 			goto out2;
 		}
-- 
cgit v1.1


From 786ec7915e530936b9eb2e3d12274145cab7aa7d Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 17 May 2010 00:00:00 -0400
Subject: ext4: Clear the EXT4_EOFBLOCKS_FL flag only when warranted

Dimitry Monakhov discovered an edge case where it was possible for the
EXT4_EOFBLOCKS_FL flag could get cleared unnecessarily.  This is true;
I have a test case that can be exercised via downloading and
decompressing the file:

wget ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/ext4-testcases/eofblocks-fl-test-case.img.bz2
bunzip2 eofblocks-fl-test-case.img
dd if=/dev/zero of=eofblocks-fl-test-case.img bs=1k seek=17925 bs=1k count=1 conv=notrunc

However, triggering it in real life is highly unlikely since it
requires an extremely fragmented sparse file with a hole in exactly
the right place in the extent tree.  (It actually took quite a bit of
work to generate this test case.)  Still, it's nice to get even
extreme corner cases to be correct, so this patch makes sure that we
don't clear the EXT4_EOFBLOCKS_FL incorrectly even in this corner
case.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e40d2b7..ffcaa11 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3319,7 +3319,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	struct ext4_extent_header *eh;
 	struct ext4_extent newex, *ex, *last_ex;
 	ext4_fsblk_t newblock;
-	int err = 0, depth, ret, cache_type;
+	int i, err = 0, depth, ret, cache_type;
 	unsigned int allocated = 0;
 	struct ext4_allocation_request ar;
 	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
@@ -3508,8 +3508,20 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			goto out2;
 		}
 		last_ex = EXT_LAST_EXTENT(eh);
-		if (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block)
-		    + ext4_ext_get_actual_len(last_ex))
+		/*
+		 * If the current leaf block was reached by looking at
+		 * the last index block all the way down the tree, and
+		 * we are extending the inode beyond the last extent
+		 * in the current leaf block, then clear the
+		 * EOFBLOCKS_FL flag.
+		 */
+		for (i = depth-1; i >= 0; i--) {
+			if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
+				break;
+		}
+		if ((i < 0) &&
+		    (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
+		     ext4_ext_get_actual_len(last_ex)))
 			ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
 	}
 	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
-- 
cgit v1.1


From 0617b83fa239db9743a18ce6cc0e556f4d0fd567 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Mon, 17 May 2010 01:00:00 -0400
Subject: ext4: restart ext4_ext_remove_space() after transaction restart

If i_data_sem was internally dropped due to transaction restart, it is
necessary to restart path look-up because extents tree was possibly
modified by ext4_get_block().

https://bugzilla.kernel.org/show_bug.cgi?id=15827

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Acked-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/extents.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ffcaa11..0a47bec 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -107,11 +107,8 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle,
 	if (err <= 0)
 		return err;
 	err = ext4_truncate_restart_trans(handle, inode, needed);
-	/*
-	 * We have dropped i_data_sem so someone might have cached again
-	 * an extent we are going to truncate.
-	 */
-	ext4_ext_invalidate_cache(inode);
+	if (err == 0)
+		err = -EAGAIN;
 
 	return err;
 }
@@ -2359,7 +2356,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 	int depth = ext_depth(inode);
 	struct ext4_ext_path *path;
 	handle_t *handle;
-	int i = 0, err = 0;
+	int i, err;
 
 	ext_debug("truncate since %u\n", start);
 
@@ -2368,23 +2365,26 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
+again:
 	ext4_ext_invalidate_cache(inode);
 
 	/*
 	 * We start scanning from right side, freeing all the blocks
 	 * after i_size and walking into the tree depth-wise.
 	 */
+	depth = ext_depth(inode);
 	path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
 	if (path == NULL) {
 		ext4_journal_stop(handle);
 		return -ENOMEM;
 	}
+	path[0].p_depth = depth;
 	path[0].p_hdr = ext_inode_hdr(inode);
 	if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
 		err = -EIO;
 		goto out;
 	}
-	path[0].p_depth = depth;
+	i = err = 0;
 
 	while (i >= 0 && err == 0) {
 		if (i == depth) {
@@ -2478,6 +2478,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 out:
 	ext4_ext_drop_refs(path);
 	kfree(path);
+	if (err == -EAGAIN)
+		goto again;
 	ext4_journal_stop(handle);
 
 	return err;
-- 
cgit v1.1


From 5a58ec8766e0ce98fd585eb404b3e56935afafe6 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 17 May 2010 02:00:00 -0400
Subject: ext4: Add a missing trace hook

Commit f8ec9d6837241865cf99bed97bb99f4399fd5a03 added a
trace event ext4_da_release_space, but didn't add some
corresponding trace hook.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c2b0724..8e45331 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1893,6 +1893,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
 
 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 
+	trace_ext4_da_release_space(inode, to_free);
 	if (unlikely(to_free > ei->i_reserved_data_blocks)) {
 		/*
 		 * if there aren't enough reserved blocks, then the
-- 
cgit v1.1


From f307333e14f6b18045eb4198fe646d9b6028f3ed Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 17 May 2010 03:00:00 -0400
Subject: ext4: Add new tracepoints to track mballoc's buddy bitmap loads

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c           |  2 ++
 include/trace/events/ext4.h | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 0bdc018..a93f554 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -890,6 +890,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 			BUG_ON(incore == NULL);
 			mb_debug(1, "put buddy for group %u in page %lu/%x\n",
 				group, page->index, i * blocksize);
+			trace_ext4_mb_buddy_bitmap_load(sb, group);
 			grinfo = ext4_get_group_info(sb, group);
 			grinfo->bb_fragments = 0;
 			memset(grinfo->bb_counters, 0,
@@ -907,6 +908,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 			BUG_ON(incore != NULL);
 			mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
 				group, page->index, i * blocksize);
+			trace_ext4_mb_bitmap_load(sb, group);
 
 			/* see comments in ext4_mb_put_pa() */
 			ext4_lock_group(sb, group);
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 2aa6aa3..99fbf1d 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -974,6 +974,39 @@ TRACE_EVENT(ext4_da_release_space,
 		  __entry->reserved_meta_blocks, __entry->allocated_meta_blocks)
 );
 
+DECLARE_EVENT_CLASS(ext4__bitmap_load,
+	TP_PROTO(struct super_block *sb, unsigned long group),
+
+	TP_ARGS(sb, group),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	__u32,	group			)
+
+	),
+
+	TP_fast_assign(
+		__entry->dev	= sb->s_dev;
+		__entry->group	= group;
+	),
+
+	TP_printk("dev %s group %u",
+		  jbd2_dev_to_name(__entry->dev), __entry->group)
+);
+
+DEFINE_EVENT(ext4__bitmap_load, ext4_mb_bitmap_load,
+
+	TP_PROTO(struct super_block *sb, unsigned long group),
+
+	TP_ARGS(sb, group)
+);
+
+DEFINE_EVENT(ext4__bitmap_load, ext4_mb_buddy_bitmap_load,
+
+	TP_PROTO(struct super_block *sb, unsigned long group),
+
+	TP_ARGS(sb, group)
+);
 
 #endif /* _TRACE_EXT4_H */
 
-- 
cgit v1.1


From f084db932e6fe877bf8362bc256fc850de196deb Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 17 May 2010 04:00:00 -0400
Subject: tracing: Convert more ext4 events to DEFINE_EVENT

Use DECLARE_EVENT_CLASS, and save ~2.7K:

   text    data     bss     dec     hex filename
 274441    7200     260  281901   44d2d fs/ext4/ext4.o.orig
 271881    7040     256  279177   44289 fs/ext4/ext4.o

4 events are converted:

  ext4__mb_new_pa: ext4_mb_new_inode_pa, ext4_mb_new_group_pa
  ext4__mballoc:   ext4_mballoc_discard, ext4_mballoc_free

No change in functionality.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 include/trace/events/ext4.h | 61 +++++++++++++--------------------------------
 1 file changed, 17 insertions(+), 44 deletions(-)

diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 99fbf1d..5d60ad4 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -353,7 +353,7 @@ TRACE_EVENT(ext4_discard_blocks,
 		  jbd2_dev_to_name(__entry->dev), __entry->blk, __entry->count)
 );
 
-TRACE_EVENT(ext4_mb_new_inode_pa,
+DECLARE_EVENT_CLASS(ext4__mb_new_pa,
 	TP_PROTO(struct ext4_allocation_context *ac,
 		 struct ext4_prealloc_space *pa),
 
@@ -381,32 +381,20 @@ TRACE_EVENT(ext4_mb_new_inode_pa,
 		  __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart)
 );
 
-TRACE_EVENT(ext4_mb_new_group_pa,
+DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_inode_pa,
+
 	TP_PROTO(struct ext4_allocation_context *ac,
 		 struct ext4_prealloc_space *pa),
 
-	TP_ARGS(ac, pa),
-
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	ino_t,	ino			)
-		__field(	__u64,	pa_pstart		)
-		__field(	__u32,	pa_len			)
-		__field(	__u64,	pa_lstart		)
+	TP_ARGS(ac, pa)
+);
 
-	),
+DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_group_pa,
 
-	TP_fast_assign(
-		__entry->dev		= ac->ac_sb->s_dev;
-		__entry->ino		= ac->ac_inode->i_ino;
-		__entry->pa_pstart	= pa->pa_pstart;
-		__entry->pa_len		= pa->pa_len;
-		__entry->pa_lstart	= pa->pa_lstart;
-	),
+	TP_PROTO(struct ext4_allocation_context *ac,
+		 struct ext4_prealloc_space *pa),
 
-	TP_printk("dev %s ino %lu pstart %llu len %u lstart %llu",
-		  jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-		  __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart)
+	TP_ARGS(ac, pa)
 );
 
 TRACE_EVENT(ext4_mb_release_inode_pa,
@@ -790,7 +778,7 @@ TRACE_EVENT(ext4_mballoc_prealloc,
 		  __entry->result_len, __entry->result_logical)
 );
 
-TRACE_EVENT(ext4_mballoc_discard,
+DECLARE_EVENT_CLASS(ext4__mballoc,
 	TP_PROTO(struct ext4_allocation_context *ac),
 
 	TP_ARGS(ac),
@@ -819,33 +807,18 @@ TRACE_EVENT(ext4_mballoc_discard,
 		  __entry->result_len, __entry->result_logical)
 );
 
-TRACE_EVENT(ext4_mballoc_free,
+DEFINE_EVENT(ext4__mballoc, ext4_mballoc_discard,
+
 	TP_PROTO(struct ext4_allocation_context *ac),
 
-	TP_ARGS(ac),
+	TP_ARGS(ac)
+);
 
-	TP_STRUCT__entry(
-		__field(	dev_t,	dev			)
-		__field(	ino_t,	ino			)
-		__field(	__u32, 	result_logical		)
-		__field(	  int,	result_start		)
-		__field(	__u32, 	result_group		)
-		__field(	  int,	result_len		)
-	),
+DEFINE_EVENT(ext4__mballoc, ext4_mballoc_free,
 
-	TP_fast_assign(
-		__entry->dev		= ac->ac_inode->i_sb->s_dev;
-		__entry->ino		= ac->ac_inode->i_ino;
-		__entry->result_logical	= ac->ac_b_ex.fe_logical;
-		__entry->result_start	= ac->ac_b_ex.fe_start;
-		__entry->result_group	= ac->ac_b_ex.fe_group;
-		__entry->result_len	= ac->ac_b_ex.fe_len;
-	),
+	TP_PROTO(struct ext4_allocation_context *ac),
 
-	TP_printk("dev %s inode %lu extent %u/%d/%u@%u ",
-		  jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
-		  __entry->result_group, __entry->result_start,
-		  __entry->result_len, __entry->result_logical)
+	TP_ARGS(ac)
 );
 
 TRACE_EVENT(ext4_forget,
-- 
cgit v1.1


From 899ad0cea6ad7ff4ba24b16318edbc3cbbe03fad Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Mon, 17 May 2010 05:00:00 -0400
Subject: ext4: Conditionally define compat ioctl numbers

It is unnecessary, and in general impossible, to define the compat
ioctl numbers except when building the filesystem with CONFIG_COMPAT
defined.

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 7441488..d8c28f6 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -498,6 +498,7 @@ struct ext4_new_group_data {
 #define EXT4_IOC_ALLOC_DA_BLKS		_IO('f', 12)
 #define EXT4_IOC_MOVE_EXT		_IOWR('f', 15, struct move_extent)
 
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
  * ioctl commands in 32 bit emulation
  */
@@ -513,6 +514,7 @@ struct ext4_new_group_data {
 #endif
 #define EXT4_IOC32_GETVERSION_OLD	FS_IOC32_GETVERSION
 #define EXT4_IOC32_SETVERSION_OLD	FS_IOC32_SETVERSION
+#endif
 
 
 /*
-- 
cgit v1.1


From 4d92dc0f00a775dc2e1267b0e00befb783902fe7 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Mon, 17 May 2010 06:00:00 -0400
Subject: ext4: Fix compat EXT4_IOC_ADD_GROUP

struct ext4_new_group_input needs to be converted because u64 has
only 32-bit alignment on some 32-bit architectures, notably i386.

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  | 16 ++++++++++++++++
 fs/ext4/ioctl.c | 25 +++++++++++++++++++++++--
 2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d8c28f6..2c1165f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -29,6 +29,9 @@
 #include <linux/wait.h>
 #include <linux/blockgroup_lock.h>
 #include <linux/percpu_counter.h>
+#ifdef __KERNEL__
+#include <linux/compat.h>
+#endif
 
 /*
  * The fourth extended filesystem constants/structures
@@ -432,6 +435,18 @@ struct ext4_new_group_input {
 	__u16 unused;
 };
 
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
+struct compat_ext4_new_group_input {
+	u32 group;
+	compat_u64 block_bitmap;
+	compat_u64 inode_bitmap;
+	compat_u64 inode_table;
+	u32 blocks_count;
+	u16 reserved_blocks;
+	u16 unused;
+};
+#endif
+
 /* The struct ext4_new_group_input in kernel space, with free_blocks_count */
 struct ext4_new_group_data {
 	__u32 group;
@@ -509,6 +524,7 @@ struct ext4_new_group_data {
 #define EXT4_IOC32_GETRSVSZ		_IOR('f', 5, int)
 #define EXT4_IOC32_SETRSVSZ		_IOW('f', 6, int)
 #define EXT4_IOC32_GROUP_EXTEND		_IOW('f', 7, unsigned int)
+#define EXT4_IOC32_GROUP_ADD		_IOW('f', 8, struct compat_ext4_new_group_input)
 #ifdef CONFIG_JBD2_DEBUG
 #define EXT4_IOC32_WAIT_FOR_READONLY	_IOR('f', 99, int)
 #endif
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 66fa0b0..6ddec84 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -373,8 +373,29 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case EXT4_IOC32_SETRSVSZ:
 		cmd = EXT4_IOC_SETRSVSZ;
 		break;
-	case EXT4_IOC_GROUP_ADD:
-		break;
+	case EXT4_IOC32_GROUP_ADD: {
+		struct compat_ext4_new_group_input __user *uinput;
+		struct ext4_new_group_input input;
+		mm_segment_t old_fs;
+		int err;
+
+		uinput = compat_ptr(arg);
+		err = get_user(input.group, &uinput->group);
+		err |= get_user(input.block_bitmap, &uinput->block_bitmap);
+		err |= get_user(input.inode_bitmap, &uinput->inode_bitmap);
+		err |= get_user(input.inode_table, &uinput->inode_table);
+		err |= get_user(input.blocks_count, &uinput->blocks_count);
+		err |= get_user(input.reserved_blocks,
+				&uinput->reserved_blocks);
+		if (err)
+			return -EFAULT;
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
+				 (unsigned long) &input);
+		set_fs(old_fs);
+		return err;
+	}
 	case EXT4_IOC_MOVE_EXT:
 		break;
 	default:
-- 
cgit v1.1


From 60e6679e28518ccd67169c4a539d8cc7490eb8a6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 17 May 2010 07:00:00 -0400
Subject: ext4: Drop whitespace at end of lines

This patch was generated using:

#!/usr/bin/perl -i
while (<>) {
    s/[ 	]+$//;
    print;
}

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/block_validity.c |  4 ++--
 fs/ext4/ext4.h           |  4 ++--
 fs/ext4/extents.c        |  8 ++++----
 fs/ext4/fsync.c          |  2 +-
 fs/ext4/inode.c          |  4 ++--
 fs/ext4/ioctl.c          |  2 +-
 fs/ext4/mballoc.c        |  8 ++++----
 fs/ext4/namei.c          | 10 +++++-----
 fs/ext4/super.c          |  4 ++--
 9 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 538c486..5b6973f 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -72,9 +72,9 @@ static int add_system_zone(struct ext4_sb_info *sbi,
 		else if (start_blk >= (entry->start_blk + entry->count))
 			n = &(*n)->rb_right;
 		else {
-			if (start_blk + count > (entry->start_blk + 
+			if (start_blk + count > (entry->start_blk +
 						 entry->count))
-				entry->count = (start_blk + count - 
+				entry->count = (start_blk + count -
 						entry->start_blk);
 			new_node = *n;
 			new_entry = rb_entry(new_node, struct ext4_system_zone,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2c1165f..124ef77 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -75,7 +75,7 @@ typedef __u32 ext4_lblk_t;
 typedef unsigned int ext4_group_t;
 
 /*
- * Flags used in mballoc's allocation_context flags field.  
+ * Flags used in mballoc's allocation_context flags field.
  *
  * Also used to show what's going on for debugging purposes when the
  * flag field is exported via the traceport interface
@@ -470,7 +470,7 @@ struct ext4_new_group_data {
 #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT	(EXT4_GET_BLOCKS_UNINIT_EXT|\
 						 EXT4_GET_BLOCKS_CREATE)
 	/* Caller is from the delayed allocation writeout path,
-	   so set the magic i_delalloc_reserve_flag after taking the 
+	   so set the magic i_delalloc_reserve_flag after taking the
 	   inode allocation semaphore for */
 #define EXT4_GET_BLOCKS_DELALLOC_RESERVE	0x0004
 	/* caller is from the direct IO path, request to creation of an
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0a47bec..377309c 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -182,10 +182,10 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 	if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
 		/*
 		 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
-		 * block groups per flexgroup, reserve the first block 
-		 * group for directories and special files.  Regular 
+		 * block groups per flexgroup, reserve the first block
+		 * group for directories and special files.  Regular
 		 * files will start at the second block group.  This
-		 * tends to speed up directory access and improves 
+		 * tends to speed up directory access and improves
 		 * fsck times.
 		 */
 		block_group &= ~(flex_size-1);
@@ -2034,7 +2034,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
 	struct ext4_ext_cache *cex;
 	int ret = EXT4_EXT_CACHE_NO;
 
-	/* 
+	/*
 	 * We borrow i_block_reservation_lock to protect i_cached_extent
 	 */
 	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 42bd94a..6ca7b6e 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -66,7 +66,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	ret = flush_completed_IO(inode);
 	if (ret < 0)
 		return ret;
-	
+
 	if (!journal)
 		return simple_fsync(file, dentry, datasync);
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8e45331..502b07d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -784,7 +784,7 @@ failed:
 	/* Allocation failed, free what we already allocated */
 	ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
 	for (i = 1; i <= n ; i++) {
-		/* 
+		/*
 		 * branch[i].bh is newly allocated, so there is no
 		 * need to revoke the block, which is why we don't
 		 * need to set EXT4_FREE_BLOCKS_METADATA.
@@ -874,7 +874,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 
 err_out:
 	for (i = 1; i <= num; i++) {
-		/* 
+		/*
 		 * branch[i].bh is newly allocated, so there is no
 		 * need to revoke the block, which is why we don't
 		 * need to set EXT4_FREE_BLOCKS_METADATA.
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 6ddec84..bf5ae88 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -258,7 +258,7 @@ setversion_out:
 		if (me.moved_len > 0)
 			file_remove_suid(donor_filp);
 
-		if (copy_to_user((struct move_extent __user *)arg, 
+		if (copy_to_user((struct move_extent __user *)arg,
 				 &me, sizeof(me)))
 			err = -EFAULT;
 mext_out:
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a93f554..12b3bc0 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2691,7 +2691,7 @@ int __init init_ext4_mballoc(void)
 
 void exit_ext4_mballoc(void)
 {
-	/* 
+	/*
 	 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
 	 * before destroying the slab cache.
 	 */
@@ -3330,7 +3330,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
 	spin_unlock(&pa->pa_lock);
 
 	grp_blk = pa->pa_pstart;
-	/* 
+	/*
 	 * If doing group-based preallocation, pa_pstart may be in the
 	 * next group when pa is used up
 	 */
@@ -4534,12 +4534,12 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			if (!bh)
 				tbh = sb_find_get_block(inode->i_sb,
 							block + i);
-			ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 
+			ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
 				    inode, tbh, block + i);
 		}
 	}
 
-	/* 
+	/*
 	 * We need to make sure we don't reuse the freed block until
 	 * after the transaction is committed, which we can do by
 	 * treating the block as metadata, below.  We make an
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 7b4bf8f..83be743 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -187,7 +187,7 @@ unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
 		return blocksize;
 	return (len & 65532) | ((len & 3) << 16);
 }
-  
+
 __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
 {
 	if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
@@ -197,7 +197,7 @@ __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
 	if (len == blocksize) {
 		if (blocksize == 65536)
 			return cpu_to_le16(EXT4_MAX_REC_LEN);
-		else 
+		else
 			return cpu_to_le16(0);
 	}
 	return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
@@ -349,7 +349,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
 		brelse(bh);
 	}
 	if (bcount)
-		printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", 
+		printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
 		       levels ? "" : "   ", names, space/bcount,
 		       (space/bcount)*100/blocksize);
 	return (struct stats) { names, space, bcount};
@@ -653,7 +653,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
 	int ret, err;
 	__u32 hashval;
 
-	dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", 
+	dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
 		       start_hash, start_minor_hash));
 	dir = dir_file->f_path.dentry->d_inode;
 	if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
@@ -1141,7 +1141,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
 	unsigned rec_len = 0;
 
 	while (count--) {
-		struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) 
+		struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
 						(from + (map->offs<<2));
 		rec_len = EXT4_DIR_REC_LEN(de->name_len);
 		memcpy (to, de, rec_len);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9766b2a..49d88c0 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2216,7 +2216,7 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
 struct ext4_attr {
 	struct attribute attr;
 	ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
-	ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, 
+	ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
 			 const char *, size_t);
 	int offset;
 };
@@ -3383,7 +3383,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 	if (!(sb->s_flags & MS_RDONLY))
 		es->s_wtime = cpu_to_le32(get_seconds());
 	es->s_kbytes_written =
-		cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 
+		cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
 			    ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
 			      EXT4_SB(sb)->s_sectors_written_start) >> 1));
 	ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
-- 
cgit v1.1


From 14ece1028b3ed53ffec1b1213ffc6acaf79ad77c Mon Sep 17 00:00:00 2001
From: Frank Mayhar <fmayhar@google.com>
Date: Mon, 17 May 2010 08:00:00 -0400
Subject: ext4: Make fsync sync new parent directories in no-journal mode

Add a new ext4 state to tell us when a file has been newly created; use
that state in ext4_sync_file in no-journal mode to tell us when we need
to sync the parent directory as well as the inode and data itself.  This
fixes a problem in which a panic or power failure may lose the entire
file even when using fsync, since the parent directory entry is lost.

Addresses-Google-Bug: #2480057

Signed-off-by: Frank Mayhar <fmayhar@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  |  1 +
 fs/ext4/fsync.c | 31 +++++++++++++++++++++++++++++--
 fs/ext4/namei.c |  2 ++
 3 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 124ef77..60bd310 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1180,6 +1180,7 @@ enum {
 	EXT4_STATE_DA_ALLOC_CLOSE,	/* Alloc DA blks on close */
 	EXT4_STATE_EXT_MIGRATE,		/* Inode is migrating */
 	EXT4_STATE_DIO_UNWRITTEN,	/* need convert on dio done*/
+	EXT4_STATE_NEWENTRY,		/* File just added to dir */
 };
 
 #define EXT4_INODE_BIT_FNS(name, field)					\
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 6ca7b6e..e187f87 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -35,6 +35,29 @@
 #include <trace/events/ext4.h>
 
 /*
+ * If we're not journaling and this is a just-created file, we have to
+ * sync our parent directory (if it was freshly created) since
+ * otherwise it will only be written by writeback, leaving a huge
+ * window during which a crash may lose the file.  This may apply for
+ * the parent directory's parent as well, and so on recursively, if
+ * they are also freshly created.
+ */
+static void ext4_sync_parent(struct inode *inode)
+{
+	struct dentry *dentry = NULL;
+
+	while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
+		ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
+		dentry = list_entry(inode->i_dentry.next,
+				    struct dentry, d_alias);
+		if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
+			break;
+		inode = dentry->d_parent->d_inode;
+		sync_mapping_buffers(inode->i_mapping);
+	}
+}
+
+/*
  * akpm: A new design for ext4_sync_file().
  *
  * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
@@ -67,8 +90,12 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	if (ret < 0)
 		return ret;
 
-	if (!journal)
-		return simple_fsync(file, dentry, datasync);
+	if (!journal) {
+		ret = simple_fsync(file, dentry, datasync);
+		if (!ret && !list_empty(&inode->i_dentry))
+			ext4_sync_parent(inode);
+		return ret;
+	}
 
 	/*
 	 * data=writeback,ordered:
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 83be743..a43e661 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1517,6 +1517,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 	de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
 	retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
 	brelse(bh);
+	if (retval == 0)
+		ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
 	return retval;
 }
 
-- 
cgit v1.1