From 67638e4043083cdc6f10386a75fef87ba46eecb3 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Sat, 1 Aug 2009 21:30:31 +0900
Subject: fat/nls: Fix handling of utf8 invalid char

With utf8 option, vfat allowed the duplicated filenames.

Normal nls returns -EINVAL for invalid char. But utf8s_to_utf16s()
skipped the invalid char historically.

So, this changes the utf8s_to_utf16s() directly to return -EINVAL for
invalid char, because vfat is only user of it.

mkdir /mnt/fatfs
FILENAME=`echo -ne "invalidutf8char_\\0341_endofchar"`
echo "Using filename: $FILENAME"
dd if=/dev/zero of=fatfs bs=512 count=128
mkdosfs -F 32 fatfs
mount -o loop,utf8 fatfs /mnt/fatfs
touch "/mnt/fatfs/$FILENAME"
umount /mnt/fatfs
mount -o loop,utf8 fatfs /mnt/fatfs
touch "/mnt/fatfs/$FILENAME"
ls -l /mnt/fatfs
umount /mnt/fatfs

----  And the output is:

Using filename: invalidutf8char_\0341_endofchar
128+0 records in
128+0 records out
65536 bytes (66 kB) copied, 0.000388118 s, 169 MB/s
mkdosfs 2.11 (12 Mar 2005)
total 0
-rwxr-xr-x 1 root root 0 Jun 28 19:46 invalidutf8char__endofchar
-rwxr-xr-x 1 root root 0 Jun 28 19:46 invalidutf8char__endofchar

Tested-by: Marton Balint <cus@fazekas.hu>
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
---
 fs/fat/namei_vfat.c | 15 ++++-----------
 fs/nls/nls_base.c   |  8 ++++----
 2 files changed, 8 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index cb6e835..f565f24 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -499,17 +499,10 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
 	int charlen;
 
 	if (utf8) {
-		int name_len = strlen(name);
-
-		*outlen = utf8s_to_utf16s(name, PATH_MAX, (wchar_t *) outname);
-
-		/*
-		 * We stripped '.'s before and set len appropriately,
-		 * but utf8s_to_utf16s doesn't care about len
-		 */
-		*outlen -= (name_len - len);
-
-		if (*outlen > 255)
+		*outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname);
+		if (*outlen < 0)
+			return *outlen;
+		else if (*outlen > 255)
 			return -ENAMETOOLONG;
 
 		op = &outname[*outlen * sizeof(wchar_t)];
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 477d37d..b25c218 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -124,10 +124,10 @@ int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs)
 	while (*s && len > 0) {
 		if (*s & 0x80) {
 			size = utf8_to_utf32(s, len, &u);
-			if (size < 0) {
-				/* Ignore character and move on */
-				size = 1;
-			} else if (u >= PLANE_SIZE) {
+			if (size < 0)
+				return -EINVAL;
+
+			if (u >= PLANE_SIZE) {
 				u -= PLANE_SIZE;
 				*op++ = (wchar_t) (SURROGATE_PAIR |
 						((u >> 10) & SURROGATE_BITS));
-- 
cgit v1.1


From 955234755ce4a2c33cfc558912aa8f2148cc1fc6 Mon Sep 17 00:00:00 2001
From: Paul Wise <pabs3@bonedaddy.net>
Date: Sat, 1 Aug 2009 21:30:31 +0900
Subject: vfat: change the default from shortname=lower to shortname=mixed

Because, with "shortname=lower", copying one FAT filesystem tree to
another FAT filesystem tree using Linux results in semantically
different filesystems. (E.g.: Filenames which were once "all
uppercase" are now "all lowercase").

So, this changes the default of "shortname=lower" to "shortname=mixed".

Signed-off-by: Paul Wise <pabs3@bonedaddy.net>
[change fat_show_options()]
Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
---
 fs/fat/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 8970d8c..63a5c1a 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -820,7 +820,7 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
 			seq_puts(m, ",shortname=mixed");
 			break;
 		case VFAT_SFN_DISPLAY_LOWER | VFAT_SFN_CREATE_WIN95:
-			/* seq_puts(m, ",shortname=lower"); */
+			seq_puts(m, ",shortname=lower");
 			break;
 		default:
 			seq_puts(m, ",shortname=unknown");
@@ -971,7 +971,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
 	opts->codepage = fat_default_codepage;
 	opts->iocharset = fat_default_iocharset;
 	if (is_vfat) {
-		opts->shortname = VFAT_SFN_DISPLAY_LOWER|VFAT_SFN_CREATE_WIN95;
+		opts->shortname = VFAT_SFN_DISPLAY_WINNT|VFAT_SFN_CREATE_WIN95;
 		opts->rodir = 0;
 	} else {
 		opts->shortname = 0;
-- 
cgit v1.1


From ed248b290da7297c9b9a3ff180f5eee4db016224 Mon Sep 17 00:00:00 2001
From: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Date: Sun, 20 Sep 2009 01:31:58 +0900
Subject: fat: Check s_dirt in fat_sync_fs()

If we didn't check sb->s_dirt, it will update the FSINFO
unconditionally. It will reduce the filetime of flash base device.

So, this checks sb->s_dirt. sb->s_dirt is racy, however FSINFO is just
hint. So even if there is race, and we hit it, it would not become big
problem.

And this also is as workaround of suspend problem.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
---
 fs/fat/fat.h   |  2 +-
 fs/fat/inode.c | 14 +++++++++-----
 fs/fat/misc.c  |  8 +++++---
 3 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index adb0e72..7db0979 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -323,7 +323,7 @@ extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
 /* fat/misc.c */
 extern void fat_fs_error(struct super_block *s, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3))) __cold;
-extern void fat_clusters_flush(struct super_block *sb);
+extern int fat_clusters_flush(struct super_block *sb);
 extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
 extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
 			      __le16 __time, __le16 __date, u8 time_cs);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 63a5c1a..a8a3afe 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -451,12 +451,16 @@ static void fat_write_super(struct super_block *sb)
 
 static int fat_sync_fs(struct super_block *sb, int wait)
 {
-	lock_super(sb);
-	fat_clusters_flush(sb);
-	sb->s_dirt = 0;
-	unlock_super(sb);
+	int err = 0;
 
-	return 0;
+	if (sb->s_dirt) {
+		lock_super(sb);
+		sb->s_dirt = 0;
+		err = fat_clusters_flush(sb);
+		unlock_super(sb);
+	}
+
+	return err;
 }
 
 static void fat_put_super(struct super_block *sb)
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index a6c2047..63785a1 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -43,19 +43,19 @@ EXPORT_SYMBOL_GPL(fat_fs_error);
 
 /* Flushes the number of free clusters on FAT32 */
 /* XXX: Need to write one per FSINFO block.  Currently only writes 1 */
-void fat_clusters_flush(struct super_block *sb)
+int fat_clusters_flush(struct super_block *sb)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 	struct buffer_head *bh;
 	struct fat_boot_fsinfo *fsinfo;
 
 	if (sbi->fat_bits != 32)
-		return;
+		return 0;
 
 	bh = sb_bread(sb, sbi->fsinfo_sector);
 	if (bh == NULL) {
 		printk(KERN_ERR "FAT: bread failed in fat_clusters_flush\n");
-		return;
+		return -EIO;
 	}
 
 	fsinfo = (struct fat_boot_fsinfo *)bh->b_data;
@@ -74,6 +74,8 @@ void fat_clusters_flush(struct super_block *sb)
 		mark_buffer_dirty(bh);
 	}
 	brelse(bh);
+
+	return 0;
 }
 
 /*
-- 
cgit v1.1


From 1693918e0b6988cf5eb93b7da34f30e94360a379 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 26 Sep 2009 17:43:59 -0400
Subject: ext4: Use ext4_msg() for ext4_da_writepage() errors

This allows the user to see what filesystem was involved with a
particular ext4_da_writepage() error.  Also, use KERN_CRIT which is
more appropriate than KERN_EMERG.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 50 +++++++++++++++++++++++++-------------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 064746f..5fb72a9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2092,18 +2092,18 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
 static void ext4_print_free_blocks(struct inode *inode)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-	printk(KERN_EMERG "Total free blocks count %lld\n",
-			ext4_count_free_blocks(inode->i_sb));
-	printk(KERN_EMERG "Free/Dirty block details\n");
-	printk(KERN_EMERG "free_blocks=%lld\n",
-			(long long)percpu_counter_sum(&sbi->s_freeblocks_counter));
-	printk(KERN_EMERG "dirty_blocks=%lld\n",
-			(long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter));
-	printk(KERN_EMERG "Block reservation details\n");
-	printk(KERN_EMERG "i_reserved_data_blocks=%u\n",
-			EXT4_I(inode)->i_reserved_data_blocks);
-	printk(KERN_EMERG "i_reserved_meta_blocks=%u\n",
-			EXT4_I(inode)->i_reserved_meta_blocks);
+	printk(KERN_CRIT "Total free blocks count %lld\n",
+	       ext4_count_free_blocks(inode->i_sb));
+	printk(KERN_CRIT "Free/Dirty block details\n");
+	printk(KERN_CRIT "free_blocks=%lld\n",
+	       (long long) percpu_counter_sum(&sbi->s_freeblocks_counter));
+	printk(KERN_CRIT "dirty_blocks=%lld\n",
+	       (long long) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+	printk(KERN_CRIT "Block reservation details\n");
+	printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
+	       EXT4_I(inode)->i_reserved_data_blocks);
+	printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
+	       EXT4_I(inode)->i_reserved_meta_blocks);
 	return;
 }
 
@@ -2189,14 +2189,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 		 * writepage and writepages will again try to write
 		 * the same.
 		 */
-		printk(KERN_EMERG "%s block allocation failed for inode %lu "
-				  "at logical offset %llu with max blocks "
-				  "%zd with error %d\n",
-				  __func__, mpd->inode->i_ino,
-				  (unsigned long long)next,
-				  mpd->b_size >> mpd->inode->i_blkbits, err);
-		printk(KERN_EMERG "This should not happen.!! "
-					"Data will be lost\n");
+		ext4_msg(mpd->inode->i_sb, KERN_CRIT,
+			 "delayed block allocation failed for inode %lu at "
+			 "logical offset %llu with max blocks %zd with "
+			 "error %d\n", mpd->inode->i_ino,
+			 (unsigned long long) next,
+			 mpd->b_size >> mpd->inode->i_blkbits, err);
+		printk(KERN_CRIT "This should not happen!!  "
+		       "Data will be lost\n");
 		if (err == -ENOSPC) {
 			ext4_print_free_blocks(mpd->inode);
 		}
@@ -2822,10 +2822,9 @@ retry:
 		handle = ext4_journal_start(inode, needed_blocks);
 		if (IS_ERR(handle)) {
 			ret = PTR_ERR(handle);
-			printk(KERN_CRIT "%s: jbd2_start: "
+			ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
 			       "%ld pages, ino %lu; err %d\n", __func__,
 				wbc->nr_to_write, inode->i_ino, ret);
-			dump_stack();
 			goto out_writepages;
 		}
 
@@ -2897,9 +2896,10 @@ retry:
 		goto retry;
 	}
 	if (pages_skipped != wbc->pages_skipped)
-		printk(KERN_EMERG "This should not happen leaving %s "
-				"with nr_to_write = %ld ret = %d\n",
-				__func__, wbc->nr_to_write, ret);
+		ext4_msg(inode->i_sb, KERN_CRIT,
+			 "This should not happen leaving %s "
+			 "with nr_to_write = %ld ret = %d\n",
+			 __func__, wbc->nr_to_write, ret);
 
 	/* Update index */
 	index += pages_written;
-- 
cgit v1.1


From 71780577306fd1e76c7a92e3b308db624d03adb9 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 28 Sep 2009 00:06:20 -0400
Subject: ext4: Fix hueristic which avoids group preallocation for closed files

The hueristic was designed to avoid using locality group preallocation
when writing the last segment of a closed file.  Fix it by move
setting size to the maximum of size and isize until after we check
whether size == isize.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e9c6189..c73d439 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4189,7 +4189,6 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
 	size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
 	isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
 		>> bsbits;
-	size = max(size, isize);
 
 	if ((size == isize) &&
 	    !ext4_fs_is_busy(sbi) &&
@@ -4199,6 +4198,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
 	}
 
 	/* don't use group allocation for large files */
+	size = max(size, isize);
 	if (size >= sbi->s_mb_stream_request) {
 		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
 		return;
-- 
cgit v1.1


From 55138e0bc29c0751e2152df9ad35deea542f29b3 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 29 Sep 2009 13:31:31 -0400
Subject: ext4: Adjust ext4_da_writepages() to write out larger contiguous
 chunks

Work around problems in the writeback code to force out writebacks in
larger chunks than just 4mb, which is just too small.  This also works
around limitations in the ext4 block allocator, which can't allocate
more than 2048 blocks at a time.  So we need to defeat the round-robin
characteristics of the writeback code and try to write out as many
blocks in one inode before allowing the writeback code to move on to
another inode.  We add a a new per-filesystem tunable,
max_writeback_mb_bump, which caps this to a default of 128mb per
inode.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  |   1 +
 fs/ext4/inode.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++++-------
 fs/ext4/super.c |   3 ++
 3 files changed, 97 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e227eea..a58438e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -942,6 +942,7 @@ struct ext4_sb_info {
 	unsigned int s_mb_stats;
 	unsigned int s_mb_order2_reqs;
 	unsigned int s_mb_group_prealloc;
+	unsigned int s_max_writeback_mb_bump;
 	/* where last allocation was done - for stream allocation */
 	unsigned long s_mb_last_group;
 	unsigned long s_mb_last_start;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5fb72a9..20e2d70 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1145,6 +1145,64 @@ static int check_block_validity(struct inode *inode, const char *msg,
 }
 
 /*
+ * Return the number of dirty pages in the given inode starting at
+ * page frame idx.
+ */
+static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
+				    unsigned int max_pages)
+{
+	struct address_space *mapping = inode->i_mapping;
+	pgoff_t	index;
+	struct pagevec pvec;
+	pgoff_t num = 0;
+	int i, nr_pages, done = 0;
+
+	if (max_pages == 0)
+		return 0;
+	pagevec_init(&pvec, 0);
+	while (!done) {
+		index = idx;
+		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+					      PAGECACHE_TAG_DIRTY,
+					      (pgoff_t)PAGEVEC_SIZE);
+		if (nr_pages == 0)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+			struct buffer_head *bh, *head;
+
+			lock_page(page);
+			if (unlikely(page->mapping != mapping) ||
+			    !PageDirty(page) ||
+			    PageWriteback(page) ||
+			    page->index != idx) {
+				done = 1;
+				unlock_page(page);
+				break;
+			}
+			head = page_buffers(page);
+			bh = head;
+			do {
+				if (!buffer_delay(bh) &&
+				    !buffer_unwritten(bh)) {
+					done = 1;
+					break;
+				}
+			} while ((bh = bh->b_this_page) != head);
+			unlock_page(page);
+			if (done)
+				break;
+			idx++;
+			num++;
+			if (num >= max_pages)
+				break;
+		}
+		pagevec_release(&pvec);
+	}
+	return num;
+}
+
+/*
  * The ext4_get_blocks() function tries to look up the requested blocks,
  * and returns if the blocks are already mapped.
  *
@@ -2743,8 +2801,10 @@ static int ext4_da_writepages(struct address_space *mapping,
 	int no_nrwrite_index_update;
 	int pages_written = 0;
 	long pages_skipped;
+	unsigned int max_pages;
 	int range_cyclic, cycled = 1, io_done = 0;
-	int needed_blocks, ret = 0, nr_to_writebump = 0;
+	int needed_blocks, ret = 0;
+	long desired_nr_to_write, nr_to_writebump = 0;
 	loff_t range_start = wbc->range_start;
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
 
@@ -2771,16 +2831,6 @@ static int ext4_da_writepages(struct address_space *mapping,
 	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
 		return -EROFS;
 
-	/*
-	 * Make sure nr_to_write is >= sbi->s_mb_stream_request
-	 * This make sure small files blocks are allocated in
-	 * single attempt. This ensure that small files
-	 * get less fragmented.
-	 */
-	if (wbc->nr_to_write < sbi->s_mb_stream_request) {
-		nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
-		wbc->nr_to_write = sbi->s_mb_stream_request;
-	}
 	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 		range_whole = 1;
 
@@ -2795,6 +2845,36 @@ static int ext4_da_writepages(struct address_space *mapping,
 	} else
 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
 
+	/*
+	 * This works around two forms of stupidity.  The first is in
+	 * the writeback code, which caps the maximum number of pages
+	 * written to be 1024 pages.  This is wrong on multiple
+	 * levels; different architectues have a different page size,
+	 * which changes the maximum amount of data which gets
+	 * written.  Secondly, 4 megabytes is way too small.  XFS
+	 * forces this value to be 16 megabytes by multiplying
+	 * nr_to_write parameter by four, and then relies on its
+	 * allocator to allocate larger extents to make them
+	 * contiguous.  Unfortunately this brings us to the second
+	 * stupidity, which is that ext4's mballoc code only allocates
+	 * at most 2048 blocks.  So we force contiguous writes up to
+	 * the number of dirty blocks in the inode, or
+	 * sbi->max_writeback_mb_bump whichever is smaller.
+	 */
+	max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
+	if (!range_cyclic && range_whole)
+		desired_nr_to_write = wbc->nr_to_write * 8;
+	else
+		desired_nr_to_write = ext4_num_dirty_pages(inode, index,
+							   max_pages);
+	if (desired_nr_to_write > max_pages)
+		desired_nr_to_write = max_pages;
+
+	if (wbc->nr_to_write < desired_nr_to_write) {
+		nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
+		wbc->nr_to_write = desired_nr_to_write;
+	}
+
 	mpd.wbc = wbc;
 	mpd.inode = mapping->host;
 
@@ -2914,7 +2994,8 @@ retry:
 out_writepages:
 	if (!no_nrwrite_index_update)
 		wbc->no_nrwrite_index_update = 0;
-	wbc->nr_to_write -= nr_to_writebump;
+	if (wbc->nr_to_write > nr_to_writebump)
+		wbc->nr_to_write -= nr_to_writebump;
 	wbc->range_start = range_start;
 	trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
 	return ret;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index df539ba..1681773 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2197,6 +2197,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
 
 static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(delayed_allocation_blocks),
@@ -2210,6 +2211,7 @@ static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(mb_order2_req),
 	ATTR_LIST(mb_stream_req),
 	ATTR_LIST(mb_group_prealloc),
+	ATTR_LIST(max_writeback_mb_bump),
 	NULL,
 };
 
@@ -2679,6 +2681,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	sbi->s_stripe = ext4_get_stripe_size(sbi);
+	sbi->s_max_writeback_mb_bump = 128;
 
 	/*
 	 * set up enough so that it can read an inode
-- 
cgit v1.1


From 9f0ccfd8e07d61b413e6536ffa02fbf60d2e20d8 Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Mon, 28 Sep 2009 15:49:52 -0400
Subject: ext4: release reserved quota when block reservation for delalloc
 retry

ext4_da_reserve_space() can reserve quota blocks multiple times if
ext4_claim_free_blocks() fail and we retry the allocation. We should
release the quota reservation before restarting.

Bug found by Jan Kara.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 20e2d70..219067c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1854,11 +1854,11 @@ repeat:
 
 	if (ext4_claim_free_blocks(sbi, total)) {
 		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+		vfs_dq_release_reservation_block(inode, total);
 		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
 			yield();
 			goto repeat;
 		}
-		vfs_dq_release_reservation_block(inode, total);
 		return -ENOSPC;
 	}
 	EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
-- 
cgit v1.1


From 0031462b5b392f90d17f1d75abb795883c44e969 Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Mon, 28 Sep 2009 15:49:08 -0400
Subject: ext4: Split uninitialized extents for direct I/O

When writing into an unitialized extent via direct I/O, and the direct
I/O doesn't exactly cover the unitialized extent, split the extent
into uninitialized and initialized extents before submitting the I/O.
This avoids needing to deal with an ENOSPC error in the end_io
callback that gets used for direct I/O.

When the IO is complete, the written extent will be marked as initialized.

Singed-Off-By: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h         |  22 ++-
 fs/ext4/ext4_extents.h |   7 +-
 fs/ext4/extents.c      | 423 ++++++++++++++++++++++++++++++++++++++++++++-----
 fs/ext4/inode.c        |   3 +
 fs/ext4/migrate.c      |   2 +-
 fs/ext4/move_extent.c  |   4 +-
 6 files changed, 419 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a58438e..2b4293a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -128,6 +128,15 @@ struct mpage_da_data {
 	int retval;
 };
 
+typedef struct ext4_io_end {
+	struct inode		*inode;		/* file being written to */
+	unsigned int		flag;		/* sync IO or AIO */
+	int			error;		/* I/O error code */
+	ext4_lblk_t		offset;		/* offset in the file */
+	size_t			size;		/* size of the extent */
+	struct work_struct	work;		/* data work queue */
+} ext4_io_end_t;
+
 /*
  * Special inodes numbers
  */
@@ -347,7 +356,16 @@ struct ext4_new_group_data {
 	/* Call ext4_da_update_reserve_space() after successfully 
 	   allocating the blocks */
 #define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE	0x0008
-
+	/* caller is from the direct IO path, request to creation of an
+	unitialized extents if not allocated, split the uninitialized
+	extent if blocks has been preallocated already*/
+#define EXT4_GET_BLOCKS_DIO			0x0010
+#define EXT4_GET_BLOCKS_CONVERT			0x0020
+#define EXT4_GET_BLOCKS_DIO_CREATE_EXT		(EXT4_GET_BLOCKS_DIO|\
+					 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+	/* Convert extent to initialized after direct IO complete */
+#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT		(EXT4_GET_BLOCKS_CONVERT|\
+					 EXT4_GET_BLOCKS_DIO_CREATE_EXT)
 
 /*
  * ioctl commands
@@ -1700,6 +1718,8 @@ extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
 			  loff_t len);
+extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
+			  loff_t len);
 extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
 			   sector_t block, unsigned int max_blocks,
 			   struct buffer_head *bh, int flags);
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 61652f1..2ca6864 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -220,6 +220,11 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
 		(le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
 }
 
+static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
+{
+	ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
+}
+
 extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
 extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
 extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
@@ -235,7 +240,7 @@ extern int ext4_ext_try_to_merge(struct inode *inode,
 				 struct ext4_ext_path *path,
 				 struct ext4_extent *);
 extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
-extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
+extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
 extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
 							ext_prepare_callback, void *);
 extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7a38325..a38e651 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -723,7 +723,7 @@ err:
  * insert new index [@logical;@ptr] into the block at @curp;
  * check where to insert: before @curp or after @curp
  */
-static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
+int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
 				struct ext4_ext_path *curp,
 				int logical, ext4_fsblk_t ptr)
 {
@@ -1586,7 +1586,7 @@ out:
  */
 int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 				struct ext4_ext_path *path,
-				struct ext4_extent *newext)
+				struct ext4_extent *newext, int flag)
 {
 	struct ext4_extent_header *eh;
 	struct ext4_extent *ex, *fex;
@@ -1602,7 +1602,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
 	BUG_ON(path[depth].p_hdr == NULL);
 
 	/* try to insert block into found extent and return */
-	if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
+	if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+		&& ext4_can_extents_be_merged(inode, ex, newext)) {
 		ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
 				ext4_ext_is_uninitialized(newext),
 				ext4_ext_get_actual_len(newext),
@@ -1722,7 +1723,8 @@ has_space:
 
 merge:
 	/* try to merge extents to the right */
-	ext4_ext_try_to_merge(inode, path, nearex);
+	if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+		ext4_ext_try_to_merge(inode, path, nearex);
 
 	/* try to merge extents to the left */
 
@@ -2490,7 +2492,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 }
 
 #define EXT4_EXT_ZERO_LEN 7
-
 /*
  * This function is called by ext4_ext_get_blocks() if someone tries to write
  * to an uninitialized extent. It may result in splitting the uninitialized
@@ -2583,7 +2584,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 			ex3->ee_block = cpu_to_le32(iblock);
 			ext4_ext_store_pblock(ex3, newblock);
 			ex3->ee_len = cpu_to_le16(allocated);
-			err = ext4_ext_insert_extent(handle, inode, path, ex3);
+			err = ext4_ext_insert_extent(handle, inode, path,
+							ex3, 0);
 			if (err == -ENOSPC) {
 				err =  ext4_ext_zeroout(inode, &orig_ex);
 				if (err)
@@ -2639,7 +2641,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 		ext4_ext_store_pblock(ex3, newblock + max_blocks);
 		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
 		ext4_ext_mark_uninitialized(ex3);
-		err = ext4_ext_insert_extent(handle, inode, path, ex3);
+		err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
 		if (err == -ENOSPC) {
 			err =  ext4_ext_zeroout(inode, &orig_ex);
 			if (err)
@@ -2757,7 +2759,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 	err = ext4_ext_dirty(handle, inode, path + depth);
 	goto out;
 insert:
-	err = ext4_ext_insert_extent(handle, inode, path, &newex);
+	err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
 	if (err == -ENOSPC) {
 		err =  ext4_ext_zeroout(inode, &orig_ex);
 		if (err)
@@ -2785,6 +2787,320 @@ fix_extent_len:
 }
 
 /*
+ * This function is called by ext4_ext_get_blocks() from
+ * ext4_get_blocks_dio_write() when DIO to write
+ * to an uninitialized extent.
+ *
+ * Writing to an uninitized extent may result in splitting the uninitialized
+ * extent into multiple /intialized unintialized extents (up to three)
+ * There are three possibilities:
+ *   a> There is no split required: Entire extent should be uninitialized
+ *   b> Splits in two extents: Write is happening at either end of the extent
+ *   c> Splits in three extents: Somone is writing in middle of the extent
+ *
+ * One of more index blocks maybe needed if the extent tree grow after
+ * the unintialized extent split. To prevent ENOSPC occur at the IO
+ * complete, we need to split the uninitialized extent before DIO submit
+ * the IO. The uninitilized extent called at this time will be split
+ * into three uninitialized extent(at most). After IO complete, the part
+ * being filled will be convert to initialized by the end_io callback function
+ * via ext4_convert_unwritten_extents().
+ */
+static int ext4_split_unwritten_extents(handle_t *handle,
+					struct inode *inode,
+					struct ext4_ext_path *path,
+					ext4_lblk_t iblock,
+					unsigned int max_blocks,
+					int flags)
+{
+	struct ext4_extent *ex, newex, orig_ex;
+	struct ext4_extent *ex1 = NULL;
+	struct ext4_extent *ex2 = NULL;
+	struct ext4_extent *ex3 = NULL;
+	struct ext4_extent_header *eh;
+	ext4_lblk_t ee_block;
+	unsigned int allocated, ee_len, depth;
+	ext4_fsblk_t newblock;
+	int err = 0;
+	int ret = 0;
+
+	ext_debug("ext4_split_unwritten_extents: inode %lu,"
+		  "iblock %llu, max_blocks %u\n", inode->i_ino,
+		  (unsigned long long)iblock, max_blocks);
+	depth = ext_depth(inode);
+	eh = path[depth].p_hdr;
+	ex = path[depth].p_ext;
+	ee_block = le32_to_cpu(ex->ee_block);
+	ee_len = ext4_ext_get_actual_len(ex);
+	allocated = ee_len - (iblock - ee_block);
+	newblock = iblock - ee_block + ext_pblock(ex);
+	ex2 = ex;
+	orig_ex.ee_block = ex->ee_block;
+	orig_ex.ee_len   = cpu_to_le16(ee_len);
+	ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+
+	/*
+ 	 * if the entire unintialized extent length less than
+ 	 * the size of extent to write, there is no need to split
+ 	 * uninitialized extent
+ 	 */
+ 	if (allocated <= max_blocks)
+		return ret;
+
+	err = ext4_ext_get_access(handle, inode, path + depth);
+	if (err)
+		goto out;
+	/* ex1: ee_block to iblock - 1 : uninitialized */
+	if (iblock > ee_block) {
+		ex1 = ex;
+		ex1->ee_len = cpu_to_le16(iblock - ee_block);
+		ext4_ext_mark_uninitialized(ex1);
+		ex2 = &newex;
+	}
+	/*
+	 * for sanity, update the length of the ex2 extent before
+	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
+	 * overlap of blocks.
+	 */
+	if (!ex1 && allocated > max_blocks)
+		ex2->ee_len = cpu_to_le16(max_blocks);
+	/* ex3: to ee_block + ee_len : uninitialised */
+	if (allocated > max_blocks) {
+		unsigned int newdepth;
+		ex3 = &newex;
+		ex3->ee_block = cpu_to_le32(iblock + max_blocks);
+		ext4_ext_store_pblock(ex3, newblock + max_blocks);
+		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+		ext4_ext_mark_uninitialized(ex3);
+		err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
+		if (err == -ENOSPC) {
+			err =  ext4_ext_zeroout(inode, &orig_ex);
+			if (err)
+				goto fix_extent_len;
+			/* update the extent length and mark as initialized */
+			ex->ee_block = orig_ex.ee_block;
+			ex->ee_len   = orig_ex.ee_len;
+			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+			ext4_ext_dirty(handle, inode, path + depth);
+			/* zeroed the full extent */
+			/* blocks available from iblock */
+			return allocated;
+
+		} else if (err)
+			goto fix_extent_len;
+		/*
+		 * The depth, and hence eh & ex might change
+		 * as part of the insert above.
+		 */
+		newdepth = ext_depth(inode);
+		/*
+		 * update the extent length after successful insert of the
+		 * split extent
+		 */
+		orig_ex.ee_len = cpu_to_le16(ee_len -
+						ext4_ext_get_actual_len(ex3));
+		depth = newdepth;
+		ext4_ext_drop_refs(path);
+		path = ext4_ext_find_extent(inode, iblock, path);
+		if (IS_ERR(path)) {
+			err = PTR_ERR(path);
+			goto out;
+		}
+		eh = path[depth].p_hdr;
+		ex = path[depth].p_ext;
+		if (ex2 != &newex)
+			ex2 = ex;
+
+		err = ext4_ext_get_access(handle, inode, path + depth);
+		if (err)
+			goto out;
+
+		allocated = max_blocks;
+	}
+	/*
+	 * If there was a change of depth as part of the
+	 * insertion of ex3 above, we need to update the length
+	 * of the ex1 extent again here
+	 */
+	if (ex1 && ex1 != ex) {
+		ex1 = ex;
+		ex1->ee_len = cpu_to_le16(iblock - ee_block);
+		ext4_ext_mark_uninitialized(ex1);
+		ex2 = &newex;
+	}
+	/*
+	 * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
+	 * uninitialised still.
+	 */
+	ex2->ee_block = cpu_to_le32(iblock);
+	ext4_ext_store_pblock(ex2, newblock);
+	ex2->ee_len = cpu_to_le16(allocated);
+	ext4_ext_mark_uninitialized(ex2);
+	if (ex2 != ex)
+		goto insert;
+	/* Mark modified extent as dirty */
+	err = ext4_ext_dirty(handle, inode, path + depth);
+	ext_debug("out here\n");
+	goto out;
+insert:
+	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+	if (err == -ENOSPC) {
+		err =  ext4_ext_zeroout(inode, &orig_ex);
+		if (err)
+			goto fix_extent_len;
+		/* update the extent length and mark as initialized */
+		ex->ee_block = orig_ex.ee_block;
+		ex->ee_len   = orig_ex.ee_len;
+		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+		ext4_ext_dirty(handle, inode, path + depth);
+		/* zero out the first half */
+		return allocated;
+	} else if (err)
+		goto fix_extent_len;
+out:
+	ext4_ext_show_leaf(inode, path);
+	return err ? err : allocated;
+
+fix_extent_len:
+	ex->ee_block = orig_ex.ee_block;
+	ex->ee_len   = orig_ex.ee_len;
+	ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
+	ext4_ext_mark_uninitialized(ex);
+	ext4_ext_dirty(handle, inode, path + depth);
+	return err;
+}
+static int ext4_convert_unwritten_extents_dio(handle_t *handle,
+					      struct inode *inode,
+					      struct ext4_ext_path *path)
+{
+	struct ext4_extent *ex;
+	struct ext4_extent_header *eh;
+	int depth;
+	int err = 0;
+	int ret = 0;
+
+	depth = ext_depth(inode);
+	eh = path[depth].p_hdr;
+	ex = path[depth].p_ext;
+
+	err = ext4_ext_get_access(handle, inode, path + depth);
+	if (err)
+		goto out;
+	/* first mark the extent as initialized */
+	ext4_ext_mark_initialized(ex);
+
+	/*
+	 * We have to see if it can be merged with the extent
+	 * on the left.
+	 */
+	if (ex > EXT_FIRST_EXTENT(eh)) {
+		/*
+		 * To merge left, pass "ex - 1" to try_to_merge(),
+		 * since it merges towards right _only_.
+		 */
+		ret = ext4_ext_try_to_merge(inode, path, ex - 1);
+		if (ret) {
+			err = ext4_ext_correct_indexes(handle, inode, path);
+			if (err)
+				goto out;
+			depth = ext_depth(inode);
+			ex--;
+		}
+	}
+	/*
+	 * Try to Merge towards right.
+	 */
+	ret = ext4_ext_try_to_merge(inode, path, ex);
+	if (ret) {
+		err = ext4_ext_correct_indexes(handle, inode, path);
+		if (err)
+			goto out;
+		depth = ext_depth(inode);
+	}
+	/* Mark modified extent as dirty */
+	err = ext4_ext_dirty(handle, inode, path + depth);
+out:
+	ext4_ext_show_leaf(inode, path);
+	return err;
+}
+
+static int
+ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
+			ext4_lblk_t iblock, unsigned int max_blocks,
+			struct ext4_ext_path *path, int flags,
+			unsigned int allocated, struct buffer_head *bh_result,
+			ext4_fsblk_t newblock)
+{
+	int ret = 0;
+	int err = 0;
+
+	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
+		  "block %llu, max_blocks %u, flags %d, allocated %u",
+		  inode->i_ino, (unsigned long long)iblock, max_blocks,
+		  flags, allocated);
+	ext4_ext_show_leaf(inode, path);
+
+	/* DIO get_block() before submit the IO, split the extent */
+	if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
+		ret = ext4_split_unwritten_extents(handle,
+						inode, path, iblock,
+						max_blocks, flags);
+		goto out;
+	}
+	/* DIO end_io complete, convert the filled extent to written */
+	if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
+		ret = ext4_convert_unwritten_extents_dio(handle, inode,
+							path);
+		goto out2;
+	}
+	/* buffered IO case */
+	/*
+	 * repeat fallocate creation request
+	 * we already have an unwritten extent
+	 */
+	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
+		goto map_out;
+
+	/* buffered READ or buffered write_begin() lookup */
+	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+		/*
+		 * We have blocks reserved already.  We
+		 * return allocated blocks so that delalloc
+		 * won't do block reservation for us.  But
+		 * the buffer head will be unmapped so that
+		 * a read from the block returns 0s.
+		 */
+		set_buffer_unwritten(bh_result);
+		goto out1;
+	}
+
+	/* buffered write, writepage time, convert*/
+	ret = ext4_ext_convert_to_initialized(handle, inode,
+						path, iblock,
+						max_blocks);
+out:
+	if (ret <= 0) {
+		err = ret;
+		goto out2;
+	} else
+		allocated = ret;
+	set_buffer_new(bh_result);
+map_out:
+	set_buffer_mapped(bh_result);
+out1:
+	if (allocated > max_blocks)
+		allocated = max_blocks;
+	ext4_ext_show_leaf(inode, path);
+	bh_result->b_bdev = inode->i_sb->s_bdev;
+	bh_result->b_blocknr = newblock;
+out2:
+	if (path) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
+	}
+	return err ? err : allocated;
+}
+/*
  * Block allocation/map/preallocation routine for extents based files
  *
  *
@@ -2889,33 +3205,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 							EXT4_EXT_CACHE_EXTENT);
 				goto out;
 			}
-			if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
-				goto out;
-			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
-				if (allocated > max_blocks)
-					allocated = max_blocks;
-				/*
-				 * We have blocks reserved already.  We
-				 * return allocated blocks so that delalloc
-				 * won't do block reservation for us.  But
-				 * the buffer head will be unmapped so that
-				 * a read from the block returns 0s.
-				 */
-				set_buffer_unwritten(bh_result);
-				bh_result->b_bdev = inode->i_sb->s_bdev;
-				bh_result->b_blocknr = newblock;
-				goto out2;
-			}
-
-			ret = ext4_ext_convert_to_initialized(handle, inode,
-								path, iblock,
-								max_blocks);
-			if (ret <= 0) {
-				err = ret;
-				goto out2;
-			} else
-				allocated = ret;
-			goto outnew;
+			ret = ext4_ext_handle_uninitialized_extents(handle,
+					inode, iblock, max_blocks, path,
+					flags, allocated, bh_result, newblock);
+			return ret;
 		}
 	}
 
@@ -2988,7 +3281,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	newex.ee_len = cpu_to_le16(ar.len);
 	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)  /* Mark uninitialized */
 		ext4_ext_mark_uninitialized(&newex);
-	err = ext4_ext_insert_extent(handle, inode, path, &newex);
+	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
 	if (err) {
 		/* free data blocks we just allocated */
 		/* not a good idea to call discard here directly,
@@ -3002,7 +3295,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	/* previous routine could use block we allocated */
 	newblock = ext_pblock(&newex);
 	allocated = ext4_ext_get_actual_len(&newex);
-outnew:
 	set_buffer_new(bh_result);
 
 	/* Cache only when it is _not_ an uninitialized extent */
@@ -3201,6 +3493,63 @@ retry:
 }
 
 /*
+ * This function convert a range of blocks to written extents
+ * The caller of this function will pass the start offset and the size.
+ * all unwritten extents within this range will be converted to
+ * written extents.
+ *
+ * This function is called from the direct IO end io call back
+ * function, to convert the fallocated extents after IO is completed.
+ */
+int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
+				    loff_t len)
+{
+	handle_t *handle;
+	ext4_lblk_t block;
+	unsigned int max_blocks;
+	int ret = 0;
+	int ret2 = 0;
+	struct buffer_head map_bh;
+	unsigned int credits, blkbits = inode->i_blkbits;
+
+	block = offset >> blkbits;
+	/*
+	 * We can't just convert len to max_blocks because
+	 * If blocksize = 4096 offset = 3072 and len = 2048
+	 */
+	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
+							- block;
+	/*
+	 * credits to insert 1 extent into extent tree
+	 */
+	credits = ext4_chunk_trans_blocks(inode, max_blocks);
+	while (ret >= 0 && ret < max_blocks) {
+		block = block + ret;
+		max_blocks = max_blocks - ret;
+		handle = ext4_journal_start(inode, credits);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			break;
+		}
+		map_bh.b_state = 0;
+		ret = ext4_get_blocks(handle, inode, block,
+				      max_blocks, &map_bh,
+				      EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
+		if (ret <= 0) {
+			WARN_ON(ret <= 0);
+			printk(KERN_ERR "%s: ext4_ext_get_blocks "
+				    "returned error inode#%lu, block=%u, "
+				    "max_blocks=%u", __func__,
+				    inode->i_ino, block, max_blocks);
+		}
+		ext4_mark_inode_dirty(handle, inode);
+		ret2 = ext4_journal_stop(handle);
+		if (ret <= 0 || ret2 )
+			break;
+	}
+	return ret > 0 ? ret2 : ret;
+}
+/*
  * Callback function called for each extent to gather FIEMAP information.
  */
 static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 219067c..da4f2ec 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1233,6 +1233,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	clear_buffer_mapped(bh);
 	clear_buffer_unwritten(bh);
 
+	ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
+		  "logical block %lu\n", inode->i_ino, flags, max_blocks,
+		  (unsigned long)block);
 	/*
 	 * Try to see if we can get the block without requesting a new
 	 * file system block.
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index bf519f2..a93d5b8 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -75,7 +75,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
 				goto err_out;
 		}
 	}
-	retval = ext4_ext_insert_extent(handle, inode, path, &newext);
+	retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
 err_out:
 	if (path) {
 		ext4_ext_drop_refs(path);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index c07a291..5332fd4 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -322,7 +322,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
 			goto out;
 
 		if (ext4_ext_insert_extent(handle, orig_inode,
-					orig_path, new_ext))
+					orig_path, new_ext, 0))
 			goto out;
 	}
 
@@ -333,7 +333,7 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
 			goto out;
 
 		if (ext4_ext_insert_extent(handle, orig_inode,
-					   orig_path, end_ext))
+					   orig_path, end_ext, 0))
 			goto out;
 	}
 out:
-- 
cgit v1.1


From 4c0425ff68b1b87b802ffeda7b6a46ff7da7241c Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Mon, 28 Sep 2009 15:48:41 -0400
Subject: ext4: Use end_io callback to avoid direct I/O fallback to buffered
 I/O

Currently the DIO VFS code passes create = 0 when writing to the
middle of file.  It does this to avoid block allocation for holes, so
as not to expose stale data out when there is a parallel buffered read
(which does not hold the i_mutex lock).  Direct I/O writes into holes
falls back to buffered IO for this reason.

Since preallocated extents are treated as holes when doing a
get_block() look up (buffer is not mapped), direct IO over fallocate
also falls back to buffered IO.  Thus ext4 actually silently falls
back to buffered IO in above two cases, which is undesirable.

To fix this, this patch creates unitialized extents when a direct I/O
write into holes in sparse files, and registering an end_io callback which
converts the uninitialized extent to an initialized extent after the
I/O is completed.

Singed-Off-By: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  |   3 +
 fs/ext4/inode.c | 197 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ext4/super.c |  11 ++++
 3 files changed, 210 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2b4293a..ccb4dbf 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -999,6 +999,9 @@ struct ext4_sb_info {
 
 	unsigned int s_log_groups_per_flex;
 	struct flex_groups *s_flex_groups;
+
+	/* workqueue for dio unwritten */
+	struct workqueue_struct *dio_unwritten_wq;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index da4f2ec..5633af6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
 #include <linux/namei.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
+#include <linux/workqueue.h>
 
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -3356,6 +3357,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 }
 
 /*
+ * O_DIRECT for ext3 (or indirect map) based files
+ *
  * If the O_DIRECT write will extend the file then add this inode to the
  * orphan list.  So recovery will truncate it back to the original size
  * if the machine crashes during the write.
@@ -3364,7 +3367,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
  * crashes then stale disk data _may_ be exposed inside the file. But current
  * VFS code falls back into buffered path in that case so we are safe.
  */
-static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 			      const struct iovec *iov, loff_t offset,
 			      unsigned long nr_segs)
 {
@@ -3438,6 +3441,198 @@ out:
 	return ret;
 }
 
+/* Maximum number of blocks we map for direct IO at once. */
+
+static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
+		   struct buffer_head *bh_result, int create)
+{
+	handle_t *handle = NULL;
+	int ret = 0;
+	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+	int dio_credits;
+
+	/*
+	 * DIO VFS code passes create = 0 flag for write to
+	 * the middle of file. It does this to avoid block
+	 * allocation for holes, to prevent expose stale data
+	 * out when there is parallel buffered read (which does
+	 * not hold the i_mutex lock) while direct IO write has
+	 * not completed. DIO request on holes finally falls back
+	 * to buffered IO for this reason.
+	 *
+	 * For ext4 extent based file, since we support fallocate,
+	 * new allocated extent as uninitialized, for holes, we
+	 * could fallocate blocks for holes, thus parallel
+	 * buffered IO read will zero out the page when read on
+	 * a hole while parallel DIO write to the hole has not completed.
+	 *
+	 * when we come here, we know it's a direct IO write to
+	 * to the middle of file (<i_size)
+	 * so it's safe to override the create flag from VFS.
+	 */
+	create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
+
+	if (max_blocks > DIO_MAX_BLOCKS)
+		max_blocks = DIO_MAX_BLOCKS;
+	dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+	handle = ext4_journal_start(inode, dio_credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+	ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
+			      create);
+	if (ret > 0) {
+		bh_result->b_size = (ret << inode->i_blkbits);
+		ret = 0;
+	}
+	ext4_journal_stop(handle);
+out:
+	return ret;
+}
+
+#define		DIO_AIO		0x1
+
+static void ext4_free_io_end(ext4_io_end_t *io)
+{
+	kfree(io);
+}
+
+/*
+ * IO write completion for unwritten extents.
+ *
+ * check a range of space and convert unwritten extents to written.
+ */
+static void ext4_end_dio_unwritten(struct work_struct *work)
+{
+	ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
+	struct inode *inode = io->inode;
+	loff_t offset = io->offset;
+	size_t size = io->size;
+	int ret = 0;
+	int aio = io->flag & DIO_AIO;
+
+	if (aio)
+		mutex_lock(&inode->i_mutex);
+	if (offset + size <= i_size_read(inode))
+		ret = ext4_convert_unwritten_extents(inode, offset, size);
+
+	if (ret < 0)
+		printk(KERN_EMERG "%s: failed to convert unwritten"
+			"extents to written extents, error is %d\n",
+                       __func__, ret);
+
+	ext4_free_io_end(io);
+	if (aio)
+		mutex_unlock(&inode->i_mutex);
+}
+
+static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int flag)
+{
+	ext4_io_end_t *io = NULL;
+
+	io = kmalloc(sizeof(*io), GFP_NOFS);
+
+	if (io) {
+		io->inode = inode;
+		io->flag = flag;
+		io->offset = 0;
+		io->size = 0;
+		io->error = 0;
+		INIT_WORK(&io->work, ext4_end_dio_unwritten);
+	}
+
+	return io;
+}
+
+static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
+			    ssize_t size, void *private)
+{
+        ext4_io_end_t *io_end = iocb->private;
+	struct workqueue_struct *wq;
+
+	/* if not hole or unwritten extents, just simple return */
+	if (!io_end || !size || !iocb->private)
+		return;
+	io_end->offset = offset;
+	io_end->size = size;
+	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+
+	/* We need to convert unwritten extents to written */
+	queue_work(wq, &io_end->work);
+
+        if (is_sync_kiocb(iocb))
+		flush_workqueue(wq);
+
+	iocb->private = NULL;
+}
+/*
+ * For ext4 extent files, ext4 will do direct-io write to holes,
+ * preallocated extents, and those write extend the file, no need to
+ * fall back to buffered IO.
+ *
+ * For holes, we fallocate those blocks, mark them as unintialized
+ * If those blocks were preallocated, we mark sure they are splited, but
+ * still keep the range to write as unintialized.
+ *
+ * When end_io call back function called at the last IO complete time,
+ * those extents will be converted to written extents.
+ *
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list.  So recovery will truncate it back to the original size
+ * if the machine crashes during the write.
+ *
+ */
+static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
+			      const struct iovec *iov, loff_t offset,
+			      unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	ssize_t ret;
+	size_t count = iov_length(iov, nr_segs);
+
+	loff_t final_size = offset + count;
+	if (rw == WRITE && final_size <= inode->i_size) {
+		/*
+ 		 * For DIO we fallocate blocks for holes, we fallocate blocks
+ 		 * The fallocated extent for hole is marked as uninitialized
+ 		 * to prevent paralel buffered read to expose the stale data
+ 		 * before DIO complete the data IO.
+ 		 * as for previously fallocated extents, ext4 get_block
+ 		 * will just simply mark the buffer mapped but still
+ 		 * keep the extents uninitialized.
+ 		 *
+ 		 * At the end of IO, the ext4 end_io callback function
+ 		 * will convert those unwritten extents to written,
+ 		 *
+ 		 */
+		iocb->private = ext4_init_io_end(inode, !is_sync_kiocb(iocb));
+		if (!iocb->private)
+			return -ENOMEM;
+		ret = blockdev_direct_IO(rw, iocb, inode,
+					 inode->i_sb->s_bdev, iov,
+					 offset, nr_segs,
+					 ext4_get_block_dio_write,
+					 ext4_end_io_dio);
+		return ret;
+	}
+	return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+}
+
+static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+			      const struct iovec *iov, loff_t offset,
+			      unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+
+	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+		return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+
+	return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+}
+
 /*
  * Pages can be marked dirty completely asynchronously from ext4's journalling
  * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1681773..1a03ea9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -580,6 +580,9 @@ static void ext4_put_super(struct super_block *sb)
 	struct ext4_super_block *es = sbi->s_es;
 	int i, err;
 
+	flush_workqueue(sbi->dio_unwritten_wq);
+	destroy_workqueue(sbi->dio_unwritten_wq);
+
 	lock_super(sb);
 	lock_kernel();
 	if (sb->s_dirt)
@@ -2801,6 +2804,12 @@ no_journal:
 			clear_opt(sbi->s_mount_opt, NOBH);
 		}
 	}
+	EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
+	if (!EXT4_SB(sb)->dio_unwritten_wq) {
+		printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
+		goto failed_mount_wq;
+	}
+
 	/*
 	 * The jbd2_journal_load will have done any necessary log recovery,
 	 * so we can safely mount the rest of the filesystem now.
@@ -2913,6 +2922,8 @@ cantfind_ext4:
 
 failed_mount4:
 	ext4_msg(sb, KERN_ERR, "mount failed");
+	destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
+failed_mount_wq:
 	ext4_release_system_zone(sb);
 	if (sbi->s_journal) {
 		jbd2_journal_destroy(sbi->s_journal);
-- 
cgit v1.1


From 8d5d02e6b176565c77ff03604908b1453a22044d Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Mon, 28 Sep 2009 15:48:29 -0400
Subject: ext4: async direct IO for holes and fallocate support

For async direct IO that covers holes or fallocate, the end_io
callback function now queued the convertion work on workqueue but
don't flush the work rightaway as it might take too long to afford.

But when fsync is called after all the data is completed, user expects
the metadata also being updated before fsync returns.

Thus we need to flush the conversion work when fsync() is called.
This patch keep track of a listed of completed async direct io that
has a work queued on workqueue.  When fsync() is called, it will go
through the list and do the conversion.

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
---
 fs/ext4/ext4.h    |  12 ++-
 fs/ext4/extents.c |  19 ++++-
 fs/ext4/fsync.c   |   5 ++
 fs/ext4/inode.c   | 231 +++++++++++++++++++++++++++++++++++++++++++++---------
 fs/ext4/super.c   |   8 +-
 5 files changed, 234 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ccb4dbf..b491576 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -127,10 +127,11 @@ struct mpage_da_data {
 	int pages_written;
 	int retval;
 };
-
+#define	DIO_AIO_UNWRITTEN	0x1
 typedef struct ext4_io_end {
+	struct list_head	list;		/* per-file finished AIO list */
 	struct inode		*inode;		/* file being written to */
-	unsigned int		flag;		/* sync IO or AIO */
+	unsigned int		flag;		/* unwritten or not */
 	int			error;		/* I/O error code */
 	ext4_lblk_t		offset;		/* offset in the file */
 	size_t			size;		/* size of the extent */
@@ -690,6 +691,11 @@ struct ext4_inode_info {
 	__u16 i_extra_isize;
 
 	spinlock_t i_block_reservation_lock;
+
+	/* completed async DIOs that might need unwritten extents handling */
+	struct list_head i_aio_dio_complete_list;
+	/* current io_end structure for async DIO write*/
+	ext4_io_end_t *cur_aio_dio;
 };
 
 /*
@@ -1419,7 +1425,7 @@ extern int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t ext4_get_reserved_space(struct inode *inode);
-
+extern int flush_aio_dio_completed_IO(struct inode *inode);
 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
 extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index a38e651..10a6309 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3033,6 +3033,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 {
 	int ret = 0;
 	int err = 0;
+	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
 
 	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
 		  "block %llu, max_blocks %u, flags %d, allocated %u",
@@ -3045,6 +3046,9 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
 		ret = ext4_split_unwritten_extents(handle,
 						inode, path, iblock,
 						max_blocks, flags);
+		/* flag the io_end struct that we need convert when IO done */
+		if (io)
+			io->flag = DIO_AIO_UNWRITTEN;
 		goto out;
 	}
 	/* DIO end_io complete, convert the filled extent to written */
@@ -3130,6 +3134,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	int err = 0, depth, ret, cache_type;
 	unsigned int allocated = 0;
 	struct ext4_allocation_request ar;
+	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
 
 	__clear_bit(BH_New, &bh_result->b_state);
 	ext_debug("blocks %u/%u requested for inode %lu\n",
@@ -3279,8 +3284,20 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	/* try to insert new extent into found leaf and return */
 	ext4_ext_store_pblock(&newex, newblock);
 	newex.ee_len = cpu_to_le16(ar.len);
-	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)  /* Mark uninitialized */
+	/* Mark uninitialized */
+	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
 		ext4_ext_mark_uninitialized(&newex);
+		/*
+		 * io_end structure was created for every async
+		 * direct IO write to the middle of the file.
+		 * To avoid unecessary convertion for every aio dio rewrite
+		 * to the mid of file, here we flag the IO that is really
+		 * need the convertion.
+		 *
+		 */
+		if (io && flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+			io->flag = DIO_AIO_UNWRITTEN;
+	}
 	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
 	if (err) {
 		/* free data blocks we just allocated */
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 0747574..2b15312 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -44,6 +44,8 @@
  *
  * What we do is just kick off a commit and wait on it.  This will snapshot the
  * inode to disk.
+ *
+ * i_mutex lock is held when entering and exiting this function
  */
 
 int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
@@ -56,6 +58,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 
 	trace_ext4_sync_file(file, dentry, datasync);
 
+	ret = flush_aio_dio_completed_IO(inode);
+	if (ret < 0)
+		goto out;
 	/*
 	 * data=writeback:
 	 *  The caller's filemap_fdatawrite()/wait will sync the data.
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5633af6..118e16c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3451,6 +3451,8 @@ static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
 	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
 	int dio_credits;
 
+	ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n",
+		   inode->i_ino, create);
 	/*
 	 * DIO VFS code passes create = 0 flag for write to
 	 * the middle of file. It does this to avoid block
@@ -3491,55 +3493,152 @@ out:
 	return ret;
 }
 
-#define		DIO_AIO		0x1
-
 static void ext4_free_io_end(ext4_io_end_t *io)
 {
+	BUG_ON(!io);
+	iput(io->inode);
 	kfree(io);
 }
+static void dump_aio_dio_list(struct inode * inode)
+{
+#ifdef	EXT4_DEBUG
+	struct list_head *cur, *before, *after;
+	ext4_io_end_t *io, *io0, *io1;
+
+	if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
+		ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino);
+		return;
+	}
+
+	ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino);
+	list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){
+		cur = &io->list;
+		before = cur->prev;
+		io0 = container_of(before, ext4_io_end_t, list);
+		after = cur->next;
+		io1 = container_of(after, ext4_io_end_t, list);
+
+		ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
+			    io, inode->i_ino, io0, io1);
+	}
+#endif
+}
 
 /*
- * IO write completion for unwritten extents.
- *
  * check a range of space and convert unwritten extents to written.
  */
-static void ext4_end_dio_unwritten(struct work_struct *work)
+static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
 {
-	ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
 	struct inode *inode = io->inode;
 	loff_t offset = io->offset;
 	size_t size = io->size;
 	int ret = 0;
-	int aio = io->flag & DIO_AIO;
 
-	if (aio)
-		mutex_lock(&inode->i_mutex);
+	ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
+		   "list->prev 0x%p\n",
+	           io, inode->i_ino, io->list.next, io->list.prev);
+
+	if (list_empty(&io->list))
+		return ret;
+
+	if (io->flag != DIO_AIO_UNWRITTEN)
+		return ret;
+
 	if (offset + size <= i_size_read(inode))
 		ret = ext4_convert_unwritten_extents(inode, offset, size);
 
-	if (ret < 0)
+	if (ret < 0) {
 		printk(KERN_EMERG "%s: failed to convert unwritten"
-			"extents to written extents, error is %d\n",
-                       __func__, ret);
+			"extents to written extents, error is %d"
+			" io is still on inode %lu aio dio list\n",
+                       __func__, ret, inode->i_ino);
+		return ret;
+	}
+
+	/* clear the DIO AIO unwritten flag */
+	io->flag = 0;
+	return ret;
+}
+/*
+ * work on completed aio dio IO, to convert unwritten extents to extents
+ */
+static void ext4_end_aio_dio_work(struct work_struct *work)
+{
+	ext4_io_end_t *io  = container_of(work, ext4_io_end_t, work);
+	struct inode *inode = io->inode;
+	int ret = 0;
+
+	mutex_lock(&inode->i_mutex);
+	ret = ext4_end_aio_dio_nolock(io);
+	if (ret >= 0) {
+		if (!list_empty(&io->list))
+			list_del_init(&io->list);
+		ext4_free_io_end(io);
+	}
+	mutex_unlock(&inode->i_mutex);
+}
+/*
+ * This function is called from ext4_sync_file().
+ *
+ * When AIO DIO IO is completed, the work to convert unwritten
+ * extents to written is queued on workqueue but may not get immediately
+ * scheduled. When fsync is called, we need to ensure the
+ * conversion is complete before fsync returns.
+ * The inode keeps track of a list of completed AIO from DIO path
+ * that might needs to do the conversion. This function walks through
+ * the list and convert the related unwritten extents to written.
+ */
+int flush_aio_dio_completed_IO(struct inode *inode)
+{
+	ext4_io_end_t *io;
+	int ret = 0;
+	int ret2 = 0;
 
-	ext4_free_io_end(io);
-	if (aio)
-		mutex_unlock(&inode->i_mutex);
+	if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list))
+		return ret;
+
+	dump_aio_dio_list(inode);
+	while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
+		io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next,
+				ext4_io_end_t, list);
+		/*
+		 * Calling ext4_end_aio_dio_nolock() to convert completed
+		 * IO to written.
+		 *
+		 * When ext4_sync_file() is called, run_queue() may already
+		 * about to flush the work corresponding to this io structure.
+		 * It will be upset if it founds the io structure related
+		 * to the work-to-be schedule is freed.
+		 *
+		 * Thus we need to keep the io structure still valid here after
+		 * convertion finished. The io structure has a flag to
+		 * avoid double converting from both fsync and background work
+		 * queue work.
+		 */
+		ret = ext4_end_aio_dio_nolock(io);
+		if (ret < 0)
+			ret2 = ret;
+		else
+			list_del_init(&io->list);
+	}
+	return (ret2 < 0) ? ret2 : 0;
 }
 
-static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int flag)
+static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
 {
 	ext4_io_end_t *io = NULL;
 
 	io = kmalloc(sizeof(*io), GFP_NOFS);
 
 	if (io) {
+		igrab(inode);
 		io->inode = inode;
-		io->flag = flag;
+		io->flag = 0;
 		io->offset = 0;
 		io->size = 0;
 		io->error = 0;
-		INIT_WORK(&io->work, ext4_end_dio_unwritten);
+		INIT_WORK(&io->work, ext4_end_aio_dio_work);
+		INIT_LIST_HEAD(&io->list);
 	}
 
 	return io;
@@ -3551,19 +3650,31 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
         ext4_io_end_t *io_end = iocb->private;
 	struct workqueue_struct *wq;
 
-	/* if not hole or unwritten extents, just simple return */
-	if (!io_end || !size || !iocb->private)
+	ext_debug("ext4_end_io_dio(): io_end 0x%p"
+		  "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
+ 		  iocb->private, io_end->inode->i_ino, iocb, offset,
+		  size);
+	/* if not async direct IO or dio with 0 bytes write, just return */
+	if (!io_end || !size)
 		return;
+
+	/* if not aio dio with unwritten extents, just free io and return */
+	if (io_end->flag != DIO_AIO_UNWRITTEN){
+		ext4_free_io_end(io_end);
+		iocb->private = NULL;
+		return;
+	}
+
 	io_end->offset = offset;
 	io_end->size = size;
 	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
 
-	/* We need to convert unwritten extents to written */
+	/* queue the work to convert unwritten extents to written */
 	queue_work(wq, &io_end->work);
 
-        if (is_sync_kiocb(iocb))
-		flush_workqueue(wq);
-
+	/* Add the io_end to per-inode completed aio dio list*/
+	list_add_tail(&io_end->list,
+		 &EXT4_I(io_end->inode)->i_aio_dio_complete_list);
 	iocb->private = NULL;
 }
 /*
@@ -3575,8 +3686,10 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
  * If those blocks were preallocated, we mark sure they are splited, but
  * still keep the range to write as unintialized.
  *
- * When end_io call back function called at the last IO complete time,
- * those extents will be converted to written extents.
+ * The unwrritten extents will be converted to written when DIO is completed.
+ * For async direct IO, since the IO may still pending when return, we
+ * set up an end_io call back function, which will do the convertion
+ * when async direct IO completed.
  *
  * If the O_DIRECT write will extend the file then add this inode to the
  * orphan list.  So recovery will truncate it back to the original size
@@ -3595,28 +3708,76 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 	loff_t final_size = offset + count;
 	if (rw == WRITE && final_size <= inode->i_size) {
 		/*
- 		 * For DIO we fallocate blocks for holes, we fallocate blocks
- 		 * The fallocated extent for hole is marked as uninitialized
+ 		 * We could direct write to holes and fallocate.
+		 *
+ 		 * Allocated blocks to fill the hole are marked as uninitialized
  		 * to prevent paralel buffered read to expose the stale data
  		 * before DIO complete the data IO.
- 		 * as for previously fallocated extents, ext4 get_block
+		 *
+ 		 * As to previously fallocated extents, ext4 get_block
  		 * will just simply mark the buffer mapped but still
  		 * keep the extents uninitialized.
  		 *
- 		 * At the end of IO, the ext4 end_io callback function
- 		 * will convert those unwritten extents to written,
- 		 *
+		 * for non AIO case, we will convert those unwritten extents
+		 * to written after return back from blockdev_direct_IO.
+		 *
+		 * for async DIO, the conversion needs to be defered when
+		 * the IO is completed. The ext4 end_io callback function
+		 * will be called to take care of the conversion work.
+		 * Here for async case, we allocate an io_end structure to
+		 * hook to the iocb.
  		 */
-		iocb->private = ext4_init_io_end(inode, !is_sync_kiocb(iocb));
-		if (!iocb->private)
-			return -ENOMEM;
+		iocb->private = NULL;
+		EXT4_I(inode)->cur_aio_dio = NULL;
+		if (!is_sync_kiocb(iocb)) {
+			iocb->private = ext4_init_io_end(inode);
+			if (!iocb->private)
+				return -ENOMEM;
+			/*
+			 * we save the io structure for current async
+			 * direct IO, so that later ext4_get_blocks()
+			 * could flag the io structure whether there
+			 * is a unwritten extents needs to be converted
+			 * when IO is completed.
+			 */
+			EXT4_I(inode)->cur_aio_dio = iocb->private;
+		}
+
 		ret = blockdev_direct_IO(rw, iocb, inode,
 					 inode->i_sb->s_bdev, iov,
 					 offset, nr_segs,
 					 ext4_get_block_dio_write,
 					 ext4_end_io_dio);
+		if (iocb->private)
+			EXT4_I(inode)->cur_aio_dio = NULL;
+		/*
+		 * The io_end structure takes a reference to the inode,
+		 * that structure needs to be destroyed and the
+		 * reference to the inode need to be dropped, when IO is
+		 * complete, even with 0 byte write, or failed.
+		 *
+		 * In the successful AIO DIO case, the io_end structure will be
+		 * desctroyed and the reference to the inode will be dropped
+		 * after the end_io call back function is called.
+		 *
+		 * In the case there is 0 byte write, or error case, since
+		 * VFS direct IO won't invoke the end_io call back function,
+		 * we need to free the end_io structure here.
+		 */
+		if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
+			ext4_free_io_end(iocb->private);
+			iocb->private = NULL;
+		} else if (ret > 0)
+			/*
+			 * for non AIO case, since the IO is already
+			 * completed, we could do the convertion right here
+			 */
+			ret = ext4_convert_unwritten_extents(inode,
+								offset, ret);
 		return ret;
 	}
+
+	/* for write the the end of file case, we fall back to old way */
 	return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
 }
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1a03ea9..f095c60 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -687,6 +687,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ei->i_allocated_meta_blocks = 0;
 	ei->i_delalloc_reserved_flag = 0;
 	spin_lock_init(&(ei->i_block_reservation_lock));
+	INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
+	ei->cur_aio_dio = NULL;
 
 	return &ei->vfs_inode;
 }
@@ -3375,11 +3377,13 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 {
 	int ret = 0;
 	tid_t target;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 	trace_ext4_sync_fs(sb, wait);
-	if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
+	flush_workqueue(sbi->dio_unwritten_wq);
+	if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
 		if (wait)
-			jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
+			jbd2_log_wait_commit(sbi->s_journal, target);
 	}
 	return ret;
 }
-- 
cgit v1.1


From f3ce8064b388ccf420012c5a4907aae4f13fe9d0 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 28 Sep 2009 15:58:29 -0400
Subject: ext4: EXT4_IOC_MOVE_EXT: Check for different original and donor
 inodes first

Move the check to make sure the original and donor inodes are
different earlier, to avoid a potential deadlock by trying to lock the
same inode twice.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/move_extent.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 5332fd4..25b6b14 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -1001,14 +1001,6 @@ mext_check_arguments(struct inode *orig_inode,
 		return -EINVAL;
 	}
 
-	/* orig and donor should be different file */
-	if (orig_inode->i_ino == donor_inode->i_ino) {
-		ext4_debug("ext4 move extent: The argument files should not "
-			"be same file [ino:orig %lu, donor %lu]\n",
-			orig_inode->i_ino, donor_inode->i_ino);
-		return -EINVAL;
-	}
-
 	/* Ext4 move extent supports only extent based file */
 	if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
 		ext4_debug("ext4 move extent: orig file is not extents "
@@ -1232,6 +1224,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 	int block_len_in_page;
 	int uninit;
 
+	/* orig and donor should be different file */
+	if (orig_inode->i_ino == donor_inode->i_ino) {
+		ext4_debug("ext4 move extent: The argument files should not "
+			"be same file [ino:orig %lu, donor %lu]\n",
+			orig_inode->i_ino, donor_inode->i_ino);
+		return -EINVAL;
+	}
+
 	/* protect orig and donor against a truncate */
 	ret1 = mext_inode_double_lock(orig_inode, donor_inode);
 	if (ret1 < 0)
-- 
cgit v1.1


From 9ed74f2dba6ebf9f30b80554290bfc73cc3ef083 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 11 Sep 2009 16:12:44 -0400
Subject: Btrfs: proper -ENOSPC handling

At the start of a transaction we do a btrfs_reserve_metadata_space() and
specify how many items we plan on modifying.  Then once we've done our
modifications and such, just call btrfs_unreserve_metadata_space() for
the same number of items we reserved.

For keeping track of metadata needed for data I've had to add an extent_io op
for when we merge extents.  This lets us track space properly when we are doing
sequential writes, so we don't end up reserving way more metadata space than
what we need.

The only place where the metadata space accounting is not done is in the
relocation code.  This is because Yan is going to be reworking that code in the
near future, so running btrfs-vol -b could still possibly result in a ENOSPC
related panic.  This patch also turns off the metadata_ratio stuff in order to
allow users to more efficiently use their disk space.

This patch makes it so we track how much metadata we need for an inode's
delayed allocation extents by tracking how many extents are currently
waiting for allocation.  It introduces two new callbacks for the
extent_io tree's, merge_extent_hook and split_extent_hook.  These help
us keep track of when we merge delalloc extents together and split them
up.  Reservations are handled prior to any actually dirty'ing occurs,
and then we unreserve after we dirty.

btrfs_unreserve_metadata_for_delalloc() will make the appropriate
unreservations as needed based on the number of reservations we
currently have and the number of extents we currently have.  Doing the
reservation outside of doing any of the actual dirty'ing lets us do
things like filemap_flush() the inode to try and force delalloc to
happen, or as a last resort actually start allocation on all delalloc
inodes in the fs.  This has survived dbench, fs_mark and an fsx torture
test.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |   8 +
 fs/btrfs/ctree.h       |  23 ++-
 fs/btrfs/disk-io.c     |   2 +-
 fs/btrfs/extent-tree.c | 389 +++++++++++++++++++++++++++++++++++++++++++------
 fs/btrfs/extent_io.c   |  92 +++++++++---
 fs/btrfs/extent_io.h   |  13 +-
 fs/btrfs/file.c        |  11 +-
 fs/btrfs/inode.c       | 224 ++++++++++++++++++++++++----
 fs/btrfs/ioctl.c       |  21 ++-
 fs/btrfs/transaction.c |  10 ++
 10 files changed, 678 insertions(+), 115 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 82ee56b..a54d354 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -128,6 +128,14 @@ struct btrfs_inode {
 	u64 last_unlink_trans;
 
 	/*
+	 * These two counters are for delalloc metadata reservations.  We keep
+	 * track of how many extents we've accounted for vs how many extents we
+	 * have.
+	 */
+	int delalloc_reserved_extents;
+	int delalloc_extents;
+
+	/*
 	 * ordered_data_close is set by truncate when a file that used
 	 * to have good data has been truncated to zero.  When it is set
 	 * the btrfs file release call will add this inode to the
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 80599b4..b3959a1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -675,18 +675,19 @@ struct btrfs_space_info {
 				   current allocations */
 	u64 bytes_readonly;	/* total bytes that are read only */
 	u64 bytes_super;	/* total bytes reserved for the super blocks */
-
-	/* delalloc accounting */
-	u64 bytes_delalloc;	/* number of bytes reserved for allocation,
-				   this space is not necessarily reserved yet
-				   by the allocator */
+	u64 bytes_root;		/* the number of bytes needed to commit a
+				   transaction */
 	u64 bytes_may_use;	/* number of bytes that may be used for
-				   delalloc */
+				   delalloc/allocations */
+	u64 bytes_delalloc;	/* number of bytes currently reserved for
+				   delayed allocation */
 
 	int full;		/* indicates that we cannot allocate any more
 				   chunks for this space */
 	int force_alloc;	/* set if we need to force a chunk alloc for
 				   this space */
+	int force_delalloc;	/* make people start doing filemap_flush until
+				   we're under a threshold */
 
 	struct list_head list;
 
@@ -695,6 +696,9 @@ struct btrfs_space_info {
 	spinlock_t lock;
 	struct rw_semaphore groups_sem;
 	atomic_t caching_threads;
+
+	int allocating_chunk;
+	wait_queue_head_t wait;
 };
 
 /*
@@ -2022,7 +2026,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
 
-int btrfs_check_metadata_free_space(struct btrfs_root *root);
+int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
+int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
+int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
+					  struct inode *inode, int num_items);
+int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
+					struct inode *inode, int num_items);
 int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
 				u64 bytes);
 void btrfs_free_reserved_data_space(struct btrfs_root *root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f1e905f..ece8d1e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1629,7 +1629,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->sb = sb;
 	fs_info->max_extent = (u64)-1;
 	fs_info->max_inline = 8192 * 1024;
-	fs_info->metadata_ratio = 8;
+	fs_info->metadata_ratio = 0;
 
 	fs_info->thread_pool_size = min_t(unsigned long,
 					  num_online_cpus() + 2, 8);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 90d314e..a4b2b03 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -68,6 +68,8 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
 			  struct extent_buffer **must_clean);
 static int find_next_key(struct btrfs_path *path, int level,
 			 struct btrfs_key *key);
+static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
+			    int dump_block_groups);
 
 static noinline int
 block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -2764,67 +2766,346 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
 						       alloc_target);
 }
 
+static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
+{
+	u64 num_bytes;
+	int level;
+
+	level = BTRFS_MAX_LEVEL - 2;
+	/*
+	 * NOTE: these calculations are absolutely the worst possible case.
+	 * This assumes that _every_ item we insert will require a new leaf, and
+	 * that the tree has grown to its maximum level size.
+	 */
+
+	/*
+	 * for every item we insert we could insert both an extent item and a
+	 * extent ref item.  Then for ever item we insert, we will need to cow
+	 * both the original leaf, plus the leaf to the left and right of it.
+	 *
+	 * Unless we are talking about the extent root, then we just want the
+	 * number of items * 2, since we just need the extent item plus its ref.
+	 */
+	if (root == root->fs_info->extent_root)
+		num_bytes = num_items * 2;
+	else
+		num_bytes = (num_items + (2 * num_items)) * 3;
+
+	/*
+	 * num_bytes is total number of leaves we could need times the leaf
+	 * size, and then for every leaf we could end up cow'ing 2 nodes per
+	 * level, down to the leaf level.
+	 */
+	num_bytes = (num_bytes * root->leafsize) +
+		(num_bytes * (level * 2)) * root->nodesize;
+
+	return num_bytes;
+}
+
 /*
- * for now this just makes sure we have at least 5% of our metadata space free
- * for use.
+ * Unreserve metadata space for delalloc.  If we have less reserved credits than
+ * we have extents, this function does nothing.
  */
-int btrfs_check_metadata_free_space(struct btrfs_root *root)
+int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
+					  struct inode *inode, int num_items)
 {
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_space_info *meta_sinfo;
-	u64 alloc_target, thresh;
-	int committed = 0, ret;
+	u64 num_bytes;
+	u64 alloc_target;
+	bool bug = false;
 
 	/* get the space info for where the metadata will live */
 	alloc_target = btrfs_get_alloc_profile(root, 0);
 	meta_sinfo = __find_space_info(info, alloc_target);
-	if (!meta_sinfo)
-		goto alloc;
 
-again:
+	num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
+					   num_items);
+
 	spin_lock(&meta_sinfo->lock);
-	if (!meta_sinfo->full)
-		thresh = meta_sinfo->total_bytes * 80;
-	else
-		thresh = meta_sinfo->total_bytes * 95;
+	if (BTRFS_I(inode)->delalloc_reserved_extents <=
+	    BTRFS_I(inode)->delalloc_extents) {
+		spin_unlock(&meta_sinfo->lock);
+		return 0;
+	}
+
+	BTRFS_I(inode)->delalloc_reserved_extents--;
+	BUG_ON(BTRFS_I(inode)->delalloc_reserved_extents < 0);
+
+	if (meta_sinfo->bytes_delalloc < num_bytes) {
+		bug = true;
+		meta_sinfo->bytes_delalloc = 0;
+	} else {
+		meta_sinfo->bytes_delalloc -= num_bytes;
+	}
+	spin_unlock(&meta_sinfo->lock);
 
+	BUG_ON(bug);
+
+	return 0;
+}
+
+static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
+{
+	u64 thresh;
+
+	thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+		meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+		meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+		meta_sinfo->bytes_may_use;
+
+	thresh = meta_sinfo->total_bytes - thresh;
+	thresh *= 80;
 	do_div(thresh, 100);
+	if (thresh <= meta_sinfo->bytes_delalloc)
+		meta_sinfo->force_delalloc = 1;
+	else
+		meta_sinfo->force_delalloc = 0;
+}
 
-	if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
-	    meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
-	    meta_sinfo->bytes_super > thresh) {
-		struct btrfs_trans_handle *trans;
-		if (!meta_sinfo->full) {
-			meta_sinfo->force_alloc = 1;
+static int maybe_allocate_chunk(struct btrfs_root *root,
+				 struct btrfs_space_info *info)
+{
+	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+	struct btrfs_trans_handle *trans;
+	bool wait = false;
+	int ret = 0;
+	u64 min_metadata;
+	u64 free_space;
+
+	free_space = btrfs_super_total_bytes(disk_super);
+	/*
+	 * we allow the metadata to grow to a max of either 5gb or 5% of the
+	 * space in the volume.
+	 */
+	min_metadata = min((u64)5 * 1024 * 1024 * 1024,
+			     div64_u64(free_space * 5, 100));
+	if (info->total_bytes >= min_metadata) {
+		spin_unlock(&info->lock);
+		return 0;
+	}
+
+	if (info->full) {
+		spin_unlock(&info->lock);
+		return 0;
+	}
+
+	if (!info->allocating_chunk) {
+		info->force_alloc = 1;
+		info->allocating_chunk = 1;
+		init_waitqueue_head(&info->wait);
+	} else {
+		wait = true;
+	}
+
+	spin_unlock(&info->lock);
+
+	if (wait) {
+		wait_event(info->wait,
+			   !info->allocating_chunk);
+		return 1;
+	}
+
+	trans = btrfs_start_transaction(root, 1);
+	if (!trans) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+			     4096 + 2 * 1024 * 1024,
+			     info->flags, 0);
+	btrfs_end_transaction(trans, root);
+	if (ret)
+		goto out;
+out:
+	spin_lock(&info->lock);
+	info->allocating_chunk = 0;
+	spin_unlock(&info->lock);
+	wake_up(&info->wait);
+
+	if (ret)
+		return 0;
+	return 1;
+}
+
+/*
+ * Reserve metadata space for delalloc.
+ */
+int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
+					struct inode *inode, int num_items)
+{
+	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_space_info *meta_sinfo;
+	u64 num_bytes;
+	u64 used;
+	u64 alloc_target;
+	int flushed = 0;
+	int force_delalloc;
+
+	/* get the space info for where the metadata will live */
+	alloc_target = btrfs_get_alloc_profile(root, 0);
+	meta_sinfo = __find_space_info(info, alloc_target);
+
+	num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
+					   num_items);
+again:
+	spin_lock(&meta_sinfo->lock);
+
+	force_delalloc = meta_sinfo->force_delalloc;
+
+	if (unlikely(!meta_sinfo->bytes_root))
+		meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
+
+	if (!flushed)
+		meta_sinfo->bytes_delalloc += num_bytes;
+
+	used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+		meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+		meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+		meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
+
+	if (used > meta_sinfo->total_bytes) {
+		flushed++;
+
+		if (flushed == 1) {
+			if (maybe_allocate_chunk(root, meta_sinfo))
+				goto again;
+			flushed++;
+		} else {
 			spin_unlock(&meta_sinfo->lock);
-alloc:
-			trans = btrfs_start_transaction(root, 1);
-			if (!trans)
-				return -ENOMEM;
+		}
 
-			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-					     2 * 1024 * 1024, alloc_target, 0);
-			btrfs_end_transaction(trans, root);
-			if (!meta_sinfo) {
-				meta_sinfo = __find_space_info(info,
-							       alloc_target);
-			}
+		if (flushed == 2) {
+			filemap_flush(inode->i_mapping);
+			goto again;
+		} else if (flushed == 3) {
+			btrfs_start_delalloc_inodes(root);
+			btrfs_wait_ordered_extents(root, 0);
 			goto again;
 		}
+		spin_lock(&meta_sinfo->lock);
+		meta_sinfo->bytes_delalloc -= num_bytes;
 		spin_unlock(&meta_sinfo->lock);
+		printk(KERN_ERR "enospc, has %d, reserved %d\n",
+		       BTRFS_I(inode)->delalloc_extents,
+		       BTRFS_I(inode)->delalloc_reserved_extents);
+		dump_space_info(meta_sinfo, 0, 0);
+		return -ENOSPC;
+	}
 
-		if (!committed) {
-			committed = 1;
-			trans = btrfs_join_transaction(root, 1);
-			if (!trans)
-				return -ENOMEM;
-			ret = btrfs_commit_transaction(trans, root);
-			if (ret)
-				return ret;
+	BTRFS_I(inode)->delalloc_reserved_extents++;
+	check_force_delalloc(meta_sinfo);
+	spin_unlock(&meta_sinfo->lock);
+
+	if (!flushed && force_delalloc)
+		filemap_flush(inode->i_mapping);
+
+	return 0;
+}
+
+/*
+ * unreserve num_items number of items worth of metadata space.  This needs to
+ * be paired with btrfs_reserve_metadata_space.
+ *
+ * NOTE: if you have the option, run this _AFTER_ you do a
+ * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
+ * oprations which will result in more used metadata, so we want to make sure we
+ * can do that without issue.
+ */
+int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
+{
+	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_space_info *meta_sinfo;
+	u64 num_bytes;
+	u64 alloc_target;
+	bool bug = false;
+
+	/* get the space info for where the metadata will live */
+	alloc_target = btrfs_get_alloc_profile(root, 0);
+	meta_sinfo = __find_space_info(info, alloc_target);
+
+	num_bytes = calculate_bytes_needed(root, num_items);
+
+	spin_lock(&meta_sinfo->lock);
+	if (meta_sinfo->bytes_may_use < num_bytes) {
+		bug = true;
+		meta_sinfo->bytes_may_use = 0;
+	} else {
+		meta_sinfo->bytes_may_use -= num_bytes;
+	}
+	spin_unlock(&meta_sinfo->lock);
+
+	BUG_ON(bug);
+
+	return 0;
+}
+
+/*
+ * Reserve some metadata space for use.  We'll calculate the worste case number
+ * of bytes that would be needed to modify num_items number of items.  If we
+ * have space, fantastic, if not, you get -ENOSPC.  Please call
+ * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
+ * items you reserved, since whatever metadata you needed should have already
+ * been allocated.
+ *
+ * This will commit the transaction to make more space if we don't have enough
+ * metadata space.  THe only time we don't do this is if we're reserving space
+ * inside of a transaction, then we will just return -ENOSPC and it is the
+ * callers responsibility to handle it properly.
+ */
+int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
+{
+	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_space_info *meta_sinfo;
+	u64 num_bytes;
+	u64 used;
+	u64 alloc_target;
+	int retries = 0;
+
+	/* get the space info for where the metadata will live */
+	alloc_target = btrfs_get_alloc_profile(root, 0);
+	meta_sinfo = __find_space_info(info, alloc_target);
+
+	num_bytes = calculate_bytes_needed(root, num_items);
+again:
+	spin_lock(&meta_sinfo->lock);
+
+	if (unlikely(!meta_sinfo->bytes_root))
+		meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
+
+	if (!retries)
+		meta_sinfo->bytes_may_use += num_bytes;
+
+	used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+		meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+		meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+		meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
+
+	if (used > meta_sinfo->total_bytes) {
+		retries++;
+		if (retries == 1) {
+			if (maybe_allocate_chunk(root, meta_sinfo))
+				goto again;
+			retries++;
+		} else {
+			spin_unlock(&meta_sinfo->lock);
+		}
+
+		if (retries == 2) {
+			btrfs_start_delalloc_inodes(root);
+			btrfs_wait_ordered_extents(root, 0);
 			goto again;
 		}
+		spin_lock(&meta_sinfo->lock);
+		meta_sinfo->bytes_may_use -= num_bytes;
+		spin_unlock(&meta_sinfo->lock);
+
+		dump_space_info(meta_sinfo, 0, 0);
 		return -ENOSPC;
 	}
+
+	check_force_delalloc(meta_sinfo);
 	spin_unlock(&meta_sinfo->lock);
 
 	return 0;
@@ -2915,7 +3196,7 @@ alloc:
 	BTRFS_I(inode)->reserved_bytes += bytes;
 	spin_unlock(&data_sinfo->lock);
 
-	return btrfs_check_metadata_free_space(root);
+	return 0;
 }
 
 /*
@@ -3014,17 +3295,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	BUG_ON(!space_info);
 
 	spin_lock(&space_info->lock);
-	if (space_info->force_alloc) {
+	if (space_info->force_alloc)
 		force = 1;
-		space_info->force_alloc = 0;
-	}
 	if (space_info->full) {
 		spin_unlock(&space_info->lock);
 		goto out;
 	}
 
 	thresh = space_info->total_bytes - space_info->bytes_readonly;
-	thresh = div_factor(thresh, 6);
+	thresh = div_factor(thresh, 8);
 	if (!force &&
 	   (space_info->bytes_used + space_info->bytes_pinned +
 	    space_info->bytes_reserved + alloc_bytes) < thresh) {
@@ -3038,7 +3317,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	 * we keep a reasonable number of metadata chunks allocated in the
 	 * FS as well.
 	 */
-	if (flags & BTRFS_BLOCK_GROUP_DATA) {
+	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
 		fs_info->data_chunk_allocations++;
 		if (!(fs_info->data_chunk_allocations %
 		      fs_info->metadata_ratio))
@@ -3046,8 +3325,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 	}
 
 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
+	spin_lock(&space_info->lock);
 	if (ret)
 		space_info->full = 1;
+	space_info->force_alloc = 0;
+	spin_unlock(&space_info->lock);
 out:
 	mutex_unlock(&extent_root->fs_info->chunk_mutex);
 	return ret;
@@ -4062,21 +4344,32 @@ loop:
 	return ret;
 }
 
-static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
+static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
+			    int dump_block_groups)
 {
 	struct btrfs_block_group_cache *cache;
 
+	spin_lock(&info->lock);
 	printk(KERN_INFO "space_info has %llu free, is %sfull\n",
 	       (unsigned long long)(info->total_bytes - info->bytes_used -
-				    info->bytes_pinned - info->bytes_reserved),
+				    info->bytes_pinned - info->bytes_reserved -
+				    info->bytes_super),
 	       (info->full) ? "" : "not ");
 	printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
-	       " may_use=%llu, used=%llu\n",
+	       " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
+	       "\n",
 	       (unsigned long long)info->total_bytes,
 	       (unsigned long long)info->bytes_pinned,
 	       (unsigned long long)info->bytes_delalloc,
 	       (unsigned long long)info->bytes_may_use,
-	       (unsigned long long)info->bytes_used);
+	       (unsigned long long)info->bytes_used,
+	       (unsigned long long)info->bytes_root,
+	       (unsigned long long)info->bytes_super,
+	       (unsigned long long)info->bytes_reserved);
+	spin_unlock(&info->lock);
+
+	if (!dump_block_groups)
+		return;
 
 	down_read(&info->groups_sem);
 	list_for_each_entry(cache, &info->block_groups, list) {
@@ -4144,7 +4437,7 @@ again:
 		printk(KERN_ERR "btrfs allocation failed flags %llu, "
 		       "wanted %llu\n", (unsigned long long)data,
 		       (unsigned long long)num_bytes);
-		dump_space_info(sinfo, num_bytes);
+		dump_space_info(sinfo, num_bytes, 1);
 	}
 
 	return ret;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0cb88f8..de1793b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -280,6 +280,14 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
 	return NULL;
 }
 
+static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
+		     struct extent_state *other)
+{
+	if (tree->ops && tree->ops->merge_extent_hook)
+		tree->ops->merge_extent_hook(tree->mapping->host, new,
+					     other);
+}
+
 /*
  * utility function to look for merge candidates inside a given range.
  * Any extents with matching state are merged together into a single
@@ -303,6 +311,7 @@ static int merge_state(struct extent_io_tree *tree,
 		other = rb_entry(other_node, struct extent_state, rb_node);
 		if (other->end == state->start - 1 &&
 		    other->state == state->state) {
+			merge_cb(tree, state, other);
 			state->start = other->start;
 			other->tree = NULL;
 			rb_erase(&other->rb_node, &tree->state);
@@ -314,33 +323,37 @@ static int merge_state(struct extent_io_tree *tree,
 		other = rb_entry(other_node, struct extent_state, rb_node);
 		if (other->start == state->end + 1 &&
 		    other->state == state->state) {
+			merge_cb(tree, state, other);
 			other->start = state->start;
 			state->tree = NULL;
 			rb_erase(&state->rb_node, &tree->state);
 			free_extent_state(state);
+			state = NULL;
 		}
 	}
+
 	return 0;
 }
 
-static void set_state_cb(struct extent_io_tree *tree,
+static int set_state_cb(struct extent_io_tree *tree,
 			 struct extent_state *state,
 			 unsigned long bits)
 {
 	if (tree->ops && tree->ops->set_bit_hook) {
-		tree->ops->set_bit_hook(tree->mapping->host, state->start,
-					state->end, state->state, bits);
+		return tree->ops->set_bit_hook(tree->mapping->host,
+					       state->start, state->end,
+					       state->state, bits);
 	}
+
+	return 0;
 }
 
 static void clear_state_cb(struct extent_io_tree *tree,
 			   struct extent_state *state,
 			   unsigned long bits)
 {
-	if (tree->ops && tree->ops->clear_bit_hook) {
-		tree->ops->clear_bit_hook(tree->mapping->host, state->start,
-					  state->end, state->state, bits);
-	}
+	if (tree->ops && tree->ops->clear_bit_hook)
+		tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
 }
 
 /*
@@ -358,6 +371,7 @@ static int insert_state(struct extent_io_tree *tree,
 			int bits)
 {
 	struct rb_node *node;
+	int ret;
 
 	if (end < start) {
 		printk(KERN_ERR "btrfs end < start %llu %llu\n",
@@ -365,11 +379,14 @@ static int insert_state(struct extent_io_tree *tree,
 		       (unsigned long long)start);
 		WARN_ON(1);
 	}
-	if (bits & EXTENT_DIRTY)
-		tree->dirty_bytes += end - start + 1;
 	state->start = start;
 	state->end = end;
-	set_state_cb(tree, state, bits);
+	ret = set_state_cb(tree, state, bits);
+	if (ret)
+		return ret;
+
+	if (bits & EXTENT_DIRTY)
+		tree->dirty_bytes += end - start + 1;
 	state->state |= bits;
 	node = tree_insert(&tree->state, end, &state->rb_node);
 	if (node) {
@@ -387,6 +404,15 @@ static int insert_state(struct extent_io_tree *tree,
 	return 0;
 }
 
+static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
+		     u64 split)
+{
+	if (tree->ops && tree->ops->split_extent_hook)
+		return tree->ops->split_extent_hook(tree->mapping->host,
+						    orig, split);
+	return 0;
+}
+
 /*
  * split a given extent state struct in two, inserting the preallocated
  * struct 'prealloc' as the newly created second half.  'split' indicates an
@@ -405,6 +431,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 		       struct extent_state *prealloc, u64 split)
 {
 	struct rb_node *node;
+
+	split_cb(tree, orig, split);
+
 	prealloc->start = orig->start;
 	prealloc->end = split - 1;
 	prealloc->state = orig->state;
@@ -542,8 +571,8 @@ hit_next:
 		if (err)
 			goto out;
 		if (state->end <= end) {
-			set |= clear_state_bit(tree, state, bits,
-					wake, delete);
+			set |= clear_state_bit(tree, state, bits, wake,
+					       delete);
 			if (last_end == (u64)-1)
 				goto out;
 			start = last_end + 1;
@@ -561,12 +590,11 @@ hit_next:
 			prealloc = alloc_extent_state(GFP_ATOMIC);
 		err = split_state(tree, state, prealloc, end + 1);
 		BUG_ON(err == -EEXIST);
-
 		if (wake)
 			wake_up(&state->wq);
 
-		set |= clear_state_bit(tree, prealloc, bits,
-				       wake, delete);
+		set |= clear_state_bit(tree, prealloc, bits, wake, delete);
+
 		prealloc = NULL;
 		goto out;
 	}
@@ -667,16 +695,23 @@ out:
 	return 0;
 }
 
-static void set_state_bits(struct extent_io_tree *tree,
+static int set_state_bits(struct extent_io_tree *tree,
 			   struct extent_state *state,
 			   int bits)
 {
+	int ret;
+
+	ret = set_state_cb(tree, state, bits);
+	if (ret)
+		return ret;
+
 	if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
 		u64 range = state->end - state->start + 1;
 		tree->dirty_bytes += range;
 	}
-	set_state_cb(tree, state, bits);
 	state->state |= bits;
+
+	return 0;
 }
 
 static void cache_state(struct extent_state *state,
@@ -758,7 +793,10 @@ hit_next:
 			goto out;
 		}
 
-		set_state_bits(tree, state, bits);
+		err = set_state_bits(tree, state, bits);
+		if (err)
+			goto out;
+
 		cache_state(state, cached_state);
 		merge_state(tree, state);
 		if (last_end == (u64)-1)
@@ -805,7 +843,9 @@ hit_next:
 		if (err)
 			goto out;
 		if (state->end <= end) {
-			set_state_bits(tree, state, bits);
+			err = set_state_bits(tree, state, bits);
+			if (err)
+				goto out;
 			cache_state(state, cached_state);
 			merge_state(tree, state);
 			if (last_end == (u64)-1)
@@ -829,11 +869,13 @@ hit_next:
 			this_end = last_start - 1;
 		err = insert_state(tree, prealloc, start, this_end,
 				   bits);
-		cache_state(prealloc, cached_state);
-		prealloc = NULL;
 		BUG_ON(err == -EEXIST);
-		if (err)
+		if (err) {
+			prealloc = NULL;
 			goto out;
+		}
+		cache_state(prealloc, cached_state);
+		prealloc = NULL;
 		start = this_end + 1;
 		goto search_again;
 	}
@@ -852,7 +894,11 @@ hit_next:
 		err = split_state(tree, state, prealloc, end + 1);
 		BUG_ON(err == -EEXIST);
 
-		set_state_bits(tree, prealloc, bits);
+		err = set_state_bits(tree, prealloc, bits);
+		if (err) {
+			prealloc = NULL;
+			goto out;
+		}
 		cache_state(prealloc, cached_state);
 		merge_state(tree, prealloc);
 		prealloc = NULL;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 14ed16f..4794ec8 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -60,8 +60,13 @@ struct extent_io_ops {
 				      struct extent_state *state, int uptodate);
 	int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
 			    unsigned long old, unsigned long bits);
-	int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
-			    unsigned long old, unsigned long bits);
+	int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
+			      unsigned long bits);
+	int (*merge_extent_hook)(struct inode *inode,
+				 struct extent_state *new,
+				 struct extent_state *other);
+	int (*split_extent_hook)(struct inode *inode,
+				 struct extent_state *orig, u64 split);
 	int (*write_cache_pages_lock_hook)(struct page *page);
 };
 
@@ -79,10 +84,14 @@ struct extent_state {
 	u64 start;
 	u64 end; /* inclusive */
 	struct rb_node rb_node;
+
+	/* ADD NEW ELEMENTS AFTER THIS */
 	struct extent_io_tree *tree;
 	wait_queue_head_t wq;
 	atomic_t refs;
 	unsigned long state;
+	u64 split_start;
+	u64 split_end;
 
 	/* for use by the FS */
 	u64 private;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 571ad3c..1be96ba 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -123,7 +123,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
 		    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
 
 	end_of_last_block = start_pos + num_bytes - 1;
-	btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
+	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
+	if (err)
+		return err;
+
 	for (i = 0; i < num_pages; i++) {
 		struct page *p = pages[i];
 		SetPageUptodate(p);
@@ -927,6 +930,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	err = file_remove_suid(file);
 	if (err)
 		goto out_nolock;
+
+	err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+	if (err)
+		goto out_nolock;
+
 	file_update_time(file);
 
 	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
@@ -1028,6 +1036,7 @@ out:
 	mutex_unlock(&inode->i_mutex);
 	if (ret)
 		err = ret;
+	btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
 
 out_nolock:
 	kfree(pages);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 01c5f8b..3cc5677 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1159,6 +1159,83 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 	return ret;
 }
 
+static int btrfs_split_extent_hook(struct inode *inode,
+				    struct extent_state *orig, u64 split)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 size;
+
+	if (!(orig->state & EXTENT_DELALLOC))
+		return 0;
+
+	size = orig->end - orig->start + 1;
+	if (size > root->fs_info->max_extent) {
+		u64 num_extents;
+		u64 new_size;
+
+		new_size = orig->end - split + 1;
+		num_extents = div64_u64(size + root->fs_info->max_extent - 1,
+					root->fs_info->max_extent);
+
+		/*
+		 * if we break a large extent up then leave delalloc_extents be,
+		 * since we've already accounted for the large extent.
+		 */
+		if (div64_u64(new_size + root->fs_info->max_extent - 1,
+			      root->fs_info->max_extent) < num_extents)
+			return 0;
+	}
+
+	BTRFS_I(inode)->delalloc_extents++;
+
+	return 0;
+}
+
+/*
+ * extent_io.c merge_extent_hook, used to track merged delayed allocation
+ * extents so we can keep track of new extents that are just merged onto old
+ * extents, such as when we are doing sequential writes, so we can properly
+ * account for the metadata space we'll need.
+ */
+static int btrfs_merge_extent_hook(struct inode *inode,
+				   struct extent_state *new,
+				   struct extent_state *other)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 new_size, old_size;
+	u64 num_extents;
+
+	/* not delalloc, ignore it */
+	if (!(other->state & EXTENT_DELALLOC))
+		return 0;
+
+	old_size = other->end - other->start + 1;
+	if (new->start < other->start)
+		new_size = other->end - new->start + 1;
+	else
+		new_size = new->end - other->start + 1;
+
+	/* we're not bigger than the max, unreserve the space and go */
+	if (new_size <= root->fs_info->max_extent) {
+		BTRFS_I(inode)->delalloc_extents--;
+		return 0;
+	}
+
+	/*
+	 * If we grew by another max_extent, just return, we want to keep that
+	 * reserved amount.
+	 */
+	num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
+				root->fs_info->max_extent);
+	if (div64_u64(new_size + root->fs_info->max_extent - 1,
+		      root->fs_info->max_extent) > num_extents)
+		return 0;
+
+	BTRFS_I(inode)->delalloc_extents--;
+
+	return 0;
+}
+
 /*
  * extent_io.c set_bit_hook, used to track delayed allocation
  * bytes in this file, and to maintain the list of inodes that
@@ -1167,6 +1244,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 		       unsigned long old, unsigned long bits)
 {
+
 	/*
 	 * set_bit and clear bit hooks normally require _irqsave/restore
 	 * but in this case, we are only testeing for the DELALLOC
@@ -1174,6 +1252,8 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 	 */
 	if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
+
+		BTRFS_I(inode)->delalloc_extents++;
 		btrfs_delalloc_reserve_space(root, inode, end - start + 1);
 		spin_lock(&root->fs_info->delalloc_lock);
 		BTRFS_I(inode)->delalloc_bytes += end - start + 1;
@@ -1190,22 +1270,27 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 /*
  * extent_io.c clear_bit_hook, see set_bit_hook for why
  */
-static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
-			 unsigned long old, unsigned long bits)
+static int btrfs_clear_bit_hook(struct inode *inode,
+				struct extent_state *state, unsigned long bits)
 {
 	/*
 	 * set_bit and clear bit hooks normally require _irqsave/restore
 	 * but in this case, we are only testeing for the DELALLOC
 	 * bit, which is only set or cleared with irqs on
 	 */
-	if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+	if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
 
+		BTRFS_I(inode)->delalloc_extents--;
+		btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+
 		spin_lock(&root->fs_info->delalloc_lock);
-		if (end - start + 1 > root->fs_info->delalloc_bytes) {
+		if (state->end - state->start + 1 >
+		    root->fs_info->delalloc_bytes) {
 			printk(KERN_INFO "btrfs warning: delalloc account "
 			       "%llu %llu\n",
-			       (unsigned long long)end - start + 1,
+			       (unsigned long long)
+			       state->end - state->start + 1,
 			       (unsigned long long)
 			       root->fs_info->delalloc_bytes);
 			btrfs_delalloc_free_space(root, inode, (u64)-1);
@@ -1213,9 +1298,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 			BTRFS_I(inode)->delalloc_bytes = 0;
 		} else {
 			btrfs_delalloc_free_space(root, inode,
-						  end - start + 1);
-			root->fs_info->delalloc_bytes -= end - start + 1;
-			BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
+						  state->end -
+						  state->start + 1);
+			root->fs_info->delalloc_bytes -= state->end -
+				state->start + 1;
+			BTRFS_I(inode)->delalloc_bytes -= state->end -
+				state->start + 1;
 		}
 		if (BTRFS_I(inode)->delalloc_bytes == 0 &&
 		    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
@@ -2950,7 +3038,12 @@ again:
 		goto again;
 	}
 
-	btrfs_set_extent_delalloc(inode, page_start, page_end);
+	ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
+	if (ret) {
+		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		goto out_unlock;
+	}
+
 	ret = 0;
 	if (offset != PAGE_CACHE_SIZE) {
 		kaddr = kmap(page);
@@ -2981,15 +3074,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 	u64 last_byte;
 	u64 cur_offset;
 	u64 hole_size;
-	int err;
+	int err = 0;
 
 	if (size <= hole_start)
 		return 0;
 
-	err = btrfs_check_metadata_free_space(root);
-	if (err)
-		return err;
-
 	btrfs_truncate_page(inode->i_mapping, inode->i_size);
 
 	while (1) {
@@ -3024,12 +3113,18 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 						 cur_offset, &hint_byte, 1);
 			if (err)
 				break;
+
+			err = btrfs_reserve_metadata_space(root, 1);
+			if (err)
+				break;
+
 			err = btrfs_insert_file_extent(trans, root,
 					inode->i_ino, cur_offset, 0,
 					0, hole_size, 0, hole_size,
 					0, 0, 0);
 			btrfs_drop_extent_cache(inode, hole_start,
 					last_byte - 1, 0);
+			btrfs_unreserve_metadata_space(root, 1);
 		}
 		free_extent_map(em);
 		cur_offset = last_byte;
@@ -3990,11 +4085,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
-	err = btrfs_check_metadata_free_space(root);
+	/*
+	 * 2 for inode item and ref
+	 * 2 for dir items
+	 * 1 for xattr if selinux is on
+	 */
+	err = btrfs_reserve_metadata_space(root, 5);
 	if (err)
-		goto fail;
+		return err;
 
 	trans = btrfs_start_transaction(root, 1);
+	if (!trans)
+		goto fail;
 	btrfs_set_trans_block_group(trans, dir);
 
 	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -4032,6 +4134,7 @@ out_unlock:
 	nr = trans->blocks_used;
 	btrfs_end_transaction_throttle(trans, root);
 fail:
+	btrfs_unreserve_metadata_space(root, 5);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
@@ -4052,10 +4155,18 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	u64 objectid;
 	u64 index = 0;
 
-	err = btrfs_check_metadata_free_space(root);
+	/*
+	 * 2 for inode item and ref
+	 * 2 for dir items
+	 * 1 for xattr if selinux is on
+	 */
+	err = btrfs_reserve_metadata_space(root, 5);
 	if (err)
-		goto fail;
+		return err;
+
 	trans = btrfs_start_transaction(root, 1);
+	if (!trans)
+		goto fail;
 	btrfs_set_trans_block_group(trans, dir);
 
 	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -4096,6 +4207,7 @@ out_unlock:
 	nr = trans->blocks_used;
 	btrfs_end_transaction_throttle(trans, root);
 fail:
+	btrfs_unreserve_metadata_space(root, 5);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
@@ -4118,10 +4230,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	if (inode->i_nlink == 0)
 		return -ENOENT;
 
-	btrfs_inc_nlink(inode);
-	err = btrfs_check_metadata_free_space(root);
+	/*
+	 * 1 item for inode ref
+	 * 2 items for dir items
+	 */
+	err = btrfs_reserve_metadata_space(root, 3);
 	if (err)
-		goto fail;
+		return err;
+
+	btrfs_inc_nlink(inode);
+
 	err = btrfs_set_inode_index(dir, &index);
 	if (err)
 		goto fail;
@@ -4145,6 +4263,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	nr = trans->blocks_used;
 	btrfs_end_transaction_throttle(trans, root);
 fail:
+	btrfs_unreserve_metadata_space(root, 3);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
@@ -4164,17 +4283,21 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	u64 index = 0;
 	unsigned long nr = 1;
 
-	err = btrfs_check_metadata_free_space(root);
+	/*
+	 * 2 items for inode and ref
+	 * 2 items for dir items
+	 * 1 for xattr if selinux is on
+	 */
+	err = btrfs_reserve_metadata_space(root, 5);
 	if (err)
-		goto out_unlock;
+		return err;
 
 	trans = btrfs_start_transaction(root, 1);
-	btrfs_set_trans_block_group(trans, dir);
-
-	if (IS_ERR(trans)) {
-		err = PTR_ERR(trans);
+	if (!trans) {
+		err = -ENOMEM;
 		goto out_unlock;
 	}
+	btrfs_set_trans_block_group(trans, dir);
 
 	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
 	if (err) {
@@ -4223,6 +4346,7 @@ out_fail:
 	btrfs_end_transaction_throttle(trans, root);
 
 out_unlock:
+	btrfs_unreserve_metadata_space(root, 5);
 	if (drop_on_err)
 		iput(inode);
 	btrfs_btree_balance_dirty(root, nr);
@@ -4747,6 +4871,13 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 		goto out;
 	}
 
+	ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+	if (ret) {
+		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+		ret = VM_FAULT_SIGBUS;
+		goto out;
+	}
+
 	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
 again:
 	lock_page(page);
@@ -4778,7 +4909,12 @@ again:
 		goto again;
 	}
 
-	btrfs_set_extent_delalloc(inode, page_start, page_end);
+	ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
+	if (ret) {
+		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		ret = VM_FAULT_SIGBUS;
+		goto out_unlock;
+	}
 	ret = 0;
 
 	/* page is wholly or partially inside EOF */
@@ -4801,6 +4937,7 @@ again:
 	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
 out_unlock:
+	btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
 	if (!ret)
 		return VM_FAULT_LOCKED;
 	unlock_page(page);
@@ -4917,6 +5054,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 		return NULL;
 	ei->last_trans = 0;
 	ei->logged_trans = 0;
+	ei->delalloc_extents = 0;
+	ei->delalloc_reserved_extents = 0;
 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
 	INIT_LIST_HEAD(&ei->i_orphan);
 	INIT_LIST_HEAD(&ei->ordered_operations);
@@ -5070,7 +5209,12 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
 		return -ENOTEMPTY;
 
-	ret = btrfs_check_metadata_free_space(root);
+	/*
+	 * 2 items for dir items
+	 * 1 item for orphan entry
+	 * 1 item for ref
+	 */
+	ret = btrfs_reserve_metadata_space(root, 4);
 	if (ret)
 		return ret;
 
@@ -5185,6 +5329,8 @@ out_fail:
 
 	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
 		up_read(&root->fs_info->subvol_sem);
+
+	btrfs_unreserve_metadata_space(root, 4);
 	return ret;
 }
 
@@ -5256,11 +5402,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
 		return -ENAMETOOLONG;
 
-	err = btrfs_check_metadata_free_space(root);
+	/*
+	 * 2 items for inode item and ref
+	 * 2 items for dir items
+	 * 1 item for xattr if selinux is on
+	 */
+	err = btrfs_reserve_metadata_space(root, 5);
 	if (err)
-		goto out_fail;
+		return err;
 
 	trans = btrfs_start_transaction(root, 1);
+	if (!trans)
+		goto out_fail;
 	btrfs_set_trans_block_group(trans, dir);
 
 	err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -5341,6 +5494,7 @@ out_unlock:
 	nr = trans->blocks_used;
 	btrfs_end_transaction_throttle(trans, root);
 out_fail:
+	btrfs_unreserve_metadata_space(root, 5);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
@@ -5362,6 +5516,11 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
 
 	while (num_bytes > 0) {
 		alloc_size = min(num_bytes, root->fs_info->max_extent);
+
+		ret = btrfs_reserve_metadata_space(root, 1);
+		if (ret)
+			goto out;
+
 		ret = btrfs_reserve_extent(trans, root, alloc_size,
 					   root->sectorsize, 0, alloc_hint,
 					   (u64)-1, &ins, 1);
@@ -5381,6 +5540,7 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
 		num_bytes -= ins.offset;
 		cur_offset += ins.offset;
 		alloc_hint = ins.objectid + ins.offset;
+		btrfs_unreserve_metadata_space(root, 1);
 	}
 out:
 	if (cur_offset > start) {
@@ -5566,6 +5726,8 @@ static struct extent_io_ops btrfs_extent_io_ops = {
 	.readpage_io_failed_hook = btrfs_io_failed_hook,
 	.set_bit_hook = btrfs_set_bit_hook,
 	.clear_bit_hook = btrfs_clear_bit_hook,
+	.merge_extent_hook = btrfs_merge_extent_hook,
+	.split_extent_hook = btrfs_split_extent_hook,
 };
 
 /*
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a8577a7..4de7ef6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -239,7 +239,13 @@ static noinline int create_subvol(struct btrfs_root *root,
 	u64 index = 0;
 	unsigned long nr = 1;
 
-	ret = btrfs_check_metadata_free_space(root);
+	/*
+	 * 1 - inode item
+	 * 2 - refs
+	 * 1 - root item
+	 * 2 - dir items
+	 */
+	ret = btrfs_reserve_metadata_space(root, 6);
 	if (ret)
 		return ret;
 
@@ -340,6 +346,9 @@ fail:
 	err = btrfs_commit_transaction(trans, root);
 	if (err && !ret)
 		ret = err;
+
+	btrfs_unreserve_metadata_space(root, 6);
+	btrfs_btree_balance_dirty(root, nr);
 	return ret;
 }
 
@@ -355,19 +364,27 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 	if (!root->ref_cows)
 		return -EINVAL;
 
-	ret = btrfs_check_metadata_free_space(root);
+	/*
+	 * 1 - inode item
+	 * 2 - refs
+	 * 1 - root item
+	 * 2 - dir items
+	 */
+	ret = btrfs_reserve_metadata_space(root, 6);
 	if (ret)
 		goto fail_unlock;
 
 	pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
 	if (!pending_snapshot) {
 		ret = -ENOMEM;
+		btrfs_unreserve_metadata_space(root, 6);
 		goto fail_unlock;
 	}
 	pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
 	if (!pending_snapshot->name) {
 		ret = -ENOMEM;
 		kfree(pending_snapshot);
+		btrfs_unreserve_metadata_space(root, 6);
 		goto fail_unlock;
 	}
 	memcpy(pending_snapshot->name, name, namelen);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 88f866f..0b8f36d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -186,6 +186,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 	h->alloc_exclude_start = 0;
 	h->delayed_ref_updates = 0;
 
+	if (!current->journal_info)
+		current->journal_info = h;
+
 	root->fs_info->running_transaction->use_count++;
 	record_root_in_trans(h, root);
 	mutex_unlock(&root->fs_info->trans_mutex);
@@ -317,6 +320,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		wake_up(&cur_trans->writer_wait);
 	put_transaction(cur_trans);
 	mutex_unlock(&info->trans_mutex);
+
+	if (current->journal_info == trans)
+		current->journal_info = NULL;
 	memset(trans, 0, sizeof(*trans));
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 
@@ -743,6 +749,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	memcpy(&pending->root_key, &key, sizeof(key));
 fail:
 	kfree(new_root_item);
+	btrfs_unreserve_metadata_space(root, 6);
 	return ret;
 }
 
@@ -1059,6 +1066,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	mutex_unlock(&root->fs_info->trans_mutex);
 
+	if (current->journal_info == trans)
+		current->journal_info = NULL;
+
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 	return ret;
 }
-- 
cgit v1.1


From 1f28fcd925b2b3157411bbd08f0024b55b70d8dd Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 28 Sep 2009 01:46:11 +0900
Subject: nilfs2: fix missing zero-fill initialization of btree node cache

This will fix file system corruption which infrequently happens after
mount.  The problem was reported from users with the title "[NILFS
users] Fail to mount NILFS." (Message-ID:
<200908211918.34720.yuri@itinteg.net>), and so forth.  I've also
experienced the corruption multiple times on kernel 2.6.30 and 2.6.31.

The problem turned out to be caused due to discordance between
mapping->nrpages of a btree node cache and the actual number of pages
hung on the cache; if the mapping->nrpages becomes zero even as it has
pages, truncate_inode_pages() returns without doing anything.  Usually
this is harmless except it may cause page leak, but garbage collection
fairly infrequently sees a stale page remained in the btree node cache
of DAT (i.e. disk address translation file of nilfs), and induces the
corruption.

I identified a missing initialization in btree node caches was the
root cause.  This corrects the bug.

I've tested this for kernel 2.6.30 and 2.6.31.

Reported-by: Yuri Chislov <yuri@itinteg.net>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: stable <stable@kernel.org>
---
 fs/nilfs2/btnode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 6a2711f..5941958 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -36,6 +36,7 @@
 
 void nilfs_btnode_cache_init_once(struct address_space *btnc)
 {
+	memset(btnc, 0, sizeof(*btnc));
 	INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC);
 	spin_lock_init(&btnc->tree_lock);
 	INIT_LIST_HEAD(&btnc->private_list);
-- 
cgit v1.1


From 3cc811bffdf35ebaf1467fbec71a49b57800fc74 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 28 Sep 2009 13:02:46 +0900
Subject: nilfs2: fix missing initialization of i_dir_start_lookup member

The i_dir_start_lookup field in nilfs_inode_info objects should be
cleared when the objects are allocated, but the the initialization was
missing in case of reading from disk.  This adds the initialization.

Since the variable just gives a start page on directory lookups, the
bug was nonfatal until now.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 2d2c501..5040220 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -400,6 +400,7 @@ int nilfs_read_inode_common(struct inode *inode,
 	ii->i_dir_acl = S_ISREG(inode->i_mode) ?
 		0 : le32_to_cpu(raw_inode->i_dir_acl);
 #endif
+	ii->i_dir_start_lookup = 0;
 	ii->i_cno = 0;
 	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
 
-- 
cgit v1.1


From 830156c79b0a99ddf0f62496bcf4de640f9f52cd Mon Sep 17 00:00:00 2001
From: Frank Mayhar <fmayhar@google.com>
Date: Tue, 29 Sep 2009 10:07:47 -0400
Subject: ext4: Avoid updating the inode table bh twice in no journal mode

This is a cleanup of commit 91ac6f4.  Since ext4_mark_inode_dirty()
has already called ext4_mark_iloc_dirty(), which in turn calls
ext4_do_update_inode(), it's not necessary to have ext4_write_inode()
call ext4_do_update_inode() in no journal mode.  Indeed, it would be
duplicated work.

Reviewed-by: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 37 ++++++++++++++++---------------------
 1 file changed, 16 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 118e16c..22fb1a3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4991,8 +4991,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
  */
 static int ext4_do_update_inode(handle_t *handle,
 				struct inode *inode,
-				struct ext4_iloc *iloc,
-				int do_sync)
+				struct ext4_iloc *iloc)
 {
 	struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
 	struct ext4_inode_info *ei = EXT4_I(inode);
@@ -5093,22 +5092,10 @@ static int ext4_do_update_inode(handle_t *handle,
 		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
 	}
 
-	/*
-	 * If we're not using a journal and we were called from
-	 * ext4_write_inode() to sync the inode (making do_sync true),
-	 * we can just use sync_dirty_buffer() directly to do our dirty
-	 * work.  Testing s_journal here is a bit redundant but it's
-	 * worth it to avoid potential future trouble.
-	 */
-	if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) {
-		BUFFER_TRACE(bh, "call sync_dirty_buffer");
-		sync_dirty_buffer(bh);
-	} else {
-		BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-		rc = ext4_handle_dirty_metadata(handle, inode, bh);
-		if (!err)
-			err = rc;
-	}
+	BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+	rc = ext4_handle_dirty_metadata(handle, inode, bh);
+	if (!err)
+		err = rc;
 	ei->i_state &= ~EXT4_STATE_NEW;
 
 out_brelse:
@@ -5176,8 +5163,16 @@ int ext4_write_inode(struct inode *inode, int wait)
 		err = ext4_get_inode_loc(inode, &iloc);
 		if (err)
 			return err;
-		err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE,
-					   inode, &iloc, wait);
+		if (wait)
+			sync_dirty_buffer(iloc.bh);
+		if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
+			ext4_error(inode->i_sb, __func__,
+				   "IO error syncing inode, "
+				   "inode=%lu, block=%llu",
+				   inode->i_ino,
+				   (unsigned long long)iloc.bh->b_blocknr);
+			err = -EIO;
+		}
 	}
 	return err;
 }
@@ -5473,7 +5468,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
 	get_bh(iloc->bh);
 
 	/* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
-	err = ext4_do_update_inode(handle, inode, iloc, 0);
+	err = ext4_do_update_inode(handle, inode, iloc);
 	put_bh(iloc->bh);
 	return err;
 }
-- 
cgit v1.1


From f3dc272fd5e2ae08244796bb39e7e1ce4b25d3b3 Mon Sep 17 00:00:00 2001
From: Curt Wohlgemuth <curtw@google.com>
Date: Tue, 29 Sep 2009 16:06:01 -0400
Subject: ext4: Make sure ext4_dirty_inode() updates the inode in no journal
 mode

This patch a problem that ext4_dirty_inode() was not calling
ext4_mark_inode_dirty() if the current_handle is not valid, which it
is the case in no journal mode.

It also removes a test for non-matching transaction which can never
happen.

Signed-off-by: Curt Wohlgemuth <curtw@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 22fb1a3..ec367bc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5615,24 +5615,13 @@ void ext4_dirty_inode(struct inode *inode)
 	handle_t *current_handle = ext4_journal_current_handle();
 	handle_t *handle;
 
-	if (!ext4_handle_valid(current_handle)) {
-		ext4_mark_inode_dirty(current_handle, inode);
-		return;
-	}
-
 	handle = ext4_journal_start(inode, 2);
 	if (IS_ERR(handle))
 		goto out;
-	if (current_handle &&
-		current_handle->h_transaction != handle->h_transaction) {
-		/* This task has a transaction open against a different fs */
-		printk(KERN_EMERG "%s: transactions do not match!\n",
-		       __func__);
-	} else {
-		jbd_debug(5, "marking dirty.  outer handle=%p\n",
-				current_handle);
-		ext4_mark_inode_dirty(handle, inode);
-	}
+
+	jbd_debug(5, "marking dirty.  outer handle=%p\n", current_handle);
+	ext4_mark_inode_dirty(handle, inode);
+
 	ext4_journal_stop(handle);
 out:
 	return;
-- 
cgit v1.1


From d3d1faf6a74496ea4435fd057c6a2cad49f3e523 Mon Sep 17 00:00:00 2001
From: Curt Wohlgemuth <curtw@google.com>
Date: Tue, 29 Sep 2009 11:01:03 -0400
Subject: ext4: Handle nested ext4_journal_start/stop calls without a journal

This patch fixes a problem with handling nested calls to
ext4_journal_start/ext4_journal_stop, when there is no journal present.

Signed-off-by: Curt Wohlgemuth <curtw@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4_jbd2.h |  6 ++++--
 fs/ext4/namei.c     |  3 ++-
 fs/ext4/super.c     | 42 ++++++++++++++++++++++++++++++++----------
 3 files changed, 38 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 139fb8c..a286598 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -161,11 +161,13 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
 int __ext4_journal_stop(const char *where, handle_t *handle);
 
-#define EXT4_NOJOURNAL_HANDLE	((handle_t *) 0x1)
+#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
 
+/* Note:  Do not use this for NULL handles.  This is only to determine if
+ * a properly allocated handle is using a journal or not. */
 static inline int ext4_handle_valid(handle_t *handle)
 {
-	if (handle == EXT4_NOJOURNAL_HANDLE)
+	if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
 		return 0;
 	return 1;
 }
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 42f81d2..7c8fe80 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2076,7 +2076,8 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 	struct ext4_iloc iloc;
 	int err = 0;
 
-	if (!ext4_handle_valid(handle))
+	/* ext4_handle_valid() assumes a valid handle_t pointer */
+	if (handle && !ext4_handle_valid(handle))
 		return 0;
 
 	mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f095c60..3f7e701 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -189,6 +189,36 @@ void ext4_itable_unused_set(struct super_block *sb,
 		bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
 }
 
+
+/* Just increment the non-pointer handle value */
+static handle_t *ext4_get_nojournal(void)
+{
+	handle_t *handle = current->journal_info;
+	unsigned long ref_cnt = (unsigned long)handle;
+
+	BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
+
+	ref_cnt++;
+	handle = (handle_t *)ref_cnt;
+
+	current->journal_info = handle;
+	return handle;
+}
+
+
+/* Decrement the non-pointer handle value */
+static void ext4_put_nojournal(handle_t *handle)
+{
+	unsigned long ref_cnt = (unsigned long)handle;
+
+	BUG_ON(ref_cnt == 0);
+
+	ref_cnt--;
+	handle = (handle_t *)ref_cnt;
+
+	current->journal_info = handle;
+}
+
 /*
  * Wrappers for jbd2_journal_start/end.
  *
@@ -215,11 +245,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
 		}
 		return jbd2_journal_start(journal, nblocks);
 	}
-	/*
-	 * We're not journaling, return the appropriate indication.
-	 */
-	current->journal_info = EXT4_NOJOURNAL_HANDLE;
-	return current->journal_info;
+	return ext4_get_nojournal();
 }
 
 /*
@@ -235,11 +261,7 @@ int __ext4_journal_stop(const char *where, handle_t *handle)
 	int rc;
 
 	if (!ext4_handle_valid(handle)) {
-		/*
-		 * Do this here since we don't call jbd2_journal_stop() in
-		 * no-journal mode.
-		 */
-		current->journal_info = NULL;
+		ext4_put_nojournal(handle);
 		return 0;
 	}
 	sb = handle->h_transaction->t_journal->j_private;
-- 
cgit v1.1


From 49cf6f4529b7945ef51b8e39f0bac630726f8c96 Mon Sep 17 00:00:00 2001
From: Chris Ball <cjb@laptop.org>
Date: Tue, 29 Sep 2009 13:51:04 -0400
Subject: Btrfs: Fix setting umask when POSIX ACLs are not enabled

We currently set sb->s_flags |= MS_POSIXACL unconditionally, which is
incorrect -- it tells the VFS that it shouldn't set umask because we
will, yet we don't set it ourselves if we aren't using POSIX ACLs, so
the umask ends up ignored.

Signed-off-by: Chris Ball <cjb@laptop.org>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0242c8b..e0a6432 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -344,7 +344,9 @@ static int btrfs_fill_super(struct super_block *sb,
 	sb->s_export_op = &btrfs_export_ops;
 	sb->s_xattr = btrfs_xattr_handlers;
 	sb->s_time_gran = 1;
+#ifdef CONFIG_BTRFS_POSIX_ACL
 	sb->s_flags |= MS_POSIXACL;
+#endif
 
 	tree_root = open_ctree(sb, fs_devices, (char *)data);
 
-- 
cgit v1.1


From fd2696f399e45347c07f1f7f340e8515cace5657 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Tue, 29 Sep 2009 13:51:04 -0400
Subject: Btrfs: introduce missing kfree

Error handling code following a kzalloc should free the allocated data.

The semantic match that finds the problem is as follows:
(http://www.emn.fr/x-info/coccinelle/)

// <smpl>
@r exists@
local idexpression x;
statement S;
expression E;
identifier f,f1,l;
position p1,p2;
expression *ptr != NULL;
@@

x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...);
...
if (x == NULL) S
<... when != x
     when != if (...) { <+...x...+> }
(
x->f1 = E
|
 (x->f1 == NULL || ...)
|
 f(...,x->f1,...)
)
...>
(
 return \(0\|<+...x...+>\|ptr\);
|
 return@p2 ...;
)

@script:python@
p1 << r.p1;
p2 << r.p2;
@@

print "* file: %s kmalloc %s return %s" % (p1[0].file,p1[0].line,p2[0].line)
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index be953af..20cbd2e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -446,8 +446,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 			goto error;
 
 		device->name = kstrdup(orig_dev->name, GFP_NOFS);
-		if (!device->name)
+		if (!device->name) {
+			kfree(device);
 			goto error;
+		}
 
 		device->devid = orig_dev->devid;
 		device->work.func = pending_bios_fn;
-- 
cgit v1.1


From 3baf0bed0a5adab95c7599d2f27124c74692ef28 Mon Sep 17 00:00:00 2001
From: Chris Ball <cjb@laptop.org>
Date: Tue, 29 Sep 2009 13:51:05 -0400
Subject: Btrfs: Use CONFIG_BTRFS_POSIX_ACL to enable ACL code

We've already defined CONFIG_BTRFS_POSIX_ACL in Kconfig, but we're
currently not using it and are testing CONFIG_FS_POSIX_ACL instead.
CONFIG_FS_POSIX_ACL states "Never use this symbol for ifdefs".

Signed-off-by: Chris Ball <cjb@laptop.org>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/acl.c   | 6 +++---
 fs/btrfs/ctree.h | 2 +-
 fs/btrfs/xattr.c | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index f128427..69b355a 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -27,7 +27,7 @@
 #include "btrfs_inode.h"
 #include "xattr.h"
 
-#ifdef CONFIG_FS_POSIX_ACL
+#ifdef CONFIG_BTRFS_POSIX_ACL
 
 static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 {
@@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = {
 	.set	= btrfs_xattr_acl_access_set,
 };
 
-#else /* CONFIG_FS_POSIX_ACL */
+#else /* CONFIG_BTRFS_POSIX_ACL */
 
 int btrfs_acl_chmod(struct inode *inode)
 {
@@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
 	return 0;
 }
 
-#endif /* CONFIG_FS_POSIX_ACL */
+#endif /* CONFIG_BTRFS_POSIX_ACL */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b3959a1..8184f2f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2366,7 +2366,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options);
 int btrfs_sync_fs(struct super_block *sb, int wait);
 
 /* acl.c */
-#ifdef CONFIG_FS_POSIX_ACL
+#ifdef CONFIG_BTRFS_POSIX_ACL
 int btrfs_check_acl(struct inode *inode, int mask);
 #else
 #define btrfs_check_acl NULL
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index a9d3bf4..b0fc93f 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -260,7 +260,7 @@ err:
  * attributes are handled directly.
  */
 struct xattr_handler *btrfs_xattr_handlers[] = {
-#ifdef CONFIG_FS_POSIX_ACL
+#ifdef CONFIG_BTRFS_POSIX_ACL
 	&btrfs_xattr_acl_access_handler,
 	&btrfs_xattr_acl_default_handler,
 #endif
-- 
cgit v1.1


From 90576c0b9a0b5323fc4bd7f23f49be0d234f36d1 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 29 Sep 2009 15:51:30 -0400
Subject: ext4, jbd2: Drop unneeded printks at mount and unmount time

There are a number of kernel printk's which are printed when an ext4
filesystem is mounted and unmounted.  Disable them to economize space
in the system logs.  In addition, disabling the mballoc stats by
default saves a number of unneeded atomic operations for every block
allocation or deallocation.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c |  2 ++
 fs/ext4/mballoc.c |  2 --
 fs/ext4/mballoc.h |  2 +-
 fs/ext4/super.c   | 17 ++++-------------
 fs/jbd2/journal.c |  9 +++------
 5 files changed, 10 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 10a6309..10539e3 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2380,6 +2380,7 @@ void ext4_ext_init(struct super_block *sb)
 	 */
 
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
 		printk(KERN_INFO "EXT4-fs: file extents enabled");
 #ifdef AGGRESSIVE_TEST
 		printk(", aggressive tests");
@@ -2391,6 +2392,7 @@ void ext4_ext_init(struct super_block *sb)
 		printk(", stats");
 #endif
 		printk("\n");
+#endif
 #ifdef EXTENTS_STATS
 		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
 		EXT4_SB(sb)->s_ext_min = 1 << 30;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c73d439..3e2320e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2712,8 +2712,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 
 	if (sbi->s_journal)
 		sbi->s_journal->j_commit_callback = release_blocks_on_commit;
-
-	printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
 	return 0;
 }
 
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 188d3d7..14f25f2 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -84,7 +84,7 @@ extern u8 mb_enable_debug;
  * with 'ext4_mb_stats' allocator will collect stats that will be
  * shown at umount. The collecting costs though!
  */
-#define MB_DEFAULT_STATS		1
+#define MB_DEFAULT_STATS		0
 
 /*
  * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3f7e701..e5b206a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1673,13 +1673,6 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 			EXT4_INODES_PER_GROUP(sb),
 			sbi->s_mount_opt);
 
-	if (EXT4_SB(sb)->s_journal) {
-		ext4_msg(sb, KERN_INFO, "%s journal on %s",
-		       EXT4_SB(sb)->s_journal->j_inode ? "internal" :
-		       "external", EXT4_SB(sb)->s_journal->j_devname);
-	} else {
-		ext4_msg(sb, KERN_INFO, "no journal");
-	}
 	return res;
 }
 
@@ -2885,12 +2878,12 @@ no_journal:
 			 "available");
 	}
 
-	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+	if (test_opt(sb, DELALLOC) &&
+	    (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
 		ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
 			 "requested data journaling mode");
 		clear_opt(sbi->s_mount_opt, DELALLOC);
-	} else if (test_opt(sb, DELALLOC))
-		ext4_msg(sb, KERN_INFO, "delayed allocation enabled");
+	}
 
 	err = ext4_setup_system_zone(sb);
 	if (err) {
@@ -3202,9 +3195,7 @@ static int ext4_load_journal(struct super_block *sb,
 			return -EINVAL;
 	}
 
-	if (journal->j_flags & JBD2_BARRIER)
-		ext4_msg(sb, KERN_INFO, "barriers enabled");
-	else
+	if (!(journal->j_flags & JBD2_BARRIER))
 		ext4_msg(sb, KERN_INFO, "barriers disabled");
 
 	if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 53b86e1..977a8da 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -136,10 +136,6 @@ static int kjournald2(void *arg)
 	journal->j_task = current;
 	wake_up(&journal->j_wait_done_commit);
 
-	printk(KERN_INFO "kjournald2 starting: pid %d, dev %s, "
-	       "commit interval %ld seconds\n", current->pid,
-	       journal->j_devname, journal->j_commit_interval / HZ);
-
 	/*
 	 * And now, wait forever for commit wakeup events.
 	 */
@@ -223,7 +219,8 @@ static int jbd2_journal_start_thread(journal_t *journal)
 {
 	struct task_struct *t;
 
-	t = kthread_run(kjournald2, journal, "kjournald2");
+	t = kthread_run(kjournald2, journal, "jbd2/%s",
+			journal->j_devname);
 	if (IS_ERR(t))
 		return PTR_ERR(t);
 
@@ -1115,7 +1112,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
 	while ((p = strchr(p, '/')))
 		*p = '!';
 	p = journal->j_devname + strlen(journal->j_devname);
-	sprintf(p, ":%lu", journal->j_inode->i_ino);
+	sprintf(p, "-%lu", journal->j_inode->i_ino);
 	jbd_debug(1,
 		  "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
 		  journal, inode->i_sb->s_id, inode->i_ino,
-- 
cgit v1.1


From 1ab86aedbc7845a946b4ba4edf37762629970708 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 29 Sep 2009 18:38:44 -0400
Subject: Btrfs: fix error cases for ioctl transactions

Fix leak of vfsmount write reference and open_ioctl_trans reference on
ENOMEM.  Clean up the error paths while we're at it.

Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 41 ++++++++++++++++++++++-------------------
 1 file changed, 22 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4de7ef6..9a780c8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1232,15 +1232,15 @@ static long btrfs_ioctl_trans_start(struct file *file)
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
-	int ret = 0;
+	int ret;
 
+	ret = -EPERM;
 	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
+		goto out;
 
-	if (file->private_data) {
-		ret = -EINPROGRESS;
+	ret = -EINPROGRESS;
+	if (file->private_data)
 		goto out;
-	}
 
 	ret = mnt_want_write(file->f_path.mnt);
 	if (ret)
@@ -1250,12 +1250,19 @@ static long btrfs_ioctl_trans_start(struct file *file)
 	root->fs_info->open_ioctl_trans++;
 	mutex_unlock(&root->fs_info->trans_mutex);
 
+	ret = -ENOMEM;
 	trans = btrfs_start_ioctl_transaction(root, 0);
-	if (trans)
-		file->private_data = trans;
-	else
-		ret = -ENOMEM;
-	/*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
+	if (!trans)
+		goto out_drop;
+
+	file->private_data = trans;
+	return 0;
+
+out_drop:
+	mutex_lock(&root->fs_info->trans_mutex);
+	root->fs_info->open_ioctl_trans--;
+	mutex_unlock(&root->fs_info->trans_mutex);
+	mnt_drop_write(file->f_path.mnt);
 out:
 	return ret;
 }
@@ -1271,24 +1278,20 @@ long btrfs_ioctl_trans_end(struct file *file)
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
-	int ret = 0;
 
 	trans = file->private_data;
-	if (!trans) {
-		ret = -EINVAL;
-		goto out;
-	}
-	btrfs_end_transaction(trans, root);
+	if (!trans)
+		return -EINVAL;
 	file->private_data = NULL;
 
+	btrfs_end_transaction(trans, root);
+
 	mutex_lock(&root->fs_info->trans_mutex);
 	root->fs_info->open_ioctl_trans--;
 	mutex_unlock(&root->fs_info->trans_mutex);
 
 	mnt_drop_write(file->f_path.mnt);
-
-out:
-	return ret;
+	return 0;
 }
 
 long btrfs_ioctl(struct file *file, unsigned int
-- 
cgit v1.1


From dd7e0b7b02ccff73b87032e20fc5b4f2c1cfcc14 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 29 Sep 2009 18:38:44 -0400
Subject: Btrfs: fix deadlock with free space handling and user transactions

If an ioctl-initiated transaction is open, we can't force a commit during
the free space checks in order to free up pinned extents or else we
deadlock.  Just ENOSPC instead.

A more satisfying solution that reserves space for the entire user
transaction up front is forthcoming...

Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a4b2b03..d119c03 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3168,7 +3168,7 @@ alloc:
 		spin_unlock(&data_sinfo->lock);
 
 		/* commit the current transaction and try again */
-		if (!committed) {
+		if (!committed && !root->fs_info->open_ioctl_trans) {
 			committed = 1;
 			trans = btrfs_join_transaction(root, 1);
 			if (!trans)
-- 
cgit v1.1


From 296c355cd6443d89fa251885a8d78778fe111dc4 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 30 Sep 2009 00:32:42 -0400
Subject: ext4: Use tracepoints for mb_history trace file

The /proc/fs/ext4/<dev>/mb_history was maintained manually, and had a
number of problems: it required a largish amount of memory to be
allocated for each ext4 filesystem, and the s_mb_history_lock
introduced a CPU contention problem.

By ripping out the mb_history code and replacing it with ftrace
tracepoints, and we get more functionality: timestamps, event
filtering, the ability to correlate mballoc history with other ext4
tracepoints, etc.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  14 ++-
 fs/ext4/mballoc.c | 301 +++---------------------------------------------------
 fs/ext4/mballoc.h |  33 ------
 fs/ext4/super.c   |  18 +---
 4 files changed, 19 insertions(+), 347 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b491576..c508cf7 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -65,6 +65,12 @@ typedef __u32 ext4_lblk_t;
 /* data type for block group number */
 typedef unsigned int ext4_group_t;
 
+/*
+ * Flags used in mballoc's allocation_context flags field.  
+ *
+ * Also used to show what's going on for debugging purposes when the
+ * flag field is exported via the traceport interface
+ */
 
 /* prefer goal again. length */
 #define EXT4_MB_HINT_MERGE		0x0001
@@ -971,14 +977,6 @@ struct ext4_sb_info {
 	unsigned long s_mb_last_group;
 	unsigned long s_mb_last_start;
 
-	/* history to debug policy */
-	struct ext4_mb_history *s_mb_history;
-	int s_mb_history_cur;
-	int s_mb_history_max;
-	int s_mb_history_num;
-	spinlock_t s_mb_history_lock;
-	int s_mb_history_filter;
-
 	/* stats for buddy allocator */
 	spinlock_t s_mb_pa_lock;
 	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 3e2320e..bba1282 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2096,207 +2096,6 @@ out:
 	return err;
 }
 
-#ifdef EXT4_MB_HISTORY
-struct ext4_mb_proc_session {
-	struct ext4_mb_history *history;
-	struct super_block *sb;
-	int start;
-	int max;
-};
-
-static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s,
-					struct ext4_mb_history *hs,
-					int first)
-{
-	if (hs == s->history + s->max)
-		hs = s->history;
-	if (!first && hs == s->history + s->start)
-		return NULL;
-	while (hs->orig.fe_len == 0) {
-		hs++;
-		if (hs == s->history + s->max)
-			hs = s->history;
-		if (hs == s->history + s->start)
-			return NULL;
-	}
-	return hs;
-}
-
-static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos)
-{
-	struct ext4_mb_proc_session *s = seq->private;
-	struct ext4_mb_history *hs;
-	int l = *pos;
-
-	if (l == 0)
-		return SEQ_START_TOKEN;
-	hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1);
-	if (!hs)
-		return NULL;
-	while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL);
-	return hs;
-}
-
-static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v,
-				      loff_t *pos)
-{
-	struct ext4_mb_proc_session *s = seq->private;
-	struct ext4_mb_history *hs = v;
-
-	++*pos;
-	if (v == SEQ_START_TOKEN)
-		return ext4_mb_history_skip_empty(s, s->history + s->start, 1);
-	else
-		return ext4_mb_history_skip_empty(s, ++hs, 0);
-}
-
-static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
-{
-	char buf[25], buf2[25], buf3[25], *fmt;
-	struct ext4_mb_history *hs = v;
-
-	if (v == SEQ_START_TOKEN) {
-		seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
-				"%-5s %-2s %-6s %-5s %-5s %-6s\n",
-			  "pid", "inode", "original", "goal", "result", "found",
-			   "grps", "cr", "flags", "merge", "tail", "broken");
-		return 0;
-	}
-
-	if (hs->op == EXT4_MB_HISTORY_ALLOC) {
-		fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
-			"0x%04x %-5s %-5u %-6u\n";
-		sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
-			hs->result.fe_start, hs->result.fe_len,
-			hs->result.fe_logical);
-		sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
-			hs->orig.fe_start, hs->orig.fe_len,
-			hs->orig.fe_logical);
-		sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group,
-			hs->goal.fe_start, hs->goal.fe_len,
-			hs->goal.fe_logical);
-		seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2,
-				hs->found, hs->groups, hs->cr, hs->flags,
-				hs->merged ? "M" : "", hs->tail,
-				hs->buddy ? 1 << hs->buddy : 0);
-	} else if (hs->op == EXT4_MB_HISTORY_PREALLOC) {
-		fmt = "%-5u %-8u %-23s %-23s %-23s\n";
-		sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
-			hs->result.fe_start, hs->result.fe_len,
-			hs->result.fe_logical);
-		sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group,
-			hs->orig.fe_start, hs->orig.fe_len,
-			hs->orig.fe_logical);
-		seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2);
-	} else if (hs->op == EXT4_MB_HISTORY_DISCARD) {
-		sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
-			hs->result.fe_start, hs->result.fe_len);
-		seq_printf(seq, "%-5u %-8u %-23s discard\n",
-				hs->pid, hs->ino, buf2);
-	} else if (hs->op == EXT4_MB_HISTORY_FREE) {
-		sprintf(buf2, "%u/%d/%u", hs->result.fe_group,
-			hs->result.fe_start, hs->result.fe_len);
-		seq_printf(seq, "%-5u %-8u %-23s free\n",
-				hs->pid, hs->ino, buf2);
-	}
-	return 0;
-}
-
-static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
-{
-}
-
-static const struct seq_operations ext4_mb_seq_history_ops = {
-	.start  = ext4_mb_seq_history_start,
-	.next   = ext4_mb_seq_history_next,
-	.stop   = ext4_mb_seq_history_stop,
-	.show   = ext4_mb_seq_history_show,
-};
-
-static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
-{
-	struct super_block *sb = PDE(inode)->data;
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct ext4_mb_proc_session *s;
-	int rc;
-	int size;
-
-	if (unlikely(sbi->s_mb_history == NULL))
-		return -ENOMEM;
-	s = kmalloc(sizeof(*s), GFP_KERNEL);
-	if (s == NULL)
-		return -ENOMEM;
-	s->sb = sb;
-	size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max;
-	s->history = kmalloc(size, GFP_KERNEL);
-	if (s->history == NULL) {
-		kfree(s);
-		return -ENOMEM;
-	}
-
-	spin_lock(&sbi->s_mb_history_lock);
-	memcpy(s->history, sbi->s_mb_history, size);
-	s->max = sbi->s_mb_history_max;
-	s->start = sbi->s_mb_history_cur % s->max;
-	spin_unlock(&sbi->s_mb_history_lock);
-
-	rc = seq_open(file, &ext4_mb_seq_history_ops);
-	if (rc == 0) {
-		struct seq_file *m = (struct seq_file *)file->private_data;
-		m->private = s;
-	} else {
-		kfree(s->history);
-		kfree(s);
-	}
-	return rc;
-
-}
-
-static int ext4_mb_seq_history_release(struct inode *inode, struct file *file)
-{
-	struct seq_file *seq = (struct seq_file *)file->private_data;
-	struct ext4_mb_proc_session *s = seq->private;
-	kfree(s->history);
-	kfree(s);
-	return seq_release(inode, file);
-}
-
-static ssize_t ext4_mb_seq_history_write(struct file *file,
-				const char __user *buffer,
-				size_t count, loff_t *ppos)
-{
-	struct seq_file *seq = (struct seq_file *)file->private_data;
-	struct ext4_mb_proc_session *s = seq->private;
-	struct super_block *sb = s->sb;
-	char str[32];
-	int value;
-
-	if (count >= sizeof(str)) {
-		printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n",
-				"mb_history", (int)sizeof(str));
-		return -EOVERFLOW;
-	}
-
-	if (copy_from_user(str, buffer, count))
-		return -EFAULT;
-
-	value = simple_strtol(str, NULL, 0);
-	if (value < 0)
-		return -ERANGE;
-	EXT4_SB(sb)->s_mb_history_filter = value;
-
-	return count;
-}
-
-static const struct file_operations ext4_mb_seq_history_fops = {
-	.owner		= THIS_MODULE,
-	.open		= ext4_mb_seq_history_open,
-	.read		= seq_read,
-	.write		= ext4_mb_seq_history_write,
-	.llseek		= seq_lseek,
-	.release	= ext4_mb_seq_history_release,
-};
-
 static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
 {
 	struct super_block *sb = seq->private;
@@ -2396,82 +2195,6 @@ static const struct file_operations ext4_mb_seq_groups_fops = {
 	.release	= seq_release,
 };
 
-static void ext4_mb_history_release(struct super_block *sb)
-{
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-
-	if (sbi->s_proc != NULL) {
-		remove_proc_entry("mb_groups", sbi->s_proc);
-		if (sbi->s_mb_history_max)
-			remove_proc_entry("mb_history", sbi->s_proc);
-	}
-	kfree(sbi->s_mb_history);
-}
-
-static void ext4_mb_history_init(struct super_block *sb)
-{
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	int i;
-
-	if (sbi->s_proc != NULL) {
-		if (sbi->s_mb_history_max)
-			proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
-					 &ext4_mb_seq_history_fops, sb);
-		proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
-				 &ext4_mb_seq_groups_fops, sb);
-	}
-
-	sbi->s_mb_history_cur = 0;
-	spin_lock_init(&sbi->s_mb_history_lock);
-	i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
-	sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL;
-	/* if we can't allocate history, then we simple won't use it */
-}
-
-static noinline_for_stack void
-ext4_mb_store_history(struct ext4_allocation_context *ac)
-{
-	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
-	struct ext4_mb_history h;
-
-	if (sbi->s_mb_history == NULL)
-		return;
-
-	if (!(ac->ac_op & sbi->s_mb_history_filter))
-		return;
-
-	h.op = ac->ac_op;
-	h.pid = current->pid;
-	h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0;
-	h.orig = ac->ac_o_ex;
-	h.result = ac->ac_b_ex;
-	h.flags = ac->ac_flags;
-	h.found = ac->ac_found;
-	h.groups = ac->ac_groups_scanned;
-	h.cr = ac->ac_criteria;
-	h.tail = ac->ac_tail;
-	h.buddy = ac->ac_buddy;
-	h.merged = 0;
-	if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) {
-		if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
-				ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
-			h.merged = 1;
-		h.goal = ac->ac_g_ex;
-		h.result = ac->ac_f_ex;
-	}
-
-	spin_lock(&sbi->s_mb_history_lock);
-	memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h));
-	if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max)
-		sbi->s_mb_history_cur = 0;
-	spin_unlock(&sbi->s_mb_history_lock);
-}
-
-#else
-#define ext4_mb_history_release(sb)
-#define ext4_mb_history_init(sb)
-#endif
-
 
 /* Create and initialize ext4_group_info data for the given group. */
 int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
@@ -2690,7 +2413,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	sbi->s_mb_stats = MB_DEFAULT_STATS;
 	sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
 	sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
-	sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
 	sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
 
 	sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
@@ -2708,7 +2430,9 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 		spin_lock_init(&lg->lg_prealloc_lock);
 	}
 
-	ext4_mb_history_init(sb);
+	if (sbi->s_proc)
+		proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
+				 &ext4_mb_seq_groups_fops, sb);
 
 	if (sbi->s_journal)
 		sbi->s_journal->j_commit_callback = release_blocks_on_commit;
@@ -2788,7 +2512,8 @@ int ext4_mb_release(struct super_block *sb)
 	}
 
 	free_percpu(sbi->s_locality_groups);
-	ext4_mb_history_release(sb);
+	if (sbi->s_proc)
+		remove_proc_entry("mb_groups", sbi->s_proc);
 
 	return 0;
 }
@@ -3274,7 +2999,10 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
 			atomic_inc(&sbi->s_bal_breaks);
 	}
 
-	ext4_mb_store_history(ac);
+	if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
+		trace_ext4_mballoc_alloc(ac);
+	else
+		trace_ext4_mballoc_prealloc(ac);
 }
 
 /*
@@ -3774,7 +3502,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 	if (ac) {
 		ac->ac_sb = sb;
 		ac->ac_inode = pa->pa_inode;
-		ac->ac_op = EXT4_MB_HISTORY_DISCARD;
 	}
 
 	while (bit < end) {
@@ -3794,7 +3521,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 			ac->ac_b_ex.fe_start = bit;
 			ac->ac_b_ex.fe_len = next - bit;
 			ac->ac_b_ex.fe_logical = 0;
-			ext4_mb_store_history(ac);
+			trace_ext4_mballoc_discard(ac);
 		}
 
 		trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit,
@@ -3829,9 +3556,6 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
 	ext4_group_t group;
 	ext4_grpblk_t bit;
 
-	if (ac)
-		ac->ac_op = EXT4_MB_HISTORY_DISCARD;
-
 	trace_ext4_mb_release_group_pa(ac, pa);
 	BUG_ON(pa->pa_deleted == 0);
 	ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
@@ -3846,7 +3570,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
 		ac->ac_b_ex.fe_start = bit;
 		ac->ac_b_ex.fe_len = pa->pa_len;
 		ac->ac_b_ex.fe_logical = 0;
-		ext4_mb_store_history(ac);
+		trace_ext4_mballoc_discard(ac);
 	}
 
 	return 0;
@@ -4737,7 +4461,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
 
 	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
 	if (ac) {
-		ac->ac_op = EXT4_MB_HISTORY_FREE;
 		ac->ac_inode = inode;
 		ac->ac_sb = sb;
 	}
@@ -4804,7 +4527,7 @@ do_more:
 		ac->ac_b_ex.fe_group = block_group;
 		ac->ac_b_ex.fe_start = bit;
 		ac->ac_b_ex.fe_len = count;
-		ext4_mb_store_history(ac);
+		trace_ext4_mballoc_free(ac);
 	}
 
 	err = ext4_mb_load_buddy(sb, block_group, &e4b);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 14f25f2..0ca8110 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -52,18 +52,8 @@ extern u8 mb_enable_debug;
 #define mb_debug(n, fmt, a...)
 #endif
 
-/*
- * with EXT4_MB_HISTORY mballoc stores last N allocations in memory
- * and you can monitor it in /proc/fs/ext4/<dev>/mb_history
- */
-#define EXT4_MB_HISTORY
 #define EXT4_MB_HISTORY_ALLOC		1	/* allocation */
 #define EXT4_MB_HISTORY_PREALLOC	2	/* preallocated blocks used */
-#define EXT4_MB_HISTORY_DISCARD		4	/* preallocation discarded */
-#define EXT4_MB_HISTORY_FREE		8	/* free */
-
-#define EXT4_MB_HISTORY_DEFAULT		(EXT4_MB_HISTORY_ALLOC | \
-					 EXT4_MB_HISTORY_PREALLOC)
 
 /*
  * How long mballoc can look for a best extent (in found extents)
@@ -217,22 +207,6 @@ struct ext4_allocation_context {
 #define AC_STATUS_FOUND		2
 #define AC_STATUS_BREAK		3
 
-struct ext4_mb_history {
-	struct ext4_free_extent orig;	/* orig allocation */
-	struct ext4_free_extent goal;	/* goal allocation */
-	struct ext4_free_extent result;	/* result allocation */
-	unsigned pid;
-	unsigned ino;
-	__u16 found;	/* how many extents have been found */
-	__u16 groups;	/* how many groups have been scanned */
-	__u16 tail;	/* what tail broke some buddy */
-	__u16 buddy;	/* buddy the tail ^^^ broke */
-	__u16 flags;
-	__u8 cr:3;	/* which phase the result extent was found at */
-	__u8 op:4;
-	__u8 merged:1;
-};
-
 struct ext4_buddy {
 	struct page *bd_buddy_page;
 	void *bd_buddy;
@@ -247,13 +221,6 @@ struct ext4_buddy {
 #define EXT4_MB_BITMAP(e4b)	((e4b)->bd_bitmap)
 #define EXT4_MB_BUDDY(e4b)	((e4b)->bd_buddy)
 
-#ifndef EXT4_MB_HISTORY
-static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
-{
-	return;
-}
-#endif
-
 #define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
 
 static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e5b206a..12e726a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -50,13 +50,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/ext4.h>
 
-static int default_mb_history_length = 1000;
-
-module_param_named(default_mb_history_length, default_mb_history_length,
-		   int, 0644);
-MODULE_PARM_DESC(default_mb_history_length,
-		 "Default number of entries saved for mb_history");
-
 struct proc_dir_entry *ext4_proc_root;
 static struct kset *ext4_kset;
 
@@ -1079,7 +1072,7 @@ enum {
 	Opt_journal_update, Opt_journal_dev,
 	Opt_journal_checksum, Opt_journal_async_commit,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
-	Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length,
+	Opt_data_err_abort, Opt_data_err_ignore,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
@@ -1126,7 +1119,6 @@ static const match_table_t tokens = {
 	{Opt_data_writeback, "data=writeback"},
 	{Opt_data_err_abort, "data_err=abort"},
 	{Opt_data_err_ignore, "data_err=ignore"},
-	{Opt_mb_history_length, "mb_history_length=%u"},
 	{Opt_offusrjquota, "usrjquota="},
 	{Opt_usrjquota, "usrjquota=%s"},
 	{Opt_offgrpjquota, "grpjquota="},
@@ -1367,13 +1359,6 @@ static int parse_options(char *options, struct super_block *sb,
 		case Opt_data_err_ignore:
 			clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
 			break;
-		case Opt_mb_history_length:
-			if (match_int(&args[0], &option))
-				return 0;
-			if (option < 0)
-				return 0;
-			sbi->s_mb_history_max = option;
-			break;
 #ifdef CONFIG_QUOTA
 		case Opt_usrjquota:
 			qtype = USRQUOTA;
@@ -2435,7 +2420,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
 	sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
 	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
-	sbi->s_mb_history_max = default_mb_history_length;
 
 	set_opt(sbi->s_mount_opt, BARRIER);
 
-- 
cgit v1.1


From bf6993276f74d46776f35c45ddef29b981b1d1c6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 30 Sep 2009 00:32:06 -0400
Subject: jbd2: Use tracepoints for history file

The /proc/fs/jbd2/<dev>/history was maintained manually; by using
tracepoints, we can get all of the existing functionality of the /proc
file plus extra capabilities thanks to the ftrace infrastructure.  We
save memory as a bonus.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/checkpoint.c |   7 ++
 fs/jbd2/commit.c     |  59 ++++++++--------
 fs/jbd2/journal.c    | 187 +++------------------------------------------------
 3 files changed, 45 insertions(+), 208 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 5d70b3e..ca0f5eb 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -643,6 +643,7 @@ out:
 
 int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
 {
+	struct transaction_chp_stats_s *stats;
 	transaction_t *transaction;
 	journal_t *journal;
 	int ret = 0;
@@ -679,6 +680,12 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
 
 	/* OK, that was the last buffer for the transaction: we can now
 	   safely remove this transaction from the log */
+	stats = &transaction->t_chp_stats;
+	if (stats->cs_chp_time)
+		stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time,
+						    jiffies);
+	trace_jbd2_checkpoint_stats(journal->j_fs_dev->bd_dev,
+				    transaction->t_tid, stats);
 
 	__jbd2_journal_drop_transaction(journal, transaction);
 	kfree(transaction);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 26d991d..d4cfd6d 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -410,10 +410,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	if (commit_transaction->t_synchronous_commit)
 		write_op = WRITE_SYNC_PLUG;
 	trace_jbd2_commit_locking(journal, commit_transaction);
-	stats.u.run.rs_wait = commit_transaction->t_max_wait;
-	stats.u.run.rs_locked = jiffies;
-	stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
-						stats.u.run.rs_locked);
+	stats.run.rs_wait = commit_transaction->t_max_wait;
+	stats.run.rs_locked = jiffies;
+	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
+					      stats.run.rs_locked);
 
 	spin_lock(&commit_transaction->t_handle_lock);
 	while (commit_transaction->t_updates) {
@@ -486,9 +486,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	jbd2_journal_switch_revoke_table(journal);
 
 	trace_jbd2_commit_flushing(journal, commit_transaction);
-	stats.u.run.rs_flushing = jiffies;
-	stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
-					       stats.u.run.rs_flushing);
+	stats.run.rs_flushing = jiffies;
+	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
+					     stats.run.rs_flushing);
 
 	commit_transaction->t_state = T_FLUSH;
 	journal->j_committing_transaction = commit_transaction;
@@ -523,11 +523,11 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	spin_unlock(&journal->j_state_lock);
 
 	trace_jbd2_commit_logging(journal, commit_transaction);
-	stats.u.run.rs_logging = jiffies;
-	stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
-						 stats.u.run.rs_logging);
-	stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
-	stats.u.run.rs_blocks_logged = 0;
+	stats.run.rs_logging = jiffies;
+	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
+					       stats.run.rs_logging);
+	stats.run.rs_blocks = commit_transaction->t_outstanding_credits;
+	stats.run.rs_blocks_logged = 0;
 
 	J_ASSERT(commit_transaction->t_nr_buffers <=
 		 commit_transaction->t_outstanding_credits);
@@ -695,7 +695,7 @@ start_journal_io:
 				submit_bh(write_op, bh);
 			}
 			cond_resched();
-			stats.u.run.rs_blocks_logged += bufs;
+			stats.run.rs_blocks_logged += bufs;
 
 			/* Force a new descriptor to be generated next
                            time round the loop. */
@@ -988,33 +988,30 @@ restart_loop:
 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
 
 	commit_transaction->t_start = jiffies;
-	stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
-						commit_transaction->t_start);
+	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
+					      commit_transaction->t_start);
 
 	/*
-	 * File the transaction for history
+	 * File the transaction statistics
 	 */
-	stats.ts_type = JBD2_STATS_RUN;
 	stats.ts_tid = commit_transaction->t_tid;
-	stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
-	spin_lock(&journal->j_history_lock);
-	memcpy(journal->j_history + journal->j_history_cur, &stats,
-			sizeof(stats));
-	if (++journal->j_history_cur == journal->j_history_max)
-		journal->j_history_cur = 0;
+	stats.run.rs_handle_count = commit_transaction->t_handle_count;
+	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
+			     commit_transaction->t_tid, &stats.run);
 
 	/*
 	 * Calculate overall stats
 	 */
+	spin_lock(&journal->j_history_lock);
 	journal->j_stats.ts_tid++;
-	journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
-	journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
-	journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
-	journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
-	journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
-	journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
-	journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
-	journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
+	journal->j_stats.run.rs_wait += stats.run.rs_wait;
+	journal->j_stats.run.rs_running += stats.run.rs_running;
+	journal->j_stats.run.rs_locked += stats.run.rs_locked;
+	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
+	journal->j_stats.run.rs_logging += stats.run.rs_logging;
+	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
+	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
+	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
 	spin_unlock(&journal->j_history_lock);
 
 	commit_transaction->t_state = T_FINISHED;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 977a8da..761af77 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -676,153 +676,6 @@ struct jbd2_stats_proc_session {
 	int max;
 };
 
-static void *jbd2_history_skip_empty(struct jbd2_stats_proc_session *s,
-					struct transaction_stats_s *ts,
-					int first)
-{
-	if (ts == s->stats + s->max)
-		ts = s->stats;
-	if (!first && ts == s->stats + s->start)
-		return NULL;
-	while (ts->ts_type == 0) {
-		ts++;
-		if (ts == s->stats + s->max)
-			ts = s->stats;
-		if (ts == s->stats + s->start)
-			return NULL;
-	}
-	return ts;
-
-}
-
-static void *jbd2_seq_history_start(struct seq_file *seq, loff_t *pos)
-{
-	struct jbd2_stats_proc_session *s = seq->private;
-	struct transaction_stats_s *ts;
-	int l = *pos;
-
-	if (l == 0)
-		return SEQ_START_TOKEN;
-	ts = jbd2_history_skip_empty(s, s->stats + s->start, 1);
-	if (!ts)
-		return NULL;
-	l--;
-	while (l) {
-		ts = jbd2_history_skip_empty(s, ++ts, 0);
-		if (!ts)
-			break;
-		l--;
-	}
-	return ts;
-}
-
-static void *jbd2_seq_history_next(struct seq_file *seq, void *v, loff_t *pos)
-{
-	struct jbd2_stats_proc_session *s = seq->private;
-	struct transaction_stats_s *ts = v;
-
-	++*pos;
-	if (v == SEQ_START_TOKEN)
-		return jbd2_history_skip_empty(s, s->stats + s->start, 1);
-	else
-		return jbd2_history_skip_empty(s, ++ts, 0);
-}
-
-static int jbd2_seq_history_show(struct seq_file *seq, void *v)
-{
-	struct transaction_stats_s *ts = v;
-	if (v == SEQ_START_TOKEN) {
-		seq_printf(seq, "%-4s %-5s %-5s %-5s %-5s %-5s %-5s %-6s %-5s "
-				"%-5s %-5s %-5s %-5s %-5s\n", "R/C", "tid",
-				"wait", "run", "lock", "flush", "log", "hndls",
-				"block", "inlog", "ctime", "write", "drop",
-				"close");
-		return 0;
-	}
-	if (ts->ts_type == JBD2_STATS_RUN)
-		seq_printf(seq, "%-4s %-5lu %-5u %-5u %-5u %-5u %-5u "
-				"%-6lu %-5lu %-5lu\n", "R", ts->ts_tid,
-				jiffies_to_msecs(ts->u.run.rs_wait),
-				jiffies_to_msecs(ts->u.run.rs_running),
-				jiffies_to_msecs(ts->u.run.rs_locked),
-				jiffies_to_msecs(ts->u.run.rs_flushing),
-				jiffies_to_msecs(ts->u.run.rs_logging),
-				ts->u.run.rs_handle_count,
-				ts->u.run.rs_blocks,
-				ts->u.run.rs_blocks_logged);
-	else if (ts->ts_type == JBD2_STATS_CHECKPOINT)
-		seq_printf(seq, "%-4s %-5lu %48s %-5u %-5lu %-5lu %-5lu\n",
-				"C", ts->ts_tid, " ",
-				jiffies_to_msecs(ts->u.chp.cs_chp_time),
-				ts->u.chp.cs_written, ts->u.chp.cs_dropped,
-				ts->u.chp.cs_forced_to_close);
-	else
-		J_ASSERT(0);
-	return 0;
-}
-
-static void jbd2_seq_history_stop(struct seq_file *seq, void *v)
-{
-}
-
-static const struct seq_operations jbd2_seq_history_ops = {
-	.start  = jbd2_seq_history_start,
-	.next   = jbd2_seq_history_next,
-	.stop   = jbd2_seq_history_stop,
-	.show   = jbd2_seq_history_show,
-};
-
-static int jbd2_seq_history_open(struct inode *inode, struct file *file)
-{
-	journal_t *journal = PDE(inode)->data;
-	struct jbd2_stats_proc_session *s;
-	int rc, size;
-
-	s = kmalloc(sizeof(*s), GFP_KERNEL);
-	if (s == NULL)
-		return -ENOMEM;
-	size = sizeof(struct transaction_stats_s) * journal->j_history_max;
-	s->stats = kmalloc(size, GFP_KERNEL);
-	if (s->stats == NULL) {
-		kfree(s);
-		return -ENOMEM;
-	}
-	spin_lock(&journal->j_history_lock);
-	memcpy(s->stats, journal->j_history, size);
-	s->max = journal->j_history_max;
-	s->start = journal->j_history_cur % s->max;
-	spin_unlock(&journal->j_history_lock);
-
-	rc = seq_open(file, &jbd2_seq_history_ops);
-	if (rc == 0) {
-		struct seq_file *m = file->private_data;
-		m->private = s;
-	} else {
-		kfree(s->stats);
-		kfree(s);
-	}
-	return rc;
-
-}
-
-static int jbd2_seq_history_release(struct inode *inode, struct file *file)
-{
-	struct seq_file *seq = file->private_data;
-	struct jbd2_stats_proc_session *s = seq->private;
-
-	kfree(s->stats);
-	kfree(s);
-	return seq_release(inode, file);
-}
-
-static struct file_operations jbd2_seq_history_fops = {
-	.owner		= THIS_MODULE,
-	.open           = jbd2_seq_history_open,
-	.read           = seq_read,
-	.llseek         = seq_lseek,
-	.release        = jbd2_seq_history_release,
-};
-
 static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
 {
 	return *pos ? NULL : SEQ_START_TOKEN;
@@ -839,29 +692,29 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v)
 
 	if (v != SEQ_START_TOKEN)
 		return 0;
-	seq_printf(seq, "%lu transaction, each upto %u blocks\n",
+	seq_printf(seq, "%lu transaction, each up to %u blocks\n",
 			s->stats->ts_tid,
 			s->journal->j_max_transaction_buffers);
 	if (s->stats->ts_tid == 0)
 		return 0;
 	seq_printf(seq, "average: \n  %ums waiting for transaction\n",
-	    jiffies_to_msecs(s->stats->u.run.rs_wait / s->stats->ts_tid));
+	    jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid));
 	seq_printf(seq, "  %ums running transaction\n",
-	    jiffies_to_msecs(s->stats->u.run.rs_running / s->stats->ts_tid));
+	    jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid));
 	seq_printf(seq, "  %ums transaction was being locked\n",
-	    jiffies_to_msecs(s->stats->u.run.rs_locked / s->stats->ts_tid));
+	    jiffies_to_msecs(s->stats->run.rs_locked / s->stats->ts_tid));
 	seq_printf(seq, "  %ums flushing data (in ordered mode)\n",
-	    jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid));
+	    jiffies_to_msecs(s->stats->run.rs_flushing / s->stats->ts_tid));
 	seq_printf(seq, "  %ums logging transaction\n",
-	    jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid));
+	    jiffies_to_msecs(s->stats->run.rs_logging / s->stats->ts_tid));
 	seq_printf(seq, "  %lluus average transaction commit time\n",
 		   div_u64(s->journal->j_average_commit_time, 1000));
 	seq_printf(seq, "  %lu handles per transaction\n",
-	    s->stats->u.run.rs_handle_count / s->stats->ts_tid);
+	    s->stats->run.rs_handle_count / s->stats->ts_tid);
 	seq_printf(seq, "  %lu blocks per transaction\n",
-	    s->stats->u.run.rs_blocks / s->stats->ts_tid);
+	    s->stats->run.rs_blocks / s->stats->ts_tid);
 	seq_printf(seq, "  %lu logged blocks per transaction\n",
-	    s->stats->u.run.rs_blocks_logged / s->stats->ts_tid);
+	    s->stats->run.rs_blocks_logged / s->stats->ts_tid);
 	return 0;
 }
 
@@ -931,8 +784,6 @@ static void jbd2_stats_proc_init(journal_t *journal)
 {
 	journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
 	if (journal->j_proc_entry) {
-		proc_create_data("history", S_IRUGO, journal->j_proc_entry,
-				 &jbd2_seq_history_fops, journal);
 		proc_create_data("info", S_IRUGO, journal->j_proc_entry,
 				 &jbd2_seq_info_fops, journal);
 	}
@@ -941,27 +792,9 @@ static void jbd2_stats_proc_init(journal_t *journal)
 static void jbd2_stats_proc_exit(journal_t *journal)
 {
 	remove_proc_entry("info", journal->j_proc_entry);
-	remove_proc_entry("history", journal->j_proc_entry);
 	remove_proc_entry(journal->j_devname, proc_jbd2_stats);
 }
 
-static void journal_init_stats(journal_t *journal)
-{
-	int size;
-
-	if (!proc_jbd2_stats)
-		return;
-
-	journal->j_history_max = 100;
-	size = sizeof(struct transaction_stats_s) * journal->j_history_max;
-	journal->j_history = kzalloc(size, GFP_KERNEL);
-	if (!journal->j_history) {
-		journal->j_history_max = 0;
-		return;
-	}
-	spin_lock_init(&journal->j_history_lock);
-}
-
 /*
  * Management for journal control blocks: functions to create and
  * destroy journal_t structures, and to initialise and read existing
@@ -1006,7 +839,7 @@ static journal_t * journal_init_common (void)
 		goto fail;
 	}
 
-	journal_init_stats(journal);
+	spin_lock_init(&journal->j_history_lock);
 
 	return journal;
 fail:
-- 
cgit v1.1


From c1fccc0696bcaff6008c11865091f5ec4b0937ab Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 30 Sep 2009 01:13:55 -0400
Subject: ext4: Fix time encoding with extra epoch bits

"Looking at ext4.h, I think the setting of extra time fields forgets to
mask the epoch bits so the epoch part overwrites nsec part. The second
change is only for coherency (2 -> EXT4_EPOCH_BITS)."

Thanks to Damien Guibouret for pointing out this problem.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c508cf7..984ca0c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -525,8 +525,8 @@ struct move_extent {
 static inline __le32 ext4_encode_extra_time(struct timespec *time)
 {
        return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
-			   time->tv_sec >> 32 : 0) |
-			   ((time->tv_nsec << 2) & EXT4_NSEC_MASK));
+			   (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) |
+                          ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK));
 }
 
 static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
@@ -534,7 +534,7 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
        if (sizeof(time->tv_sec) > 4)
 	       time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
 			       << 32;
-       time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2;
+       time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
 }
 
 #define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode)			       \
-- 
cgit v1.1


From 04bedd79a7037ee7af816b06c60c738144475c4a Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Fri, 18 Sep 2009 14:31:47 -0500
Subject: dlm: fix lowcomms_connect_node for sctp

The recently added dlm_lowcomms_connect_node() from
391fbdc5d527149578490db2f1619951d91f3561 does not work
when using SCTP instead of TCP.  The sctp connection code
has nothing to do without data to send.  Check for no data
in the sctp connection code and do nothing instead of
triggering a BUG.  Also have connect_node() do nothing
when the protocol is sctp.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 240cef1..a3350e4 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -316,6 +316,10 @@ int dlm_lowcomms_connect_node(int nodeid)
 {
 	struct connection *con;
 
+	/* with sctp there's no connecting without sending */
+	if (dlm_config.ci_protocol != 0)
+		return 0;
+
 	if (nodeid == dlm_our_nodeid())
 		return 0;
 
@@ -855,11 +859,14 @@ static void sctp_init_assoc(struct connection *con)
 	outmessage.msg_flags = MSG_EOR;
 
 	spin_lock(&con->writequeue_lock);
-	e = list_entry(con->writequeue.next, struct writequeue_entry,
-		       list);
 
-	BUG_ON((struct list_head *) e == &con->writequeue);
+	if (list_empty(&con->writequeue)) {
+		spin_unlock(&con->writequeue_lock);
+		log_print("writequeue empty for nodeid %d", con->nodeid);
+		return;
+	}
 
+	e = list_first_entry(&con->writequeue, struct writequeue_entry, list);
 	len = e->len;
 	offset = e->offset;
 	spin_unlock(&con->writequeue_lock);
-- 
cgit v1.1


From 6861f350785bf476c2d4e3b9cb69ee36b78df2fc Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Thu, 24 Sep 2009 15:58:23 -0500
Subject: dlm: fix socket fd translation

The code to set up sctp sockets was not using the sockfd_lookup()
and sockfd_put() routines to translate an fd to a socket.  The
direct fget and fput calls were resulting in error messages from
alloc_fd().

Also clean up two log messages and remove a third, related to
setting up sctp associations.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lowcomms.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index a3350e4..70736eb 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -459,9 +459,9 @@ static void process_sctp_notification(struct connection *con,
 			int prim_len, ret;
 			int addr_len;
 			struct connection *new_con;
-			struct file *file;
 			sctp_peeloff_arg_t parg;
 			int parglen = sizeof(parg);
+			int err;
 
 			/*
 			 * We get this before any data for an association.
@@ -516,19 +516,22 @@ static void process_sctp_notification(struct connection *con,
 			ret = kernel_getsockopt(con->sock, IPPROTO_SCTP,
 						SCTP_SOCKOPT_PEELOFF,
 						(void *)&parg, &parglen);
-			if (ret) {
+			if (ret < 0) {
 				log_print("Can't peel off a socket for "
-					  "connection %d to node %d: err=%d\n",
+					  "connection %d to node %d: err=%d",
 					  parg.associd, nodeid, ret);
+				return;
+			}
+			new_con->sock = sockfd_lookup(parg.sd, &err);
+			if (!new_con->sock) {
+				log_print("sockfd_lookup error %d", err);
+				return;
 			}
-			file = fget(parg.sd);
-			new_con->sock = SOCKET_I(file->f_dentry->d_inode);
 			add_sock(new_con->sock, new_con);
-			fput(file);
-			put_unused_fd(parg.sd);
+			sockfd_put(new_con->sock);
 
-			log_print("got new/restarted association %d nodeid %d",
-				 (int)sn->sn_assoc_change.sac_assoc_id, nodeid);
+			log_print("connecting to %d sctp association %d",
+				 nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
 
 			/* Send any pending writes */
 			clear_bit(CF_CONNECT_PENDING, &new_con->flags);
@@ -841,8 +844,6 @@ static void sctp_init_assoc(struct connection *con)
 	if (con->retries++ > MAX_CONNECT_RETRIES)
 		return;
 
-	log_print("Initiating association with node %d", con->nodeid);
-
 	if (nodeid_to_addr(con->nodeid, (struct sockaddr *)&rem_addr)) {
 		log_print("no address for nodeid %d", con->nodeid);
 		return;
-- 
cgit v1.1


From 1f94533d9cd75f6d2826018d54a971b9cc085992 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 30 Sep 2009 22:57:41 -0400
Subject: ext4: fix a BUG_ON crash by checking that page has buffers attached
 to it

In ext4_num_dirty_pages() we were calling page_buffers() before
checking to see if the page actually had pages attached to it; this
would cause a BUG check crash in the inline function page_buffers().

Thanks to Markus Trippelsdorf for reporting this bug.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ec367bc..6e65d0e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1146,8 +1146,8 @@ static int check_block_validity(struct inode *inode, const char *msg,
 }
 
 /*
- * Return the number of dirty pages in the given inode starting at
- * page frame idx.
+ * Return the number of contiguous dirty pages in a given inode
+ * starting at page frame idx.
  */
 static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 				    unsigned int max_pages)
@@ -1181,15 +1181,15 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 				unlock_page(page);
 				break;
 			}
-			head = page_buffers(page);
-			bh = head;
-			do {
-				if (!buffer_delay(bh) &&
-				    !buffer_unwritten(bh)) {
-					done = 1;
-					break;
-				}
-			} while ((bh = bh->b_this_page) != head);
+			if (page_has_buffers(page)) {
+				bh = head = page_buffers(page);
+				do {
+					if (!buffer_delay(bh) &&
+					    !buffer_unwritten(bh))
+						done = 1;
+					bh = bh->b_this_page;
+				} while (!done && (bh != head));
+			}
 			unlock_page(page);
 			if (done)
 				break;
-- 
cgit v1.1


From f0e2dfa7f3e1361ca8fc91c25e67fc4e92613cc9 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Thu, 1 Oct 2009 02:21:07 -0400
Subject: ext4: drop ext4dev compat

Kconfig & super.c promised it'd be gone by 2.6.31, so it's
about time to drop it.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/Kconfig | 14 --------------
 fs/ext4/super.c | 31 -------------------------------
 2 files changed, 45 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index d5c0ea2..9f2d45d 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -26,20 +26,6 @@ config EXT4_FS
 
 	  If unsure, say N.
 
-config EXT4DEV_COMPAT
-	bool "Enable ext4dev compatibility"
-	depends on EXT4_FS
-	help
-	  Starting with 2.6.28, the name of the ext4 filesystem was
-	  renamed from ext4dev to ext4.  Unfortunately there are some
-	  legacy userspace programs (such as klibc's fstype) have
-	  "ext4dev" hardcoded.
-
-	  To enable backwards compatibility so that systems that are
-	  still expecting to mount ext4 filesystems using ext4dev,
-	  choose Y here.   This feature will go away by 2.6.31, so
-	  please arrange to get your userspace programs fixed!
-
 config EXT4_FS_XATTR
 	bool "Ext4 extended attributes"
 	depends on EXT4_FS
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 12e726a..312211e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3966,27 +3966,6 @@ static struct file_system_type ext4_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
-#ifdef CONFIG_EXT4DEV_COMPAT
-static int ext4dev_get_sb(struct file_system_type *fs_type, int flags,
-			  const char *dev_name, void *data,struct vfsmount *mnt)
-{
-	printk(KERN_WARNING "EXT4-fs (%s): Update your userspace programs "
-	       "to mount using ext4\n", dev_name);
-	printk(KERN_WARNING "EXT4-fs (%s): ext4dev backwards compatibility "
-	       "will go away by 2.6.31\n", dev_name);
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
-}
-
-static struct file_system_type ext4dev_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "ext4dev",
-	.get_sb		= ext4dev_get_sb,
-	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,
-};
-MODULE_ALIAS("ext4dev");
-#endif
-
 static int __init init_ext4_fs(void)
 {
 	int err;
@@ -4011,13 +3990,6 @@ static int __init init_ext4_fs(void)
 	err = register_filesystem(&ext4_fs_type);
 	if (err)
 		goto out;
-#ifdef CONFIG_EXT4DEV_COMPAT
-	err = register_filesystem(&ext4dev_fs_type);
-	if (err) {
-		unregister_filesystem(&ext4_fs_type);
-		goto out;
-	}
-#endif
 	return 0;
 out:
 	destroy_inodecache();
@@ -4036,9 +4008,6 @@ out4:
 static void __exit exit_ext4_fs(void)
 {
 	unregister_filesystem(&ext4_fs_type);
-#ifdef CONFIG_EXT4DEV_COMPAT
-	unregister_filesystem(&ext4dev_fs_type);
-#endif
 	destroy_inodecache();
 	exit_ext4_xattr();
 	exit_ext4_mballoc();
-- 
cgit v1.1


From 35d62a942db5ae03104929fe7397835b572c4bc4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 30 Sep 2009 16:47:08 -0400
Subject: Btrfs: fix arguments to btrfs_wait_on_page_writeback_range

wait_on_page_writeback_range/btrfs_wait_on_page_writeback_range takes
a pagecache offset, not a byte offset into the file.  Shift the arguments
around to wait for the correct range

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ece8d1e..69dce50 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -829,7 +829,9 @@ int btrfs_write_tree_block(struct extent_buffer *buf)
 int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
 {
 	return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
-				  buf->start, buf->start + buf->len - 1);
+				  buf->start >> PAGE_CACHE_SHIFT,
+				  (buf->start + buf->len - 1) >>
+				   PAGE_CACHE_SHIFT);
 }
 
 struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
-- 
cgit v1.1


From ab93dbecfba72bbc04b7036343d180aaff1b61a3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 1 Oct 2009 12:29:10 -0400
Subject: Btrfs: take i_mutex before generic_write_checks

btrfs_file_write was incorrectly calling generic_write_checks without
taking i_mutex.  This lead to problems with racing around i_size when
doing O_APPEND writes.

The fix here is to move i_mutex higher.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1be96ba..f155179 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -920,26 +920,35 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	start_pos = pos;
 
 	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
+	/* do the reserve before the mutex lock in case we have to do some
+	 * flushing.  We wouldn't deadlock, but this is more polite.
+	 */
+	err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+	if (err)
+		goto out_nolock;
+
+	mutex_lock(&inode->i_mutex);
+
 	current->backing_dev_info = inode->i_mapping->backing_dev_info;
 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
 	if (err)
-		goto out_nolock;
+		goto out;
+
 	if (count == 0)
-		goto out_nolock;
+		goto out;
 
 	err = file_remove_suid(file);
 	if (err)
-		goto out_nolock;
-
-	err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
-	if (err)
-		goto out_nolock;
+		goto out;
 
 	file_update_time(file);
 
 	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
 
-	mutex_lock(&inode->i_mutex);
+	/* generic_write_checks can change our pos */
+	start_pos = pos;
+
 	BTRFS_I(inode)->sequence++;
 	first_index = pos >> PAGE_CACHE_SHIFT;
 	last_index = (pos + count) >> PAGE_CACHE_SHIFT;
-- 
cgit v1.1


From 8aa38c31b7659e338fee4d9af4c3805acbd9806f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 1 Oct 2009 12:58:30 -0400
Subject: Btrfs: remove duplicates of filemap_ helpers

Use filemap_fdatawrite_range and filemap_fdatawait_range instead of
local copies of the functions.  For filemap_fdatawait_range that
also means replacing the awkward old wait_on_page_writeback_range
calling convention with the regular filemap byte offsets.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c      | 10 +++---
 fs/btrfs/file.c         |  5 ++-
 fs/btrfs/ordered-data.c | 93 +++----------------------------------------------
 fs/btrfs/ordered-data.h |  4 ---
 4 files changed, 10 insertions(+), 102 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d20dc05..af0435f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -822,16 +822,14 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 
 int btrfs_write_tree_block(struct extent_buffer *buf)
 {
-	return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start,
-				      buf->start + buf->len - 1, WB_SYNC_ALL);
+	return filemap_fdatawrite_range(buf->first_page->mapping, buf->start,
+					buf->start + buf->len - 1);
 }
 
 int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
 {
-	return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
-				  buf->start >> PAGE_CACHE_SHIFT,
-				  (buf->start + buf->len - 1) >>
-				   PAGE_CACHE_SHIFT);
+	return filemap_fdatawait_range(buf->first_page->mapping,
+				       buf->start, buf->start + buf->len - 1);
 }
 
 struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 7351bdb..ca784a7 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1022,9 +1022,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		}
 
 		if (will_write) {
-			btrfs_fdatawrite_range(inode->i_mapping, pos,
-					       pos + write_bytes - 1,
-					       WB_SYNC_ALL);
+			filemap_fdatawrite_range(inode->i_mapping, pos,
+						 pos + write_bytes - 1);
 		} else {
 			balance_dirty_pages_ratelimited_nr(inode->i_mapping,
 							   num_pages);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b5d6d24..897fba8 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -458,7 +458,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
 	 * start IO on any dirty ones so the wait doesn't stall waiting
 	 * for pdflush to find them
 	 */
-	btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL);
+	filemap_fdatawrite_range(inode->i_mapping, start, end);
 	if (wait) {
 		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
 						 &entry->flags));
@@ -488,17 +488,15 @@ again:
 	/* start IO across the range first to instantiate any delalloc
 	 * extents
 	 */
-	btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
+	filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
 
 	/* The compression code will leave pages locked but return from
 	 * writepage without setting the page writeback.  Starting again
 	 * with WB_SYNC_ALL will end up waiting for the IO to actually start.
 	 */
-	btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
+	filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
 
-	btrfs_wait_on_page_writeback_range(inode->i_mapping,
-					   start >> PAGE_CACHE_SHIFT,
-					   orig_end >> PAGE_CACHE_SHIFT);
+	filemap_fdatawait_range(inode->i_mapping, start, orig_end);
 
 	end = orig_end;
 	found = 0;
@@ -716,89 +714,6 @@ out:
 }
 
 
-/**
- * taken from mm/filemap.c because it isn't exported
- *
- * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
- * @mapping:	address space structure to write
- * @start:	offset in bytes where the range starts
- * @end:	offset in bytes where the range ends (inclusive)
- * @sync_mode:	enable synchronous operation
- *
- * Start writeback against all of a mapping's dirty pages that lie
- * within the byte offsets <start, end> inclusive.
- *
- * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
- * opposed to a regular memory cleansing writeback.  The difference between
- * these two operations is that if a dirty page/buffer is encountered, it must
- * be waited upon, and not just skipped over.
- */
-int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
-			   loff_t end, int sync_mode)
-{
-	struct writeback_control wbc = {
-		.sync_mode = sync_mode,
-		.nr_to_write = mapping->nrpages * 2,
-		.range_start = start,
-		.range_end = end,
-	};
-	return btrfs_writepages(mapping, &wbc);
-}
-
-/**
- * taken from mm/filemap.c because it isn't exported
- *
- * wait_on_page_writeback_range - wait for writeback to complete
- * @mapping:	target address_space
- * @start:	beginning page index
- * @end:	ending page index
- *
- * Wait for writeback to complete against pages indexed by start->end
- * inclusive
- */
-int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
-				       pgoff_t start, pgoff_t end)
-{
-	struct pagevec pvec;
-	int nr_pages;
-	int ret = 0;
-	pgoff_t index;
-
-	if (end < start)
-		return 0;
-
-	pagevec_init(&pvec, 0);
-	index = start;
-	while ((index <= end) &&
-			(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
-			PAGECACHE_TAG_WRITEBACK,
-			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
-		unsigned i;
-
-		for (i = 0; i < nr_pages; i++) {
-			struct page *page = pvec.pages[i];
-
-			/* until radix tree lookup accepts end_index */
-			if (page->index > end)
-				continue;
-
-			wait_on_page_writeback(page);
-			if (PageError(page))
-				ret = -EIO;
-		}
-		pagevec_release(&pvec);
-		cond_resched();
-	}
-
-	/* Check for outstanding write errors */
-	if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
-		ret = -ENOSPC;
-	if (test_and_clear_bit(AS_EIO, &mapping->flags))
-		ret = -EIO;
-
-	return ret;
-}
-
 /*
  * add a given inode to the list of inodes that must be fully on
  * disk before a transaction commit finishes.
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 993a7ea..f82e874 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -153,10 +153,6 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
 int btrfs_ordered_update_i_size(struct inode *inode,
 				struct btrfs_ordered_extent *ordered);
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
-int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
-				       pgoff_t start, pgoff_t end);
-int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
-			   loff_t end, int sync_mode);
 int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
 int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
 int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
-- 
cgit v1.1


From a112a71d45b5e40c3cf07371d20a4a5079a72610 Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hsweeten@visionengravers.com>
Date: Sat, 26 Sep 2009 16:19:21 +0200
Subject: fs/bio.c: move EXPORT* macros to line after function

As mentioned in Documentation/CodingStyle, move EXPORT* macro's
to the line immediately after the closing function brace line.

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 49 +++++++++++++++++++++++--------------------------
 1 file changed, 23 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 7673800..402cb84 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -249,6 +249,7 @@ void bio_free(struct bio *bio, struct bio_set *bs)
 
 	mempool_free(p, bs->bio_pool);
 }
+EXPORT_SYMBOL(bio_free);
 
 void bio_init(struct bio *bio)
 {
@@ -257,6 +258,7 @@ void bio_init(struct bio *bio)
 	bio->bi_comp_cpu = -1;
 	atomic_set(&bio->bi_cnt, 1);
 }
+EXPORT_SYMBOL(bio_init);
 
 /**
  * bio_alloc_bioset - allocate a bio for I/O
@@ -311,6 +313,7 @@ err_free:
 	mempool_free(p, bs->bio_pool);
 	return NULL;
 }
+EXPORT_SYMBOL(bio_alloc_bioset);
 
 static void bio_fs_destructor(struct bio *bio)
 {
@@ -337,6 +340,7 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
 
 	return bio;
 }
+EXPORT_SYMBOL(bio_alloc);
 
 static void bio_kmalloc_destructor(struct bio *bio)
 {
@@ -380,6 +384,7 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
 
 	return bio;
 }
+EXPORT_SYMBOL(bio_kmalloc);
 
 void zero_fill_bio(struct bio *bio)
 {
@@ -416,6 +421,7 @@ void bio_put(struct bio *bio)
 		bio->bi_destructor(bio);
 	}
 }
+EXPORT_SYMBOL(bio_put);
 
 inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
 {
@@ -424,6 +430,7 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
 
 	return bio->bi_phys_segments;
 }
+EXPORT_SYMBOL(bio_phys_segments);
 
 /**
  * 	__bio_clone	-	clone a bio
@@ -451,6 +458,7 @@ void __bio_clone(struct bio *bio, struct bio *bio_src)
 	bio->bi_size = bio_src->bi_size;
 	bio->bi_idx = bio_src->bi_idx;
 }
+EXPORT_SYMBOL(__bio_clone);
 
 /**
  *	bio_clone	-	clone a bio
@@ -482,6 +490,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
 
 	return b;
 }
+EXPORT_SYMBOL(bio_clone);
 
 /**
  *	bio_get_nr_vecs		- return approx number of vecs
@@ -505,6 +514,7 @@ int bio_get_nr_vecs(struct block_device *bdev)
 
 	return nr_pages;
 }
+EXPORT_SYMBOL(bio_get_nr_vecs);
 
 static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 			  *page, unsigned int len, unsigned int offset,
@@ -635,6 +645,7 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
 	return __bio_add_page(q, bio, page, len, offset,
 			      queue_max_hw_sectors(q));
 }
+EXPORT_SYMBOL(bio_add_pc_page);
 
 /**
  *	bio_add_page	-	attempt to add page to bio
@@ -655,6 +666,7 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
 	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 	return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
 }
+EXPORT_SYMBOL(bio_add_page);
 
 struct bio_map_data {
 	struct bio_vec *iovecs;
@@ -776,6 +788,7 @@ int bio_uncopy_user(struct bio *bio)
 	bio_put(bio);
 	return ret;
 }
+EXPORT_SYMBOL(bio_uncopy_user);
 
 /**
  *	bio_copy_user_iov	-	copy user data to bio
@@ -920,6 +933,7 @@ struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
 
 	return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
 }
+EXPORT_SYMBOL(bio_copy_user);
 
 static struct bio *__bio_map_user_iov(struct request_queue *q,
 				      struct block_device *bdev,
@@ -1050,6 +1064,7 @@ struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
 
 	return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
 }
+EXPORT_SYMBOL(bio_map_user);
 
 /**
  *	bio_map_user_iov - map user sg_iovec table into bio
@@ -1117,13 +1132,13 @@ void bio_unmap_user(struct bio *bio)
 	__bio_unmap_user(bio);
 	bio_put(bio);
 }
+EXPORT_SYMBOL(bio_unmap_user);
 
 static void bio_map_kern_endio(struct bio *bio, int err)
 {
 	bio_put(bio);
 }
 
-
 static struct bio *__bio_map_kern(struct request_queue *q, void *data,
 				  unsigned int len, gfp_t gfp_mask)
 {
@@ -1189,6 +1204,7 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
 	bio_put(bio);
 	return ERR_PTR(-EINVAL);
 }
+EXPORT_SYMBOL(bio_map_kern);
 
 static void bio_copy_kern_endio(struct bio *bio, int err)
 {
@@ -1250,6 +1266,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
 
 	return bio;
 }
+EXPORT_SYMBOL(bio_copy_kern);
 
 /*
  * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
@@ -1400,6 +1417,7 @@ void bio_endio(struct bio *bio, int error)
 	if (bio->bi_end_io)
 		bio->bi_end_io(bio, error);
 }
+EXPORT_SYMBOL(bio_endio);
 
 void bio_pair_release(struct bio_pair *bp)
 {
@@ -1410,6 +1428,7 @@ void bio_pair_release(struct bio_pair *bp)
 		mempool_free(bp, bp->bio2.bi_private);
 	}
 }
+EXPORT_SYMBOL(bio_pair_release);
 
 static void bio_pair_end_1(struct bio *bi, int err)
 {
@@ -1477,6 +1496,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
 
 	return bp;
 }
+EXPORT_SYMBOL(bio_split);
 
 /**
  *      bio_sector_offset - Find hardware sector offset in bio
@@ -1547,6 +1567,7 @@ void bioset_free(struct bio_set *bs)
 
 	kfree(bs);
 }
+EXPORT_SYMBOL(bioset_free);
 
 /**
  * bioset_create  - Create a bio_set
@@ -1592,6 +1613,7 @@ bad:
 	bioset_free(bs);
 	return NULL;
 }
+EXPORT_SYMBOL(bioset_create);
 
 static void __init biovec_init_slabs(void)
 {
@@ -1636,29 +1658,4 @@ static int __init init_bio(void)
 
 	return 0;
 }
-
 subsys_initcall(init_bio);
-
-EXPORT_SYMBOL(bio_alloc);
-EXPORT_SYMBOL(bio_kmalloc);
-EXPORT_SYMBOL(bio_put);
-EXPORT_SYMBOL(bio_free);
-EXPORT_SYMBOL(bio_endio);
-EXPORT_SYMBOL(bio_init);
-EXPORT_SYMBOL(__bio_clone);
-EXPORT_SYMBOL(bio_clone);
-EXPORT_SYMBOL(bio_phys_segments);
-EXPORT_SYMBOL(bio_add_page);
-EXPORT_SYMBOL(bio_add_pc_page);
-EXPORT_SYMBOL(bio_get_nr_vecs);
-EXPORT_SYMBOL(bio_map_user);
-EXPORT_SYMBOL(bio_unmap_user);
-EXPORT_SYMBOL(bio_map_kern);
-EXPORT_SYMBOL(bio_copy_kern);
-EXPORT_SYMBOL(bio_pair_release);
-EXPORT_SYMBOL(bio_split);
-EXPORT_SYMBOL(bio_copy_user);
-EXPORT_SYMBOL(bio_uncopy_user);
-EXPORT_SYMBOL(bioset_create);
-EXPORT_SYMBOL(bioset_free);
-EXPORT_SYMBOL(bio_alloc_bioset);
-- 
cgit v1.1


From fbf190874407f23d2891b53ffdf7d3c6be8d47ff Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 1 Oct 2009 17:10:23 -0400
Subject: Btrfs: fix data space leak fix

There is a problem where page_mkwrite can be called on a dirtied page that
already has a delalloc range associated with it.  The fix is to clear any
delalloc bits for the range we are dirtying so the space accounting gets
handled properly.  This is the same thing we do in the normal write case, so we
are consistent across the board.  With this patch we no longer leak reserved
space.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3cc5677..3a6f953 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4909,10 +4909,21 @@ again:
 		goto again;
 	}
 
+	/*
+	 * XXX - page_mkwrite gets called every time the page is dirtied, even
+	 * if it was already dirty, so for space accounting reasons we need to
+	 * clear any delalloc bits for the range we are fixing to save.  There
+	 * is probably a better way to do this, but for now keep consistent with
+	 * prepare_pages in the normal write path.
+	 */
+	clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
+			  EXTENT_DIRTY | EXTENT_DELALLOC, GFP_NOFS);
+
 	ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
 	if (ret) {
 		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 		ret = VM_FAULT_SIGBUS;
+		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
 		goto out_unlock;
 	}
 	ret = 0;
-- 
cgit v1.1


From 828c09509b9695271bcbdc53e9fc9a6a737148d2 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 1 Oct 2009 15:43:56 -0700
Subject: const: constify remaining file_operations

[akpm@linux-foundation.org: fix KVM]
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/btrfs/ctree.h             | 2 +-
 fs/btrfs/file.c              | 2 +-
 fs/btrfs/inode.c             | 4 ++--
 fs/jbd2/journal.c            | 2 +-
 fs/nfsd/nfsctl.c             | 2 +-
 fs/nilfs2/dir.c              | 2 +-
 fs/nilfs2/file.c             | 2 +-
 fs/nilfs2/mdt.c              | 2 +-
 fs/nilfs2/nilfs.h            | 4 ++--
 fs/ocfs2/cluster/heartbeat.c | 2 +-
 fs/ocfs2/cluster/netdebug.c  | 4 ++--
 fs/ocfs2/dlm/dlmdebug.c      | 8 ++++----
 fs/ocfs2/super.c             | 2 +-
 fs/omfs/dir.c                | 2 +-
 fs/omfs/file.c               | 2 +-
 fs/omfs/omfs.h               | 4 ++--
 16 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 80599b4..4484eb3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2326,7 +2326,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			    int skip_pinned);
 int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
-extern struct file_operations btrfs_file_operations;
+extern const struct file_operations btrfs_file_operations;
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
 		       u64 start, u64 end, u64 locked_end,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a3492a3..9ed17dbe 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1196,7 +1196,7 @@ static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
 	return 0;
 }
 
-struct file_operations btrfs_file_operations = {
+const struct file_operations btrfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.aio_read       = generic_file_aio_read,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e9b76bc..b9fe06d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -62,7 +62,7 @@ static const struct inode_operations btrfs_special_inode_operations;
 static const struct inode_operations btrfs_file_inode_operations;
 static const struct address_space_operations btrfs_aops;
 static const struct address_space_operations btrfs_symlink_aops;
-static struct file_operations btrfs_dir_file_operations;
+static const struct file_operations btrfs_dir_file_operations;
 static struct extent_io_ops btrfs_extent_io_ops;
 
 static struct kmem_cache *btrfs_inode_cachep;
@@ -5544,7 +5544,7 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = {
 	.permission	= btrfs_permission,
 };
 
-static struct file_operations btrfs_dir_file_operations = {
+static const struct file_operations btrfs_dir_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= btrfs_real_readdir,
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 761af77..b0ab521 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -770,7 +770,7 @@ static int jbd2_seq_info_release(struct inode *inode, struct file *file)
 	return seq_release(inode, file);
 }
 
-static struct file_operations jbd2_seq_info_fops = {
+static const struct file_operations jbd2_seq_info_fops = {
 	.owner		= THIS_MODULE,
 	.open           = jbd2_seq_info_open,
 	.read           = seq_read,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 00388d2..5c01fc1 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -176,7 +176,7 @@ static const struct file_operations exports_operations = {
 extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
 extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
 
-static struct file_operations pool_stats_operations = {
+static const struct file_operations pool_stats_operations = {
 	.open		= nfsd_pool_stats_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 1a4fa04..e097099 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -697,7 +697,7 @@ not_empty:
 	return 0;
 }
 
-struct file_operations nilfs_dir_operations = {
+const struct file_operations nilfs_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= nilfs_readdir,
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 7d7b498..30292df 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -134,7 +134,7 @@ static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
  * We have mostly NULL's here: the current defaults are ok for
  * the nilfs filesystem.
  */
-struct file_operations nilfs_file_operations = {
+const struct file_operations nilfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index b18c499..f632611 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -433,7 +433,7 @@ static const struct address_space_operations def_mdt_aops = {
 };
 
 static const struct inode_operations def_mdt_iops;
-static struct file_operations def_mdt_fops;
+static const struct file_operations def_mdt_fops;
 
 /*
  * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile,
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index bad7368..4da6f67 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -294,9 +294,9 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *);
 /*
  * Inodes and files operations
  */
-extern struct file_operations nilfs_dir_operations;
+extern const struct file_operations nilfs_dir_operations;
 extern const struct inode_operations nilfs_file_inode_operations;
-extern struct file_operations nilfs_file_operations;
+extern const struct file_operations nilfs_file_operations;
 extern const struct address_space_operations nilfs_aops;
 extern const struct inode_operations nilfs_dir_inode_operations;
 extern const struct inode_operations nilfs_special_inode_operations;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 09cc25d..c452d11 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -966,7 +966,7 @@ static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
 }
 #endif  /* CONFIG_DEBUG_FS */
 
-static struct file_operations o2hb_debug_fops = {
+static const struct file_operations o2hb_debug_fops = {
 	.open =		o2hb_debug_open,
 	.release =	o2hb_debug_release,
 	.read =		o2hb_debug_read,
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index cfb2be7..da794bc 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -207,7 +207,7 @@ static int nst_fop_release(struct inode *inode, struct file *file)
 	return seq_release_private(inode, file);
 }
 
-static struct file_operations nst_seq_fops = {
+static const struct file_operations nst_seq_fops = {
 	.open = nst_fop_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
@@ -388,7 +388,7 @@ static int sc_fop_release(struct inode *inode, struct file *file)
 	return seq_release_private(inode, file);
 }
 
-static struct file_operations sc_seq_fops = {
+static const struct file_operations sc_seq_fops = {
 	.open = sc_fop_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index ca46002..42b0bad 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -478,7 +478,7 @@ bail:
 	return -ENOMEM;
 }
 
-static struct file_operations debug_purgelist_fops = {
+static const struct file_operations debug_purgelist_fops = {
 	.open =		debug_purgelist_open,
 	.release =	debug_buffer_release,
 	.read =		debug_buffer_read,
@@ -538,7 +538,7 @@ bail:
 	return -ENOMEM;
 }
 
-static struct file_operations debug_mle_fops = {
+static const struct file_operations debug_mle_fops = {
 	.open =		debug_mle_open,
 	.release =	debug_buffer_release,
 	.read =		debug_buffer_read,
@@ -741,7 +741,7 @@ static int debug_lockres_release(struct inode *inode, struct file *file)
 	return seq_release_private(inode, file);
 }
 
-static struct file_operations debug_lockres_fops = {
+static const struct file_operations debug_lockres_fops = {
 	.open =		debug_lockres_open,
 	.release =	debug_lockres_release,
 	.read =		seq_read,
@@ -925,7 +925,7 @@ bail:
 	return -ENOMEM;
 }
 
-static struct file_operations debug_state_fops = {
+static const struct file_operations debug_state_fops = {
 	.open =		debug_state_open,
 	.release =	debug_buffer_release,
 	.read =		debug_buffer_read,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 4cc3c89..c0e48ae 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -373,7 +373,7 @@ static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
 }
 #endif	/* CONFIG_DEBUG_FS */
 
-static struct file_operations ocfs2_osb_debug_fops = {
+static const struct file_operations ocfs2_osb_debug_fops = {
 	.open =		ocfs2_osb_debug_open,
 	.release =	ocfs2_debug_release,
 	.read =		ocfs2_debug_read,
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index 3680bae..b42d624 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -498,7 +498,7 @@ const struct inode_operations omfs_dir_inops = {
 	.rmdir = omfs_rmdir,
 };
 
-struct file_operations omfs_dir_operations = {
+const struct file_operations omfs_dir_operations = {
 	.read = generic_read_dir,
 	.readdir = omfs_readdir,
 	.llseek = generic_file_llseek,
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 4845fbb..399487c 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -322,7 +322,7 @@ static sector_t omfs_bmap(struct address_space *mapping, sector_t block)
 	return generic_block_bmap(mapping, block, omfs_get_block);
 }
 
-struct file_operations omfs_file_operations = {
+const struct file_operations omfs_file_operations = {
 	.llseek = generic_file_llseek,
 	.read = do_sync_read,
 	.write = do_sync_write,
diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h
index df71039..ebe2fdb 100644
--- a/fs/omfs/omfs.h
+++ b/fs/omfs/omfs.h
@@ -44,14 +44,14 @@ extern int omfs_allocate_range(struct super_block *sb, int min_request,
 extern int omfs_clear_range(struct super_block *sb, u64 block, int count);
 
 /* dir.c */
-extern struct file_operations omfs_dir_operations;
+extern const struct file_operations omfs_dir_operations;
 extern const struct inode_operations omfs_dir_inops;
 extern int omfs_make_empty(struct inode *inode, struct super_block *sb);
 extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
 			u64 fsblock);
 
 /* file.c */
-extern struct file_operations omfs_file_operations;
+extern const struct file_operations omfs_file_operations;
 extern const struct inode_operations omfs_file_inops;
 extern const struct address_space_operations omfs_aops;
 extern void omfs_make_empty_table(struct buffer_head *bh, int offset);
-- 
cgit v1.1


From 80e50be4220e1244fcf6d5f75b997f8586ae1300 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 1 Oct 2009 15:44:27 -0700
Subject: afs: remove cache.h

It's just a wrapper for <linux/fscache.h>, so remove it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/afs/cache.h    | 12 ------------
 fs/afs/internal.h |  2 +-
 2 files changed, 1 insertion(+), 13 deletions(-)
 delete mode 100644 fs/afs/cache.h

(limited to 'fs')

diff --git a/fs/afs/cache.h b/fs/afs/cache.h
deleted file mode 100644
index 5c4f6b4..0000000
--- a/fs/afs/cache.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* AFS local cache management interface
- *
- * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/fscache.h>
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 106be66..6ece2a1 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -18,10 +18,10 @@
 #include <linux/key.h>
 #include <linux/workqueue.h>
 #include <linux/sched.h>
+#include <linux/fscache.h>
 
 #include "afs.h"
 #include "afs_vl.h"
-#include "cache.h"
 
 #define AFS_CELL_MAX_ADDRS 15
 
-- 
cgit v1.1


From 74072d0a63553720dd3c70a8b8e9407eb2027dbe Mon Sep 17 00:00:00 2001
From: Curt Wohlgemuth <curtw@google.com>
Date: Fri, 2 Oct 2009 21:08:32 -0400
Subject: ext4: Fix build warning in ext4_dirty_inode()

This fixes the following warning:

fs/ext4/inode.c: In function 'ext4_dirty_inode':
fs/ext4/inode.c:5615: warning: unused variable 'current_handle'

We remove the jbd_debug() statement which does use current_handle, as
it's not terribly important in the grand scheme of things.

Thanks to Stephen Rothwell for pointing this out.

Signed-off-by: Curt Wohlgemuth <curtw@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 6e65d0e..635f8ec 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5612,14 +5612,12 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
  */
 void ext4_dirty_inode(struct inode *inode)
 {
-	handle_t *current_handle = ext4_journal_current_handle();
 	handle_t *handle;
 
 	handle = ext4_journal_start(inode, 2);
 	if (IS_ERR(handle))
 		goto out;
 
-	jbd_debug(5, "marking dirty.  outer handle=%p\n", current_handle);
 	ext4_mark_inode_dirty(handle, inode);
 
 	ext4_journal_stop(handle);
-- 
cgit v1.1


From fbbf69456619de5d251cb9f1df609069178c62d5 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Fri, 2 Oct 2009 21:20:55 -0400
Subject: [PATCH] ext4: retry failed direct IO allocations

On a 256M filesystem, doing this in a loop:

        xfs_io -F -f -d -c 'pwrite 0 64m' test
        rm -f test

eventually leads to ENOSPC.  (the xfs_io command does a
64m direct IO write to the file "test")

As with other block allocation callers, it looks like we need to
potentially retry the allocations on the initial ENOSPC.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 635f8ec..5c5bc5d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3378,6 +3378,7 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 	ssize_t ret;
 	int orphan = 0;
 	size_t count = iov_length(iov, nr_segs);
+	int retries = 0;
 
 	if (rw == WRITE) {
 		loff_t final_size = offset + count;
@@ -3400,9 +3401,12 @@ static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 		}
 	}
 
+retry:
 	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				 offset, nr_segs,
 				 ext4_get_block, NULL);
+	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
 
 	if (orphan) {
 		int err;
-- 
cgit v1.1


From 0f78ab9899e9d6acb09d5465def618704255963b Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Sun, 4 Oct 2009 21:04:38 +0200
Subject: Revert "Seperate read and write statistics of in_flight requests"

This reverts commit a9327cac440be4d8333bba975cbbf76045096275.

Corrado Zoccolo <czoccolo@gmail.com> reports:

"with 2.6.32-rc1 I started getting the following strange output from
"iostat -kx 2":
Linux 2.6.31bisect (et2) 	04/10/2009 	_i686_	(2 CPU)

avg-cpu:  %user   %nice %system %iowait  %steal   %idle
          10,70    0,00    3,16   15,75    0,00   70,38

Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s
avgrq-sz avgqu-sz   await  svctm  %util
sda              18,22     0,00    0,67    0,01    14,77     0,02
43,94     0,01   10,53 39043915,03 2629219,87
sdb              60,89     9,68   50,79    3,04  1724,43    50,52
65,95     0,70   13,06 488437,47 2629219,87

avg-cpu:  %user   %nice %system %iowait  %steal   %idle
           2,72    0,00    0,74    0,00    0,00   96,53

Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s
avgrq-sz avgqu-sz   await  svctm  %util
sda               0,00     0,00    0,00    0,00     0,00     0,00
0,00     0,00    0,00   0,00 100,00
sdb               0,00     0,00    0,00    0,00     0,00     0,00
0,00     0,00    0,00   0,00 100,00

avg-cpu:  %user   %nice %system %iowait  %steal   %idle
           6,68    0,00    0,99    0,00    0,00   92,33

Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s
avgrq-sz avgqu-sz   await  svctm  %util
sda               0,00     0,00    0,00    0,00     0,00     0,00
0,00     0,00    0,00   0,00 100,00
sdb               0,00     0,00    0,00    0,00     0,00     0,00
0,00     0,00    0,00   0,00 100,00

avg-cpu:  %user   %nice %system %iowait  %steal   %idle
           4,40    0,00    0,73    1,47    0,00   93,40

Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s
avgrq-sz avgqu-sz   await  svctm  %util
sda               0,00     0,00    0,00    0,00     0,00     0,00
0,00     0,00    0,00   0,00 100,00
sdb               0,00     4,00    0,00    3,00     0,00    28,00
18,67     0,06   19,50 333,33 100,00

Global values for service time and utilization are garbage. For
interval values, utilization is always 100%, and service time is
higher than normal.

I bisected it down to:
[a9327cac440be4d8333bba975cbbf76045096275] Seperate read and write
statistics of in_flight requests
and verified that reverting just that commit indeed solves the issue
on 2.6.32-rc1."

So until this is debugged, revert the bad commit.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/partitions/check.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 7b685e1..f38fee0 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -248,19 +248,11 @@ ssize_t part_stat_show(struct device *dev,
 		part_stat_read(p, merges[WRITE]),
 		(unsigned long long)part_stat_read(p, sectors[WRITE]),
 		jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
-		part_in_flight(p),
+		p->in_flight,
 		jiffies_to_msecs(part_stat_read(p, io_ticks)),
 		jiffies_to_msecs(part_stat_read(p, time_in_queue)));
 }
 
-ssize_t part_inflight_show(struct device *dev,
-			struct device_attribute *attr, char *buf)
-{
-	struct hd_struct *p = dev_to_part(dev);
-
-	return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]);
-}
-
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 ssize_t part_fail_show(struct device *dev,
 		       struct device_attribute *attr, char *buf)
@@ -289,7 +281,6 @@ static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
-static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
 	__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -301,7 +292,6 @@ static struct attribute *part_attrs[] = {
 	&dev_attr_size.attr,
 	&dev_attr_alignment_offset.attr,
 	&dev_attr_stat.attr,
-	&dev_attr_inflight.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
 #endif
-- 
cgit v1.1


From a99bbaf5ee6bad1aca0c88ea65ec6e5373e86184 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sun, 4 Oct 2009 16:11:37 +0400
Subject: headers: remove sched.h from poll.h

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/anon_inodes.c | 2 ++
 fs/coda/psdev.c  | 1 +
 fs/select.c      | 1 +
 3 files changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index d11c51f..2ca7a7c 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -8,8 +8,10 @@
  *
  */
 
+#include <linux/cred.h>
 #include <linux/file.h>
 #include <linux/poll.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/fs.h>
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 0376ac6..be4392c 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -22,6 +22,7 @@
 #include <linux/kernel.h>
 #include <linux/major.h>
 #include <linux/time.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/ioport.h>
 #include <linux/fcntl.h>
diff --git a/fs/select.c b/fs/select.c
index a201fc3..fd38ce2 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -15,6 +15,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/syscalls.h>
 #include <linux/module.h>
 #include <linux/slab.h>
-- 
cgit v1.1


From 61d92c328c16419fc96dc50dd16f8b8c695409ec Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 2 Oct 2009 19:11:56 -0400
Subject: Btrfs: fix deadlock on async thread startup

The btrfs async worker threads are used for a wide variety of things,
including processing bio end_io functions.  This means that when
the endio threads aren't running, the rest of the FS isn't
able to do the final processing required to clear PageWriteback.

The endio threads also try to exit as they become idle and
start more as the work piles up.  The problem is that starting more
threads means kthreadd may need to allocate ram, and that allocation
may wait until the global number of writeback pages on the system is
below a certain limit.

The result of that throttling is that end IO threads wait on
kthreadd, who is waiting on IO to end, which will never happen.

This commit fixes the deadlock by handing off thread startup to a
dedicated thread.  It also fixes a bug where the on-demand thread
creation was creating far too many threads because it didn't take into
account threads being started by other procs.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 81 ++++++++++++++++++++++++++++++++++++++++++++-----
 fs/btrfs/async-thread.h | 10 ++++--
 fs/btrfs/ctree.h        |  1 +
 fs/btrfs/disk-io.c      | 42 +++++++++++++------------
 fs/btrfs/relocation.c   |  4 +--
 5 files changed, 106 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 282ca08..c0861e7 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -64,6 +64,51 @@ struct btrfs_worker_thread {
 };
 
 /*
+ * btrfs_start_workers uses kthread_run, which can block waiting for memory
+ * for a very long time.  It will actually throttle on page writeback,
+ * and so it may not make progress until after our btrfs worker threads
+ * process all of the pending work structs in their queue
+ *
+ * This means we can't use btrfs_start_workers from inside a btrfs worker
+ * thread that is used as part of cleaning dirty memory, which pretty much
+ * involves all of the worker threads.
+ *
+ * Instead we have a helper queue who never has more than one thread
+ * where we scheduler thread start operations.  This worker_start struct
+ * is used to contain the work and hold a pointer to the queue that needs
+ * another worker.
+ */
+struct worker_start {
+	struct btrfs_work work;
+	struct btrfs_workers *queue;
+};
+
+static void start_new_worker_func(struct btrfs_work *work)
+{
+	struct worker_start *start;
+	start = container_of(work, struct worker_start, work);
+	btrfs_start_workers(start->queue, 1);
+	kfree(start);
+}
+
+static int start_new_worker(struct btrfs_workers *queue)
+{
+	struct worker_start *start;
+	int ret;
+
+	start = kzalloc(sizeof(*start), GFP_NOFS);
+	if (!start)
+		return -ENOMEM;
+
+	start->work.func = start_new_worker_func;
+	start->queue = queue;
+	ret = btrfs_queue_worker(queue->atomic_worker_start, &start->work);
+	if (ret)
+		kfree(start);
+	return ret;
+}
+
+/*
  * helper function to move a thread onto the idle list after it
  * has finished some requests.
  */
@@ -118,11 +163,13 @@ static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
 		goto out;
 
 	workers->atomic_start_pending = 0;
-	if (workers->num_workers >= workers->max_workers)
+	if (workers->num_workers + workers->num_workers_starting >=
+	    workers->max_workers)
 		goto out;
 
+	workers->num_workers_starting += 1;
 	spin_unlock_irqrestore(&workers->lock, flags);
-	btrfs_start_workers(workers, 1);
+	start_new_worker(workers);
 	return;
 
 out:
@@ -390,9 +437,11 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
 /*
  * simple init on struct btrfs_workers
  */
-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
+void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
+			struct btrfs_workers *async_helper)
 {
 	workers->num_workers = 0;
+	workers->num_workers_starting = 0;
 	INIT_LIST_HEAD(&workers->worker_list);
 	INIT_LIST_HEAD(&workers->idle_list);
 	INIT_LIST_HEAD(&workers->order_list);
@@ -404,14 +453,15 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
 	workers->name = name;
 	workers->ordered = 0;
 	workers->atomic_start_pending = 0;
-	workers->atomic_worker_start = 0;
+	workers->atomic_worker_start = async_helper;
 }
 
 /*
  * starts new worker threads.  This does not enforce the max worker
  * count in case you need to temporarily go past it.
  */
-int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+static int __btrfs_start_workers(struct btrfs_workers *workers,
+				 int num_workers)
 {
 	struct btrfs_worker_thread *worker;
 	int ret = 0;
@@ -444,6 +494,8 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
 		list_add_tail(&worker->worker_list, &workers->idle_list);
 		worker->idle = 1;
 		workers->num_workers++;
+		workers->num_workers_starting--;
+		WARN_ON(workers->num_workers_starting < 0);
 		spin_unlock_irq(&workers->lock);
 	}
 	return 0;
@@ -452,6 +504,14 @@ fail:
 	return ret;
 }
 
+int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
+{
+	spin_lock_irq(&workers->lock);
+	workers->num_workers_starting += num_workers;
+	spin_unlock_irq(&workers->lock);
+	return __btrfs_start_workers(workers, num_workers);
+}
+
 /*
  * run through the list and find a worker thread that doesn't have a lot
  * to do right now.  This can return null if we aren't yet at the thread
@@ -461,7 +521,10 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
 {
 	struct btrfs_worker_thread *worker;
 	struct list_head *next;
-	int enforce_min = workers->num_workers < workers->max_workers;
+	int enforce_min;
+
+	enforce_min = (workers->num_workers + workers->num_workers_starting) <
+		workers->max_workers;
 
 	/*
 	 * if we find an idle thread, don't move it to the end of the
@@ -509,15 +572,17 @@ again:
 	worker = next_worker(workers);
 
 	if (!worker) {
-		if (workers->num_workers >= workers->max_workers) {
+		if (workers->num_workers + workers->num_workers_starting >=
+		    workers->max_workers) {
 			goto fallback;
 		} else if (workers->atomic_worker_start) {
 			workers->atomic_start_pending = 1;
 			goto fallback;
 		} else {
+			workers->num_workers_starting++;
 			spin_unlock_irqrestore(&workers->lock, flags);
 			/* we're below the limit, start another worker */
-			btrfs_start_workers(workers, 1);
+			__btrfs_start_workers(workers, 1);
 			goto again;
 		}
 	}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index fc089b9..5077746 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -64,6 +64,8 @@ struct btrfs_workers {
 	/* current number of running workers */
 	int num_workers;
 
+	int num_workers_starting;
+
 	/* max number of workers allowed.  changed by btrfs_start_workers */
 	int max_workers;
 
@@ -78,9 +80,10 @@ struct btrfs_workers {
 
 	/*
 	 * are we allowed to sleep while starting workers or are we required
-	 * to start them at a later time?
+	 * to start them at a later time?  If we can't sleep, this indicates
+	 * which queue we need to use to schedule thread creation.
 	 */
-	int atomic_worker_start;
+	struct btrfs_workers *atomic_worker_start;
 
 	/* list with all the work threads.  The workers on the idle thread
 	 * may be actively servicing jobs, but they haven't yet hit the
@@ -109,7 +112,8 @@ struct btrfs_workers {
 int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
 int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
 int btrfs_stop_workers(struct btrfs_workers *workers);
-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
+void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
+			struct btrfs_workers *async_starter);
 int btrfs_requeue_work(struct btrfs_work *work);
 void btrfs_set_work_high_prio(struct btrfs_work *work);
 #endif
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8184f2f..1b920ff 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -907,6 +907,7 @@ struct btrfs_fs_info {
 	 * A third pool does submit_bio to avoid deadlocking with the other
 	 * two
 	 */
+	struct btrfs_workers generic_worker;
 	struct btrfs_workers workers;
 	struct btrfs_workers delalloc_workers;
 	struct btrfs_workers endio_workers;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 69dce50..9903f04 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1746,21 +1746,22 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		err = -EINVAL;
 		goto fail_iput;
 	}
-printk("thread pool is %d\n", fs_info->thread_pool_size);
-	/*
-	 * we need to start all the end_io workers up front because the
-	 * queue work function gets called at interrupt time, and so it
-	 * cannot dynamically grow.
-	 */
+
+	btrfs_init_workers(&fs_info->generic_worker,
+			   "genwork", 1, NULL);
+
 	btrfs_init_workers(&fs_info->workers, "worker",
-			   fs_info->thread_pool_size);
+			   fs_info->thread_pool_size,
+			   &fs_info->generic_worker);
 
 	btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
-			   fs_info->thread_pool_size);
+			   fs_info->thread_pool_size,
+			   &fs_info->generic_worker);
 
 	btrfs_init_workers(&fs_info->submit_workers, "submit",
 			   min_t(u64, fs_devices->num_devices,
-			   fs_info->thread_pool_size));
+			   fs_info->thread_pool_size),
+			   &fs_info->generic_worker);
 
 	/* a higher idle thresh on the submit workers makes it much more
 	 * likely that bios will be send down in a sane order to the
@@ -1774,15 +1775,20 @@ printk("thread pool is %d\n", fs_info->thread_pool_size);
 	fs_info->delalloc_workers.idle_thresh = 2;
 	fs_info->delalloc_workers.ordered = 1;
 
-	btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
+	btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
+			   &fs_info->generic_worker);
 	btrfs_init_workers(&fs_info->endio_workers, "endio",
-			   fs_info->thread_pool_size);
+			   fs_info->thread_pool_size,
+			   &fs_info->generic_worker);
 	btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
-			   fs_info->thread_pool_size);
+			   fs_info->thread_pool_size,
+			   &fs_info->generic_worker);
 	btrfs_init_workers(&fs_info->endio_meta_write_workers,
-			   "endio-meta-write", fs_info->thread_pool_size);
+			   "endio-meta-write", fs_info->thread_pool_size,
+			   &fs_info->generic_worker);
 	btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
-			   fs_info->thread_pool_size);
+			   fs_info->thread_pool_size,
+			   &fs_info->generic_worker);
 
 	/*
 	 * endios are largely parallel and should have a very
@@ -1794,12 +1800,8 @@ printk("thread pool is %d\n", fs_info->thread_pool_size);
 	fs_info->endio_write_workers.idle_thresh = 2;
 	fs_info->endio_meta_write_workers.idle_thresh = 2;
 
-	fs_info->endio_workers.atomic_worker_start = 1;
-	fs_info->endio_meta_workers.atomic_worker_start = 1;
-	fs_info->endio_write_workers.atomic_worker_start = 1;
-	fs_info->endio_meta_write_workers.atomic_worker_start = 1;
-
 	btrfs_start_workers(&fs_info->workers, 1);
+	btrfs_start_workers(&fs_info->generic_worker, 1);
 	btrfs_start_workers(&fs_info->submit_workers, 1);
 	btrfs_start_workers(&fs_info->delalloc_workers, 1);
 	btrfs_start_workers(&fs_info->fixup_workers, 1);
@@ -2012,6 +2014,7 @@ fail_chunk_root:
 	free_extent_buffer(chunk_root->node);
 	free_extent_buffer(chunk_root->commit_root);
 fail_sb_buffer:
+	btrfs_stop_workers(&fs_info->generic_worker);
 	btrfs_stop_workers(&fs_info->fixup_workers);
 	btrfs_stop_workers(&fs_info->delalloc_workers);
 	btrfs_stop_workers(&fs_info->workers);
@@ -2437,6 +2440,7 @@ int close_ctree(struct btrfs_root *root)
 
 	iput(fs_info->btree_inode);
 
+	btrfs_stop_workers(&fs_info->generic_worker);
 	btrfs_stop_workers(&fs_info->fixup_workers);
 	btrfs_stop_workers(&fs_info->delalloc_workers);
 	btrfs_stop_workers(&fs_info->workers);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 361ad32..cfcc93c 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3518,7 +3518,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 	BUG_ON(!rc->block_group);
 
 	btrfs_init_workers(&rc->workers, "relocate",
-			   fs_info->thread_pool_size);
+			   fs_info->thread_pool_size, NULL);
 
 	rc->extent_root = extent_root;
 	btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
@@ -3701,7 +3701,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
 	mapping_tree_init(&rc->reloc_root_tree);
 	INIT_LIST_HEAD(&rc->reloc_roots);
 	btrfs_init_workers(&rc->workers, "relocate",
-			   root->fs_info->thread_pool_size);
+			   root->fs_info->thread_pool_size, NULL);
 	rc->extent_root = root->fs_info->extent_root;
 
 	set_reloc_control(rc);
-- 
cgit v1.1


From 1cdda9b81ac0e6ee986f034fa02f221679e1c11a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 6 Oct 2009 10:04:28 -0400
Subject: Btrfs: fix possible softlockup in the allocator

Like the cluster allocating stuff, we can lockup the box with the normal
allocation path.  This happens when we

1) Start to cache a block group that is severely fragmented, but has a decent
amount of free space.
2) Start to commit a transaction
3) Have the commit try and empty out some of the delalloc inodes with extents
that are relatively large.

The inodes will not be able to make the allocations because they will ask for
allocations larger than a contiguous area in the free space cache.  So we will
wait for more progress to be made on the block group, but since we're in a
commit the caching kthread won't make any more progress and it already has
enough free space that wait_block_group_cache_progress will just return.  So,
if we wait and fail to make the allocation the next time around, just loop and
go to the next block group.  This keeps us from getting stuck in a softlockup.
Thanks,

Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d119c03..2f82fab 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4028,6 +4028,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	int loop = 0;
 	bool found_uncached_bg = false;
 	bool failed_cluster_refill = false;
+	bool failed_alloc = false;
 
 	WARN_ON(num_bytes < root->sectorsize);
 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -4232,14 +4233,23 @@ refill_cluster:
 
 		offset = btrfs_find_space_for_alloc(block_group, search_start,
 						    num_bytes, empty_size);
-		if (!offset && (cached || (!cached &&
-					   loop == LOOP_CACHING_NOWAIT))) {
-			goto loop;
-		} else if (!offset && (!cached &&
-				       loop > LOOP_CACHING_NOWAIT)) {
+		/*
+		 * If we didn't find a chunk, and we haven't failed on this
+		 * block group before, and this block group is in the middle of
+		 * caching and we are ok with waiting, then go ahead and wait
+		 * for progress to be made, and set failed_alloc to true.
+		 *
+		 * If failed_alloc is true then we've already waited on this
+		 * block group once and should move on to the next block group.
+		 */
+		if (!offset && !failed_alloc && !cached &&
+		    loop > LOOP_CACHING_NOWAIT) {
 			wait_block_group_cache_progress(block_group,
-					num_bytes + empty_size);
+						num_bytes + empty_size);
+			failed_alloc = true;
 			goto have_block_group;
+		} else if (!offset) {
+			goto loop;
 		}
 checks:
 		search_start = stripe_align(root, offset);
@@ -4287,6 +4297,7 @@ checks:
 		break;
 loop:
 		failed_cluster_refill = false;
+		failed_alloc = false;
 		btrfs_put_block_group(block_group);
 	}
 	up_read(&space_info->groups_sem);
-- 
cgit v1.1


From 316d315bffa4026f28085f6b24ebcebede370ac7 Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Tue, 6 Oct 2009 20:16:55 +0200
Subject: block: Seperate read and write statistics of in_flight requests v2

Commit a9327cac440be4d8333bba975cbbf76045096275 added seperate read
and write statistics of in_flight requests. And exported the number
of read and write requests in progress seperately through sysfs.

But  Corrado Zoccolo <czoccolo@gmail.com> reported getting strange
output from "iostat -kx 2". Global values for service time and
utilization were garbage. For interval values, utilization was always
100%, and service time is higher than normal.

So this was reverted by commit 0f78ab9899e9d6acb09d5465def618704255963b

The problem was in part_round_stats_single(), I missed the following:
        if (now == part->stamp)
                return;

-       if (part->in_flight) {
+       if (part_in_flight(part)) {
                __part_stat_add(cpu, part, time_in_queue,
                                part_in_flight(part) * (now - part->stamp));
                __part_stat_add(cpu, part, io_ticks, (now - part->stamp));

With this chunk included, the reported regression gets fixed.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>

--
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/partitions/check.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index f38fee0..7b685e1 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -248,11 +248,19 @@ ssize_t part_stat_show(struct device *dev,
 		part_stat_read(p, merges[WRITE]),
 		(unsigned long long)part_stat_read(p, sectors[WRITE]),
 		jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
-		p->in_flight,
+		part_in_flight(p),
 		jiffies_to_msecs(part_stat_read(p, io_ticks)),
 		jiffies_to_msecs(part_stat_read(p, time_in_queue)));
 }
 
+ssize_t part_inflight_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+
+	return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]);
+}
+
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 ssize_t part_fail_show(struct device *dev,
 		       struct device_attribute *attr, char *buf)
@@ -281,6 +289,7 @@ static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
+static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
 	__ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -292,6 +301,7 @@ static struct attribute *part_attrs[] = {
 	&dev_attr_size.attr,
 	&dev_attr_alignment_offset.attr,
 	&dev_attr_stat.attr,
+	&dev_attr_inflight.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
 #endif
-- 
cgit v1.1


From c5811dbdd26284d63c19fca618bd740dd10ad53d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 6 Oct 2009 15:40:15 -0400
Subject: NFS: Fix a default mount regression...

With the recent spate of changes, the nfs protocol version will now default
to 2 instead of 3, while the mount protocol version defaults to 3.

The following patch should ensure the defaults are consistent with the
previous defaults of vers=3,proto=tcp,mountvers=3,mountproto=tcp.

This fixes the bug
   http://bugzilla.kernel.org/show_bug.cgi?id=14259

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 29786d3..0343ebc 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -728,22 +728,24 @@ static void nfs_umount_begin(struct super_block *sb)
 	unlock_kernel();
 }
 
-static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(int flags)
+static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int version)
 {
 	struct nfs_parsed_mount_data *data;
 
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (data) {
-		data->flags		= flags;
 		data->rsize		= NFS_MAX_FILE_IO_SIZE;
 		data->wsize		= NFS_MAX_FILE_IO_SIZE;
 		data->acregmin		= NFS_DEF_ACREGMIN;
 		data->acregmax		= NFS_DEF_ACREGMAX;
 		data->acdirmin		= NFS_DEF_ACDIRMIN;
 		data->acdirmax		= NFS_DEF_ACDIRMAX;
+		data->mount_server.port	= NFS_UNSPEC_PORT;
 		data->nfs_server.port	= NFS_UNSPEC_PORT;
+		data->nfs_server.protocol = XPRT_TRANSPORT_TCP;
 		data->auth_flavors[0]	= RPC_AUTH_UNIX;
 		data->auth_flavor_len	= 1;
+		data->version		= version;
 		data->minorversion	= 0;
 	}
 	return data;
@@ -1711,8 +1713,6 @@ static int nfs_validate_mount_data(void *options,
 
 		if (!(data->flags & NFS_MOUNT_TCP))
 			args->nfs_server.protocol = XPRT_TRANSPORT_UDP;
-		else
-			args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
 		/* N.B. caller will free nfs_server.hostname in all cases */
 		args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL);
 		args->namlen		= data->namlen;
@@ -2106,7 +2106,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 	};
 	int error = -ENOMEM;
 
-	data = nfs_alloc_parsed_mount_data(NFS_MOUNT_VER3 | NFS_MOUNT_TCP);
+	data = nfs_alloc_parsed_mount_data(3);
 	mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
 	if (data == NULL || mntfh == NULL)
 		goto out_free_fh;
@@ -2376,7 +2376,6 @@ static int nfs4_validate_mount_data(void *options,
 	if (data == NULL)
 		goto out_no_data;
 
-	args->version		= 4;
 	switch (data->version) {
 	case 1:
 		if (data->host_addrlen > sizeof(args->nfs_server.address))
@@ -2660,7 +2659,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 	struct nfs_parsed_mount_data *data;
 	int error = -ENOMEM;
 
-	data = nfs_alloc_parsed_mount_data(0);
+	data = nfs_alloc_parsed_mount_data(4);
 	if (data == NULL)
 		goto out_free_data;
 
-- 
cgit v1.1


From f5855fecda65c1965c894915ace1e086d4925154 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 6 Oct 2009 15:40:37 -0400
Subject: NFS: Fix port and mountport display in /proc/self/mountinfo

Currently, the port and mount port will both display as 65535 if you do not
specify a port number. That would be wrong...

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0343ebc..0d14704 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -778,15 +778,13 @@ static int nfs_verify_server_address(struct sockaddr *addr)
  * Select between a default port value and a user-specified port value.
  * If a zero value is set, then autobind will be used.
  */
-static void nfs_set_default_port(struct sockaddr *sap, const int parsed_port,
+static void nfs_set_port(struct sockaddr *sap, int *port,
 				 const unsigned short default_port)
 {
-	unsigned short port = default_port;
+	if (*port == NFS_UNSPEC_PORT)
+		*port = default_port;
 
-	if (parsed_port != NFS_UNSPEC_PORT)
-		port = parsed_port;
-
-	rpc_set_port(sap, port);
+	rpc_set_port(sap, *port);
 }
 
 /*
@@ -1477,7 +1475,7 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
 		args->mount_server.addrlen = args->nfs_server.addrlen;
 	}
 	request.salen = args->mount_server.addrlen;
-	nfs_set_default_port(request.sap, args->mount_server.port, 0);
+	nfs_set_port(request.sap, &args->mount_server.port, 0);
 
 	/*
 	 * Now ask the mount server to map our export path
@@ -1767,7 +1765,7 @@ static int nfs_validate_mount_data(void *options,
 			goto out_v4_not_compiled;
 #endif
 
-		nfs_set_default_port(sap, args->nfs_server.port, 0);
+		nfs_set_port(sap, &args->nfs_server.port, 0);
 
 		nfs_set_mount_transport_protocol(args);
 
@@ -2331,7 +2329,7 @@ static int nfs4_validate_text_mount_data(void *options,
 {
 	struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
 
-	nfs_set_default_port(sap, args->nfs_server.port, NFS_PORT);
+	nfs_set_port(sap, &args->nfs_server.port, NFS_PORT);
 
 	nfs_validate_transport_protocol(args);
 
-- 
cgit v1.1


From bcd2ea17da6a329a7276cde7286d802f009af332 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 6 Oct 2009 15:41:22 -0400
Subject: NFS: Fix port initialisation in nfs_remount()

The recent changeset 53a0b9c4c99ab0085a06421f71592722e5b3fd5f (NFS: Replace
nfs_parse_ip_address() with rpc_pton()) broke nfs_remount, since the call
to rpc_pton() will zero out the port number in data->nfs_server.address.

This is actually due to a bug in nfs_remount: it should be looking at the
port number in nfs_server.port instead...

This fixes bug
   http://bugzilla.kernel.org/show_bug.cgi?id=14276

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0d14704..fb3b280 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1846,9 +1846,10 @@ nfs_compare_remount_data(struct nfs_server *nfss,
 	    data->acdirmin != nfss->acdirmin / HZ ||
 	    data->acdirmax != nfss->acdirmax / HZ ||
 	    data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) ||
+	    data->nfs_server.port != nfss->port ||
 	    data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen ||
-	    memcmp(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
-		   data->nfs_server.addrlen) != 0)
+	    !rpc_cmp_addr(&data->nfs_server.address,
+		    &nfss->nfs_client->cl_addr))
 		return -EINVAL;
 
 	return 0;
@@ -1891,6 +1892,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	data->acdirmin = nfss->acdirmin / HZ;
 	data->acdirmax = nfss->acdirmax / HZ;
 	data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ;
+	data->nfs_server.port = nfss->port;
 	data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen;
 	memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
 		data->nfs_server.addrlen);
-- 
cgit v1.1


From f4373bf9e67e4a653c8854acd7b02dac9714c98a Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Tue, 6 Oct 2009 15:42:18 -0400
Subject: nfs: Avoid overrun when copying client IP address string

As seen in <http://bugs.debian.org/549002>, nfs4_init_client() can
overrun the source string when copying the client IP address from
nfs_parsed_mount_data::client_address to nfs_client::cl_ipaddr.  Since
these are both treated as null-terminated strings elsewhere, the copy
should be done with strlcpy() not memcpy().

Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 63976c0..99ea196 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1180,7 +1180,7 @@ static int nfs4_init_client(struct nfs_client *clp,
 				      1, flags & NFS_MOUNT_NORESVPORT);
 	if (error < 0)
 		goto error;
-	memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
+	strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
 
 	error = nfs_idmap_new(clp);
 	if (error < 0) {
-- 
cgit v1.1


From 517be09def6cd7bc231222ee756fde8ea245a6fe Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 6 Oct 2009 15:42:20 -0400
Subject: NFSv4: Fix the referral mount code

Fix a typo which causes try_location() to use the wrong length argument
when calling nfs_parse_server_name(). This again, causes the initialisation
of the mount's sockaddr structure to fail.

Also ensure that if nfs4_pathname_string() returns an error, then we pass
that error back up the stack instead of ENOENT.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4namespace.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 2636c26..fa3408f 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -121,7 +121,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 
 	mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
 	if (IS_ERR(mnt_path))
-		return mnt;
+		return ERR_CAST(mnt_path);
 	mountdata->mnt_path = mnt_path;
 	maxbuflen = mnt_path - 1 - page2;
 
@@ -132,15 +132,15 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 		if (buf->len <= 0 || buf->len >= maxbuflen)
 			continue;
 
-		mountdata->addr = (struct sockaddr *)&addr;
-
 		if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len))
 			continue;
-		mountdata->addrlen = nfs_parse_server_name(buf->data,
-				buf->len,
-				mountdata->addr, mountdata->addrlen);
+
+		mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len,
+				(struct sockaddr *)&addr, sizeof(addr));
 		if (mountdata->addrlen == 0)
 			continue;
+
+		mountdata->addr = (struct sockaddr *)&addr;
 		rpc_set_port(mountdata->addr, NFS_PORT);
 
 		memcpy(page2, buf->data, buf->len);
-- 
cgit v1.1


From 4055e97318809638a57fbe1746b93bc7a90ef0d3 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Wed, 7 Oct 2009 16:32:24 -0700
Subject: fs: includecheck fix: proc, kcore.c

fix the following 'make includecheck' warning:

  fs/proc/kcore.c: linux/mm.h is included more than once.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/kcore.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 5601337..a44a789 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -23,7 +23,6 @@
 #include <asm/io.h>
 #include <linux/list.h>
 #include <linux/ioport.h>
-#include <linux/mm.h>
 #include <linux/memory.h>
 #include <asm/sections.h>
 
-- 
cgit v1.1


From 253fb02d62571e5455eedc9e39b9d660e86a40f0 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 7 Oct 2009 16:32:27 -0700
Subject: pagemap: export KPF_HWPOISON

This flag indicates a hardware detected memory corruption on the page.
Any future access of the page data may bring down the machine.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/page.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/proc/page.c b/fs/proc/page.c
index 2281c2c..5033ce0 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -94,6 +94,7 @@ static const struct file_operations proc_kpagecount_operations = {
 #define KPF_COMPOUND_TAIL	16
 #define KPF_HUGE		17
 #define KPF_UNEVICTABLE		18
+#define KPF_HWPOISON		19
 #define KPF_NOPAGE		20
 
 #define KPF_KSM			21
@@ -180,6 +181,10 @@ static u64 get_uflags(struct page *page)
 	u |= kpf_copy_bit(k, KPF_UNEVICTABLE,	PG_unevictable);
 	u |= kpf_copy_bit(k, KPF_MLOCKED,	PG_mlocked);
 
+#ifdef CONFIG_MEMORY_FAILURE
+	u |= kpf_copy_bit(k, KPF_HWPOISON,	PG_hwpoison);
+#endif
+
 #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR
 	u |= kpf_copy_bit(k, KPF_UNCACHED,	PG_uncached);
 #endif
-- 
cgit v1.1


From 3050141bae57984dd660e6861632ccf9b8bca77e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 8 Oct 2009 11:50:55 -0400
Subject: NFSv4: Kill nfs4_renewd_prepare_shutdown()

The NFSv4 renew daemon is shared between all active super blocks that refer
to a particular NFS server, so it is wrong to be shutting it down in
nfs4_kill_super every time a super block is destroyed.

This patch therefore kills nfs4_renewd_prepare_shutdown altogether, and
leaves it up to nfs4_shutdown_client() to also shut down the renew daemon
by means of the existing call to nfs4_kill_renewd().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4renewd.c | 6 ------
 fs/nfs/super.c      | 1 -
 2 files changed, 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index e27c6ce..0156c01 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -127,12 +127,6 @@ nfs4_schedule_state_renewal(struct nfs_client *clp)
 }
 
 void
-nfs4_renewd_prepare_shutdown(struct nfs_server *server)
-{
-	cancel_delayed_work(&server->nfs_client->cl_renewd);
-}
-
-void
 nfs4_kill_renewd(struct nfs_client *clp)
 {
 	cancel_delayed_work_sync(&clp->cl_renewd);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index fb3b280..6dabf6f 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2689,7 +2689,6 @@ static void nfs4_kill_super(struct super_block *sb)
 	dprintk("--> %s\n", __func__);
 	nfs_super_return_all_delegations(sb);
 	kill_anon_super(sb);
-	nfs4_renewd_prepare_shutdown(server);
 	nfs_fscache_release_super_cookie(sb);
 	nfs_free_server(server);
 	dprintk("<-- %s\n", __func__);
-- 
cgit v1.1


From 664fc5a4e7d0d7f3487e5c856b79f7dac79567fd Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 28 Sep 2009 13:34:20 -0700
Subject: ecryptfs: depends on CRYPTO

ecryptfs uses crypto APIs so it should depend on CRYPTO.
Otherwise many build errors occur. [63 lines not pasted]

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: ecryptfs-devel@lists.launchpad.net
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig
index 8aadb99..89ebed3 100644
--- a/fs/ecryptfs/Kconfig
+++ b/fs/ecryptfs/Kconfig
@@ -1,6 +1,6 @@
 config ECRYPT_FS
 	tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && KEYS && NET
+	depends on EXPERIMENTAL && KEYS && NET && CRYPTO
 	select CRYPTO_ECB
 	select CRYPTO_CBC
 	help
-- 
cgit v1.1


From ed1f21857e76a92a006e0f890a3d7f72953b1469 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Tue, 29 Sep 2009 02:33:59 -0500
Subject: eCryptfs: Remove Kconfig NET dependency and select MD5

eCryptfs no longer uses a netlink interface to communicate with
ecryptfsd, so NET is not a valid dependency anymore.

MD5 is required and must be built for eCryptfs to be of any use.

Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/Kconfig | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig
index 89ebed3..1cd6d9d 100644
--- a/fs/ecryptfs/Kconfig
+++ b/fs/ecryptfs/Kconfig
@@ -1,8 +1,9 @@
 config ECRYPT_FS
 	tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && KEYS && NET && CRYPTO
+	depends on EXPERIMENTAL && KEYS && CRYPTO
 	select CRYPTO_ECB
 	select CRYPTO_CBC
+	select CRYPTO_MD5
 	help
 	  Encrypted filesystem that operates on the VFS layer.  See
 	  <file:Documentation/filesystems/ecryptfs.txt> to learn more about
-- 
cgit v1.1


From 36520be8e32b49bd85a63b7b8b40cd07c3da59a5 Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Mon, 5 Oct 2009 14:25:44 -0400
Subject: ima: ecryptfs fix imbalance message

The unencrypted files are being measured.  Update the counters to get
rid of the ecryptfs imbalance message. (http://bugzilla.redhat.com/519737)

Reported-by: Sachin Garg
Cc: Eric Paris <eparis@redhat.com>
Cc: Dustin Kirkland <kirkland@canonical.com>
Cc: James Morris <jmorris@namei.org>
Cc: David Safford <safford@watson.ibm.com>
Cc: stable@kernel.org
Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/main.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 101fe4c..c6ac85d 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -35,6 +35,7 @@
 #include <linux/key.h>
 #include <linux/parser.h>
 #include <linux/fs_stack.h>
+#include <linux/ima.h>
 #include "ecryptfs_kernel.h"
 
 /**
@@ -118,6 +119,7 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
 	const struct cred *cred = current_cred();
 	struct ecryptfs_inode_info *inode_info =
 		ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
+	int opened_lower_file = 0;
 	int rc = 0;
 
 	mutex_lock(&inode_info->lower_file_mutex);
@@ -134,9 +136,12 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
 			       "for lower_dentry [0x%p] and lower_mnt [0x%p]; "
 			       "rc = [%d]\n", lower_dentry, lower_mnt, rc);
 			inode_info->lower_file = NULL;
-		}
+		} else
+			opened_lower_file = 1;
 	}
 	mutex_unlock(&inode_info->lower_file_mutex);
+	if (opened_lower_file)
+		ima_counts_get(inode_info->lower_file);
 	return rc;
 }
 
-- 
cgit v1.1


From f9581b1443abac50c90168301d40a7734b13a5dc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 6 Oct 2009 20:29:26 +0000
Subject: xfs: implement ->dirty_inode to fix timestamp handling

This is picking up on Felix's repost of Dave's patch to implement a
.dirty_inode method.  We really need this notification because
the VFS keeps writing directly into the inode structure instead
of going through methods to update this state.  In addition to
the long-known atime issue we now also have a caller in VM code
that updates c/mtime that way for shared writeable mmaps.  And
I found another one that no one has noticed in practice in the FIFO
code.

So implement ->dirty_inode to set i_update_core whenever the
inode gets externally dirtied, and switch the c/mtime handling to
the same scheme we already use for atime (always picking up
the value from the Linux inode).

Note that this patch also removes the xfs_synchronize_atime call
in xfs_reclaim it was superflous as we already synchronize the time
when writing the inode via the log (xfs_inode_item_format) or the
normal buffers (xfs_iflush_int).

In addition also remove the I_CLEAR check before copying the Linux
timestamps - now that we always have the Linux inode available
we can always use the timestamps in it.

Also switch to just using file_update_time for regular reads/writes -
that will get us all optimization done to it for free and make
sure we notice early when it breaks.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Felix Blyakher <felixb@sgi.com>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c  |  1 -
 fs/xfs/linux-2.6/xfs_iops.c  | 41 +++++++++++++++--------------------------
 fs/xfs/linux-2.6/xfs_lrw.c   |  2 +-
 fs/xfs/linux-2.6/xfs_super.c | 23 +++++++++++++++++++++++
 fs/xfs/xfs_dfrag.c           |  8 ++++----
 fs/xfs/xfs_inode.c           |  4 ++--
 fs/xfs/xfs_inode.h           |  2 +-
 fs/xfs/xfs_inode_item.c      | 18 +++++++++++-------
 fs/xfs/xfs_itable.c          | 21 +++++++++++++--------
 fs/xfs/xfs_vnodeops.c        |  6 ------
 10 files changed, 70 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index d5e5559..40d2226 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -215,7 +215,6 @@ xfs_setfilesize(
 
 	if (ip->i_d.di_size < isize) {
 		ip->i_d.di_size = isize;
-		ip->i_update_core = 1;
 		xfs_mark_inode_dirty_sync(ip);
 	}
 
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 626b474..cdf7114 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -57,19 +57,22 @@
 #include <linux/fiemap.h>
 
 /*
- * Bring the atime in the XFS inode uptodate.
- * Used before logging the inode to disk or when the Linux inode goes away.
+ * Bring the timestamps in the XFS inode uptodate.
+ *
+ * Used before writing the inode to disk.
  */
 void
-xfs_synchronize_atime(
+xfs_synchronize_times(
 	xfs_inode_t	*ip)
 {
 	struct inode	*inode = VFS_I(ip);
 
-	if (!(inode->i_state & I_CLEAR)) {
-		ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
-		ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
-	}
+	ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
+	ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
+	ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec;
+	ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec;
+	ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec;
+	ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec;
 }
 
 /*
@@ -106,32 +109,20 @@ xfs_ichgtime(
 	if ((flags & XFS_ICHGTIME_MOD) &&
 	    !timespec_equal(&inode->i_mtime, &tv)) {
 		inode->i_mtime = tv;
-		ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
-		ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
 		sync_it = 1;
 	}
 	if ((flags & XFS_ICHGTIME_CHG) &&
 	    !timespec_equal(&inode->i_ctime, &tv)) {
 		inode->i_ctime = tv;
-		ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec;
-		ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec;
 		sync_it = 1;
 	}
 
 	/*
-	 * We update the i_update_core field _after_ changing
-	 * the timestamps in order to coordinate properly with
-	 * xfs_iflush() so that we don't lose timestamp updates.
-	 * This keeps us from having to hold the inode lock
-	 * while doing this.  We use the SYNCHRONIZE macro to
-	 * ensure that the compiler does not reorder the update
-	 * of i_update_core above the timestamp updates above.
+	 * Update complete - now make sure everyone knows that the inode
+	 * is dirty.
 	 */
-	if (sync_it) {
-		SYNCHRONIZE();
-		ip->i_update_core = 1;
+	if (sync_it)
 		xfs_mark_inode_dirty_sync(ip);
-	}
 }
 
 /*
@@ -514,10 +505,8 @@ xfs_vn_getattr(
 	stat->gid = ip->i_d.di_gid;
 	stat->ino = ip->i_ino;
 	stat->atime = inode->i_atime;
-	stat->mtime.tv_sec = ip->i_d.di_mtime.t_sec;
-	stat->mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
-	stat->ctime.tv_sec = ip->i_d.di_ctime.t_sec;
-	stat->ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
+	stat->mtime = inode->i_mtime;
+	stat->ctime = inode->i_ctime;
 	stat->blocks =
 		XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
 
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 49e4a6a..072050f 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -667,7 +667,7 @@ start:
 		xip->i_new_size = new_size;
 
 	if (likely(!(ioflags & IO_INVIS)))
-		xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+		file_update_time(file);
 
 	/*
 	 * If the offset is beyond the size of the file, we have a couple
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 5d7c60a..1ea65b6 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -977,6 +977,28 @@ xfs_fs_inode_init_once(
 }
 
 /*
+ * Dirty the XFS inode when mark_inode_dirty_sync() is called so that
+ * we catch unlogged VFS level updates to the inode. Care must be taken
+ * here - the transaction code calls mark_inode_dirty_sync() to mark the
+ * VFS inode dirty in a transaction and clears the i_update_core field;
+ * it must clear the field after calling mark_inode_dirty_sync() to
+ * correctly indicate that the dirty state has been propagated into the
+ * inode log item.
+ *
+ * We need the barrier() to maintain correct ordering between unlogged
+ * updates and the transaction commit code that clears the i_update_core
+ * field. This requires all updates to be completed before marking the
+ * inode dirty.
+ */
+STATIC void
+xfs_fs_dirty_inode(
+	struct inode	*inode)
+{
+	barrier();
+	XFS_I(inode)->i_update_core = 1;
+}
+
+/*
  * Attempt to flush the inode, this will actually fail
  * if the inode is pinned, but we dirty the inode again
  * at the point when it is unpinned after a log write,
@@ -1539,6 +1561,7 @@ xfs_fs_get_sb(
 static struct super_operations xfs_super_operations = {
 	.alloc_inode		= xfs_fs_alloc_inode,
 	.destroy_inode		= xfs_fs_destroy_inode,
+	.dirty_inode		= xfs_fs_dirty_inode,
 	.write_inode		= xfs_fs_write_inode,
 	.clear_inode		= xfs_fs_clear_inode,
 	.put_super		= xfs_fs_put_super,
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 7465f9e..ab89a7e 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -206,10 +206,10 @@ xfs_swap_extents(
 	 * process that the file was not changed out from
 	 * under it.
 	 */
-	if ((sbp->bs_ctime.tv_sec != ip->i_d.di_ctime.t_sec) ||
-	    (sbp->bs_ctime.tv_nsec != ip->i_d.di_ctime.t_nsec) ||
-	    (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) ||
-	    (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) {
+	if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
+	    (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
+	    (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
+	    (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
 		error = XFS_ERROR(EBUSY);
 		goto out_unlock;
 	}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c1dc7ef..b92a4fa 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3068,9 +3068,9 @@ xfs_iflush_int(
 	SYNCHRONIZE();
 
 	/*
-	 * Make sure to get the latest atime from the Linux inode.
+	 * Make sure to get the latest timestamps from the Linux inode.
 	 */
-	xfs_synchronize_atime(ip);
+	xfs_synchronize_times(ip);
 
 	if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC,
 			       mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0b38b9a..41555de 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -504,7 +504,7 @@ void		xfs_ichgtime(xfs_inode_t *, int);
 void		xfs_lock_inodes(xfs_inode_t **, int, uint);
 void		xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
 
-void		xfs_synchronize_atime(xfs_inode_t *);
+void		xfs_synchronize_times(xfs_inode_t *);
 void		xfs_mark_inode_dirty_sync(xfs_inode_t *);
 
 #if defined(XFS_INODE_TRACE)
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 47d5b66..9794b87 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -232,6 +232,15 @@ xfs_inode_item_format(
 	nvecs	     = 1;
 
 	/*
+	 * Make sure the linux inode is dirty. We do this before
+	 * clearing i_update_core as the VFS will call back into
+	 * XFS here and set i_update_core, so we need to dirty the
+	 * inode first so that the ordering of i_update_core and
+	 * unlogged modifications still works as described below.
+	 */
+	xfs_mark_inode_dirty_sync(ip);
+
+	/*
 	 * Clear i_update_core if the timestamps (or any other
 	 * non-transactional modification) need flushing/logging
 	 * and we're about to log them with the rest of the core.
@@ -263,14 +272,9 @@ xfs_inode_item_format(
 	}
 
 	/*
-	 * Make sure to get the latest atime from the Linux inode.
+	 * Make sure to get the latest timestamps from the Linux inode.
 	 */
-	xfs_synchronize_atime(ip);
-
-	/*
-	 * make sure the linux inode is dirty
-	 */
-	xfs_mark_inode_dirty_sync(ip);
+	xfs_synchronize_times(ip);
 
 	vecp->i_addr = (xfs_caddr_t)&ip->i_d;
 	vecp->i_len  = sizeof(struct xfs_icdinode);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index b68f910..62efab2 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -59,6 +59,7 @@ xfs_bulkstat_one_iget(
 {
 	xfs_icdinode_t	*dic;	/* dinode core info pointer */
 	xfs_inode_t	*ip;		/* incore inode pointer */
+	struct inode	*inode;
 	int		error;
 
 	error = xfs_iget(mp, NULL, ino,
@@ -72,6 +73,7 @@ xfs_bulkstat_one_iget(
 	ASSERT(ip->i_imap.im_blkno != 0);
 
 	dic = &ip->i_d;
+	inode = VFS_I(ip);
 
 	/* xfs_iget returns the following without needing
 	 * further change.
@@ -83,16 +85,19 @@ xfs_bulkstat_one_iget(
 	buf->bs_uid = dic->di_uid;
 	buf->bs_gid = dic->di_gid;
 	buf->bs_size = dic->di_size;
+
 	/*
-	 * We are reading the atime from the Linux inode because the
-	 * dinode might not be uptodate.
+	 * We need to read the timestamps from the Linux inode because
+	 * the VFS keeps writing directly into the inode structure instead
+	 * of telling us about the updates.
 	 */
-	buf->bs_atime.tv_sec = VFS_I(ip)->i_atime.tv_sec;
-	buf->bs_atime.tv_nsec = VFS_I(ip)->i_atime.tv_nsec;
-	buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
-	buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
-	buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
-	buf->bs_ctime.tv_nsec = dic->di_ctime.t_nsec;
+	buf->bs_atime.tv_sec = inode->i_atime.tv_sec;
+	buf->bs_atime.tv_nsec = inode->i_atime.tv_nsec;
+	buf->bs_mtime.tv_sec = inode->i_mtime.tv_sec;
+	buf->bs_mtime.tv_nsec = inode->i_mtime.tv_nsec;
+	buf->bs_ctime.tv_sec = inode->i_ctime.tv_sec;
+	buf->bs_ctime.tv_nsec = inode->i_ctime.tv_nsec;
+
 	buf->bs_xflags = xfs_ip2xflags(ip);
 	buf->bs_extsize = dic->di_extsize << mp->m_sb.sb_blocklog;
 	buf->bs_extents = dic->di_nextents;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index a434f28..b572f7e 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2476,12 +2476,6 @@ xfs_reclaim(
 	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
 
 	/*
-	 * Make sure the atime in the XFS inode is correct before freeing the
-	 * Linux inode.
-	 */
-	xfs_synchronize_atime(ip);
-
-	/*
 	 * If we have nothing to flush with this inode then complete the
 	 * teardown now, otherwise break the link between the xfs inode and the
 	 * linux inode and clean up the xfs inode later. This avoids flushing
-- 
cgit v1.1


From c90b07e8dd9f263d2e2af1d9648ba269f4d0d8fd Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Tue, 6 Oct 2009 20:29:27 +0000
Subject: xfs: fix xfs_quiesce_data

We need to do a synchronous xfs_sync_fsdata to make sure the superblock
actually is on disk when we return.

Also remove SYNC_BDFLUSH flag to xfs_sync_inodes because that particular
flag is never checked.

Move xfs_filestream_flush call later to only release inodes after they
have been written out.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_sync.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 320be6a..7647b8d 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -419,14 +419,16 @@ xfs_quiesce_data(
 	/* push non-blocking */
 	xfs_sync_data(mp, 0);
 	xfs_qm_sync(mp, SYNC_TRYLOCK);
-	xfs_filestream_flush(mp);
 
-	/* push and block */
+	/* push and block till complete */
 	xfs_sync_data(mp, SYNC_WAIT);
 	xfs_qm_sync(mp, SYNC_WAIT);
 
+	/* drop inode references pinned by filestreams */
+	xfs_filestream_flush(mp);
+
 	/* write superblock and hoover up shutdown errors */
-	error = xfs_sync_fsdata(mp, 0);
+	error = xfs_sync_fsdata(mp, SYNC_WAIT);
 
 	/* flush data-only devices */
 	if (mp->m_rtdev_targp)
-- 
cgit v1.1


From 69961a26b82931601e07c9e3656bd90c598f2395 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 6 Oct 2009 20:29:28 +0000
Subject: xfs: cleanup ->sync_fs

Sort out ->sync_fs to not perform a superblock writeback for the wait = 0 case
as that is just an optional first pass and the superblock will be written back
properly in the next call with wait = 1.  Instead perform an opportunistic
quota writeback to have less work later.  Also remove the freeze special case
as we do a proper wait = 1 call in the freeze code anyway.

Also rename the function to xfs_fs_sync_fs to match the normal naming
convention, update comments and avoid calling into the laptop_mode logic on
an error.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_super.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 1ea65b6..eefea0d 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1148,7 +1148,7 @@ xfs_fs_put_super(
 }
 
 STATIC int
-xfs_fs_sync_super(
+xfs_fs_sync_fs(
 	struct super_block	*sb,
 	int			wait)
 {
@@ -1156,23 +1156,23 @@ xfs_fs_sync_super(
 	int			error;
 
 	/*
-	 * Treat a sync operation like a freeze.  This is to work
-	 * around a race in sync_inodes() which works in two phases
-	 * - an asynchronous flush, which can write out an inode
-	 * without waiting for file size updates to complete, and a
-	 * synchronous flush, which wont do anything because the
-	 * async flush removed the inode's dirty flag.  Also
-	 * sync_inodes() will not see any files that just have
-	 * outstanding transactions to be flushed because we don't
-	 * dirty the Linux inode until after the transaction I/O
-	 * completes.
+	 * Not much we can do for the first async pass.  Writing out the
+	 * superblock would be counter-productive as we are going to redirty
+	 * when writing out other data and metadata (and writing out a single
+	 * block is quite fast anyway).
+	 *
+	 * Try to asynchronously kick off quota syncing at least.
 	 */
-	if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE))
-		error = xfs_quiesce_data(mp);
-	else
-		error = xfs_sync_fsdata(mp, 0);
+	if (!wait) {
+		xfs_qm_sync(mp, SYNC_TRYLOCK);
+		return 0;
+	}
+
+	error = xfs_quiesce_data(mp);
+	if (error)
+		return -error;
 
-	if (unlikely(laptop_mode)) {
+	if (laptop_mode) {
 		int	prev_sync_seq = mp->m_sync_seq;
 
 		/*
@@ -1191,7 +1191,7 @@ xfs_fs_sync_super(
 				mp->m_sync_seq != prev_sync_seq);
 	}
 
-	return -error;
+	return 0;
 }
 
 STATIC int
@@ -1565,7 +1565,7 @@ static struct super_operations xfs_super_operations = {
 	.write_inode		= xfs_fs_write_inode,
 	.clear_inode		= xfs_fs_clear_inode,
 	.put_super		= xfs_fs_put_super,
-	.sync_fs		= xfs_fs_sync_super,
+	.sync_fs		= xfs_fs_sync_fs,
 	.freeze_fs		= xfs_fs_freeze,
 	.statfs			= xfs_fs_statfs,
 	.remount_fs		= xfs_fs_remount,
-- 
cgit v1.1


From 932640e8adaff3d0c28ee32d4863c7a7a74df055 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Tue, 6 Oct 2009 20:29:29 +0000
Subject: xfs: mark inodes dirty before issuing I/O

To make sure they get properly waited on in sync when I/O is in flight and
we latter need to update the inode size.  Requires a new helper to check if an
ioend structure is beyond the current EOF.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_aops.c | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 40d2226..6a2910e 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -186,19 +186,37 @@ xfs_destroy_ioend(
 }
 
 /*
+ * If the end of the current ioend is beyond the current EOF,
+ * return the new EOF value, otherwise zero.
+ */
+STATIC xfs_fsize_t
+xfs_ioend_new_eof(
+	xfs_ioend_t		*ioend)
+{
+	xfs_inode_t		*ip = XFS_I(ioend->io_inode);
+	xfs_fsize_t		isize;
+	xfs_fsize_t		bsize;
+
+	bsize = ioend->io_offset + ioend->io_size;
+	isize = MAX(ip->i_size, ip->i_new_size);
+	isize = MIN(isize, bsize);
+	return isize > ip->i_d.di_size ? isize : 0;
+}
+
+/*
  * Update on-disk file size now that data has been written to disk.
  * The current in-memory file size is i_size.  If a write is beyond
  * eof i_new_size will be the intended file size until i_size is
  * updated.  If this write does not extend all the way to the valid
  * file size then restrict this update to the end of the write.
  */
+
 STATIC void
 xfs_setfilesize(
 	xfs_ioend_t		*ioend)
 {
 	xfs_inode_t		*ip = XFS_I(ioend->io_inode);
 	xfs_fsize_t		isize;
-	xfs_fsize_t		bsize;
 
 	ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
 	ASSERT(ioend->io_type != IOMAP_READ);
@@ -206,14 +224,9 @@ xfs_setfilesize(
 	if (unlikely(ioend->io_error))
 		return;
 
-	bsize = ioend->io_offset + ioend->io_size;
-
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
-
-	isize = MAX(ip->i_size, ip->i_new_size);
-	isize = MIN(isize, bsize);
-
-	if (ip->i_d.di_size < isize) {
+	isize = xfs_ioend_new_eof(ioend);
+	if (isize) {
 		ip->i_d.di_size = isize;
 		xfs_mark_inode_dirty_sync(ip);
 	}
@@ -403,10 +416,16 @@ xfs_submit_ioend_bio(
 	struct bio	*bio)
 {
 	atomic_inc(&ioend->io_remaining);
-
 	bio->bi_private = ioend;
 	bio->bi_end_io = xfs_end_bio;
 
+	/*
+	 * If the I/O is beyond EOF we mark the inode dirty immediately
+	 * but don't update the inode size until I/O completion.
+	 */
+	if (xfs_ioend_new_eof(ioend))
+		xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode));
+
 	submit_bio(WRITE, bio);
 	ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
 	bio_put(bio);
-- 
cgit v1.1


From dce5065a57c158e0ca5db8e63c50118b2ecf4ac5 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Tue, 6 Oct 2009 20:29:30 +0000
Subject: xfs: make sure xfs_sync_fsdata covers the log

We want to always cover the log after writing out the superblock, and
in case of a synchronous writeout make sure we actually wait for the
log to be covered.  That way a filesystem that has been sync()ed can
be considered clean by log recovery.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_sync.c | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 7647b8d..961df0a 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -309,11 +309,15 @@ xfs_sync_attr(
 STATIC int
 xfs_commit_dummy_trans(
 	struct xfs_mount	*mp,
-	uint			log_flags)
+	uint			flags)
 {
 	struct xfs_inode	*ip = mp->m_rootip;
 	struct xfs_trans	*tp;
 	int			error;
+	int			log_flags = XFS_LOG_FORCE;
+
+	if (flags & SYNC_WAIT)
+		log_flags |= XFS_LOG_SYNC;
 
 	/*
 	 * Put a dummy transaction in the log to tell recovery
@@ -331,13 +335,12 @@ xfs_commit_dummy_trans(
 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 	xfs_trans_ihold(tp, ip);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-	/* XXX(hch): ignoring the error here.. */
 	error = xfs_trans_commit(tp, 0);
-
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
+	/* the log force ensures this transaction is pushed to disk */
 	xfs_log_force(mp, 0, log_flags);
-	return 0;
+	return error;
 }
 
 int
@@ -385,7 +388,20 @@ xfs_sync_fsdata(
 	else
 		XFS_BUF_ASYNC(bp);
 
-	return xfs_bwrite(mp, bp);
+	error = xfs_bwrite(mp, bp);
+	if (error)
+		return error;
+
+	/*
+	 * If this is a data integrity sync make sure all pending buffers
+	 * are flushed out for the log coverage check below.
+	 */
+	if (flags & SYNC_WAIT)
+		xfs_flush_buftarg(mp->m_ddev_targp, 1);
+
+	if (xfs_log_need_covered(mp))
+		error = xfs_commit_dummy_trans(mp, flags);
+	return error;
 
  out_brelse:
 	xfs_buf_relse(bp);
@@ -572,8 +588,6 @@ xfs_sync_worker(
 		/* dgc: errors ignored here */
 		error = xfs_qm_sync(mp, SYNC_TRYLOCK);
 		error = xfs_sync_fsdata(mp, SYNC_TRYLOCK);
-		if (xfs_log_need_covered(mp))
-			error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE);
 	}
 	mp->m_sync_seq++;
 	wake_up(&mp->m_wait_single_sync_task);
-- 
cgit v1.1


From 8e69ce147127a0235e8d1f2b75ea214be78c61b3 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Fri, 25 Sep 2009 19:42:26 +0000
Subject: fix readahead calculations in xfs_dir2_leaf_getdents()

This is for bug #850,
http://oss.sgi.com/bugzilla/show_bug.cgi?id=850
XFS file system segfaults , repeatedly and 100% reproducable in 2.6.30 , 2.6.31

The above only showed up on a CONFIG_XFS_DEBUG=y kernel, because
xfs_bmapi() ASSERTs that it has been asked for at least one map,

and it was getting 0.

The root cause is that our guesstimated "bufsize" from xfs_file_readdir
was fairly small, and the

		bufsize -= length;

in the loop was going negative - except bufsize is a size_t, so it
was wrapping to a very large number.

Then when we did
		ra_want = howmany(bufsize + mp->m_dirblksize,
				  mp->m_sb.sb_blocksize) - 1;

with that very large number, the (int) ra_want was coming out
negative, and a subsequent compare:

		if (1 + ra_want > map_blocks ...

was coming out -true- (negative int compare w/ uint) and we went
back to xfs_bmapi() for more, even though we did not need more,
and asked for 0 maps, and hit the ASSERT.

We have kind of a type mess here, but just keeping bufsize from
going negative is probably sufficient to avoid the problem.

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_dir2_leaf.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index fa913e45..41ad537 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -854,6 +854,7 @@ xfs_dir2_leaf_getdents(
 			 */
 			ra_want = howmany(bufsize + mp->m_dirblksize,
 					  mp->m_sb.sb_blocksize) - 1;
+			ASSERT(ra_want >= 0);
 
 			/*
 			 * If we don't have as many as we want, and we haven't
@@ -1088,7 +1089,8 @@ xfs_dir2_leaf_getdents(
 		 */
 		ptr += length;
 		curoff += length;
-		bufsize -= length;
+		/* bufsize may have just been a guess; don't go negative */
+		bufsize = bufsize > length ? bufsize - length : 0;
 	}
 
 	/*
-- 
cgit v1.1


From d0800703febc04827b8fa91921aa4e254d01e8d1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Sat, 26 Sep 2009 19:55:04 +0000
Subject: xfs: stop calling filemap_fdatawait inside ->fsync

Now that the VFS actually waits for the data I/O to complete before
calling into ->fsync we can stop doing it ourselves.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Alex Elder <aelder@sgi.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_file.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 988d8f8..6d65baf 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -176,14 +176,7 @@ xfs_file_fsync(
 	struct dentry		*dentry,
 	int			datasync)
 {
-	struct inode		*inode = dentry->d_inode;
-	struct xfs_inode	*ip = XFS_I(inode);
-	int			error;
-
-	/* capture size updates in I/O completion before writing the inode. */
-	error = filemap_fdatawait(inode->i_mapping);
-	if (error)
-		return error;
+	struct xfs_inode	*ip = XFS_I(dentry->d_inode);
 
 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
 	return -xfs_fsync(ip);
-- 
cgit v1.1


From a791e35e12ff672e8a0e140abeeaf900c3b2ea77 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 8 Oct 2009 11:27:10 -0400
Subject: Btrfs: cleanup extent_clear_unlock_delalloc flags

extent_clear_unlock_delalloc has a growing set of ugly parameters
that is very difficult to read and maintain.

This switches to a flag field and well named flag defines.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 27 +++++++++++----------------
 fs/btrfs/extent_io.h | 16 ++++++++++------
 fs/btrfs/inode.c     | 45 ++++++++++++++++++++++++++++++---------------
 3 files changed, 51 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index de1793b..f9708bd 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1401,12 +1401,7 @@ out_failed:
 int extent_clear_unlock_delalloc(struct inode *inode,
 				struct extent_io_tree *tree,
 				u64 start, u64 end, struct page *locked_page,
-				int unlock_pages,
-				int clear_unlock,
-				int clear_delalloc, int clear_dirty,
-				int set_writeback,
-				int end_writeback,
-				int set_private2)
+				unsigned long op)
 {
 	int ret;
 	struct page *pages[16];
@@ -1416,17 +1411,17 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 	int i;
 	int clear_bits = 0;
 
-	if (clear_unlock)
+	if (op & EXTENT_CLEAR_UNLOCK)
 		clear_bits |= EXTENT_LOCKED;
-	if (clear_dirty)
+	if (op & EXTENT_CLEAR_DIRTY)
 		clear_bits |= EXTENT_DIRTY;
 
-	if (clear_delalloc)
+	if (op & EXTENT_CLEAR_DELALLOC)
 		clear_bits |= EXTENT_DELALLOC;
 
 	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
-	if (!(unlock_pages || clear_dirty || set_writeback || end_writeback ||
-	      set_private2))
+	if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK |
+		    EXTENT_END_WRITEBACK | EXTENT_SET_PRIVATE2)))
 		return 0;
 
 	while (nr_pages > 0) {
@@ -1435,20 +1430,20 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 				     nr_pages, ARRAY_SIZE(pages)), pages);
 		for (i = 0; i < ret; i++) {
 
-			if (set_private2)
+			if (op & EXTENT_SET_PRIVATE2)
 				SetPagePrivate2(pages[i]);
 
 			if (pages[i] == locked_page) {
 				page_cache_release(pages[i]);
 				continue;
 			}
-			if (clear_dirty)
+			if (op & EXTENT_CLEAR_DIRTY)
 				clear_page_dirty_for_io(pages[i]);
-			if (set_writeback)
+			if (op & EXTENT_SET_WRITEBACK)
 				set_page_writeback(pages[i]);
-			if (end_writeback)
+			if (op & EXTENT_END_WRITEBACK)
 				end_page_writeback(pages[i]);
-			if (unlock_pages)
+			if (op & EXTENT_CLEAR_UNLOCK_PAGE)
 				unlock_page(pages[i]);
 			page_cache_release(pages[i]);
 		}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 4794ec8..41d2a47 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -25,6 +25,15 @@
 #define EXTENT_BUFFER_BLOCKING 1
 #define EXTENT_BUFFER_DIRTY 2
 
+/* these are flags for extent_clear_unlock_delalloc */
+#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
+#define EXTENT_CLEAR_UNLOCK	 0x2
+#define EXTENT_CLEAR_DELALLOC	 0x4
+#define EXTENT_CLEAR_DIRTY	 0x8
+#define EXTENT_SET_WRITEBACK	 0x10
+#define EXTENT_END_WRITEBACK	 0x20
+#define EXTENT_SET_PRIVATE2	 0x40
+
 /*
  * page->private values.  Every page that is controlled by the extent
  * map has page->private set to one.
@@ -288,10 +297,5 @@ int extent_range_uptodate(struct extent_io_tree *tree,
 int extent_clear_unlock_delalloc(struct inode *inode,
 				struct extent_io_tree *tree,
 				u64 start, u64 end, struct page *locked_page,
-				int unlock_page,
-				int clear_unlock,
-				int clear_delalloc, int clear_dirty,
-				int set_writeback,
-				int end_writeback,
-				int set_private2);
+				unsigned long op);
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3a6f953..a7058fc 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -424,9 +424,10 @@ again:
 			 * and free up our temp pages.
 			 */
 			extent_clear_unlock_delalloc(inode,
-						     &BTRFS_I(inode)->io_tree,
-						     start, end, NULL, 1, 0,
-						     0, 1, 1, 1, 0);
+			     &BTRFS_I(inode)->io_tree,
+			     start, end, NULL,
+			     EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
+			     EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
 			ret = 0;
 			goto free_pages_out;
 		}
@@ -637,11 +638,13 @@ static noinline int submit_compressed_extents(struct inode *inode,
 		 * clear dirty, set writeback and unlock the pages.
 		 */
 		extent_clear_unlock_delalloc(inode,
-					     &BTRFS_I(inode)->io_tree,
-					     async_extent->start,
-					     async_extent->start +
-					     async_extent->ram_size - 1,
-					     NULL, 1, 1, 0, 1, 1, 0, 0);
+				&BTRFS_I(inode)->io_tree,
+				async_extent->start,
+				async_extent->start +
+				async_extent->ram_size - 1,
+				NULL, EXTENT_CLEAR_UNLOCK_PAGE |
+				EXTENT_CLEAR_UNLOCK |
+				EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK);
 
 		ret = btrfs_submit_compressed_write(inode,
 				    async_extent->start,
@@ -712,9 +715,14 @@ static noinline int cow_file_range(struct inode *inode,
 					    start, end, 0, NULL);
 		if (ret == 0) {
 			extent_clear_unlock_delalloc(inode,
-						     &BTRFS_I(inode)->io_tree,
-						     start, end, NULL, 1, 1,
-						     1, 1, 1, 1, 0);
+				     &BTRFS_I(inode)->io_tree,
+				     start, end, NULL,
+				     EXTENT_CLEAR_UNLOCK_PAGE |
+				     EXTENT_CLEAR_UNLOCK |
+				     EXTENT_CLEAR_DELALLOC |
+				     EXTENT_CLEAR_DIRTY |
+				     EXTENT_SET_WRITEBACK |
+				     EXTENT_END_WRITEBACK);
 			*nr_written = *nr_written +
 			     (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
 			*page_started = 1;
@@ -738,6 +746,8 @@ static noinline int cow_file_range(struct inode *inode,
 	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
 
 	while (disk_num_bytes > 0) {
+		unsigned long op;
+
 		cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent);
 		ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
 					   root->sectorsize, 0, alloc_hint,
@@ -789,10 +799,13 @@ static noinline int cow_file_range(struct inode *inode,
 		 * Do set the Private2 bit so we know this page was properly
 		 * setup for writepage
 		 */
+		op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0;
+		op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
+			EXTENT_SET_PRIVATE2;
+
 		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
 					     start, start + ram_size - 1,
-					     locked_page, unlock, 1,
-					     1, 0, 0, 0, 1);
+					     locked_page, op);
 		disk_num_bytes -= cur_alloc_size;
 		num_bytes -= cur_alloc_size;
 		alloc_hint = ins.objectid + ins.offset;
@@ -1112,8 +1125,10 @@ out_check:
 		BUG_ON(ret);
 
 		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
-					cur_offset, cur_offset + num_bytes - 1,
-					locked_page, 1, 1, 1, 0, 0, 0, 1);
+				cur_offset, cur_offset + num_bytes - 1,
+				locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
+				EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
+				EXTENT_SET_PRIVATE2);
 		cur_offset = extent_end;
 		if (cur_offset > end)
 			break;
-- 
cgit v1.1


From a3429ab70b04363c6190964e82f04f44f3e34cf0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 8 Oct 2009 12:30:20 -0400
Subject: Btrfs: delay clearing EXTENT_DELALLOC for compressed extents

When compression is on, the cow_file_range code is farmed off to
worker threads.  This allows us to do significant CPU work in parallel
on SMP machines.

But it is a delicate balance around when we clear flags and how.  In
the past we cleared the delalloc flag immediately, which was safe
because the pages stayed locked.

But this is causing problems with the newest ENOSPC code, and with the
recent extent state cleanups we can now clear the delalloc bit at the
same time the uncompressed code does.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a7058fc..401dfb2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -427,6 +427,7 @@ again:
 			     &BTRFS_I(inode)->io_tree,
 			     start, end, NULL,
 			     EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
+			     EXTENT_CLEAR_DELALLOC |
 			     EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
 			ret = 0;
 			goto free_pages_out;
@@ -644,6 +645,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
 				async_extent->ram_size - 1,
 				NULL, EXTENT_CLEAR_UNLOCK_PAGE |
 				EXTENT_CLEAR_UNLOCK |
+				EXTENT_CLEAR_DELALLOC |
 				EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK);
 
 		ret = btrfs_submit_compressed_write(inode,
@@ -877,8 +879,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 	u64 cur_end;
 	int limit = 10 * 1024 * 1042;
 
-	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
-			 EXTENT_DELALLOC, 1, 0, NULL, GFP_NOFS);
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
+			 1, 0, NULL, GFP_NOFS);
 	while (start < end) {
 		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
 		async_cow->inode = inode;
-- 
cgit v1.1


From 32c00aff718bb54a214b39146bdd9ac01511cd25 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 8 Oct 2009 13:34:05 -0400
Subject: Btrfs: release delalloc reservations on extent item insertion

This patch fixes an issue with the delalloc metadata space reservation
code.  The problem is we used to free the reservation as soon as we
allocated the delalloc region.  The problem with this is if we are not
inserting an inline extent, we don't actually insert the extent item until
after the ordered extent is written out.  This patch does 3 things,

1) It moves the reservation clearing stuff into the ordered code, so when
we remove the ordered extent we remove the reservation.
2) It adds a EXTENT_DO_ACCOUNTING flag that gets passed when we clear
delalloc bits in the cases where we want to clear the metadata reservation
when we clear the delalloc extent, in the case that we do an inline extent
or we invalidate the page.
3) It adds another waitqueue to the space info so that when we start a fs
wide delalloc flush, anybody else who also hits that area will simply wait
for the flush to finish and then try to make their allocation.

This has been tested thoroughly to make sure we did not regress on
performance.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h  | 12 ++++++-----
 fs/btrfs/ctree.h        |  3 +++
 fs/btrfs/extent-tree.c  | 54 +++++++++++++++++++++++++++++++++++++++----------
 fs/btrfs/extent_io.c    | 19 +++++++++++------
 fs/btrfs/extent_io.h    |  2 ++
 fs/btrfs/file.c         |  3 ++-
 fs/btrfs/inode.c        | 45 ++++++++++++++++++++++++++++-------------
 fs/btrfs/ordered-data.c |  6 ++++++
 8 files changed, 107 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index a54d354..c71abec 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -128,12 +128,14 @@ struct btrfs_inode {
 	u64 last_unlink_trans;
 
 	/*
-	 * These two counters are for delalloc metadata reservations.  We keep
-	 * track of how many extents we've accounted for vs how many extents we
-	 * have.
+	 * Counters to keep track of the number of extent item's we may use due
+	 * to delalloc and such.  outstanding_extents is the number of extent
+	 * items we think we'll end up using, and reserved_extents is the number
+	 * of extent items we've reserved metadata for.
 	 */
-	int delalloc_reserved_extents;
-	int delalloc_extents;
+	spinlock_t accounting_lock;
+	int reserved_extents;
+	int outstanding_extents;
 
 	/*
 	 * ordered_data_close is set by truncate when a file that used
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1b920ff..dbdada5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -699,6 +699,9 @@ struct btrfs_space_info {
 
 	int allocating_chunk;
 	wait_queue_head_t wait;
+
+	int flushing;
+	wait_queue_head_t flush_wait;
 };
 
 /*
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2f82fab..3d1be0b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2823,14 +2823,17 @@ int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
 					   num_items);
 
 	spin_lock(&meta_sinfo->lock);
-	if (BTRFS_I(inode)->delalloc_reserved_extents <=
-	    BTRFS_I(inode)->delalloc_extents) {
+	spin_lock(&BTRFS_I(inode)->accounting_lock);
+	if (BTRFS_I(inode)->reserved_extents <=
+	    BTRFS_I(inode)->outstanding_extents) {
+		spin_unlock(&BTRFS_I(inode)->accounting_lock);
 		spin_unlock(&meta_sinfo->lock);
 		return 0;
 	}
+	spin_unlock(&BTRFS_I(inode)->accounting_lock);
 
-	BTRFS_I(inode)->delalloc_reserved_extents--;
-	BUG_ON(BTRFS_I(inode)->delalloc_reserved_extents < 0);
+	BTRFS_I(inode)->reserved_extents--;
+	BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
 
 	if (meta_sinfo->bytes_delalloc < num_bytes) {
 		bug = true;
@@ -2863,6 +2866,37 @@ static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
 		meta_sinfo->force_delalloc = 0;
 }
 
+static void flush_delalloc(struct btrfs_root *root,
+				 struct btrfs_space_info *info)
+{
+	bool wait = false;
+
+	spin_lock(&info->lock);
+
+	if (!info->flushing) {
+		info->flushing = 1;
+		init_waitqueue_head(&info->flush_wait);
+	} else {
+		wait = true;
+	}
+
+	spin_unlock(&info->lock);
+
+	if (wait) {
+		wait_event(info->flush_wait,
+			   !info->flushing);
+		return;
+	}
+
+	btrfs_start_delalloc_inodes(root);
+	btrfs_wait_ordered_extents(root, 0);
+
+	spin_lock(&info->lock);
+	info->flushing = 0;
+	spin_unlock(&info->lock);
+	wake_up(&info->flush_wait);
+}
+
 static int maybe_allocate_chunk(struct btrfs_root *root,
 				 struct btrfs_space_info *info)
 {
@@ -2980,21 +3014,20 @@ again:
 			filemap_flush(inode->i_mapping);
 			goto again;
 		} else if (flushed == 3) {
-			btrfs_start_delalloc_inodes(root);
-			btrfs_wait_ordered_extents(root, 0);
+			flush_delalloc(root, meta_sinfo);
 			goto again;
 		}
 		spin_lock(&meta_sinfo->lock);
 		meta_sinfo->bytes_delalloc -= num_bytes;
 		spin_unlock(&meta_sinfo->lock);
 		printk(KERN_ERR "enospc, has %d, reserved %d\n",
-		       BTRFS_I(inode)->delalloc_extents,
-		       BTRFS_I(inode)->delalloc_reserved_extents);
+		       BTRFS_I(inode)->outstanding_extents,
+		       BTRFS_I(inode)->reserved_extents);
 		dump_space_info(meta_sinfo, 0, 0);
 		return -ENOSPC;
 	}
 
-	BTRFS_I(inode)->delalloc_reserved_extents++;
+	BTRFS_I(inode)->reserved_extents++;
 	check_force_delalloc(meta_sinfo);
 	spin_unlock(&meta_sinfo->lock);
 
@@ -3093,8 +3126,7 @@ again:
 		}
 
 		if (retries == 2) {
-			btrfs_start_delalloc_inodes(root);
-			btrfs_wait_ordered_extents(root, 0);
+			flush_delalloc(root, meta_sinfo);
 			goto again;
 		}
 		spin_lock(&meta_sinfo->lock);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f9708bd..96577e8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -460,7 +460,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
 			    struct extent_state *state, int bits, int wake,
 			    int delete)
 {
-	int ret = state->state & bits;
+	int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING;
+	int ret = state->state & bits_to_clear;
 
 	if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
 		u64 range = state->end - state->start + 1;
@@ -468,7 +469,7 @@ static int clear_state_bit(struct extent_io_tree *tree,
 		tree->dirty_bytes -= range;
 	}
 	clear_state_cb(tree, state, bits);
-	state->state &= ~bits;
+	state->state &= ~bits_to_clear;
 	if (wake)
 		wake_up(&state->wq);
 	if (delete || state->state == 0) {
@@ -956,7 +957,8 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end,
-				EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
+				EXTENT_DIRTY | EXTENT_DELALLOC |
+				EXTENT_DO_ACCOUNTING, 0, 0,
 				NULL, mask);
 }
 
@@ -1419,9 +1421,13 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 	if (op & EXTENT_CLEAR_DELALLOC)
 		clear_bits |= EXTENT_DELALLOC;
 
+	if (op & EXTENT_CLEAR_ACCOUNTING)
+		clear_bits |= EXTENT_DO_ACCOUNTING;
+
 	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
-	if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK |
-		    EXTENT_END_WRITEBACK | EXTENT_SET_PRIVATE2)))
+	if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
+		    EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
+		    EXTENT_SET_PRIVATE2)))
 		return 0;
 
 	while (nr_pages > 0) {
@@ -2709,7 +2715,8 @@ int extent_invalidatepage(struct extent_io_tree *tree,
 	lock_extent(tree, start, end, GFP_NOFS);
 	wait_on_page_writeback(page);
 	clear_extent_bit(tree, start, end,
-			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
+			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
+			 EXTENT_DO_ACCOUNTING,
 			 1, 1, NULL, GFP_NOFS);
 	return 0;
 }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 41d2a47..36de250 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -15,6 +15,7 @@
 #define EXTENT_BUFFER_FILLED (1 << 8)
 #define EXTENT_BOUNDARY (1 << 9)
 #define EXTENT_NODATASUM (1 << 10)
+#define EXTENT_DO_ACCOUNTING (1 << 11)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 
 /* flags for bio submission */
@@ -33,6 +34,7 @@
 #define EXTENT_SET_WRITEBACK	 0x10
 #define EXTENT_END_WRITEBACK	 0x20
 #define EXTENT_SET_PRIVATE2	 0x40
+#define EXTENT_CLEAR_ACCOUNTING  0x80
 
 /*
  * page->private values.  Every page that is controlled by the extent
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f155179..53fb1c9 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -878,7 +878,8 @@ again:
 			btrfs_put_ordered_extent(ordered);
 
 		clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
-				  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
+				  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
+				  EXTENT_DO_ACCOUNTING,
 				  GFP_NOFS);
 		unlock_extent(&BTRFS_I(inode)->io_tree,
 			      start_pos, last_pos - 1, GFP_NOFS);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 401dfb2..ccc4f11 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -428,6 +428,7 @@ again:
 			     start, end, NULL,
 			     EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
 			     EXTENT_CLEAR_DELALLOC |
+			     EXTENT_CLEAR_ACCOUNTING |
 			     EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
 			ret = 0;
 			goto free_pages_out;
@@ -722,6 +723,7 @@ static noinline int cow_file_range(struct inode *inode,
 				     EXTENT_CLEAR_UNLOCK_PAGE |
 				     EXTENT_CLEAR_UNLOCK |
 				     EXTENT_CLEAR_DELALLOC |
+				     EXTENT_CLEAR_ACCOUNTING |
 				     EXTENT_CLEAR_DIRTY |
 				     EXTENT_SET_WRITEBACK |
 				     EXTENT_END_WRITEBACK);
@@ -1195,15 +1197,17 @@ static int btrfs_split_extent_hook(struct inode *inode,
 					root->fs_info->max_extent);
 
 		/*
-		 * if we break a large extent up then leave delalloc_extents be,
-		 * since we've already accounted for the large extent.
+		 * if we break a large extent up then leave oustanding_extents
+		 * be, since we've already accounted for the large extent.
 		 */
 		if (div64_u64(new_size + root->fs_info->max_extent - 1,
 			      root->fs_info->max_extent) < num_extents)
 			return 0;
 	}
 
-	BTRFS_I(inode)->delalloc_extents++;
+	spin_lock(&BTRFS_I(inode)->accounting_lock);
+	BTRFS_I(inode)->outstanding_extents++;
+	spin_unlock(&BTRFS_I(inode)->accounting_lock);
 
 	return 0;
 }
@@ -1234,7 +1238,9 @@ static int btrfs_merge_extent_hook(struct inode *inode,
 
 	/* we're not bigger than the max, unreserve the space and go */
 	if (new_size <= root->fs_info->max_extent) {
-		BTRFS_I(inode)->delalloc_extents--;
+		spin_lock(&BTRFS_I(inode)->accounting_lock);
+		BTRFS_I(inode)->outstanding_extents--;
+		spin_unlock(&BTRFS_I(inode)->accounting_lock);
 		return 0;
 	}
 
@@ -1248,7 +1254,9 @@ static int btrfs_merge_extent_hook(struct inode *inode,
 		      root->fs_info->max_extent) > num_extents)
 		return 0;
 
-	BTRFS_I(inode)->delalloc_extents--;
+	spin_lock(&BTRFS_I(inode)->accounting_lock);
+	BTRFS_I(inode)->outstanding_extents--;
+	spin_unlock(&BTRFS_I(inode)->accounting_lock);
 
 	return 0;
 }
@@ -1270,7 +1278,9 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 	if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
 
-		BTRFS_I(inode)->delalloc_extents++;
+		spin_lock(&BTRFS_I(inode)->accounting_lock);
+		BTRFS_I(inode)->outstanding_extents++;
+		spin_unlock(&BTRFS_I(inode)->accounting_lock);
 		btrfs_delalloc_reserve_space(root, inode, end - start + 1);
 		spin_lock(&root->fs_info->delalloc_lock);
 		BTRFS_I(inode)->delalloc_bytes += end - start + 1;
@@ -1298,8 +1308,12 @@ static int btrfs_clear_bit_hook(struct inode *inode,
 	if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
 
-		BTRFS_I(inode)->delalloc_extents--;
-		btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+		if (bits & EXTENT_DO_ACCOUNTING) {
+			spin_lock(&BTRFS_I(inode)->accounting_lock);
+			BTRFS_I(inode)->outstanding_extents--;
+			spin_unlock(&BTRFS_I(inode)->accounting_lock);
+			btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+		}
 
 		spin_lock(&root->fs_info->delalloc_lock);
 		if (state->end - state->start + 1 >
@@ -4825,7 +4839,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 		 */
 		clear_extent_bit(tree, page_start, page_end,
 				 EXTENT_DIRTY | EXTENT_DELALLOC |
-				 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
+				 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
+				 NULL, GFP_NOFS);
 		/*
 		 * whoever cleared the private bit is responsible
 		 * for the finish_ordered_io
@@ -4838,8 +4853,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 		lock_extent(tree, page_start, page_end, GFP_NOFS);
 	}
 	clear_extent_bit(tree, page_start, page_end,
-		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
-		 1, 1, NULL, GFP_NOFS);
+		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
+		 EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS);
 	__btrfs_releasepage(page, GFP_NOFS);
 
 	ClearPageChecked(page);
@@ -4934,7 +4949,8 @@ again:
 	 * prepare_pages in the normal write path.
 	 */
 	clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
-			  EXTENT_DIRTY | EXTENT_DELALLOC, GFP_NOFS);
+			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+			  GFP_NOFS);
 
 	ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
 	if (ret) {
@@ -5082,8 +5098,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 		return NULL;
 	ei->last_trans = 0;
 	ei->logged_trans = 0;
-	ei->delalloc_extents = 0;
-	ei->delalloc_reserved_extents = 0;
+	ei->outstanding_extents = 0;
+	ei->reserved_extents = 0;
+	spin_lock_init(&ei->accounting_lock);
 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
 	INIT_LIST_HEAD(&ei->i_orphan);
 	INIT_LIST_HEAD(&ei->ordered_operations);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 4a9c8c4..ab21c29 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -306,6 +306,12 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 	tree->last = NULL;
 	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
 
+	spin_lock(&BTRFS_I(inode)->accounting_lock);
+	BTRFS_I(inode)->outstanding_extents--;
+	spin_unlock(&BTRFS_I(inode)->accounting_lock);
+	btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
+					      inode, 1);
+
 	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
 	list_del_init(&entry->root_extent_list);
 
-- 
cgit v1.1


From e3ccfa989752c083ceb23c823a84f7ce3a081e61 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 7 Oct 2009 20:44:34 -0400
Subject: Btrfs: async delalloc flushing under space pressure

This patch moves the delalloc flushing that occurs when we are under space
pressure off to a async thread pool.  This helps since we only free up
metadata space when we actually insert the extent item, which means it takes
quite a while for space to be free'ed up if we wait on all ordered extents.
However, if space is freed up due to inline extents being inserted, we can
wake people who are waiting up early, and they can finish their work.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       | 13 ++++----
 fs/btrfs/disk-io.c     |  6 ++++
 fs/btrfs/extent-tree.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 88 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index dbdada5..a362dd6 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -691,17 +691,17 @@ struct btrfs_space_info {
 
 	struct list_head list;
 
+	/* for controlling how we free up space for allocations */
+	wait_queue_head_t allocate_wait;
+	wait_queue_head_t flush_wait;
+	int allocating_chunk;
+	int flushing;
+
 	/* for block groups in our same type */
 	struct list_head block_groups;
 	spinlock_t lock;
 	struct rw_semaphore groups_sem;
 	atomic_t caching_threads;
-
-	int allocating_chunk;
-	wait_queue_head_t wait;
-
-	int flushing;
-	wait_queue_head_t flush_wait;
 };
 
 /*
@@ -918,6 +918,7 @@ struct btrfs_fs_info {
 	struct btrfs_workers endio_meta_write_workers;
 	struct btrfs_workers endio_write_workers;
 	struct btrfs_workers submit_workers;
+	struct btrfs_workers enospc_workers;
 	/*
 	 * fixup workers take dirty pages that didn't properly go through
 	 * the cow mechanism and make them safe to write.  It happens
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9903f04..ac8927b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1762,6 +1762,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 			   min_t(u64, fs_devices->num_devices,
 			   fs_info->thread_pool_size),
 			   &fs_info->generic_worker);
+	btrfs_init_workers(&fs_info->enospc_workers, "enospc",
+			   fs_info->thread_pool_size,
+			   &fs_info->generic_worker);
 
 	/* a higher idle thresh on the submit workers makes it much more
 	 * likely that bios will be send down in a sane order to the
@@ -1809,6 +1812,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_start_workers(&fs_info->endio_meta_workers, 1);
 	btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
 	btrfs_start_workers(&fs_info->endio_write_workers, 1);
+	btrfs_start_workers(&fs_info->enospc_workers, 1);
 
 	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
 	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -2023,6 +2027,7 @@ fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
 	btrfs_stop_workers(&fs_info->endio_write_workers);
 	btrfs_stop_workers(&fs_info->submit_workers);
+	btrfs_stop_workers(&fs_info->enospc_workers);
 fail_iput:
 	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 	iput(fs_info->btree_inode);
@@ -2449,6 +2454,7 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
 	btrfs_stop_workers(&fs_info->endio_write_workers);
 	btrfs_stop_workers(&fs_info->submit_workers);
+	btrfs_stop_workers(&fs_info->enospc_workers);
 
 	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3d1be0b..5302680 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2866,9 +2866,66 @@ static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
 		meta_sinfo->force_delalloc = 0;
 }
 
+struct async_flush {
+	struct btrfs_root *root;
+	struct btrfs_space_info *info;
+	struct btrfs_work work;
+};
+
+static noinline void flush_delalloc_async(struct btrfs_work *work)
+{
+	struct async_flush *async;
+	struct btrfs_root *root;
+	struct btrfs_space_info *info;
+
+	async = container_of(work, struct async_flush, work);
+	root = async->root;
+	info = async->info;
+
+	btrfs_start_delalloc_inodes(root);
+	wake_up(&info->flush_wait);
+	btrfs_wait_ordered_extents(root, 0);
+
+	spin_lock(&info->lock);
+	info->flushing = 0;
+	spin_unlock(&info->lock);
+	wake_up(&info->flush_wait);
+
+	kfree(async);
+}
+
+static void wait_on_flush(struct btrfs_space_info *info)
+{
+	DEFINE_WAIT(wait);
+	u64 used;
+
+	while (1) {
+		prepare_to_wait(&info->flush_wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+		spin_lock(&info->lock);
+		if (!info->flushing) {
+			spin_unlock(&info->lock);
+			break;
+		}
+
+		used = info->bytes_used + info->bytes_reserved +
+			info->bytes_pinned + info->bytes_readonly +
+			info->bytes_super + info->bytes_root +
+			info->bytes_may_use + info->bytes_delalloc;
+		if (used < info->total_bytes) {
+			spin_unlock(&info->lock);
+			break;
+		}
+		spin_unlock(&info->lock);
+		schedule();
+	}
+	finish_wait(&info->flush_wait, &wait);
+}
+
 static void flush_delalloc(struct btrfs_root *root,
 				 struct btrfs_space_info *info)
 {
+	struct async_flush *async;
 	bool wait = false;
 
 	spin_lock(&info->lock);
@@ -2883,11 +2940,24 @@ static void flush_delalloc(struct btrfs_root *root,
 	spin_unlock(&info->lock);
 
 	if (wait) {
-		wait_event(info->flush_wait,
-			   !info->flushing);
+		wait_on_flush(info);
 		return;
 	}
 
+	async = kzalloc(sizeof(*async), GFP_NOFS);
+	if (!async)
+		goto flush;
+
+	async->root = root;
+	async->info = info;
+	async->work.func = flush_delalloc_async;
+
+	btrfs_queue_worker(&root->fs_info->enospc_workers,
+			   &async->work);
+	wait_on_flush(info);
+	return;
+
+flush:
 	btrfs_start_delalloc_inodes(root);
 	btrfs_wait_ordered_extents(root, 0);
 
@@ -2927,7 +2997,7 @@ static int maybe_allocate_chunk(struct btrfs_root *root,
 	if (!info->allocating_chunk) {
 		info->force_alloc = 1;
 		info->allocating_chunk = 1;
-		init_waitqueue_head(&info->wait);
+		init_waitqueue_head(&info->allocate_wait);
 	} else {
 		wait = true;
 	}
@@ -2935,7 +3005,7 @@ static int maybe_allocate_chunk(struct btrfs_root *root,
 	spin_unlock(&info->lock);
 
 	if (wait) {
-		wait_event(info->wait,
+		wait_event(info->allocate_wait,
 			   !info->allocating_chunk);
 		return 1;
 	}
@@ -2956,7 +3026,7 @@ out:
 	spin_lock(&info->lock);
 	info->allocating_chunk = 0;
 	spin_unlock(&info->lock);
-	wake_up(&info->wait);
+	wake_up(&info->allocate_wait);
 
 	if (ret)
 		return 0;
-- 
cgit v1.1


From ff782e0a131c7f669445c07fe5c7ba91e043b7ed Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 8 Oct 2009 15:30:04 -0400
Subject: Btrfs: optimize fsync for the single writer case

This patch optimizes the tree logging stuff so it doesn't always wait 1 jiffie
for new people to join the logging transaction if there is only ever 1 writer.
This helps a little bit with latency where we have something like RPM where it
will fdatasync every file it writes, and so waiting the 1 jiffie for every
fdatasync really starts to add up.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h    |  2 ++
 fs/btrfs/tree-log.c | 12 +++++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a362dd6..16ddb19 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1010,6 +1010,8 @@ struct btrfs_root {
 	atomic_t log_commit[2];
 	unsigned long log_transid;
 	unsigned long log_batch;
+	pid_t log_start_pid;
+	bool log_multiple_pids;
 
 	u64 objectid;
 	u64 last_trans;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4d7d9ab..78f6254 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -137,11 +137,20 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 
 	mutex_lock(&root->log_mutex);
 	if (root->log_root) {
+		if (!root->log_start_pid) {
+			root->log_start_pid = current->pid;
+			root->log_multiple_pids = false;
+		} else if (root->log_start_pid != current->pid) {
+			root->log_multiple_pids = true;
+		}
+
 		root->log_batch++;
 		atomic_inc(&root->log_writers);
 		mutex_unlock(&root->log_mutex);
 		return 0;
 	}
+	root->log_multiple_pids = false;
+	root->log_start_pid = current->pid;
 	mutex_lock(&root->fs_info->tree_log_mutex);
 	if (!root->fs_info->log_root_tree) {
 		ret = btrfs_init_log_root_tree(trans, root->fs_info);
@@ -1985,7 +1994,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
 		wait_log_commit(trans, root, root->log_transid - 1);
 
-	while (1) {
+	while (root->log_multiple_pids) {
 		unsigned long batch = root->log_batch;
 		mutex_unlock(&root->log_mutex);
 		schedule_timeout_uninterruptible(1);
@@ -2011,6 +2020,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	root->log_batch = 0;
 	root->log_transid++;
 	log->log_transid = root->log_transid;
+	root->log_start_pid = 0;
 	smp_mb();
 	/*
 	 * log tree has been flushed to disk, new modifications of
-- 
cgit v1.1


From efefb1438be269897585934fc6c05deb4dfa01ce Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.yan@oracle.com>
Date: Fri, 9 Oct 2009 09:25:16 -0400
Subject: Btrfs: remove negative dentry when deleting subvolumne

The use of btrfs_dentry_delete is removing dentries from the
dcache when deleting subvolumne. btrfs_dentry_delete ignores
negative dentries. This is incorrect since if we don't remove
the negative dentry, its parent dentry can't be removed.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 12 +++++++-----
 fs/btrfs/ioctl.c |  1 +
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ccc4f11..d3585cb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3629,12 +3629,14 @@ static int btrfs_dentry_delete(struct dentry *dentry)
 {
 	struct btrfs_root *root;
 
-	if (!dentry->d_inode)
-		return 0;
+	if (!dentry->d_inode && !IS_ROOT(dentry))
+		dentry = dentry->d_parent;
 
-	root = BTRFS_I(dentry->d_inode)->root;
-	if (btrfs_root_refs(&root->root_item) == 0)
-		return 1;
+	if (dentry->d_inode) {
+		root = BTRFS_I(dentry->d_inode)->root;
+		if (btrfs_root_refs(&root->root_item) == 0)
+			return 1;
+	}
 	return 0;
 }
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9a780c8..e8795be 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -830,6 +830,7 @@ out_up_write:
 out_unlock:
 	mutex_unlock(&inode->i_mutex);
 	if (!err) {
+		shrink_dcache_sb(root->fs_info->sb);
 		btrfs_invalidate_inodes(dest);
 		d_delete(dentry);
 	}
-- 
cgit v1.1


From 94fcca9f8999e7828d5f4dc181daa39cad2af38a Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.yan@oracle.com>
Date: Fri, 9 Oct 2009 09:25:16 -0400
Subject: Btrfs: optimize back reference update during btrfs_drop_snapshot

This patch reading level 0 tree blocks that already use full backrefs.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 82 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 53 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5302680..4aedbff 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4911,6 +4911,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
 	u64 bytenr;
 	u64 generation;
 	u64 refs;
+	u64 flags;
 	u64 last = 0;
 	u32 nritems;
 	u32 blocksize;
@@ -4948,15 +4949,19 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
 		    generation <= root->root_key.offset)
 			continue;
 
+		/* We don't lock the tree block, it's OK to be racy here */
+		ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
+					       &refs, &flags);
+		BUG_ON(ret);
+		BUG_ON(refs == 0);
+
 		if (wc->stage == DROP_REFERENCE) {
-			ret = btrfs_lookup_extent_info(trans, root,
-						bytenr, blocksize,
-						&refs, NULL);
-			BUG_ON(ret);
-			BUG_ON(refs == 0);
 			if (refs == 1)
 				goto reada;
 
+			if (wc->level == 1 &&
+			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+				continue;
 			if (!wc->update_ref ||
 			    generation <= root->root_key.offset)
 				continue;
@@ -4965,6 +4970,10 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
 						  &wc->update_progress);
 			if (ret < 0)
 				continue;
+		} else {
+			if (wc->level == 1 &&
+			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+				continue;
 		}
 reada:
 		ret = readahead_tree_block(root, bytenr, blocksize,
@@ -4988,7 +4997,7 @@ reada:
 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   struct btrfs_path *path,
-				   struct walk_control *wc)
+				   struct walk_control *wc, int lookup_info)
 {
 	int level = wc->level;
 	struct extent_buffer *eb = path->nodes[level];
@@ -5003,8 +5012,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 	 * when reference count of tree block is 1, it won't increase
 	 * again. once full backref flag is set, we never clear it.
 	 */
-	if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
-	    (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) {
+	if (lookup_info &&
+	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
+	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
 		BUG_ON(!path->locks[level]);
 		ret = btrfs_lookup_extent_info(trans, root,
 					       eb->start, eb->len,
@@ -5065,7 +5075,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct btrfs_path *path,
-				 struct walk_control *wc)
+				 struct walk_control *wc, int *lookup_info)
 {
 	u64 bytenr;
 	u64 generation;
@@ -5085,8 +5095,10 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 	 * for the subtree
 	 */
 	if (wc->stage == UPDATE_BACKREF &&
-	    generation <= root->root_key.offset)
+	    generation <= root->root_key.offset) {
+		*lookup_info = 1;
 		return 1;
+	}
 
 	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
 	blocksize = btrfs_level_size(root, level - 1);
@@ -5099,14 +5111,19 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 	btrfs_tree_lock(next);
 	btrfs_set_lock_blocking(next);
 
-	if (wc->stage == DROP_REFERENCE) {
-		ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
-					       &wc->refs[level - 1],
-					       &wc->flags[level - 1]);
-		BUG_ON(ret);
-		BUG_ON(wc->refs[level - 1] == 0);
+	ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
+				       &wc->refs[level - 1],
+				       &wc->flags[level - 1]);
+	BUG_ON(ret);
+	BUG_ON(wc->refs[level - 1] == 0);
+	*lookup_info = 0;
 
+	if (wc->stage == DROP_REFERENCE) {
 		if (wc->refs[level - 1] > 1) {
+			if (level == 1 &&
+			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+				goto skip;
+
 			if (!wc->update_ref ||
 			    generation <= root->root_key.offset)
 				goto skip;
@@ -5120,12 +5137,17 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 			wc->stage = UPDATE_BACKREF;
 			wc->shared_level = level - 1;
 		}
+	} else {
+		if (level == 1 &&
+		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+			goto skip;
 	}
 
 	if (!btrfs_buffer_uptodate(next, generation)) {
 		btrfs_tree_unlock(next);
 		free_extent_buffer(next);
 		next = NULL;
+		*lookup_info = 1;
 	}
 
 	if (!next) {
@@ -5148,21 +5170,22 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 skip:
 	wc->refs[level - 1] = 0;
 	wc->flags[level - 1] = 0;
+	if (wc->stage == DROP_REFERENCE) {
+		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+			parent = path->nodes[level]->start;
+		} else {
+			BUG_ON(root->root_key.objectid !=
+			       btrfs_header_owner(path->nodes[level]));
+			parent = 0;
+		}
 
-	if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
-		parent = path->nodes[level]->start;
-	} else {
-		BUG_ON(root->root_key.objectid !=
-		       btrfs_header_owner(path->nodes[level]));
-		parent = 0;
+		ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
+					root->root_key.objectid, level - 1, 0);
+		BUG_ON(ret);
 	}
-
-	ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
-				root->root_key.objectid, level - 1, 0);
-	BUG_ON(ret);
-
 	btrfs_tree_unlock(next);
 	free_extent_buffer(next);
+	*lookup_info = 1;
 	return 1;
 }
 
@@ -5276,6 +5299,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 				   struct walk_control *wc)
 {
 	int level = wc->level;
+	int lookup_info = 1;
 	int ret;
 
 	while (level >= 0) {
@@ -5283,14 +5307,14 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 		    btrfs_header_nritems(path->nodes[level]))
 			break;
 
-		ret = walk_down_proc(trans, root, path, wc);
+		ret = walk_down_proc(trans, root, path, wc, lookup_info);
 		if (ret > 0)
 			break;
 
 		if (level == 0)
 			break;
 
-		ret = do_walk_down(trans, root, path, wc);
+		ret = do_walk_down(trans, root, path, wc, &lookup_info);
 		if (ret > 0) {
 			path->slots[level]++;
 			continue;
-- 
cgit v1.1


From 82d339d9b3a6395f17d3253887653250b693b74b Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 9 Oct 2009 09:54:36 -0400
Subject: Btrfs: constify dentry_operations

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h | 2 +-
 fs/btrfs/inode.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 16ddb19..36a19cd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2330,7 +2330,7 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
 void btrfs_orphan_cleanup(struct btrfs_root *root);
 int btrfs_cont_expand(struct inode *inode, loff_t size);
 int btrfs_invalidate_inodes(struct btrfs_root *root);
-extern struct dentry_operations btrfs_dentry_operations;
+extern const struct dentry_operations btrfs_dentry_operations;
 
 /* ioctl.c */
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d3585cb..d960492 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5840,6 +5840,6 @@ static struct inode_operations btrfs_symlink_inode_operations = {
 	.removexattr	= btrfs_removexattr,
 };
 
-struct dentry_operations btrfs_dentry_operations = {
+const struct dentry_operations btrfs_dentry_operations = {
 	.d_delete	= btrfs_dentry_delete,
 };
-- 
cgit v1.1


From e9061e214810c9534381a705a1b46533e09f2676 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 9 Oct 2009 09:57:45 -0400
Subject: Btrfs: fix uninit compiler warning in cow_file_range_nocow

The extent_type variable was exposed uninit via a goto.  It should be
impossible to trigger because it is protected by a check on another
variable, but this makes sure.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d960492..ef399a7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1023,6 +1023,7 @@ next_slot:
 
 		if (found_key.offset > cur_offset) {
 			extent_end = found_key.offset;
+			extent_type = 0;
 			goto out_check;
 		}
 
-- 
cgit v1.1


From ac6889cbb254be1ffea376bea4a96ce9be0e0ed0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 9 Oct 2009 11:29:53 -0400
Subject: Btrfs: fix file clone ioctl for bookend extents

The file clone ioctl was incorrectly taking the offset into the
extent on disk into account when calculating the length of the
cloned extent.

The length never changes based on the offset into the physical extent.

Test case:

fallocate -l 1g image
mke2fs image
bcp image image2
e2fsck -f image2

(errors on image2)

The math bug ends up wrapping the length of the extent, and things
go wrong from there.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e8795be..cdbb054 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1123,8 +1123,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 					datao += off - key.offset;
 					datal -= off - key.offset;
 				}
-				if (key.offset + datao + datal > off + len)
-					datal = off + len - key.offset - datao;
+
+				if (key.offset + datal > off + len)
+					datal = off + len - key.offset;
+
 				/* disko == 0 means it's a hole */
 				if (!disko)
 					datao = 0;
-- 
cgit v1.1


From d43c36dc6b357fa1806800f18aa30123c747a6d1 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 7 Oct 2009 17:09:06 +0400
Subject: headers: remove sched.h from interrupt.h

After m68k's task_thread_info() doesn't refer to current,
it's possible to remove sched.h from interrupt.h and not break m68k!
Many thanks to Heiko Carstens for allowing this.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/file.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index f313314..87e1290 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -10,6 +10,7 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/time.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/file.h>
-- 
cgit v1.1


From ef1f7a7e878e4ae37b3a78ebdeef9f911bae59df Mon Sep 17 00:00:00 2001
From: Bernd Schmidt <bernds_cb1@t-online.de>
Date: Tue, 6 Oct 2009 09:55:26 +0100
Subject: ROMFS: fix length used with romfs_dev_strnlen() function

An interestingly corrupted romfs file system exposed a problem with the
romfs_dev_strnlen function: it's passing the wrong value to its helpers.
Rather than limit the string to the length passed in by the callers, it
uses the size of the device as the limit.

Signed-off-by: Bernd Schmidt <bernds_cb1@t-online.de>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/romfs/storage.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c
index b3208ad..71e2b4d 100644
--- a/fs/romfs/storage.c
+++ b/fs/romfs/storage.c
@@ -253,11 +253,11 @@ ssize_t romfs_dev_strnlen(struct super_block *sb,
 
 #ifdef CONFIG_ROMFS_ON_MTD
 	if (sb->s_mtd)
-		return romfs_mtd_strnlen(sb, pos, limit);
+		return romfs_mtd_strnlen(sb, pos, maxlen);
 #endif
 #ifdef CONFIG_ROMFS_ON_BLOCK
 	if (sb->s_bdev)
-		return romfs_blk_strnlen(sb, pos, limit);
+		return romfs_blk_strnlen(sb, pos, maxlen);
 #endif
 	return -EIO;
 }
-- 
cgit v1.1


From a1be9eee2996fd9969625e7b5e2f2bc2032fd3cb Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Mon, 12 Oct 2009 11:26:12 -0400
Subject: NFS: suppress a build warning

struct sockaddr_storage * can safely be used as struct sockaddr *.
Suppress an "incompatible pointer type" warning.

Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfs/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 6dabf6f..a2c18ac 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1848,8 +1848,8 @@ nfs_compare_remount_data(struct nfs_server *nfss,
 	    data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) ||
 	    data->nfs_server.port != nfss->port ||
 	    data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen ||
-	    !rpc_cmp_addr(&data->nfs_server.address,
-		    &nfss->nfs_client->cl_addr))
+	    !rpc_cmp_addr((struct sockaddr *)&data->nfs_server.address,
+			  (struct sockaddr *)&nfss->nfs_client->cl_addr))
 		return -EINVAL;
 
 	return 0;
-- 
cgit v1.1


From 96ec2e0a719fd61791dd2b0dd01325c5d20e1233 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 16 Sep 2009 11:21:13 -0400
Subject: ext3: Don't update superblock write time when filesystem is read-only

This avoids updating the superblock write time when we are mounting
the root file system read/only but we need to replay the journal; at
that point, for people who are east of GMT and who make their clock
tick in localtime for Windows bug-for-bug compatibility, and this will
cause e2fsck to complain and force a full file system check.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext3/super.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 72743d3..7a520a8 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2321,7 +2321,18 @@ static int ext3_commit_super(struct super_block *sb,
 
 	if (!sbh)
 		return error;
-	es->s_wtime = cpu_to_le32(get_seconds());
+	/*
+	 * If the file system is mounted read-only, don't update the
+	 * superblock write time.  This avoids updating the superblock
+	 * write time when we are mounting the root file system
+	 * read/only but we need to replay the journal; at that point,
+	 * for people who are east of GMT and who make their clock
+	 * tick in localtime for Windows bug-for-bug compatibility,
+	 * the clock is set in the future, and this will cause e2fsck
+	 * to complain and force a full file system check.
+	 */
+	if (!(sb->s_flags & MS_RDONLY))
+		es->s_wtime = cpu_to_le32(get_seconds());
 	es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb));
 	es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
 	BUFFER_TRACE(sbh, "marking dirty");
-- 
cgit v1.1


From 4722607db6a78bd7748c51fa4c8d7371da797254 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 13 Oct 2009 12:55:09 -0400
Subject: Btrfs: only write one super copy during fsync

During a tree-log commit for fsync, we've been writing at least
two copies of the super block and forcing them to disk.

The other filesystems write only one, and this change brings us on
par with them.  A full transaction commit will write all the super
copies, so we still have redundant info written on a regular
basis.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/tree-log.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 78f6254..6d9ec28 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2092,7 +2092,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 * the running transaction open, so a full commit can't hop
 	 * in and cause problems either.
 	 */
-	write_ctree_super(trans, root->fs_info->tree_root, 2);
+	write_ctree_super(trans, root->fs_info->tree_root, 1);
 	ret = 0;
 
 out_wake_log_root:
-- 
cgit v1.1


From 257c62e1bce03e5b9f3f069fd52ad73a56de71fd Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 13 Oct 2009 13:21:08 -0400
Subject: Btrfs: avoid tree log commit when there are no changes

rpm has a habit of running fdatasync when the file hasn't
changed.  We already detect if a file hasn't been changed
in the current transaction but it might have been sent to
the tree-log in this transaction and not changed since
the last call to fsync.

In this case, we want to avoid a tree log sync, which includes
a number of synchronous writes and barriers.  This commit
extends the existing tracking of the last transaction to change
a file to also track the last sub-transaction.

The end result is that rpm -ivh and -Uvh are roughly twice as fast,
and on par with ext3.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  6 ++++++
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/disk-io.c     |  2 ++
 fs/btrfs/file.c        | 41 ++++++++++++++++++++++++++---------------
 fs/btrfs/inode.c       |  6 +++++-
 fs/btrfs/transaction.h |  1 +
 fs/btrfs/tree-log.c    | 27 +++++++++++++++++++++++++++
 fs/btrfs/tree-log.h    |  3 +++
 8 files changed, 71 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index c71abec..f6783a4 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -86,6 +86,12 @@ struct btrfs_inode {
 	 * transid of the trans_handle that last modified this inode
 	 */
 	u64 last_trans;
+
+	/*
+	 * log transid when this inode was last modified
+	 */
+	u64 last_sub_trans;
+
 	/*
 	 * transid that last logged this inode
 	 */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 36a19cd..d0cede5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1009,6 +1009,7 @@ struct btrfs_root {
 	atomic_t log_writers;
 	atomic_t log_commit[2];
 	unsigned long log_transid;
+	unsigned long last_log_commit;
 	unsigned long log_batch;
 	pid_t log_start_pid;
 	bool log_multiple_pids;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ac8927b..d4132aa 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -919,6 +919,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	atomic_set(&root->log_writers, 0);
 	root->log_batch = 0;
 	root->log_transid = 0;
+	root->last_log_commit = 0;
 	extent_io_tree_init(&root->dirty_log_pages,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 
@@ -1089,6 +1090,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 	WARN_ON(root->log_root);
 	root->log_root = log_root;
 	root->log_transid = 0;
+	root->last_log_commit = 0;
 	return 0;
 }
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 53fb1c9..4599113 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1087,8 +1087,10 @@ out_nolock:
 					btrfs_end_transaction(trans, root);
 				else
 					btrfs_commit_transaction(trans, root);
-			} else {
+			} else if (ret != BTRFS_NO_LOG_SYNC) {
 				btrfs_commit_transaction(trans, root);
+			} else {
+				btrfs_end_transaction(trans, root);
 			}
 		}
 		if (file->f_flags & O_DIRECT) {
@@ -1138,6 +1140,13 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	int ret = 0;
 	struct btrfs_trans_handle *trans;
 
+
+	/* we wait first, since the writeback may change the inode */
+	root->log_batch++;
+	/* the VFS called filemap_fdatawrite for us */
+	btrfs_wait_ordered_range(inode, 0, (u64)-1);
+	root->log_batch++;
+
 	/*
 	 * check the transaction that last modified this inode
 	 * and see if its already been committed
@@ -1145,6 +1154,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	if (!BTRFS_I(inode)->last_trans)
 		goto out;
 
+	/*
+	 * if the last transaction that changed this file was before
+	 * the current transaction, we can bail out now without any
+	 * syncing
+	 */
 	mutex_lock(&root->fs_info->trans_mutex);
 	if (BTRFS_I(inode)->last_trans <=
 	    root->fs_info->last_trans_committed) {
@@ -1154,13 +1168,6 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	}
 	mutex_unlock(&root->fs_info->trans_mutex);
 
-	root->log_batch++;
-	filemap_fdatawrite(inode->i_mapping);
-	btrfs_wait_ordered_range(inode, 0, (u64)-1);
-	root->log_batch++;
-
-	if (datasync && !(inode->i_state & I_DIRTY_PAGES))
-		goto out;
 	/*
 	 * ok we haven't committed the transaction yet, lets do a commit
 	 */
@@ -1189,14 +1196,18 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	 */
 	mutex_unlock(&dentry->d_inode->i_mutex);
 
-	if (ret > 0) {
-		ret = btrfs_commit_transaction(trans, root);
-	} else {
-		ret = btrfs_sync_log(trans, root);
-		if (ret == 0)
-			ret = btrfs_end_transaction(trans, root);
-		else
+	if (ret != BTRFS_NO_LOG_SYNC) {
+		if (ret > 0) {
 			ret = btrfs_commit_transaction(trans, root);
+		} else {
+			ret = btrfs_sync_log(trans, root);
+			if (ret == 0)
+				ret = btrfs_end_transaction(trans, root);
+			else
+				ret = btrfs_commit_transaction(trans, root);
+		}
+	} else {
+		ret = btrfs_end_transaction(trans, root);
 	}
 	mutex_lock(&dentry->d_inode->i_mutex);
 out:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ef399a7..5b9567c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3480,6 +3480,7 @@ static noinline void init_btrfs_i(struct inode *inode)
 	bi->generation = 0;
 	bi->sequence = 0;
 	bi->last_trans = 0;
+	bi->last_sub_trans = 0;
 	bi->logged_trans = 0;
 	bi->delalloc_bytes = 0;
 	bi->reserved_bytes = 0;
@@ -4980,7 +4981,9 @@ again:
 	set_page_dirty(page);
 	SetPageUptodate(page);
 
-	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+	BTRFS_I(inode)->last_trans = root->fs_info->generation;
+	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+
 	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
 out_unlock:
@@ -5100,6 +5103,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	if (!ei)
 		return NULL;
 	ei->last_trans = 0;
+	ei->last_sub_trans = 0;
 	ei->logged_trans = 0;
 	ei->outstanding_extents = 0;
 	ei->reserved_extents = 0;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 663c674..f68cbbe 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -79,6 +79,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
 					      struct inode *inode)
 {
 	BTRFS_I(inode)->last_trans = trans->transaction->transid;
+	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
 }
 
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 6d9ec28..0a1bde2 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1980,6 +1980,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	int ret;
 	struct btrfs_root *log = root->log_root;
 	struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
+	u64 log_transid = 0;
 
 	mutex_lock(&root->log_mutex);
 	index1 = root->log_transid % 2;
@@ -2018,6 +2019,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	btrfs_set_root_node(&log->root_item, log->node);
 
 	root->log_batch = 0;
+	log_transid = root->log_transid;
 	root->log_transid++;
 	log->log_transid = root->log_transid;
 	root->log_start_pid = 0;
@@ -2095,6 +2097,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	write_ctree_super(trans, root->fs_info->tree_root, 1);
 	ret = 0;
 
+	mutex_lock(&root->log_mutex);
+	if (root->last_log_commit < log_transid)
+		root->last_log_commit = log_transid;
+	mutex_unlock(&root->log_mutex);
+
 out_wake_log_root:
 	atomic_set(&log_root_tree->log_commit[index2], 0);
 	smp_mb();
@@ -2862,6 +2869,21 @@ out:
 	return ret;
 }
 
+static int inode_in_log(struct btrfs_trans_handle *trans,
+		 struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret = 0;
+
+	mutex_lock(&root->log_mutex);
+	if (BTRFS_I(inode)->logged_trans == trans->transid &&
+	    BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
+		ret = 1;
+	mutex_unlock(&root->log_mutex);
+	return ret;
+}
+
+
 /*
  * helper function around btrfs_log_inode to make sure newly created
  * parent directories also end up in the log.  A minimal inode and backref
@@ -2901,6 +2923,11 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 	if (ret)
 		goto end_no_trans;
 
+	if (inode_in_log(trans, inode)) {
+		ret = BTRFS_NO_LOG_SYNC;
+		goto end_no_trans;
+	}
+
 	start_log_trans(trans, root);
 
 	ret = btrfs_log_inode(trans, root, inode, inode_only);
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index d09c760..0776eac 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -19,6 +19,9 @@
 #ifndef __TREE_LOG_
 #define __TREE_LOG_
 
+/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
+#define BTRFS_NO_LOG_SYNC 256
+
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		   struct btrfs_root *root);
 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
-- 
cgit v1.1


From 690587d109ffe19d6743e4cc80c18b0906b7f9ff Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 13 Oct 2009 13:29:19 -0400
Subject: Btrfs: streamline tree-log btree block writeout

Syncing the tree log is a 3 phase operation.

1) write and wait for all the tree log blocks for a given root.

2) write and wait for all the tree log blocks for the
tree of tree log roots.

3) write and wait for the super blocks (barriers here)

This isn't as efficient as it could be because there is
no requirement to wait for the blocks from step one to hit the disk
before we start writing the blocks from step two.  This commit
changes the sequence so that we don't start waiting until
all the tree blocks from both steps one and two have been sent
to disk.

We do this by breaking up btrfs_write_wait_marked_extents into
two functions, which is trivial because it was already broken
up into two parts.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/transaction.c | 45 ++++++++++++++++++++++++++++++++++++++++++---
 fs/btrfs/transaction.h |  4 ++++
 fs/btrfs/tree-log.c    |  8 +++++++-
 3 files changed, 53 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0b8f36d..bca82a4 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -344,10 +344,10 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 /*
  * when btree blocks are allocated, they have some corresponding bits set for
  * them in one of two extent_io trees.  This is used to make sure all of
- * those extents are on disk for transaction or log commit
+ * those extents are sent to disk but does not wait on them
  */
-int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
-					struct extent_io_tree *dirty_pages)
+int btrfs_write_marked_extents(struct btrfs_root *root,
+			       struct extent_io_tree *dirty_pages)
 {
 	int ret;
 	int err = 0;
@@ -394,6 +394,29 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 			page_cache_release(page);
 		}
 	}
+	if (err)
+		werr = err;
+	return werr;
+}
+
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees.  This is used to make sure all of
+ * those extents are on disk for transaction or log commit.  We wait
+ * on all the pages and clear them from the dirty pages state tree
+ */
+int btrfs_wait_marked_extents(struct btrfs_root *root,
+			      struct extent_io_tree *dirty_pages)
+{
+	int ret;
+	int err = 0;
+	int werr = 0;
+	struct page *page;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	u64 start = 0;
+	u64 end;
+	unsigned long index;
+
 	while (1) {
 		ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
 					    EXTENT_DIRTY);
@@ -424,6 +447,22 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 	return werr;
 }
 
+/*
+ * when btree blocks are allocated, they have some corresponding bits set for
+ * them in one of two extent_io trees.  This is used to make sure all of
+ * those extents are on disk for transaction or log commit
+ */
+int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+					struct extent_io_tree *dirty_pages)
+{
+	int ret;
+	int ret2;
+
+	ret = btrfs_write_marked_extents(root, dirty_pages);
+	ret2 = btrfs_wait_marked_extents(root, dirty_pages);
+	return ret || ret2;
+}
+
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root)
 {
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index f68cbbe..d4e3e7a 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -108,5 +108,9 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root);
 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 					struct extent_io_tree *dirty_pages);
+int btrfs_write_marked_extents(struct btrfs_root *root,
+					struct extent_io_tree *dirty_pages);
+int btrfs_wait_marked_extents(struct btrfs_root *root,
+					struct extent_io_tree *dirty_pages);
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
 #endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 0a1bde2..4aff766 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2013,7 +2013,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
+	/* we start IO on  all the marked extents here, but we don't actually
+	 * wait for them until later.
+	 */
+	ret = btrfs_write_marked_extents(log, &log->dirty_log_pages);
 	BUG_ON(ret);
 
 	btrfs_set_root_node(&log->root_item, log->node);
@@ -2048,6 +2051,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
 	index2 = log_root_tree->log_transid % 2;
 	if (atomic_read(&log_root_tree->log_commit[index2])) {
+		btrfs_wait_marked_extents(log, &log->dirty_log_pages);
 		wait_log_commit(trans, log_root_tree,
 				log_root_tree->log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
@@ -2067,6 +2071,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	 * check the full commit flag again
 	 */
 	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+		btrfs_wait_marked_extents(log, &log->dirty_log_pages);
 		mutex_unlock(&log_root_tree->log_mutex);
 		ret = -EAGAIN;
 		goto out_wake_log_root;
@@ -2075,6 +2080,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	ret = btrfs_write_and_wait_marked_extents(log_root_tree,
 				&log_root_tree->dirty_log_pages);
 	BUG_ON(ret);
+	btrfs_wait_marked_extents(log, &log->dirty_log_pages);
 
 	btrfs_set_super_log_root(&root->fs_info->super_for_commit,
 				log_root_tree->node->start);
-- 
cgit v1.1


From 0eda294dfc980c1cbe4f8a0564bf543f86a01ddb Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 13 Oct 2009 13:50:18 -0400
Subject: Btrfs: fix btrfs acl #ifdef checks

The btrfs acl code was #ifdefing for a define
that didn't exist.  This correctly matches it
to the values used by the Kconfig file.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/acl.c   | 6 +++---
 fs/btrfs/ctree.h | 2 +-
 fs/btrfs/super.c | 2 +-
 fs/btrfs/xattr.c | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 69b355a..3616042 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -27,7 +27,7 @@
 #include "btrfs_inode.h"
 #include "xattr.h"
 
-#ifdef CONFIG_BTRFS_POSIX_ACL
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
 
 static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 {
@@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = {
 	.set	= btrfs_xattr_acl_access_set,
 };
 
-#else /* CONFIG_BTRFS_POSIX_ACL */
+#else /* CONFIG_BTRFS_FS_POSIX_ACL */
 
 int btrfs_acl_chmod(struct inode *inode)
 {
@@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
 	return 0;
 }
 
-#endif /* CONFIG_BTRFS_POSIX_ACL */
+#endif /* CONFIG_BTRFS_FS_POSIX_ACL */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index d0cede5..dfd7e6f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2374,7 +2374,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options);
 int btrfs_sync_fs(struct super_block *sb, int wait);
 
 /* acl.c */
-#ifdef CONFIG_BTRFS_POSIX_ACL
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
 int btrfs_check_acl(struct inode *inode, int mask);
 #else
 #define btrfs_check_acl NULL
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index e0a6432..3fbbf07 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -344,7 +344,7 @@ static int btrfs_fill_super(struct super_block *sb,
 	sb->s_export_op = &btrfs_export_ops;
 	sb->s_xattr = btrfs_xattr_handlers;
 	sb->s_time_gran = 1;
-#ifdef CONFIG_BTRFS_POSIX_ACL
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
 	sb->s_flags |= MS_POSIXACL;
 #endif
 
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index b0fc93f..b6dd596 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -260,7 +260,7 @@ err:
  * attributes are handled directly.
  */
 struct xattr_handler *btrfs_xattr_handlers[] = {
-#ifdef CONFIG_BTRFS_POSIX_ACL
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
 	&btrfs_xattr_acl_access_handler,
 	&btrfs_xattr_acl_default_handler,
 #endif
-- 
cgit v1.1


From 5d5e103a70f74ae98e3965a4add1ab951d0651d1 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 13 Oct 2009 16:46:49 -0400
Subject: Btrfs: fix possible ENOSPC problems with truncate

There's a problem where we don't do any space reservation for truncates, which
can cause you to OOPs because you will be allowed to go off in the weeds a bit
since we don't account for the delalloc bytes that are created as a result of
the truncate.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5b9567c..78139ef 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3032,12 +3032,22 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 
 	if ((offset & (blocksize - 1)) == 0)
 		goto out;
+	ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+	if (ret)
+		goto out;
+
+	ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+	if (ret)
+		goto out;
 
 	ret = -ENOMEM;
 again:
 	page = grab_cache_page(mapping, index);
-	if (!page)
+	if (!page) {
+		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+		btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
 		goto out;
+	}
 
 	page_start = page_offset(page);
 	page_end = page_start + PAGE_CACHE_SIZE - 1;
@@ -3070,6 +3080,10 @@ again:
 		goto again;
 	}
 
+	clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
+			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+			  GFP_NOFS);
+
 	ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
 	if (ret) {
 		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
@@ -3088,6 +3102,9 @@ again:
 	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
 out_unlock:
+	if (ret)
+		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+	btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
 	unlock_page(page);
 	page_cache_release(page);
 out:
@@ -3111,7 +3128,9 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 	if (size <= hole_start)
 		return 0;
 
-	btrfs_truncate_page(inode->i_mapping, inode->i_size);
+	err = btrfs_truncate_page(inode->i_mapping, inode->i_size);
+	if (err)
+		return err;
 
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
@@ -5008,7 +5027,9 @@ static void btrfs_truncate(struct inode *inode)
 	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
 		return;
 
-	btrfs_truncate_page(inode->i_mapping, inode->i_size);
+	ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
+	if (ret)
+		return;
 	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
 
 	trans = btrfs_start_transaction(root, 1);
-- 
cgit v1.1


From 86df7eb921a009515285e7171363fa57dd2d7d31 Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.yan@oracle.com>
Date: Wed, 14 Oct 2009 09:24:59 -0400
Subject: Btrfs: properly wait log writers during log sync

A recently fsync optimization make btrfs_sync_log skip calling
wait_for_writer in the single log writer case. This is incorrect
since the writer count can also be increased by btrfs_pin_log.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/tree-log.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4aff766..f51bf13 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1995,12 +1995,13 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
 		wait_log_commit(trans, root, root->log_transid - 1);
 
-	while (root->log_multiple_pids) {
+	while (1) {
 		unsigned long batch = root->log_batch;
-		mutex_unlock(&root->log_mutex);
-		schedule_timeout_uninterruptible(1);
-		mutex_lock(&root->log_mutex);
-
+		if (root->log_multiple_pids) {
+			mutex_unlock(&root->log_mutex);
+			schedule_timeout_uninterruptible(1);
+			mutex_lock(&root->log_mutex);
+		}
 		wait_for_writer(trans, root);
 		if (batch == root->log_batch)
 			break;
-- 
cgit v1.1


From e244a0aeb6a599c19a7c802cda6e2d67c847b154 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Oct 2009 09:24:59 -0400
Subject: Btrfs: add -o discard option

Enable discard by default is not a good idea given the the trim speed
of SSD prototypes we've seen, and the carecteristics for many high-end
arrays.  Turn of discards by default and require the -o discard option
to enable them on.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       | 1 +
 fs/btrfs/extent-tree.c | 3 +++
 fs/btrfs/super.c       | 7 ++++++-
 3 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index dfd7e6f..e5dd628 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1153,6 +1153,7 @@ struct btrfs_root {
 #define BTRFS_MOUNT_FLUSHONCOMMIT       (1 << 7)
 #define BTRFS_MOUNT_SSD_SPREAD		(1 << 8)
 #define BTRFS_MOUNT_NOSSD		(1 << 9)
+#define BTRFS_MOUNT_DISCARD		(1 << 10)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4aedbff..bf7782f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1584,6 +1584,9 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 	u64 map_length = num_bytes;
 	struct btrfs_multi_bio *multi = NULL;
 
+	if (!btrfs_test_opt(root, DISCARD))
+		return 0;
+
 	/* Tell the block device(s) that the sectors can be discarded */
 	ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
 			      bytenr, &map_length, &multi, 0);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 3fbbf07..939b68f0 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -66,7 +66,8 @@ enum {
 	Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
 	Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
 	Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl,
-	Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err,
+	Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
+	Opt_discard, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -88,6 +89,7 @@ static match_table_t tokens = {
 	{Opt_notreelog, "notreelog"},
 	{Opt_flushoncommit, "flushoncommit"},
 	{Opt_ratio, "metadata_ratio=%d"},
+	{Opt_discard, "discard"},
 	{Opt_err, NULL},
 };
 
@@ -257,6 +259,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 				       info->metadata_ratio);
 			}
 			break;
+		case Opt_discard:
+			btrfs_set_opt(info->mount_opt, DISCARD);
+			break;
 		default:
 			break;
 		}
-- 
cgit v1.1


From 0634857488ec6e28fa22920cd0bee3c2ac07ccfd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Oct 2009 09:24:59 -0400
Subject: Btrfs: enable discard support

The discard support code in btrfs currently is guarded by ifdefs for
BIO_RW_DISCARD, which is never defines as it's the name of an enum
memeber.  Just remove the useless ifdefs to actually enable the code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index bf7782f..d4a3528 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1568,18 +1568,15 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-#ifdef BIO_RW_DISCARD
 static void btrfs_issue_discard(struct block_device *bdev,
 				u64 start, u64 len)
 {
 	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
 }
-#endif
 
 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 				u64 num_bytes)
 {
-#ifdef BIO_RW_DISCARD
 	int ret;
 	u64 map_length = num_bytes;
 	struct btrfs_multi_bio *multi = NULL;
@@ -1606,9 +1603,6 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 	}
 
 	return ret;
-#else
-	return 0;
-#endif
 }
 
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
-- 
cgit v1.1


From 444528b3e614f7f2391488d9bca8e0b872db909b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 14 Oct 2009 09:38:28 -0400
Subject: Btrfs: always pin metadata in discard mode

We have an optimization in btrfs to allow blocks to be
immediately freed if they were allocated in this transaction and never
written.  Otherwise they are pinned and freed when the transaction
commits.

This isn't optimal for discard mode because immediately freeing
them means immediately discarding them.  It is better to give the
block to the pinning code and letting the (slow) discard happen later.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d4a3528..c56f916 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3686,6 +3686,14 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
 	if (is_data)
 		goto pinit;
 
+	/*
+	 * discard is sloooow, and so triggering discards on
+	 * individual btree blocks isn't a good plan.  Just
+	 * pin everything in discard mode.
+	 */
+	if (btrfs_test_opt(root, DISCARD))
+		goto pinit;
+
 	buf = btrfs_find_tree_block(root, bytenr, num_bytes);
 	if (!buf)
 		goto pinit;
-- 
cgit v1.1


From a6a8357788d6a37f8ad0f7eb46b0a386b613abb9 Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cornelia.huck@de.ibm.com>
Date: Tue, 6 Oct 2009 15:33:35 +0200
Subject: sysfs: Allow sysfs_move_dir(..., NULL) again.

As device_move() and kobject_move() both handle a NULL destination,
sysfs_move_dir() should do this as well (again) and fall back to
sysfs_root in that case.

Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Cc: Phil Carmody <ext-phil.2.carmody@nokia.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/dir.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 0050fc4..5fad489 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -894,7 +894,8 @@ int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
 
 	mutex_lock(&sysfs_rename_mutex);
 	BUG_ON(!sd->s_parent);
-	new_parent_sd = new_parent_kobj->sd ? new_parent_kobj->sd : &sysfs_root;
+	new_parent_sd = (new_parent_kobj && new_parent_kobj->sd) ?
+		new_parent_kobj->sd : &sysfs_root;
 
 	error = 0;
 	if (sd->s_parent == new_parent_sd)
-- 
cgit v1.1


From 83db93f4de2d9ae441a491d1dc61c2204f0195de Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@suse.de>
Date: Tue, 15 Sep 2009 16:05:51 -0700
Subject: sysfs: Allow sysfs_notify_dirent to be called from interrupt context.

sysfs_notify_dirent is a simple atomic operation that can be used to
alert user-space that new data can be read from a sysfs attribute.

Unfortunately it cannot currently be called from non-process context
because of its use of spin_lock which is sometimes taken with
interrupts enabled.

So change all lockers of sysfs_open_dirent_lock to disable interrupts,
thus making sysfs_notify_dirent safe to be called from non-process
context (as drivers/md does in md_safemode_timeout).

sysfs_get_open_dirent is (documented as being) only called from
process context, so it uses spin_lock_irq.  Other places
use spin_lock_irqsave.

The usage for sysfs_notify_dirent in md_safemode_timeout was
introduced in 2.6.28, so this patch is suitable for that and more
recent kernels.

Reported-by: Joel Andres Granados <jgranado@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Cc: stable <stable@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/file.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 561a9c0..f5ea468 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -268,7 +268,7 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
 	struct sysfs_open_dirent *od, *new_od = NULL;
 
  retry:
-	spin_lock(&sysfs_open_dirent_lock);
+	spin_lock_irq(&sysfs_open_dirent_lock);
 
 	if (!sd->s_attr.open && new_od) {
 		sd->s_attr.open = new_od;
@@ -281,7 +281,7 @@ static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
 		list_add_tail(&buffer->list, &od->buffers);
 	}
 
-	spin_unlock(&sysfs_open_dirent_lock);
+	spin_unlock_irq(&sysfs_open_dirent_lock);
 
 	if (od) {
 		kfree(new_od);
@@ -315,8 +315,9 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
 				  struct sysfs_buffer *buffer)
 {
 	struct sysfs_open_dirent *od = sd->s_attr.open;
+	unsigned long flags;
 
-	spin_lock(&sysfs_open_dirent_lock);
+	spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
 
 	list_del(&buffer->list);
 	if (atomic_dec_and_test(&od->refcnt))
@@ -324,7 +325,7 @@ static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
 	else
 		od = NULL;
 
-	spin_unlock(&sysfs_open_dirent_lock);
+	spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
 
 	kfree(od);
 }
@@ -456,8 +457,9 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
 void sysfs_notify_dirent(struct sysfs_dirent *sd)
 {
 	struct sysfs_open_dirent *od;
+	unsigned long flags;
 
-	spin_lock(&sysfs_open_dirent_lock);
+	spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
 
 	od = sd->s_attr.open;
 	if (od) {
@@ -465,7 +467,7 @@ void sysfs_notify_dirent(struct sysfs_dirent *sd)
 		wake_up_interruptible(&od->poll);
 	}
 
-	spin_unlock(&sysfs_open_dirent_lock);
+	spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
 }
 EXPORT_SYMBOL_GPL(sysfs_notify_dirent);
 
-- 
cgit v1.1