diff options
Diffstat (limited to 'fs')
226 files changed, 5574 insertions, 2640 deletions
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 48d4215..c55c614 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -68,7 +68,7 @@ static int v9fs_set_super(struct super_block *s, void *data) * v9fs_fill_super - populate superblock with info * @sb: superblock * @v9ses: session information - * @flags: flags propagated from v9fs_get_sb() + * @flags: flags propagated from v9fs_mount() * */ @@ -99,18 +99,16 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, } /** - * v9fs_get_sb - mount a superblock + * v9fs_mount - mount a superblock * @fs_type: file system type * @flags: mount flags * @dev_name: device name that was mounted * @data: mount options - * @mnt: mountpoint record to be instantiated * */ -static int v9fs_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, - struct vfsmount *mnt) +static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) { struct super_block *sb = NULL; struct inode *inode = NULL; @@ -124,7 +122,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL); if (!v9ses) - return -ENOMEM; + return ERR_PTR(-ENOMEM); fid = v9fs_session_init(v9ses, dev_name, data); if (IS_ERR(fid)) { @@ -186,15 +184,15 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, v9fs_fid_add(root, fid); P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n"); - simple_set_mnt(mnt, sb); - return 0; + return dget(sb->s_root); clunk_fid: p9_client_clunk(fid); close_session: v9fs_session_close(v9ses); kfree(v9ses); - return retval; + return ERR_PTR(retval); + release_sb: /* * we will do the session_close and root dentry release @@ -204,7 +202,7 @@ release_sb: */ p9_client_clunk(fid); deactivate_locked_super(sb); - return retval; + return ERR_PTR(retval); } /** @@ -300,7 +298,7 @@ static const struct super_operations v9fs_super_ops_dotl = { struct file_system_type v9fs_fs_type = { .name = "9p", - .get_sb = v9fs_get_sb, + .mount = v9fs_mount, .kill_sb = v9fs_kill_super, .owner = THIS_MODULE, .fs_flags = FS_RENAME_DOES_D_MOVE, diff --git a/fs/adfs/super.c b/fs/adfs/super.c index d9803f7..959dbff 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -490,17 +490,16 @@ error: return -EINVAL; } -static int adfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *adfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, adfs_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, adfs_fill_super); } static struct file_system_type adfs_fs_type = { .owner = THIS_MODULE, .name = "adfs", - .get_sb = adfs_get_sb, + .mount = adfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/affs/super.c b/fs/affs/super.c index fa4fbe1..0cf7f43 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -573,17 +573,16 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static int affs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *affs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, affs_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super); } static struct file_system_type affs_fs_type = { .owner = THIS_MODULE, .name = "affs", - .get_sb = affs_get_sb, + .mount = affs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/afs/super.c b/fs/afs/super.c index eacf76d..27201cf 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -29,9 +29,8 @@ #define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */ static void afs_i_init_once(void *foo); -static int afs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data, struct vfsmount *mnt); +static struct dentry *afs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data); static struct inode *afs_alloc_inode(struct super_block *sb); static void afs_put_super(struct super_block *sb); static void afs_destroy_inode(struct inode *inode); @@ -40,7 +39,7 @@ static int afs_statfs(struct dentry *dentry, struct kstatfs *buf); struct file_system_type afs_fs_type = { .owner = THIS_MODULE, .name = "afs", - .get_sb = afs_get_sb, + .mount = afs_mount, .kill_sb = kill_anon_super, .fs_flags = 0, }; @@ -359,11 +358,8 @@ error: /* * get an AFS superblock */ -static int afs_get_sb(struct file_system_type *fs_type, - int flags, - const char *dev_name, - void *options, - struct vfsmount *mnt) +static struct dentry *afs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *options) { struct afs_mount_params params; struct super_block *sb; @@ -427,12 +423,11 @@ static int afs_get_sb(struct file_system_type *fs_type, ASSERTCMP(sb->s_flags, &, MS_ACTIVE); } - simple_set_mnt(mnt, sb); afs_put_volume(params.volume); afs_put_cell(params.cell); kfree(new_opts); _leave(" = 0 [%p]", sb); - return 0; + return dget(sb->s_root); error: afs_put_volume(params.volume); @@ -440,7 +435,7 @@ error: key_put(params.key); kfree(new_opts); _leave(" = %d", ret); - return ret; + return ERR_PTR(ret); } /* diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 5365527..57ce55b 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -26,12 +26,10 @@ static struct vfsmount *anon_inode_mnt __read_mostly; static struct inode *anon_inode_inode; static const struct file_operations anon_inode_fops; -static int anon_inodefs_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, - struct vfsmount *mnt) +static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC, - mnt); + return mount_pseudo(fs_type, "anon_inode:", NULL, ANON_INODE_FS_MAGIC); } /* @@ -45,7 +43,7 @@ static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen) static struct file_system_type anon_inode_fs_type = { .name = "anon_inodefs", - .get_sb = anon_inodefs_get_sb, + .mount = anon_inodefs_mount, .kill_sb = kill_anon_super, }; static const struct dentry_operations anon_inodefs_dentry_operations = { diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c index 9722e4b..c038727 100644 --- a/fs/autofs4/init.c +++ b/fs/autofs4/init.c @@ -14,16 +14,16 @@ #include <linux/init.h> #include "autofs_i.h" -static int autofs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *autofs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_nodev(fs_type, flags, data, autofs4_fill_super, mnt); + return mount_nodev(fs_type, flags, data, autofs4_fill_super); } static struct file_system_type autofs_fs_type = { .owner = THIS_MODULE, .name = "autofs", - .get_sb = autofs_get_sb, + .mount = autofs_mount, .kill_sb = autofs4_kill_sb, }; diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index dc39d28..aa4e7c7 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -913,18 +913,17 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -static int -befs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, - void *data, struct vfsmount *mnt) +static struct dentry * +befs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, + void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, befs_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, befs_fill_super); } static struct file_system_type befs_fs_type = { .owner = THIS_MODULE, .name = "befs", - .get_sb = befs_get_sb, + .mount = befs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 883e77a..76db6d7 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -450,16 +450,16 @@ out: return ret; } -static int bfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *bfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, bfs_fill_super, mnt); + return mount_bdev(fs_type, flags, dev_name, data, bfs_fill_super); } static struct file_system_type bfs_fs_type = { .owner = THIS_MODULE, .name = "bfs", - .get_sb = bfs_get_sb, + .mount = bfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 29990f0..1befe2e 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -706,10 +706,10 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent) return err; } -static int bm_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *bm_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_single(fs_type, flags, data, bm_fill_super, mnt); + return mount_single(fs_type, flags, data, bm_fill_super); } static struct linux_binfmt misc_format = { @@ -720,7 +720,7 @@ static struct linux_binfmt misc_format = { static struct file_system_type bm_fs_type = { .owner = THIS_MODULE, .name = "binfmt_misc", - .get_sb = bm_get_sb, + .mount = bm_mount, .kill_sb = kill_litter_super, }; @@ -370,6 +370,9 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) { struct bio *bio; + if (nr_iovecs > UIO_MAXIOV) + return NULL; + bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec), gfp_mask); if (unlikely(!bio)) @@ -697,8 +700,12 @@ static void bio_free_map_data(struct bio_map_data *bmd) static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, gfp_t gfp_mask) { - struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask); + struct bio_map_data *bmd; + if (iov_count > UIO_MAXIOV) + return NULL; + + bmd = kmalloc(sizeof(*bmd), gfp_mask); if (!bmd) return NULL; @@ -827,6 +834,12 @@ struct bio *bio_copy_user_iov(struct request_queue *q, end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; start = uaddr >> PAGE_SHIFT; + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); + nr_pages += end - start; len += iov[i].iov_len; } @@ -955,6 +968,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; unsigned long start = uaddr >> PAGE_SHIFT; + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); + nr_pages += end - start; /* * buffer must be aligned to at least hardsector size for now @@ -982,7 +1001,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, unsigned long start = uaddr >> PAGE_SHIFT; const int local_nr_pages = end - start; const int page_limit = cur_page + local_nr_pages; - + ret = get_user_pages_fast(uaddr, local_nr_pages, write_to_vm, &pages[cur_page]); if (ret < local_nr_pages) { diff --git a/fs/block_dev.c b/fs/block_dev.c index dea3b62..4230252 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -11,7 +11,6 @@ #include <linux/slab.h> #include <linux/kmod.h> #include <linux/major.h> -#include <linux/smp_lock.h> #include <linux/device_cgroup.h> #include <linux/highmem.h> #include <linux/blkdev.h> @@ -464,15 +463,15 @@ static const struct super_operations bdev_sops = { .evict_inode = bdev_evict_inode, }; -static int bd_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *bd_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt); + return mount_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576); } static struct file_system_type bd_type = { .name = "bdev", - .get_sb = bd_get_sb, + .mount = bd_mount, .kill_sb = kill_anon_super, }; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 396039b..b50bc4b 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -91,23 +91,10 @@ static inline int compressed_bio_size(struct btrfs_root *root, static struct bio *compressed_bio_alloc(struct block_device *bdev, u64 first_byte, gfp_t gfp_flags) { - struct bio *bio; int nr_vecs; nr_vecs = bio_get_nr_vecs(bdev); - bio = bio_alloc(gfp_flags, nr_vecs); - - if (bio == NULL && (current->flags & PF_MEMALLOC)) { - while (!bio && (nr_vecs /= 2)) - bio = bio_alloc(gfp_flags, nr_vecs); - } - - if (bio) { - bio->bi_size = 0; - bio->bi_bdev = bdev; - bio->bi_sector = first_byte >> 9; - } - return bio; + return btrfs_bio_alloc(bdev, first_byte >> 9, nr_vecs, gfp_flags); } static int check_compressed_csum(struct inode *inode, @@ -163,7 +150,6 @@ fail: */ static void end_compressed_bio_read(struct bio *bio, int err) { - struct extent_io_tree *tree; struct compressed_bio *cb = bio->bi_private; struct inode *inode; struct page *page; @@ -187,7 +173,6 @@ static void end_compressed_bio_read(struct bio *bio, int err) /* ok, we're the last bio for this extent, lets start * the decompression. */ - tree = &BTRFS_I(inode)->io_tree; ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, cb->start, cb->orig_bio->bi_io_vec, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index c3df14c..9ac1715 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -200,7 +200,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct extent_buffer **cow_ret, u64 new_root_objectid) { struct extent_buffer *cow; - u32 nritems; int ret = 0; int level; struct btrfs_disk_key disk_key; @@ -210,7 +209,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, WARN_ON(root->ref_cows && trans->transid != root->last_trans); level = btrfs_header_level(buf); - nritems = btrfs_header_nritems(buf); if (level == 0) btrfs_item_key(buf, &disk_key, 0); else @@ -1008,7 +1006,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, int wret; int pslot; int orig_slot = path->slots[level]; - int err_on_enospc = 0; u64 orig_ptr; if (level == 0) @@ -1071,8 +1068,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(root) / 4) return 0; - if (btrfs_header_nritems(mid) < 2) - err_on_enospc = 1; + btrfs_header_nritems(mid); left = read_node_slot(root, parent, pslot - 1); if (left) { @@ -1103,8 +1099,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, wret = push_node_left(trans, root, left, mid, 1); if (wret < 0) ret = wret; - if (btrfs_header_nritems(mid) < 2) - err_on_enospc = 1; + btrfs_header_nritems(mid); } /* @@ -1224,14 +1219,12 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, int wret; int pslot; int orig_slot = path->slots[level]; - u64 orig_ptr; if (level == 0) return 1; mid = path->nodes[level]; WARN_ON(btrfs_header_generation(mid) != trans->transid); - orig_ptr = btrfs_node_blockptr(mid, orig_slot); if (level < BTRFS_MAX_LEVEL - 1) parent = path->nodes[level + 1]; @@ -1577,13 +1570,33 @@ read_block_for_search(struct btrfs_trans_handle *trans, blocksize = btrfs_level_size(root, level - 1); tmp = btrfs_find_tree_block(root, blocknr, blocksize); - if (tmp && btrfs_buffer_uptodate(tmp, gen)) { - /* - * we found an up to date block without sleeping, return - * right away - */ - *eb_ret = tmp; - return 0; + if (tmp) { + if (btrfs_buffer_uptodate(tmp, 0)) { + if (btrfs_buffer_uptodate(tmp, gen)) { + /* + * we found an up to date block without + * sleeping, return + * right away + */ + *eb_ret = tmp; + return 0; + } + /* the pages were up to date, but we failed + * the generation number check. Do a full + * read for the generation number that is correct. + * We must do this without dropping locks so + * we can trust our generation number + */ + free_extent_buffer(tmp); + tmp = read_tree_block(root, blocknr, blocksize, gen); + if (tmp && btrfs_buffer_uptodate(tmp, gen)) { + *eb_ret = tmp; + return 0; + } + free_extent_buffer(tmp); + btrfs_release_path(NULL, p); + return -EIO; + } } /* @@ -1596,8 +1609,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, btrfs_unlock_up_safe(p, level + 1); btrfs_set_path_blocking(p); - if (tmp) - free_extent_buffer(tmp); + free_extent_buffer(tmp); if (p->reada) reada_for_search(root, p, level, slot, key->objectid); @@ -2548,7 +2560,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, { struct btrfs_disk_key disk_key; struct extent_buffer *right = path->nodes[0]; - int slot; int i; int push_space = 0; int push_items = 0; @@ -2560,8 +2571,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, u32 this_item_size; u32 old_left_item_size; - slot = path->slots[1]; - if (empty) nr = min(right_nritems, max_slot); else @@ -3330,7 +3339,6 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, { int ret = 0; int slot; - int slot_orig; struct extent_buffer *leaf; struct btrfs_item *item; u32 nritems; @@ -3340,7 +3348,6 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans, unsigned int size_diff; int i; - slot_orig = path->slots[0]; leaf = path->nodes[0]; slot = path->slots[0]; @@ -3445,7 +3452,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, { int ret = 0; int slot; - int slot_orig; struct extent_buffer *leaf; struct btrfs_item *item; u32 nritems; @@ -3454,7 +3460,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans, unsigned int old_size; int i; - slot_orig = path->slots[0]; leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); @@ -3787,7 +3792,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, struct btrfs_key *cpu_key, u32 *data_size, int nr) { - struct extent_buffer *leaf; int ret = 0; int slot; int i; @@ -3804,7 +3808,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, if (ret < 0) goto out; - leaf = path->nodes[0]; slot = path->slots[0]; BUG_ON(slot < 0); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index eaf286a..af52f6d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -99,6 +99,9 @@ struct btrfs_ordered_sum; */ #define BTRFS_EXTENT_CSUM_OBJECTID -10ULL +/* For storing free space cache */ +#define BTRFS_FREE_SPACE_OBJECTID -11ULL + /* dummy objectid represents multiple objectids */ #define BTRFS_MULTIPLE_OBJECTIDS -255ULL @@ -265,6 +268,22 @@ struct btrfs_chunk { /* additional stripes go here */ } __attribute__ ((__packed__)); +#define BTRFS_FREE_SPACE_EXTENT 1 +#define BTRFS_FREE_SPACE_BITMAP 2 + +struct btrfs_free_space_entry { + __le64 offset; + __le64 bytes; + u8 type; +} __attribute__ ((__packed__)); + +struct btrfs_free_space_header { + struct btrfs_disk_key location; + __le64 generation; + __le64 num_entries; + __le64 num_bitmaps; +} __attribute__ ((__packed__)); + static inline unsigned long btrfs_chunk_item_size(int num_stripes) { BUG_ON(num_stripes == 0); @@ -365,8 +384,10 @@ struct btrfs_super_block { char label[BTRFS_LABEL_SIZE]; + __le64 cache_generation; + /* future expansion */ - __le64 reserved[32]; + __le64 reserved[31]; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; } __attribute__ ((__packed__)); @@ -375,13 +396,15 @@ struct btrfs_super_block { * ones specified below then we will fail to mount */ #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) -#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (2ULL << 0) +#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) +#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL -#define BTRFS_FEATURE_INCOMPAT_SUPP \ - (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ - BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL) +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ + BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ + BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) /* * A leaf is full of items. offset and size tell us where to find @@ -675,7 +698,8 @@ struct btrfs_block_group_item { struct btrfs_space_info { u64 flags; - u64 total_bytes; /* total bytes in the space */ + u64 total_bytes; /* total bytes in the space, + this doesn't take mirrors into account */ u64 bytes_used; /* total bytes used, this does't take mirrors into account */ u64 bytes_pinned; /* total bytes pinned, will be freed when the @@ -687,6 +711,8 @@ struct btrfs_space_info { u64 bytes_may_use; /* number of bytes that may be used for delalloc/allocations */ u64 disk_used; /* total bytes used on disk */ + u64 disk_total; /* total bytes on disk, takes mirrors into + account */ int full; /* indicates that we cannot allocate any more chunks for this space */ @@ -750,6 +776,14 @@ enum btrfs_caching_type { BTRFS_CACHE_FINISHED = 2, }; +enum btrfs_disk_cache_state { + BTRFS_DC_WRITTEN = 0, + BTRFS_DC_ERROR = 1, + BTRFS_DC_CLEAR = 2, + BTRFS_DC_SETUP = 3, + BTRFS_DC_NEED_WRITE = 4, +}; + struct btrfs_caching_control { struct list_head list; struct mutex mutex; @@ -763,6 +797,7 @@ struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; struct btrfs_fs_info *fs_info; + struct inode *inode; spinlock_t lock; u64 pinned; u64 reserved; @@ -773,8 +808,11 @@ struct btrfs_block_group_cache { int extents_thresh; int free_extents; int total_bitmaps; - int ro; - int dirty; + unsigned int ro:1; + unsigned int dirty:1; + unsigned int iref:1; + + int disk_cache_state; /* cache tracking stuff */ int cached; @@ -863,6 +901,7 @@ struct btrfs_fs_info { struct btrfs_transaction *running_transaction; wait_queue_head_t transaction_throttle; wait_queue_head_t transaction_wait; + wait_queue_head_t transaction_blocked_wait; wait_queue_head_t async_submit_wait; struct btrfs_super_block super_copy; @@ -949,6 +988,7 @@ struct btrfs_fs_info { struct btrfs_workers endio_meta_workers; struct btrfs_workers endio_meta_write_workers; struct btrfs_workers endio_write_workers; + struct btrfs_workers endio_freespace_worker; struct btrfs_workers submit_workers; /* * fixup workers take dirty pages that didn't properly go through @@ -1192,6 +1232,9 @@ struct btrfs_root { #define BTRFS_MOUNT_NOSSD (1 << 9) #define BTRFS_MOUNT_DISCARD (1 << 10) #define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11) +#define BTRFS_MOUNT_SPACE_CACHE (1 << 12) +#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13) +#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -1665,6 +1708,27 @@ static inline void btrfs_set_dir_item_key(struct extent_buffer *eb, write_eb_member(eb, item, struct btrfs_dir_item, location, key); } +BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header, + num_entries, 64); +BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header, + num_bitmaps, 64); +BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header, + generation, 64); + +static inline void btrfs_free_space_key(struct extent_buffer *eb, + struct btrfs_free_space_header *h, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, h, struct btrfs_free_space_header, location, key); +} + +static inline void btrfs_set_free_space_key(struct extent_buffer *eb, + struct btrfs_free_space_header *h, + struct btrfs_disk_key *key) +{ + write_eb_member(eb, h, struct btrfs_free_space_header, location, key); +} + /* struct btrfs_disk_key */ BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, objectid, 64); @@ -1876,6 +1940,8 @@ BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, incompat_flags, 64); BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, csum_type, 16); +BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, + cache_generation, 64); static inline int btrfs_super_csum_size(struct btrfs_super_block *s) { @@ -1988,6 +2054,12 @@ static inline struct dentry *fdentry(struct file *file) return file->f_path.dentry; } +static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info) +{ + return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) && + (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); +} + /* extent-tree.c */ void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, @@ -2079,7 +2151,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes); void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root, - int num_items, int *retries); + int num_items); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, @@ -2100,7 +2172,7 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 num_bytes, int *retries); + u64 num_bytes); int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, @@ -2115,6 +2187,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache); int btrfs_set_block_group_rw(struct btrfs_root *root, struct btrfs_block_group_cache *cache); +void btrfs_put_block_group_cache(struct btrfs_fs_info *info); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -2373,7 +2446,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); -int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput); +int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput, + int sync); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state); int btrfs_writepages(struct address_space *mapping, @@ -2426,6 +2500,10 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root); int btrfs_prealloc_file_range(struct inode *inode, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); +int btrfs_prealloc_file_range_trans(struct inode *inode, + struct btrfs_trans_handle *trans, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint); extern const struct dentry_operations btrfs_dentry_operations; /* ioctl.c */ diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index e9103b3..f0cad5a 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -427,5 +427,5 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, ret = btrfs_truncate_item(trans, root, path, item_len - sub_item_len, 1); } - return 0; + return ret; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5e789f4..c547cca 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -28,6 +28,7 @@ #include <linux/freezer.h> #include <linux/crc32c.h> #include <linux/slab.h> +#include <linux/migrate.h> #include "compat.h" #include "ctree.h" #include "disk-io.h" @@ -338,7 +339,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) struct extent_io_tree *tree; u64 start = (u64)page->index << PAGE_CACHE_SHIFT; u64 found_start; - int found_level; unsigned long len; struct extent_buffer *eb; int ret; @@ -356,6 +356,8 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, btrfs_header_generation(eb)); BUG_ON(ret); + WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN)); + found_start = btrfs_header_bytenr(eb); if (found_start != start) { WARN_ON(1); @@ -369,8 +371,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) WARN_ON(1); goto err; } - found_level = btrfs_header_level(eb); - csum_tree_block(root, eb, 0); err: free_extent_buffer(eb); @@ -481,9 +481,12 @@ static void end_workqueue_bio(struct bio *bio, int err) end_io_wq->work.flags = 0; if (bio->bi_rw & REQ_WRITE) { - if (end_io_wq->metadata) + if (end_io_wq->metadata == 1) btrfs_queue_worker(&fs_info->endio_meta_write_workers, &end_io_wq->work); + else if (end_io_wq->metadata == 2) + btrfs_queue_worker(&fs_info->endio_freespace_worker, + &end_io_wq->work); else btrfs_queue_worker(&fs_info->endio_write_workers, &end_io_wq->work); @@ -497,6 +500,13 @@ static void end_workqueue_bio(struct bio *bio, int err) } } +/* + * For the metadata arg you want + * + * 0 - if data + * 1 - if normal metadta + * 2 - if writing to the free space cache area + */ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, int metadata) { @@ -533,11 +543,9 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone) static void run_one_async_start(struct btrfs_work *work) { - struct btrfs_fs_info *fs_info; struct async_submit_bio *async; async = container_of(work, struct async_submit_bio, work); - fs_info = BTRFS_I(async->inode)->root->fs_info; async->submit_bio_start(async->inode, async->rw, async->bio, async->mirror_num, async->bio_flags, async->bio_offset); @@ -688,6 +696,29 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, __btree_submit_bio_done); } +static int btree_migratepage(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + /* + * we can't safely write a btree page from here, + * we haven't done the locking hook + */ + if (PageDirty(page)) + return -EAGAIN; + /* + * Buffers may be managed in a filesystem specific way. + * We must have no buffers or drop them. + */ + if (page_has_private(page) && + !try_to_release_page(page, GFP_KERNEL)) + return -EAGAIN; +#ifdef CONFIG_MIGRATION + return migrate_page(mapping, newpage, page); +#else + return -ENOSYS; +#endif +} + static int btree_writepage(struct page *page, struct writeback_control *wbc) { struct extent_io_tree *tree; @@ -702,8 +733,7 @@ static int btree_writepage(struct page *page, struct writeback_control *wbc) } redirty_page_for_writepage(wbc, page); - eb = btrfs_find_tree_block(root, page_offset(page), - PAGE_CACHE_SIZE); + eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE); WARN_ON(!eb); was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); @@ -794,6 +824,9 @@ static const struct address_space_operations btree_aops = { .releasepage = btree_releasepage, .invalidatepage = btree_invalidatepage, .sync_page = block_sync_page, +#ifdef CONFIG_MIGRATION + .migratepage = btree_migratepage, +#endif }; int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, @@ -850,12 +883,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, u64 parent_transid) { struct extent_buffer *buf = NULL; - struct inode *btree_inode = root->fs_info->btree_inode; - struct extent_io_tree *io_tree; int ret; - io_tree = &BTRFS_I(btree_inode)->io_tree; - buf = btrfs_find_create_tree_block(root, bytenr, blocksize); if (!buf) return NULL; @@ -1377,7 +1406,6 @@ static int bio_ready_for_csum(struct bio *bio) u64 start = 0; struct page *page; struct extent_io_tree *io_tree = NULL; - struct btrfs_fs_info *info = NULL; struct bio_vec *bvec; int i; int ret; @@ -1396,7 +1424,6 @@ static int bio_ready_for_csum(struct bio *bio) buf_len = page->private >> 2; start = page_offset(page) + bvec->bv_offset; io_tree = &BTRFS_I(page->mapping->host)->io_tree; - info = BTRFS_I(page->mapping->host)->root->fs_info; } /* are we fully contained in this bio? */ if (buf_len <= length) @@ -1539,10 +1566,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, GFP_NOFS); struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); - struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info), - GFP_NOFS); + struct btrfs_root *tree_root = btrfs_sb(sb); + struct btrfs_fs_info *fs_info = tree_root->fs_info; struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), @@ -1680,12 +1705,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, init_waitqueue_head(&fs_info->transaction_throttle); init_waitqueue_head(&fs_info->transaction_wait); + init_waitqueue_head(&fs_info->transaction_blocked_wait); init_waitqueue_head(&fs_info->async_submit_wait); __setup_root(4096, 4096, 4096, 4096, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); - bh = btrfs_read_dev_super(fs_devices->latest_bdev); if (!bh) goto fail_iput; @@ -1775,6 +1800,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", fs_info->thread_pool_size, &fs_info->generic_worker); + btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write", + 1, &fs_info->generic_worker); /* * endios are largely parallel and should have a very @@ -1795,6 +1822,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_start_workers(&fs_info->endio_meta_workers, 1); btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); btrfs_start_workers(&fs_info->endio_write_workers, 1); + btrfs_start_workers(&fs_info->endio_freespace_worker, 1); fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, @@ -1993,6 +2021,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (!(sb->s_flags & MS_RDONLY)) { down_read(&fs_info->cleanup_work_sem); btrfs_orphan_cleanup(fs_info->fs_root); + btrfs_orphan_cleanup(fs_info->tree_root); up_read(&fs_info->cleanup_work_sem); } @@ -2035,6 +2064,7 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->endio_meta_workers); btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); + btrfs_stop_workers(&fs_info->endio_freespace_worker); btrfs_stop_workers(&fs_info->submit_workers); fail_iput: invalidate_inode_pages2(fs_info->btree_inode->i_mapping); @@ -2410,6 +2440,7 @@ int close_ctree(struct btrfs_root *root) fs_info->closing = 1; smp_mb(); + btrfs_put_block_group_cache(fs_info); if (!(fs_info->sb->s_flags & MS_RDONLY)) { ret = btrfs_commit_super(root); if (ret) @@ -2456,6 +2487,7 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->endio_meta_workers); btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); + btrfs_stop_workers(&fs_info->endio_freespace_worker); btrfs_stop_workers(&fs_info->submit_workers); btrfs_close_devices(fs_info->fs_devices); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 951ef09..6f04444 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -232,9 +232,85 @@ fail: return ERR_PTR(ret); } +static int btrfs_get_name(struct dentry *parent, char *name, + struct dentry *child) +{ + struct inode *inode = child->d_inode; + struct inode *dir = parent->d_inode; + struct btrfs_path *path; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_inode_ref *iref; + struct btrfs_root_ref *rref; + struct extent_buffer *leaf; + unsigned long name_ptr; + struct btrfs_key key; + int name_len; + int ret; + + if (!dir || !inode) + return -EINVAL; + + if (!S_ISDIR(dir->i_mode)) + return -EINVAL; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->leave_spinning = 1; + + if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) { + key.objectid = BTRFS_I(inode)->root->root_key.objectid; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = (u64)-1; + root = root->fs_info->tree_root; + } else { + key.objectid = inode->i_ino; + key.offset = dir->i_ino; + key.type = BTRFS_INODE_REF_KEY; + } + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + btrfs_free_path(path); + return ret; + } else if (ret > 0) { + if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) { + path->slots[0]--; + } else { + btrfs_free_path(path); + return -ENOENT; + } + } + leaf = path->nodes[0]; + + if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) { + rref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_root_ref); + name_ptr = (unsigned long)(rref + 1); + name_len = btrfs_root_ref_name_len(leaf, rref); + } else { + iref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_ref); + name_ptr = (unsigned long)(iref + 1); + name_len = btrfs_inode_ref_name_len(leaf, iref); + } + + read_extent_buffer(leaf, name, name_ptr, name_len); + btrfs_free_path(path); + + /* + * have to add the null termination to make sure that reconnect_path + * gets the right len for strlen + */ + name[name_len] = '\0'; + + return 0; +} + const struct export_operations btrfs_export_ops = { .encode_fh = btrfs_encode_fh, .fh_to_dentry = btrfs_fh_to_dentry, .fh_to_parent = btrfs_fh_to_parent, .get_parent = btrfs_get_parent, + .get_name = btrfs_get_name, }; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0b81ecd..bcd59c7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -242,6 +242,12 @@ get_caching_control(struct btrfs_block_group_cache *cache) return NULL; } + /* We're loading it the fast way, so we don't have a caching_ctl. */ + if (!cache->caching_ctl) { + spin_unlock(&cache->lock); + return NULL; + } + ctl = cache->caching_ctl; atomic_inc(&ctl->count); spin_unlock(&cache->lock); @@ -421,7 +427,9 @@ err: return 0; } -static int cache_block_group(struct btrfs_block_group_cache *cache) +static int cache_block_group(struct btrfs_block_group_cache *cache, + struct btrfs_trans_handle *trans, + int load_cache_only) { struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_caching_control *caching_ctl; @@ -432,6 +440,36 @@ static int cache_block_group(struct btrfs_block_group_cache *cache) if (cache->cached != BTRFS_CACHE_NO) return 0; + /* + * We can't do the read from on-disk cache during a commit since we need + * to have the normal tree locking. + */ + if (!trans->transaction->in_commit) { + spin_lock(&cache->lock); + if (cache->cached != BTRFS_CACHE_NO) { + spin_unlock(&cache->lock); + return 0; + } + cache->cached = BTRFS_CACHE_STARTED; + spin_unlock(&cache->lock); + + ret = load_free_space_cache(fs_info, cache); + + spin_lock(&cache->lock); + if (ret == 1) { + cache->cached = BTRFS_CACHE_FINISHED; + cache->last_byte_to_unpin = (u64)-1; + } else { + cache->cached = BTRFS_CACHE_NO; + } + spin_unlock(&cache->lock); + if (ret == 1) + return 0; + } + + if (load_cache_only) + return 0; + caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL); BUG_ON(!caching_ctl); @@ -509,7 +547,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { - if (found->flags == flags) { + if (found->flags & flags) { rcu_read_unlock(); return found; } @@ -542,6 +580,15 @@ static u64 div_factor(u64 num, int factor) return num; } +static u64 div_factor_fine(u64 num, int factor) +{ + if (factor == 100) + return num; + num *= factor; + do_div(num, 100); + return num; +} + u64 btrfs_find_block_group(struct btrfs_root *root, u64 search_start, u64 search_hint, int owner) { @@ -2687,6 +2734,109 @@ next_block_group(struct btrfs_root *root, return cache; } +static int cache_save_setup(struct btrfs_block_group_cache *block_group, + struct btrfs_trans_handle *trans, + struct btrfs_path *path) +{ + struct btrfs_root *root = block_group->fs_info->tree_root; + struct inode *inode = NULL; + u64 alloc_hint = 0; + int num_pages = 0; + int retries = 0; + int ret = 0; + + /* + * If this block group is smaller than 100 megs don't bother caching the + * block group. + */ + if (block_group->key.offset < (100 * 1024 * 1024)) { + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_WRITTEN; + spin_unlock(&block_group->lock); + return 0; + } + +again: + inode = lookup_free_space_inode(root, block_group, path); + if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { + ret = PTR_ERR(inode); + btrfs_release_path(root, path); + goto out; + } + + if (IS_ERR(inode)) { + BUG_ON(retries); + retries++; + + if (block_group->ro) + goto out_free; + + ret = create_free_space_inode(root, trans, block_group, path); + if (ret) + goto out_free; + goto again; + } + + /* + * We want to set the generation to 0, that way if anything goes wrong + * from here on out we know not to trust this cache when we load up next + * time. + */ + BTRFS_I(inode)->generation = 0; + ret = btrfs_update_inode(trans, root, inode); + WARN_ON(ret); + + if (i_size_read(inode) > 0) { + ret = btrfs_truncate_free_space_cache(root, trans, path, + inode); + if (ret) + goto out_put; + } + + spin_lock(&block_group->lock); + if (block_group->cached != BTRFS_CACHE_FINISHED) { + spin_unlock(&block_group->lock); + goto out_put; + } + spin_unlock(&block_group->lock); + + num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024); + if (!num_pages) + num_pages = 1; + + /* + * Just to make absolutely sure we have enough space, we're going to + * preallocate 12 pages worth of space for each block group. In + * practice we ought to use at most 8, but we need extra space so we can + * add our header and have a terminator between the extents and the + * bitmaps. + */ + num_pages *= 16; + num_pages *= PAGE_CACHE_SIZE; + + ret = btrfs_check_data_free_space(inode, num_pages); + if (ret) + goto out_put; + + ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, + num_pages, num_pages, + &alloc_hint); + btrfs_free_reserved_data_space(inode, num_pages); +out_put: + iput(inode); +out_free: + btrfs_release_path(root, path); +out: + spin_lock(&block_group->lock); + if (ret) + block_group->disk_cache_state = BTRFS_DC_ERROR; + else + block_group->disk_cache_state = BTRFS_DC_SETUP; + spin_unlock(&block_group->lock); + + return ret; +} + int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -2699,6 +2849,25 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; +again: + while (1) { + cache = btrfs_lookup_first_block_group(root->fs_info, last); + while (cache) { + if (cache->disk_cache_state == BTRFS_DC_CLEAR) + break; + cache = next_block_group(root, cache); + } + if (!cache) { + if (last == 0) + break; + last = 0; + continue; + } + err = cache_save_setup(cache, trans, path); + last = cache->key.objectid + cache->key.offset; + btrfs_put_block_group(cache); + } + while (1) { if (last == 0) { err = btrfs_run_delayed_refs(trans, root, @@ -2708,6 +2877,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, cache = btrfs_lookup_first_block_group(root->fs_info, last); while (cache) { + if (cache->disk_cache_state == BTRFS_DC_CLEAR) { + btrfs_put_block_group(cache); + goto again; + } + if (cache->dirty) break; cache = next_block_group(root, cache); @@ -2719,6 +2893,8 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, continue; } + if (cache->disk_cache_state == BTRFS_DC_SETUP) + cache->disk_cache_state = BTRFS_DC_NEED_WRITE; cache->dirty = 0; last = cache->key.objectid + cache->key.offset; @@ -2727,6 +2903,52 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, btrfs_put_block_group(cache); } + while (1) { + /* + * I don't think this is needed since we're just marking our + * preallocated extent as written, but just in case it can't + * hurt. + */ + if (last == 0) { + err = btrfs_run_delayed_refs(trans, root, + (unsigned long)-1); + BUG_ON(err); + } + + cache = btrfs_lookup_first_block_group(root->fs_info, last); + while (cache) { + /* + * Really this shouldn't happen, but it could if we + * couldn't write the entire preallocated extent and + * splitting the extent resulted in a new block. + */ + if (cache->dirty) { + btrfs_put_block_group(cache); + goto again; + } + if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) + break; + cache = next_block_group(root, cache); + } + if (!cache) { + if (last == 0) + break; + last = 0; + continue; + } + + btrfs_write_out_cache(root, trans, cache, path); + + /* + * If we didn't have an error then the cache state is still + * NEED_WRITE, so we can set it to WRITTEN. + */ + if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE) + cache->disk_cache_state = BTRFS_DC_WRITTEN; + last = cache->key.objectid + cache->key.offset; + btrfs_put_block_group(cache); + } + btrfs_free_path(path); return 0; } @@ -2762,6 +2984,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, if (found) { spin_lock(&found->lock); found->total_bytes += total_bytes; + found->disk_total += total_bytes * factor; found->bytes_used += bytes_used; found->disk_used += bytes_used * factor; found->full = 0; @@ -2781,6 +3004,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA); found->total_bytes = total_bytes; + found->disk_total = total_bytes * factor; found->bytes_used = bytes_used; found->disk_used = bytes_used * factor; found->bytes_pinned = 0; @@ -2882,11 +3106,16 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes) struct btrfs_space_info *data_sinfo; struct btrfs_root *root = BTRFS_I(inode)->root; u64 used; - int ret = 0, committed = 0; + int ret = 0, committed = 0, alloc_chunk = 1; /* make sure bytes are sectorsize aligned */ bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + if (root == root->fs_info->tree_root) { + alloc_chunk = 0; + committed = 1; + } + data_sinfo = BTRFS_I(inode)->space_info; if (!data_sinfo) goto alloc; @@ -2905,7 +3134,7 @@ again: * if we don't have enough free bytes in this space then we need * to alloc a new chunk. */ - if (!data_sinfo->full) { + if (!data_sinfo->full && alloc_chunk) { u64 alloc_target; data_sinfo->force_alloc = 1; @@ -2997,10 +3226,11 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) rcu_read_unlock(); } -static int should_alloc_chunk(struct btrfs_space_info *sinfo, - u64 alloc_bytes) +static int should_alloc_chunk(struct btrfs_root *root, + struct btrfs_space_info *sinfo, u64 alloc_bytes) { u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; + u64 thresh; if (sinfo->bytes_used + sinfo->bytes_reserved + alloc_bytes + 256 * 1024 * 1024 < num_bytes) @@ -3010,6 +3240,12 @@ static int should_alloc_chunk(struct btrfs_space_info *sinfo, alloc_bytes < div_factor(num_bytes, 8)) return 0; + thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); + thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); + + if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3)) + return 0; + return 1; } @@ -3041,13 +3277,21 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, goto out; } - if (!force && !should_alloc_chunk(space_info, alloc_bytes)) { + if (!force && !should_alloc_chunk(extent_root, space_info, + alloc_bytes)) { spin_unlock(&space_info->lock); goto out; } spin_unlock(&space_info->lock); /* + * If we have mixed data/metadata chunks we want to make sure we keep + * allocating mixed chunks instead of individual chunks. + */ + if (btrfs_mixed_space_info(space_info)) + flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA); + + /* * if we're doing a data chunk, go ahead and make sure that * we keep a reasonable number of metadata chunks allocated in the * FS as well. @@ -3072,55 +3316,25 @@ out: return ret; } -static int maybe_allocate_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_space_info *sinfo, u64 num_bytes) -{ - int ret; - int end_trans = 0; - - if (sinfo->full) - return 0; - - spin_lock(&sinfo->lock); - ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024); - spin_unlock(&sinfo->lock); - if (!ret) - return 0; - - if (!trans) { - trans = btrfs_join_transaction(root, 1); - BUG_ON(IS_ERR(trans)); - end_trans = 1; - } - - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - num_bytes + 2 * 1024 * 1024, - get_alloc_profile(root, sinfo->flags), 0); - - if (end_trans) - btrfs_end_transaction(trans, root); - - return ret == 1 ? 1 : 0; -} - /* * shrink metadata reservation for delalloc */ static int shrink_delalloc(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 to_reclaim) + struct btrfs_root *root, u64 to_reclaim, int sync) { struct btrfs_block_rsv *block_rsv; + struct btrfs_space_info *space_info; u64 reserved; u64 max_reclaim; u64 reclaimed = 0; int pause = 1; - int ret; + int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; block_rsv = &root->fs_info->delalloc_block_rsv; - spin_lock(&block_rsv->lock); - reserved = block_rsv->reserved; - spin_unlock(&block_rsv->lock); + space_info = block_rsv->space_info; + + smp_mb(); + reserved = space_info->bytes_reserved; if (reserved == 0) return 0; @@ -3128,104 +3342,169 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, max_reclaim = min(reserved, to_reclaim); while (1) { - ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0); - if (!ret) { - __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(pause); - pause <<= 1; - if (pause > HZ / 10) - pause = HZ / 10; - } else { - pause = 1; - } + /* have the flusher threads jump in and do some IO */ + smp_mb(); + nr_pages = min_t(unsigned long, nr_pages, + root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT); + writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); - spin_lock(&block_rsv->lock); - if (reserved > block_rsv->reserved) - reclaimed = reserved - block_rsv->reserved; - reserved = block_rsv->reserved; - spin_unlock(&block_rsv->lock); + spin_lock(&space_info->lock); + if (reserved > space_info->bytes_reserved) + reclaimed += reserved - space_info->bytes_reserved; + reserved = space_info->bytes_reserved; + spin_unlock(&space_info->lock); if (reserved == 0 || reclaimed >= max_reclaim) break; if (trans && trans->transaction->blocked) return -EAGAIN; + + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(pause); + pause <<= 1; + if (pause > HZ / 10) + pause = HZ / 10; + } return reclaimed >= to_reclaim; } -static int should_retry_reserve(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, - u64 num_bytes, int *retries) +/* + * Retries tells us how many times we've called reserve_metadata_bytes. The + * idea is if this is the first call (retries == 0) then we will add to our + * reserved count if we can't make the allocation in order to hold our place + * while we go and try and free up space. That way for retries > 1 we don't try + * and add space, we just check to see if the amount of unused space is >= the + * total space, meaning that our reservation is valid. + * + * However if we don't intend to retry this reservation, pass -1 as retries so + * that it short circuits this logic. + */ +static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 orig_bytes, int flush) { struct btrfs_space_info *space_info = block_rsv->space_info; - int ret; + u64 unused; + u64 num_bytes = orig_bytes; + int retries = 0; + int ret = 0; + bool reserved = false; + bool committed = false; - if ((*retries) > 2) - return -ENOSPC; +again: + ret = -ENOSPC; + if (reserved) + num_bytes = 0; - ret = maybe_allocate_chunk(trans, root, space_info, num_bytes); - if (ret) - return 1; + spin_lock(&space_info->lock); + unused = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly + + space_info->bytes_may_use; - if (trans && trans->transaction->in_commit) - return -ENOSPC; + /* + * The idea here is that we've not already over-reserved the block group + * then we can go ahead and save our reservation first and then start + * flushing if we need to. Otherwise if we've already overcommitted + * lets start flushing stuff first and then come back and try to make + * our reservation. + */ + if (unused <= space_info->total_bytes) { + unused = space_info->total_bytes - unused; + if (unused >= num_bytes) { + if (!reserved) + space_info->bytes_reserved += orig_bytes; + ret = 0; + } else { + /* + * Ok set num_bytes to orig_bytes since we aren't + * overocmmitted, this way we only try and reclaim what + * we need. + */ + num_bytes = orig_bytes; + } + } else { + /* + * Ok we're over committed, set num_bytes to the overcommitted + * amount plus the amount of bytes that we need for this + * reservation. + */ + num_bytes = unused - space_info->total_bytes + + (orig_bytes * (retries + 1)); + } - ret = shrink_delalloc(trans, root, num_bytes); - if (ret) - return ret; + /* + * Couldn't make our reservation, save our place so while we're trying + * to reclaim space we can actually use it instead of somebody else + * stealing it from us. + */ + if (ret && !reserved) { + space_info->bytes_reserved += orig_bytes; + reserved = true; + } - spin_lock(&space_info->lock); - if (space_info->bytes_pinned < num_bytes) - ret = 1; spin_unlock(&space_info->lock); - if (ret) - return -ENOSPC; - (*retries)++; - - if (trans) - return -EAGAIN; + if (!ret) + return 0; - trans = btrfs_join_transaction(root, 1); - BUG_ON(IS_ERR(trans)); - ret = btrfs_commit_transaction(trans, root); - BUG_ON(ret); + if (!flush) + goto out; - return 1; -} + /* + * We do synchronous shrinking since we don't actually unreserve + * metadata until after the IO is completed. + */ + ret = shrink_delalloc(trans, root, num_bytes, 1); + if (ret > 0) + return 0; + else if (ret < 0) + goto out; -static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv, - u64 num_bytes) -{ - struct btrfs_space_info *space_info = block_rsv->space_info; - u64 unused; - int ret = -ENOSPC; + /* + * So if we were overcommitted it's possible that somebody else flushed + * out enough space and we simply didn't have enough space to reclaim, + * so go back around and try again. + */ + if (retries < 2) { + retries++; + goto again; + } spin_lock(&space_info->lock); - unused = space_info->bytes_used + space_info->bytes_reserved + - space_info->bytes_pinned + space_info->bytes_readonly; + /* + * Not enough space to be reclaimed, don't bother committing the + * transaction. + */ + if (space_info->bytes_pinned < orig_bytes) + ret = -ENOSPC; + spin_unlock(&space_info->lock); + if (ret) + goto out; - if (unused < space_info->total_bytes) - unused = space_info->total_bytes - unused; - else - unused = 0; + ret = -EAGAIN; + if (trans || committed) + goto out; - if (unused >= num_bytes) { - if (block_rsv->priority >= 10) { - space_info->bytes_reserved += num_bytes; - ret = 0; - } else { - if ((unused + block_rsv->reserved) * - block_rsv->priority >= - (num_bytes + block_rsv->reserved) * 10) { - space_info->bytes_reserved += num_bytes; - ret = 0; - } - } + ret = -ENOSPC; + trans = btrfs_join_transaction(root, 1); + if (IS_ERR(trans)) + goto out; + ret = btrfs_commit_transaction(trans, root); + if (!ret) { + trans = NULL; + committed = true; + goto again; + } + +out: + if (reserved) { + spin_lock(&space_info->lock); + space_info->bytes_reserved -= orig_bytes; + spin_unlock(&space_info->lock); } - spin_unlock(&space_info->lock); return ret; } @@ -3327,18 +3606,14 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) { struct btrfs_block_rsv *block_rsv; struct btrfs_fs_info *fs_info = root->fs_info; - u64 alloc_target; block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); if (!block_rsv) return NULL; btrfs_init_block_rsv(block_rsv); - - alloc_target = btrfs_get_alloc_profile(root, 0); block_rsv->space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); - return block_rsv; } @@ -3369,23 +3644,19 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, - u64 num_bytes, int *retries) + u64 num_bytes) { int ret; if (num_bytes == 0) return 0; -again: - ret = reserve_metadata_bytes(block_rsv, num_bytes); + + ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 1); return 0; } - ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries); - if (ret > 0) - goto again; - return ret; } @@ -3420,7 +3691,8 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, return 0; if (block_rsv->refill_used) { - ret = reserve_metadata_bytes(block_rsv, num_bytes); + ret = reserve_metadata_bytes(trans, root, block_rsv, + num_bytes, 0); if (!ret) { block_rsv_add_bytes(block_rsv, num_bytes, 0); return 0; @@ -3499,6 +3771,8 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); spin_lock(&sinfo->lock); + if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) + data_used = 0; meta_used = sinfo->bytes_used; spin_unlock(&sinfo->lock); @@ -3526,7 +3800,8 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) block_rsv->size = num_bytes; num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + - sinfo->bytes_reserved + sinfo->bytes_readonly; + sinfo->bytes_reserved + sinfo->bytes_readonly + + sinfo->bytes_may_use; if (sinfo->total_bytes > num_bytes) { num_bytes = sinfo->total_bytes - num_bytes; @@ -3597,7 +3872,7 @@ static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items) int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root, - int num_items, int *retries) + int num_items) { u64 num_bytes; int ret; @@ -3607,7 +3882,7 @@ int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, num_bytes = calc_trans_metadata_size(root, num_items); ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv, - num_bytes, retries); + num_bytes); if (!ret) { trans->bytes_reserved += num_bytes; trans->block_rsv = &root->fs_info->trans_block_rsv; @@ -3681,14 +3956,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; u64 to_reserve; int nr_extents; - int retries = 0; int ret; if (btrfs_transaction_in_commit(root->fs_info)) schedule_timeout(1); num_bytes = ALIGN(num_bytes, root->sectorsize); -again: + spin_lock(&BTRFS_I(inode)->accounting_lock); nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; if (nr_extents > BTRFS_I(inode)->reserved_extents) { @@ -3698,18 +3972,14 @@ again: nr_extents = 0; to_reserve = 0; } + spin_unlock(&BTRFS_I(inode)->accounting_lock); to_reserve += calc_csum_metadata_size(inode, num_bytes); - ret = reserve_metadata_bytes(block_rsv, to_reserve); - if (ret) { - spin_unlock(&BTRFS_I(inode)->accounting_lock); - ret = should_retry_reserve(NULL, root, block_rsv, to_reserve, - &retries); - if (ret > 0) - goto again; + ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); + if (ret) return ret; - } + spin_lock(&BTRFS_I(inode)->accounting_lock); BTRFS_I(inode)->reserved_extents += nr_extents; atomic_inc(&BTRFS_I(inode)->outstanding_extents); spin_unlock(&BTRFS_I(inode)->accounting_lock); @@ -3717,7 +3987,7 @@ again: block_rsv_add_bytes(block_rsv, to_reserve, 1); if (block_rsv->size > 512 * 1024 * 1024) - shrink_delalloc(NULL, root, to_reserve); + shrink_delalloc(NULL, root, to_reserve, 0); return 0; } @@ -3776,12 +4046,12 @@ static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, int alloc) { - struct btrfs_block_group_cache *cache; + struct btrfs_block_group_cache *cache = NULL; struct btrfs_fs_info *info = root->fs_info; - int factor; u64 total = num_bytes; u64 old_val; u64 byte_in_group; + int factor; /* block accounting for super block */ spin_lock(&info->delalloc_lock); @@ -3803,11 +4073,25 @@ static int update_block_group(struct btrfs_trans_handle *trans, factor = 2; else factor = 1; + /* + * If this block group has free space cache written out, we + * need to make sure to load it if we are removing space. This + * is because we need the unpinning stage to actually add the + * space back to the block group, otherwise we will leak space. + */ + if (!alloc && cache->cached == BTRFS_CACHE_NO) + cache_block_group(cache, trans, 1); + byte_in_group = bytenr - cache->key.objectid; WARN_ON(byte_in_group > cache->key.offset); spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); + + if (btrfs_super_cache_generation(&info->super_copy) != 0 && + cache->disk_cache_state < BTRFS_DC_CLEAR) + cache->disk_cache_state = BTRFS_DC_CLEAR; + cache->dirty = 1; old_val = btrfs_block_group_used(&cache->item); num_bytes = min(total, cache->key.offset - byte_in_group); @@ -4554,6 +4838,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, bool found_uncached_bg = false; bool failed_cluster_refill = false; bool failed_alloc = false; + bool use_cluster = true; u64 ideal_cache_percent = 0; u64 ideal_cache_offset = 0; @@ -4568,16 +4853,24 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, return -ENOSPC; } + /* + * If the space info is for both data and metadata it means we have a + * small filesystem and we can't use the clustering stuff. + */ + if (btrfs_mixed_space_info(space_info)) + use_cluster = false; + if (orig_root->ref_cows || empty_size) allowed_chunk_alloc = 1; - if (data & BTRFS_BLOCK_GROUP_METADATA) { + if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { last_ptr = &root->fs_info->meta_alloc_cluster; if (!btrfs_test_opt(root, SSD)) empty_cluster = 64 * 1024; } - if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) { + if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster && + btrfs_test_opt(root, SSD)) { last_ptr = &root->fs_info->data_alloc_cluster; } @@ -4641,6 +4934,10 @@ have_block_group: if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { u64 free_percent; + ret = cache_block_group(block_group, trans, 1); + if (block_group->cached == BTRFS_CACHE_FINISHED) + goto have_block_group; + free_percent = btrfs_block_group_used(&block_group->item); free_percent *= 100; free_percent = div64_u64(free_percent, @@ -4661,7 +4958,7 @@ have_block_group: if (loop > LOOP_CACHING_NOWAIT || (loop > LOOP_FIND_IDEAL && atomic_read(&space_info->caching_threads) < 2)) { - ret = cache_block_group(block_group); + ret = cache_block_group(block_group, trans, 0); BUG_ON(ret); } found_uncached_bg = true; @@ -5218,7 +5515,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, u64 num_bytes = ins->offset; block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); - cache_block_group(block_group); + cache_block_group(block_group, trans, 0); caching_ctl = get_caching_control(block_group); if (!caching_ctl) { @@ -5308,7 +5605,8 @@ use_block_rsv(struct btrfs_trans_handle *trans, block_rsv = get_block_rsv(trans, root); if (block_rsv->size == 0) { - ret = reserve_metadata_bytes(block_rsv, blocksize); + ret = reserve_metadata_bytes(trans, root, block_rsv, + blocksize, 0); if (ret) return ERR_PTR(ret); return block_rsv; @@ -5318,11 +5616,6 @@ use_block_rsv(struct btrfs_trans_handle *trans, if (!ret) return block_rsv; - WARN_ON(1); - printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n", - block_rsv->size, block_rsv->reserved, - block_rsv->freed[0], block_rsv->freed[1]); - return ERR_PTR(-ENOSPC); } @@ -5421,7 +5714,6 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, u64 generation; u64 refs; u64 flags; - u64 last = 0; u32 nritems; u32 blocksize; struct btrfs_key key; @@ -5489,7 +5781,6 @@ reada: generation); if (ret) break; - last = bytenr + blocksize; nread++; } wc->reada_slot = slot; @@ -7813,6 +8104,40 @@ out: return ret; } +void btrfs_put_block_group_cache(struct btrfs_fs_info *info) +{ + struct btrfs_block_group_cache *block_group; + u64 last = 0; + + while (1) { + struct inode *inode; + + block_group = btrfs_lookup_first_block_group(info, last); + while (block_group) { + spin_lock(&block_group->lock); + if (block_group->iref) + break; + spin_unlock(&block_group->lock); + block_group = next_block_group(info->tree_root, + block_group); + } + if (!block_group) { + if (last == 0) + break; + last = 0; + continue; + } + + inode = block_group->inode; + block_group->iref = 0; + block_group->inode = NULL; + spin_unlock(&block_group->lock); + iput(inode); + last = block_group->key.objectid + block_group->key.offset; + btrfs_put_block_group(block_group); + } +} + int btrfs_free_block_groups(struct btrfs_fs_info *info) { struct btrfs_block_group_cache *block_group; @@ -7896,6 +8221,8 @@ int btrfs_read_block_groups(struct btrfs_root *root) struct btrfs_key key; struct btrfs_key found_key; struct extent_buffer *leaf; + int need_clear = 0; + u64 cache_gen; root = info->extent_root; key.objectid = 0; @@ -7905,6 +8232,15 @@ int btrfs_read_block_groups(struct btrfs_root *root) if (!path) return -ENOMEM; + cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); + if (cache_gen != 0 && + btrfs_super_generation(&root->fs_info->super_copy) != cache_gen) + need_clear = 1; + if (btrfs_test_opt(root, CLEAR_CACHE)) + need_clear = 1; + if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen) + printk(KERN_INFO "btrfs: disk space caching is enabled\n"); + while (1) { ret = find_first_block_group(root, path, &key); if (ret > 0) @@ -7927,6 +8263,9 @@ int btrfs_read_block_groups(struct btrfs_root *root) INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); + if (need_clear) + cache->disk_cache_state = BTRFS_DC_CLEAR; + /* * we only want to have 32k of ram per block group for keeping * track of free space, and if we pass 1/2 of that we want to @@ -8031,6 +8370,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->key.offset = size; cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; cache->sectorsize = root->sectorsize; + cache->fs_info = root->fs_info; /* * we only want to have 32k of ram per block group for keeping track @@ -8087,8 +8427,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct btrfs_block_group_cache *block_group; struct btrfs_free_cluster *cluster; + struct btrfs_root *tree_root = root->fs_info->tree_root; struct btrfs_key key; + struct inode *inode; int ret; + int factor; root = root->fs_info->extent_root; @@ -8097,6 +8440,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, BUG_ON(!block_group->ro); memcpy(&key, &block_group->key, sizeof(key)); + if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; /* make sure this block group isn't part of an allocation cluster */ cluster = &root->fs_info->data_alloc_cluster; @@ -8116,6 +8465,40 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); BUG_ON(!path); + inode = lookup_free_space_inode(root, block_group, path); + if (!IS_ERR(inode)) { + btrfs_orphan_add(trans, inode); + clear_nlink(inode); + /* One for the block groups ref */ + spin_lock(&block_group->lock); + if (block_group->iref) { + block_group->iref = 0; + block_group->inode = NULL; + spin_unlock(&block_group->lock); + iput(inode); + } else { + spin_unlock(&block_group->lock); + } + /* One for our lookup ref */ + iput(inode); + } + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = block_group->key.objectid; + key.type = 0; + + ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); + if (ret < 0) + goto out; + if (ret > 0) + btrfs_release_path(tree_root, path); + if (ret == 0) { + ret = btrfs_del_item(trans, tree_root, path); + if (ret) + goto out; + btrfs_release_path(tree_root, path); + } + spin_lock(&root->fs_info->block_group_cache_lock); rb_erase(&block_group->cache_node, &root->fs_info->block_group_cache_tree); @@ -8137,8 +8520,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_lock(&block_group->space_info->lock); block_group->space_info->total_bytes -= block_group->key.offset; block_group->space_info->bytes_readonly -= block_group->key.offset; + block_group->space_info->disk_total -= block_group->key.offset * factor; spin_unlock(&block_group->space_info->lock); + memcpy(&key, &block_group->key, sizeof(key)); + btrfs_clear_space_info_full(root->fs_info); btrfs_put_block_group(block_group); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d74e6af..3e86b9f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -104,7 +104,7 @@ void extent_io_tree_init(struct extent_io_tree *tree, struct address_space *mapping, gfp_t mask) { tree->state = RB_ROOT; - tree->buffer = RB_ROOT; + INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC); tree->ops = NULL; tree->dirty_bytes = 0; spin_lock_init(&tree->lock); @@ -235,50 +235,6 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree, return ret; } -static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree, - u64 offset, struct rb_node *node) -{ - struct rb_root *root = &tree->buffer; - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct extent_buffer *eb; - - while (*p) { - parent = *p; - eb = rb_entry(parent, struct extent_buffer, rb_node); - - if (offset < eb->start) - p = &(*p)->rb_left; - else if (offset > eb->start) - p = &(*p)->rb_right; - else - return eb; - } - - rb_link_node(node, parent, p); - rb_insert_color(node, root); - return NULL; -} - -static struct extent_buffer *buffer_search(struct extent_io_tree *tree, - u64 offset) -{ - struct rb_root *root = &tree->buffer; - struct rb_node *n = root->rb_node; - struct extent_buffer *eb; - - while (n) { - eb = rb_entry(n, struct extent_buffer, rb_node); - if (offset < eb->start) - n = n->rb_left; - else if (offset > eb->start) - n = n->rb_right; - else - return eb; - } - return NULL; -} - static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, struct extent_state *other) { @@ -1872,9 +1828,9 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err) bio_put(bio); } -static struct bio * -extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, - gfp_t gfp_flags) +struct bio * +btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, + gfp_t gfp_flags) { struct bio *bio; @@ -1901,10 +1857,8 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, struct page *page = bvec->bv_page; struct extent_io_tree *tree = bio->bi_private; u64 start; - u64 end; start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; - end = start + bvec->bv_len - 1; bio->bi_private = NULL; @@ -1965,7 +1919,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, else nr = bio_get_nr_vecs(bdev); - bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); + bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; @@ -2204,7 +2158,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, u64 last_byte = i_size_read(inode); u64 block_start; u64 iosize; - u64 unlock_start; sector_t sector; struct extent_state *cached_state = NULL; struct extent_map *em; @@ -2329,7 +2282,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, start, page_end, NULL, 1); - unlock_start = page_end + 1; goto done; } @@ -2340,7 +2292,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (tree->ops && tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, cur, page_end, NULL, 1); - unlock_start = page_end + 1; break; } em = epd->get_extent(inode, page, pg_offset, cur, @@ -2387,7 +2338,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, cur += iosize; pg_offset += iosize; - unlock_start = cur; continue; } /* leave this out until we have a page_mkwrite call */ @@ -2473,7 +2423,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, pgoff_t index; pgoff_t end; /* Inclusive */ int scanned = 0; - int range_whole = 0; pagevec_init(&pvec, 0); if (wbc->range_cyclic) { @@ -2482,8 +2431,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, } else { index = wbc->range_start >> PAGE_CACHE_SHIFT; end = wbc->range_end >> PAGE_CACHE_SHIFT; - if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) - range_whole = 1; scanned = 1; } retry: @@ -2823,6 +2770,8 @@ int extent_prepare_write(struct extent_io_tree *tree, NULL, 1, end_bio_extent_preparewrite, 0, 0, 0); + if (ret && !err) + err = ret; iocount++; block_start = block_start + iosize; } else { @@ -2952,21 +2901,53 @@ out: int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent) { - int ret; + int ret = 0; u64 off = start; u64 max = start + len; u32 flags = 0; + u32 found_type; + u64 last; u64 disko = 0; + struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; + struct btrfs_path *path; + struct btrfs_file_extent_item *item; int end = 0; u64 em_start = 0, em_len = 0; unsigned long emflags; - ret = 0; + int hole = 0; if (len == 0) return -EINVAL; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->leave_spinning = 1; + + ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, + path, inode->i_ino, -1, 0); + if (ret < 0) { + btrfs_free_path(path); + return ret; + } + WARN_ON(!ret); + path->slots[0]--; + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); + found_type = btrfs_key_type(&found_key); + + /* No extents, just return */ + if (found_key.objectid != inode->i_ino || + found_type != BTRFS_EXTENT_DATA_KEY) { + btrfs_free_path(path); + return 0; + } + last = found_key.offset; + btrfs_free_path(path); + lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, &cached_state, GFP_NOFS); em = get_extent(inode, NULL, 0, off, max - off, 0); @@ -2976,11 +2957,18 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ret = PTR_ERR(em); goto out; } + while (!end) { + hole = 0; off = em->start + em->len; if (off >= max) end = 1; + if (em->block_start == EXTENT_MAP_HOLE) { + hole = 1; + goto next; + } + em_start = em->start; em_len = em->len; @@ -2990,8 +2978,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (em->block_start == EXTENT_MAP_LAST_BYTE) { end = 1; flags |= FIEMAP_EXTENT_LAST; - } else if (em->block_start == EXTENT_MAP_HOLE) { - flags |= FIEMAP_EXTENT_UNWRITTEN; } else if (em->block_start == EXTENT_MAP_INLINE) { flags |= (FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED); @@ -3004,10 +2990,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; +next: emflags = em->flags; free_extent_map(em); em = NULL; - if (!end) { em = get_extent(inode, NULL, 0, off, max - off, 0); if (!em) @@ -3018,15 +3004,23 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } emflags = em->flags; } + if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) { flags |= FIEMAP_EXTENT_LAST; end = 1; } - ret = fiemap_fill_next_extent(fieinfo, em_start, disko, - em_len, flags); - if (ret) - goto out_free; + if (em_start == last) { + flags |= FIEMAP_EXTENT_LAST; + end = 1; + } + + if (!hole) { + ret = fiemap_fill_next_extent(fieinfo, em_start, disko, + em_len, flags); + if (ret) + goto out_free; + } } out_free: free_extent_map(em); @@ -3104,6 +3098,39 @@ static void __free_extent_buffer(struct extent_buffer *eb) kmem_cache_free(extent_buffer_cache, eb); } +/* + * Helper for releasing extent buffer page. + */ +static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, + unsigned long start_idx) +{ + unsigned long index; + struct page *page; + + if (!eb->first_page) + return; + + index = num_extent_pages(eb->start, eb->len); + if (start_idx >= index) + return; + + do { + index--; + page = extent_buffer_page(eb, index); + if (page) + page_cache_release(page); + } while (index != start_idx); +} + +/* + * Helper for releasing the extent buffer. + */ +static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) +{ + btrfs_release_extent_buffer_page(eb, 0); + __free_extent_buffer(eb); +} + struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, u64 start, unsigned long len, struct page *page0, @@ -3117,16 +3144,16 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, struct page *p; struct address_space *mapping = tree->mapping; int uptodate = 1; + int ret; - spin_lock(&tree->buffer_lock); - eb = buffer_search(tree, start); - if (eb) { - atomic_inc(&eb->refs); - spin_unlock(&tree->buffer_lock); + rcu_read_lock(); + eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); + if (eb && atomic_inc_not_zero(&eb->refs)) { + rcu_read_unlock(); mark_page_accessed(eb->first_page); return eb; } - spin_unlock(&tree->buffer_lock); + rcu_read_unlock(); eb = __alloc_extent_buffer(tree, start, len, mask); if (!eb) @@ -3165,26 +3192,31 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); + if (ret) + goto free_eb; + spin_lock(&tree->buffer_lock); - exists = buffer_tree_insert(tree, start, &eb->rb_node); - if (exists) { + ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb); + if (ret == -EEXIST) { + exists = radix_tree_lookup(&tree->buffer, + start >> PAGE_CACHE_SHIFT); /* add one reference for the caller */ atomic_inc(&exists->refs); spin_unlock(&tree->buffer_lock); + radix_tree_preload_end(); goto free_eb; } /* add one reference for the tree */ atomic_inc(&eb->refs); spin_unlock(&tree->buffer_lock); + radix_tree_preload_end(); return eb; free_eb: if (!atomic_dec_and_test(&eb->refs)) return exists; - for (index = 1; index < i; index++) - page_cache_release(extent_buffer_page(eb, index)); - page_cache_release(extent_buffer_page(eb, 0)); - __free_extent_buffer(eb); + btrfs_release_extent_buffer(eb); return exists; } @@ -3194,16 +3226,16 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, { struct extent_buffer *eb; - spin_lock(&tree->buffer_lock); - eb = buffer_search(tree, start); - if (eb) - atomic_inc(&eb->refs); - spin_unlock(&tree->buffer_lock); - - if (eb) + rcu_read_lock(); + eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); + if (eb && atomic_inc_not_zero(&eb->refs)) { + rcu_read_unlock(); mark_page_accessed(eb->first_page); + return eb; + } + rcu_read_unlock(); - return eb; + return NULL; } void free_extent_buffer(struct extent_buffer *eb) @@ -3833,34 +3865,47 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, } } +static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) +{ + struct extent_buffer *eb = + container_of(head, struct extent_buffer, rcu_head); + + btrfs_release_extent_buffer(eb); +} + int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) { u64 start = page_offset(page); struct extent_buffer *eb; int ret = 1; - unsigned long i; - unsigned long num_pages; spin_lock(&tree->buffer_lock); - eb = buffer_search(tree, start); - if (!eb) - goto out; + eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); + if (!eb) { + spin_unlock(&tree->buffer_lock); + return ret; + } - if (atomic_read(&eb->refs) > 1) { + if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { ret = 0; goto out; } - if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + + /* + * set @eb->refs to 0 if it is already 1, and then release the @eb. + * Or go back. + */ + if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) { ret = 0; goto out; } - /* at this point we can safely release the extent buffer */ - num_pages = num_extent_pages(eb->start, eb->len); - for (i = 0; i < num_pages; i++) - page_cache_release(extent_buffer_page(eb, i)); - rb_erase(&eb->rb_node, &tree->buffer); - __free_extent_buffer(eb); + + radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT); out: spin_unlock(&tree->buffer_lock); + + /* at this point we can safely release the extent buffer */ + if (atomic_read(&eb->refs) == 0) + call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); return ret; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 5691c7b..4183c81 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -85,7 +85,7 @@ struct extent_io_ops { struct extent_io_tree { struct rb_root state; - struct rb_root buffer; + struct radix_tree_root buffer; struct address_space *mapping; u64 dirty_bytes; spinlock_t lock; @@ -123,7 +123,7 @@ struct extent_buffer { unsigned long bflags; atomic_t refs; struct list_head leak_list; - struct rb_node rb_node; + struct rcu_head rcu_head; /* the spinlock is used to protect most operations */ spinlock_t lock; @@ -310,4 +310,7 @@ int extent_clear_unlock_delalloc(struct inode *inode, struct extent_io_tree *tree, u64 start, u64 end, struct page *locked_page, unsigned long op); +struct bio * +btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, + gfp_t gfp_flags); #endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 454ca52..23cb8da 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -335,7 +335,7 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, goto out; } if (IS_ERR(rb_node)) { - em = ERR_PTR(PTR_ERR(rb_node)); + em = ERR_CAST(rb_node); goto out; } em = rb_entry(rb_node, struct extent_map, rb_node); @@ -384,7 +384,7 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, goto out; } if (IS_ERR(rb_node)) { - em = ERR_PTR(PTR_ERR(rb_node)); + em = ERR_CAST(rb_node); goto out; } em = rb_entry(rb_node, struct extent_map, rb_node); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e354c33..c1faded 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1047,8 +1047,14 @@ out: if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + num_written = PTR_ERR(trans); + goto done; + } + mutex_lock(&inode->i_mutex); ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); + mutex_unlock(&inode->i_mutex); if (ret == 0) { ret = btrfs_sync_log(trans, root); if (ret == 0) @@ -1067,6 +1073,7 @@ out: (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); } } +done: current->backing_dev_info = NULL; return num_written ? num_written : err; } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index f488fac..22ee0dc 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -23,10 +23,761 @@ #include "ctree.h" #include "free-space-cache.h" #include "transaction.h" +#include "disk-io.h" #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) #define MAX_CACHE_BYTES_PER_GIG (32 * 1024) +static void recalculate_thresholds(struct btrfs_block_group_cache + *block_group); +static int link_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info); + +struct inode *lookup_free_space_inode(struct btrfs_root *root, + struct btrfs_block_group_cache + *block_group, struct btrfs_path *path) +{ + struct btrfs_key key; + struct btrfs_key location; + struct btrfs_disk_key disk_key; + struct btrfs_free_space_header *header; + struct extent_buffer *leaf; + struct inode *inode = NULL; + int ret; + + spin_lock(&block_group->lock); + if (block_group->inode) + inode = igrab(block_group->inode); + spin_unlock(&block_group->lock); + if (inode) + return inode; + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = block_group->key.objectid; + key.type = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) { + btrfs_release_path(root, path); + return ERR_PTR(-ENOENT); + } + + leaf = path->nodes[0]; + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + btrfs_free_space_key(leaf, header, &disk_key); + btrfs_disk_key_to_cpu(&location, &disk_key); + btrfs_release_path(root, path); + + inode = btrfs_iget(root->fs_info->sb, &location, root, NULL); + if (!inode) + return ERR_PTR(-ENOENT); + if (IS_ERR(inode)) + return inode; + if (is_bad_inode(inode)) { + iput(inode); + return ERR_PTR(-ENOENT); + } + + spin_lock(&block_group->lock); + if (!root->fs_info->closing) { + block_group->inode = igrab(inode); + block_group->iref = 1; + } + spin_unlock(&block_group->lock); + + return inode; +} + +int create_free_space_inode(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) +{ + struct btrfs_key key; + struct btrfs_disk_key disk_key; + struct btrfs_free_space_header *header; + struct btrfs_inode_item *inode_item; + struct extent_buffer *leaf; + u64 objectid; + int ret; + + ret = btrfs_find_free_objectid(trans, root, 0, &objectid); + if (ret < 0) + return ret; + + ret = btrfs_insert_empty_inode(trans, root, path, objectid); + if (ret) + return ret; + + leaf = path->nodes[0]; + inode_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_item); + btrfs_item_key(leaf, &disk_key, path->slots[0]); + memset_extent_buffer(leaf, 0, (unsigned long)inode_item, + sizeof(*inode_item)); + btrfs_set_inode_generation(leaf, inode_item, trans->transid); + btrfs_set_inode_size(leaf, inode_item, 0); + btrfs_set_inode_nbytes(leaf, inode_item, 0); + btrfs_set_inode_uid(leaf, inode_item, 0); + btrfs_set_inode_gid(leaf, inode_item, 0); + btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); + btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | + BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM); + btrfs_set_inode_nlink(leaf, inode_item, 1); + btrfs_set_inode_transid(leaf, inode_item, trans->transid); + btrfs_set_inode_block_group(leaf, inode_item, + block_group->key.objectid); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(root, path); + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = block_group->key.objectid; + key.type = 0; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(struct btrfs_free_space_header)); + if (ret < 0) { + btrfs_release_path(root, path); + return ret; + } + leaf = path->nodes[0]; + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header)); + btrfs_set_free_space_key(leaf, header, &disk_key); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(root, path); + + return 0; +} + +int btrfs_truncate_free_space_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct inode *inode) +{ + loff_t oldsize; + int ret = 0; + + trans->block_rsv = root->orphan_block_rsv; + ret = btrfs_block_rsv_check(trans, root, + root->orphan_block_rsv, + 0, 5); + if (ret) + return ret; + + oldsize = i_size_read(inode); + btrfs_i_size_write(inode, 0); + truncate_pagecache(inode, oldsize, 0); + + /* + * We don't need an orphan item because truncating the free space cache + * will never be split across transactions. + */ + ret = btrfs_truncate_inode_items(trans, root, inode, + 0, BTRFS_EXTENT_DATA_KEY); + if (ret) { + WARN_ON(1); + return ret; + } + + return btrfs_update_inode(trans, root, inode); +} + +static int readahead_cache(struct inode *inode) +{ + struct file_ra_state *ra; + unsigned long last_index; + + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -ENOMEM; + + file_ra_state_init(ra, inode->i_mapping); + last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; + + page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index); + + kfree(ra); + + return 0; +} + +int load_free_space_cache(struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group) +{ + struct btrfs_root *root = fs_info->tree_root; + struct inode *inode; + struct btrfs_free_space_header *header; + struct extent_buffer *leaf; + struct page *page; + struct btrfs_path *path; + u32 *checksums = NULL, *crc; + char *disk_crcs = NULL; + struct btrfs_key key; + struct list_head bitmaps; + u64 num_entries; + u64 num_bitmaps; + u64 generation; + u32 cur_crc = ~(u32)0; + pgoff_t index = 0; + unsigned long first_page_offset; + int num_checksums; + int ret = 0; + + /* + * If we're unmounting then just return, since this does a search on the + * normal root and not the commit root and we could deadlock. + */ + smp_mb(); + if (fs_info->closing) + return 0; + + /* + * If this block group has been marked to be cleared for one reason or + * another then we can't trust the on disk cache, so just return. + */ + spin_lock(&block_group->lock); + if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) { + spin_unlock(&block_group->lock); + return 0; + } + spin_unlock(&block_group->lock); + + INIT_LIST_HEAD(&bitmaps); + + path = btrfs_alloc_path(); + if (!path) + return 0; + + inode = lookup_free_space_inode(root, block_group, path); + if (IS_ERR(inode)) { + btrfs_free_path(path); + return 0; + } + + /* Nothing in the space cache, goodbye */ + if (!i_size_read(inode)) { + btrfs_free_path(path); + goto out; + } + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = block_group->key.objectid; + key.type = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret) { + btrfs_free_path(path); + goto out; + } + + leaf = path->nodes[0]; + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + num_entries = btrfs_free_space_entries(leaf, header); + num_bitmaps = btrfs_free_space_bitmaps(leaf, header); + generation = btrfs_free_space_generation(leaf, header); + btrfs_free_path(path); + + if (BTRFS_I(inode)->generation != generation) { + printk(KERN_ERR "btrfs: free space inode generation (%llu) did" + " not match free space cache generation (%llu) for " + "block group %llu\n", + (unsigned long long)BTRFS_I(inode)->generation, + (unsigned long long)generation, + (unsigned long long)block_group->key.objectid); + goto out; + } + + if (!num_entries) + goto out; + + /* Setup everything for doing checksumming */ + num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE; + checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS); + if (!checksums) + goto out; + first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); + disk_crcs = kzalloc(first_page_offset, GFP_NOFS); + if (!disk_crcs) + goto out; + + ret = readahead_cache(inode); + if (ret) { + ret = 0; + goto out; + } + + while (1) { + struct btrfs_free_space_entry *entry; + struct btrfs_free_space *e; + void *addr; + unsigned long offset = 0; + unsigned long start_offset = 0; + int need_loop = 0; + + if (!num_entries && !num_bitmaps) + break; + + if (index == 0) { + start_offset = first_page_offset; + offset = start_offset; + } + + page = grab_cache_page(inode->i_mapping, index); + if (!page) { + ret = 0; + goto free_cache; + } + + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + printk(KERN_ERR "btrfs: error reading free " + "space cache: %llu\n", + (unsigned long long) + block_group->key.objectid); + goto free_cache; + } + } + addr = kmap(page); + + if (index == 0) { + u64 *gen; + + memcpy(disk_crcs, addr, first_page_offset); + gen = addr + (sizeof(u32) * num_checksums); + if (*gen != BTRFS_I(inode)->generation) { + printk(KERN_ERR "btrfs: space cache generation" + " (%llu) does not match inode (%llu) " + "for block group %llu\n", + (unsigned long long)*gen, + (unsigned long long) + BTRFS_I(inode)->generation, + (unsigned long long) + block_group->key.objectid); + kunmap(page); + unlock_page(page); + page_cache_release(page); + goto free_cache; + } + crc = (u32 *)disk_crcs; + } + entry = addr + start_offset; + + /* First lets check our crc before we do anything fun */ + cur_crc = ~(u32)0; + cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc, + PAGE_CACHE_SIZE - start_offset); + btrfs_csum_final(cur_crc, (char *)&cur_crc); + if (cur_crc != *crc) { + printk(KERN_ERR "btrfs: crc mismatch for page %lu in " + "block group %llu\n", index, + (unsigned long long)block_group->key.objectid); + kunmap(page); + unlock_page(page); + page_cache_release(page); + goto free_cache; + } + crc++; + + while (1) { + if (!num_entries) + break; + + need_loop = 1; + e = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); + if (!e) { + kunmap(page); + unlock_page(page); + page_cache_release(page); + goto free_cache; + } + + e->offset = le64_to_cpu(entry->offset); + e->bytes = le64_to_cpu(entry->bytes); + if (!e->bytes) { + kunmap(page); + kfree(e); + unlock_page(page); + page_cache_release(page); + goto free_cache; + } + + if (entry->type == BTRFS_FREE_SPACE_EXTENT) { + spin_lock(&block_group->tree_lock); + ret = link_free_space(block_group, e); + spin_unlock(&block_group->tree_lock); + BUG_ON(ret); + } else { + e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + if (!e->bitmap) { + kunmap(page); + kfree(e); + unlock_page(page); + page_cache_release(page); + goto free_cache; + } + spin_lock(&block_group->tree_lock); + ret = link_free_space(block_group, e); + block_group->total_bitmaps++; + recalculate_thresholds(block_group); + spin_unlock(&block_group->tree_lock); + list_add_tail(&e->list, &bitmaps); + } + + num_entries--; + offset += sizeof(struct btrfs_free_space_entry); + if (offset + sizeof(struct btrfs_free_space_entry) >= + PAGE_CACHE_SIZE) + break; + entry++; + } + + /* + * We read an entry out of this page, we need to move on to the + * next page. + */ + if (need_loop) { + kunmap(page); + goto next; + } + + /* + * We add the bitmaps at the end of the entries in order that + * the bitmap entries are added to the cache. + */ + e = list_entry(bitmaps.next, struct btrfs_free_space, list); + list_del_init(&e->list); + memcpy(e->bitmap, addr, PAGE_CACHE_SIZE); + kunmap(page); + num_bitmaps--; +next: + unlock_page(page); + page_cache_release(page); + index++; + } + + ret = 1; +out: + kfree(checksums); + kfree(disk_crcs); + iput(inode); + return ret; + +free_cache: + /* This cache is bogus, make sure it gets cleared */ + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_CLEAR; + spin_unlock(&block_group->lock); + btrfs_remove_free_space_cache(block_group); + goto out; +} + +int btrfs_write_out_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) +{ + struct btrfs_free_space_header *header; + struct extent_buffer *leaf; + struct inode *inode; + struct rb_node *node; + struct list_head *pos, *n; + struct page *page; + struct extent_state *cached_state = NULL; + struct list_head bitmap_list; + struct btrfs_key key; + u64 bytes = 0; + u32 *crc, *checksums; + pgoff_t index = 0, last_index = 0; + unsigned long first_page_offset; + int num_checksums; + int entries = 0; + int bitmaps = 0; + int ret = 0; + + root = root->fs_info->tree_root; + + INIT_LIST_HEAD(&bitmap_list); + + spin_lock(&block_group->lock); + if (block_group->disk_cache_state < BTRFS_DC_SETUP) { + spin_unlock(&block_group->lock); + return 0; + } + spin_unlock(&block_group->lock); + + inode = lookup_free_space_inode(root, block_group, path); + if (IS_ERR(inode)) + return 0; + + if (!i_size_read(inode)) { + iput(inode); + return 0; + } + + last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; + filemap_write_and_wait(inode->i_mapping); + btrfs_wait_ordered_range(inode, inode->i_size & + ~(root->sectorsize - 1), (u64)-1); + + /* We need a checksum per page. */ + num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE; + crc = checksums = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS); + if (!crc) { + iput(inode); + return 0; + } + + /* Since the first page has all of our checksums and our generation we + * need to calculate the offset into the page that we can start writing + * our entries. + */ + first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); + + node = rb_first(&block_group->free_space_offset); + if (!node) + goto out_free; + + /* + * Lock all pages first so we can lock the extent safely. + * + * NOTE: Because we hold the ref the entire time we're going to write to + * the page find_get_page should never fail, so we don't do a check + * after find_get_page at this point. Just putting this here so people + * know and don't freak out. + */ + while (index <= last_index) { + page = grab_cache_page(inode->i_mapping, index); + if (!page) { + pgoff_t i = 0; + + while (i < index) { + page = find_get_page(inode->i_mapping, i); + unlock_page(page); + page_cache_release(page); + page_cache_release(page); + i++; + } + goto out_free; + } + index++; + } + + index = 0; + lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + 0, &cached_state, GFP_NOFS); + + /* Write out the extent entries */ + do { + struct btrfs_free_space_entry *entry; + void *addr; + unsigned long offset = 0; + unsigned long start_offset = 0; + + if (index == 0) { + start_offset = first_page_offset; + offset = start_offset; + } + + page = find_get_page(inode->i_mapping, index); + + addr = kmap(page); + entry = addr + start_offset; + + memset(addr, 0, PAGE_CACHE_SIZE); + while (1) { + struct btrfs_free_space *e; + + e = rb_entry(node, struct btrfs_free_space, offset_index); + entries++; + + entry->offset = cpu_to_le64(e->offset); + entry->bytes = cpu_to_le64(e->bytes); + if (e->bitmap) { + entry->type = BTRFS_FREE_SPACE_BITMAP; + list_add_tail(&e->list, &bitmap_list); + bitmaps++; + } else { + entry->type = BTRFS_FREE_SPACE_EXTENT; + } + node = rb_next(node); + if (!node) + break; + offset += sizeof(struct btrfs_free_space_entry); + if (offset + sizeof(struct btrfs_free_space_entry) >= + PAGE_CACHE_SIZE) + break; + entry++; + } + *crc = ~(u32)0; + *crc = btrfs_csum_data(root, addr + start_offset, *crc, + PAGE_CACHE_SIZE - start_offset); + kunmap(page); + + btrfs_csum_final(*crc, (char *)crc); + crc++; + + bytes += PAGE_CACHE_SIZE; + + ClearPageChecked(page); + set_page_extent_mapped(page); + SetPageUptodate(page); + set_page_dirty(page); + + /* + * We need to release our reference we got for grab_cache_page, + * except for the first page which will hold our checksums, we + * do that below. + */ + if (index != 0) { + unlock_page(page); + page_cache_release(page); + } + + page_cache_release(page); + + index++; + } while (node); + + /* Write out the bitmaps */ + list_for_each_safe(pos, n, &bitmap_list) { + void *addr; + struct btrfs_free_space *entry = + list_entry(pos, struct btrfs_free_space, list); + + page = find_get_page(inode->i_mapping, index); + + addr = kmap(page); + memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE); + *crc = ~(u32)0; + *crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE); + kunmap(page); + btrfs_csum_final(*crc, (char *)crc); + crc++; + bytes += PAGE_CACHE_SIZE; + + ClearPageChecked(page); + set_page_extent_mapped(page); + SetPageUptodate(page); + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + page_cache_release(page); + list_del_init(&entry->list); + index++; + } + + /* Zero out the rest of the pages just to make sure */ + while (index <= last_index) { + void *addr; + + page = find_get_page(inode->i_mapping, index); + + addr = kmap(page); + memset(addr, 0, PAGE_CACHE_SIZE); + kunmap(page); + ClearPageChecked(page); + set_page_extent_mapped(page); + SetPageUptodate(page); + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + page_cache_release(page); + bytes += PAGE_CACHE_SIZE; + index++; + } + + btrfs_set_extent_delalloc(inode, 0, bytes - 1, &cached_state); + + /* Write the checksums and trans id to the first page */ + { + void *addr; + u64 *gen; + + page = find_get_page(inode->i_mapping, 0); + + addr = kmap(page); + memcpy(addr, checksums, sizeof(u32) * num_checksums); + gen = addr + (sizeof(u32) * num_checksums); + *gen = trans->transid; + kunmap(page); + ClearPageChecked(page); + set_page_extent_mapped(page); + SetPageUptodate(page); + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + page_cache_release(page); + } + BTRFS_I(inode)->generation = trans->transid; + + unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, + i_size_read(inode) - 1, &cached_state, GFP_NOFS); + + filemap_write_and_wait(inode->i_mapping); + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; + key.offset = block_group->key.objectid; + key.type = 0; + + ret = btrfs_search_slot(trans, root, &key, path, 1, 1); + if (ret < 0) { + ret = 0; + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); + goto out_free; + } + leaf = path->nodes[0]; + if (ret > 0) { + struct btrfs_key found_key; + BUG_ON(!path->slots[0]); + path->slots[0]--; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || + found_key.offset != block_group->key.objectid) { + ret = 0; + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 0, 0, NULL, + GFP_NOFS); + btrfs_release_path(root, path); + goto out_free; + } + } + header = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_header); + btrfs_set_free_space_entries(leaf, header, entries); + btrfs_set_free_space_bitmaps(leaf, header, bitmaps); + btrfs_set_free_space_generation(leaf, header, trans->transid); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(root, path); + + ret = 1; + +out_free: + if (ret == 0) { + invalidate_inode_pages2_range(inode->i_mapping, 0, index); + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_ERROR; + spin_unlock(&block_group->lock); + BTRFS_I(inode)->generation = 0; + } + kfree(checksums); + btrfs_update_inode(trans, root, inode); + iput(inode); + return ret; +} + static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize, u64 offset) { diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 890a8e7..e49ca5c 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -27,6 +27,24 @@ struct btrfs_free_space { struct list_head list; }; +struct inode *lookup_free_space_inode(struct btrfs_root *root, + struct btrfs_block_group_cache + *block_group, struct btrfs_path *path); +int create_free_space_inode(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path); + +int btrfs_truncate_free_space_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_path *path, + struct inode *inode); +int load_free_space_cache(struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group); +int btrfs_write_out_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path); int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, u64 bytenr, u64 size); int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 64f99cf..8039390 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -319,8 +319,6 @@ static noinline int compress_file_range(struct inode *inode, struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; u64 num_bytes; - u64 orig_start; - u64 disk_num_bytes; u64 blocksize = root->sectorsize; u64 actual_end; u64 isize = i_size_read(inode); @@ -335,8 +333,6 @@ static noinline int compress_file_range(struct inode *inode, int i; int will_compress; - orig_start = start; - actual_end = min_t(u64, isize, end + 1); again: will_compress = 0; @@ -371,7 +367,6 @@ again: total_compressed = min(total_compressed, max_uncompressed); num_bytes = (end - start + blocksize) & ~(blocksize - 1); num_bytes = max(blocksize, num_bytes); - disk_num_bytes = num_bytes; total_in = 0; ret = 0; @@ -467,7 +462,6 @@ again: if (total_compressed >= total_in) { will_compress = 0; } else { - disk_num_bytes = total_compressed; num_bytes = total_in; } } @@ -757,20 +751,17 @@ static noinline int cow_file_range(struct inode *inode, u64 disk_num_bytes; u64 cur_alloc_size; u64 blocksize = root->sectorsize; - u64 actual_end; - u64 isize = i_size_read(inode); struct btrfs_key ins; struct extent_map *em; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; int ret = 0; + BUG_ON(root == root->fs_info->tree_root); trans = btrfs_join_transaction(root, 1); BUG_ON(!trans); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; - actual_end = min_t(u64, isize, end + 1); - num_bytes = (end - start + blocksize) & ~(blocksize - 1); num_bytes = max(blocksize, num_bytes); disk_num_bytes = num_bytes; @@ -1035,10 +1026,16 @@ static noinline int run_delalloc_nocow(struct inode *inode, int type; int nocow; int check_prev = 1; + bool nolock = false; path = btrfs_alloc_path(); BUG_ON(!path); - trans = btrfs_join_transaction(root, 1); + if (root == root->fs_info->tree_root) { + nolock = true; + trans = btrfs_join_transaction_nolock(root, 1); + } else { + trans = btrfs_join_transaction(root, 1); + } BUG_ON(!trans); cow_start = (u64)-1; @@ -1211,8 +1208,13 @@ out_check: BUG_ON(ret); } - ret = btrfs_end_transaction(trans, root); - BUG_ON(ret); + if (nolock) { + ret = btrfs_end_transaction_nolock(trans, root); + BUG_ON(ret); + } else { + ret = btrfs_end_transaction(trans, root); + BUG_ON(ret); + } btrfs_free_path(path); return 0; } @@ -1289,6 +1291,8 @@ static int btrfs_set_bit_hook(struct inode *inode, if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 len = state->end + 1 - state->start; + int do_list = (root->root_key.objectid != + BTRFS_ROOT_TREE_OBJECTID); if (*bits & EXTENT_FIRST_DELALLOC) *bits &= ~EXTENT_FIRST_DELALLOC; @@ -1298,7 +1302,7 @@ static int btrfs_set_bit_hook(struct inode *inode, spin_lock(&root->fs_info->delalloc_lock); BTRFS_I(inode)->delalloc_bytes += len; root->fs_info->delalloc_bytes += len; - if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { list_add_tail(&BTRFS_I(inode)->delalloc_inodes, &root->fs_info->delalloc_inodes); } @@ -1321,6 +1325,8 @@ static int btrfs_clear_bit_hook(struct inode *inode, if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 len = state->end + 1 - state->start; + int do_list = (root->root_key.objectid != + BTRFS_ROOT_TREE_OBJECTID); if (*bits & EXTENT_FIRST_DELALLOC) *bits &= ~EXTENT_FIRST_DELALLOC; @@ -1330,14 +1336,15 @@ static int btrfs_clear_bit_hook(struct inode *inode, if (*bits & EXTENT_DO_ACCOUNTING) btrfs_delalloc_release_metadata(inode, len); - if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) + if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID + && do_list) btrfs_free_reserved_data_space(inode, len); spin_lock(&root->fs_info->delalloc_lock); root->fs_info->delalloc_bytes -= len; BTRFS_I(inode)->delalloc_bytes -= len; - if (BTRFS_I(inode)->delalloc_bytes == 0 && + if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { list_del_init(&BTRFS_I(inode)->delalloc_inodes); } @@ -1372,7 +1379,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, if (map_length < length + size) return 1; - return 0; + return ret; } /* @@ -1426,7 +1433,10 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + if (root == root->fs_info->tree_root) + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2); + else + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); BUG_ON(ret); if (!(rw & REQ_WRITE)) { @@ -1662,6 +1672,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) struct extent_state *cached_state = NULL; int compressed = 0; int ret; + bool nolock = false; ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, end - start + 1); @@ -1669,11 +1680,17 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) return 0; BUG_ON(!ordered_extent); + nolock = (root == root->fs_info->tree_root); + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { BUG_ON(!list_empty(&ordered_extent->list)); ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (!ret) { - trans = btrfs_join_transaction(root, 1); + if (nolock) + trans = btrfs_join_transaction_nolock(root, 1); + else + trans = btrfs_join_transaction(root, 1); + BUG_ON(!trans); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_update_inode(trans, root, inode); @@ -1686,7 +1703,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->file_offset + ordered_extent->len - 1, 0, &cached_state, GFP_NOFS); - trans = btrfs_join_transaction(root, 1); + if (nolock) + trans = btrfs_join_transaction_nolock(root, 1); + else + trans = btrfs_join_transaction(root, 1); btrfs_set_trans_block_group(trans, inode); trans->block_rsv = &root->fs_info->delalloc_block_rsv; @@ -1700,6 +1720,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->len); BUG_ON(ret); } else { + BUG_ON(root == root->fs_info->tree_root); ret = insert_reserved_file_extent(trans, inode, ordered_extent->file_offset, ordered_extent->start, @@ -1724,9 +1745,15 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); out: - btrfs_delalloc_release_metadata(inode, ordered_extent->len); - if (trans) - btrfs_end_transaction(trans, root); + if (nolock) { + if (trans) + btrfs_end_transaction_nolock(trans, root); + } else { + btrfs_delalloc_release_metadata(inode, ordered_extent->len); + if (trans) + btrfs_end_transaction(trans, root); + } + /* once for us */ btrfs_put_ordered_extent(ordered_extent); /* once for the tree */ @@ -2237,7 +2264,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) { struct btrfs_path *path; struct extent_buffer *leaf; - struct btrfs_item *item; struct btrfs_key key, found_key; struct btrfs_trans_handle *trans; struct inode *inode; @@ -2275,7 +2301,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) /* pull out the item */ leaf = path->nodes[0]; - item = btrfs_item_nr(leaf, path->slots[0]); btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); /* make sure the item matches what we want */ @@ -2651,7 +2676,8 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index); - BUG_ON(ret); + if (ret == -ENOENT) + ret = 0; err: btrfs_free_path(path); if (ret) @@ -2672,8 +2698,8 @@ static int check_path_shared(struct btrfs_root *root, { struct extent_buffer *eb; int level; - int ret; u64 refs = 1; + int uninitialized_var(ret); for (level = 0; level < BTRFS_MAX_LEVEL; level++) { if (!path->nodes[level]) @@ -2686,7 +2712,7 @@ static int check_path_shared(struct btrfs_root *root, if (refs > 1) return 1; } - return 0; + return ret; /* XXX callers? */ } /* @@ -3196,7 +3222,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); - if (root->ref_cows) + if (root->ref_cows || root == root->fs_info->tree_root) btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); path = btrfs_alloc_path(); @@ -3344,7 +3370,8 @@ delete: } else { break; } - if (found_extent && root->ref_cows) { + if (found_extent && (root->ref_cows || + root == root->fs_info->tree_root)) { btrfs_set_path_blocking(path); ret = btrfs_free_extent(trans, root, extent_start, extent_num_bytes, 0, @@ -3675,7 +3702,8 @@ void btrfs_evict_inode(struct inode *inode) int ret; truncate_inode_pages(&inode->i_data, 0); - if (inode->i_nlink && btrfs_root_refs(&root->root_item) != 0) + if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || + root == root->fs_info->tree_root)) goto no_delete; if (is_bad_inode(inode)) { @@ -3888,7 +3916,14 @@ static void inode_tree_del(struct inode *inode) } spin_unlock(&root->inode_lock); - if (empty && btrfs_root_refs(&root->root_item) == 0) { + /* + * Free space cache has inodes in the tree root, but the tree root has a + * root_refs of 0, so this could end up dropping the tree root as a + * snapshot, so we need the extra !root->fs_info->tree_root check to + * make sure we don't drop it. + */ + if (empty && btrfs_root_refs(&root->root_item) == 0 && + root != root->fs_info->tree_root) { synchronize_srcu(&root->fs_info->subvol_srcu); spin_lock(&root->inode_lock); empty = RB_EMPTY_ROOT(&root->inode_tree); @@ -4282,14 +4317,24 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; int ret = 0; + bool nolock = false; if (BTRFS_I(inode)->dummy_inode) return 0; + smp_mb(); + nolock = (root->fs_info->closing && root == root->fs_info->tree_root); + if (wbc->sync_mode == WB_SYNC_ALL) { - trans = btrfs_join_transaction(root, 1); + if (nolock) + trans = btrfs_join_transaction_nolock(root, 1); + else + trans = btrfs_join_transaction(root, 1); btrfs_set_trans_block_group(trans, inode); - ret = btrfs_commit_transaction(trans, root); + if (nolock) + ret = btrfs_end_transaction_nolock(trans, root); + else + ret = btrfs_commit_transaction(trans, root); } return ret; } @@ -4456,6 +4501,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, BTRFS_I(inode)->index_cnt = 2; BTRFS_I(inode)->root = root; BTRFS_I(inode)->generation = trans->transid; + inode->i_generation = BTRFS_I(inode)->generation; btrfs_set_inode_space_info(root, inode); if (mode & S_IFDIR) @@ -4577,12 +4623,12 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, } static int btrfs_add_nondir(struct btrfs_trans_handle *trans, - struct dentry *dentry, struct inode *inode, - int backref, u64 index) + struct inode *dir, struct dentry *dentry, + struct inode *inode, int backref, u64 index) { - int err = btrfs_add_link(trans, dentry->d_parent->d_inode, - inode, dentry->d_name.name, - dentry->d_name.len, backref, index); + int err = btrfs_add_link(trans, dir, inode, + dentry->d_name.name, dentry->d_name.len, + backref, index); if (!err) { d_instantiate(dentry, inode); return 0; @@ -4623,8 +4669,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, btrfs_set_trans_block_group(trans, dir); inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, - dentry->d_parent->d_inode->i_ino, objectid, + dentry->d_name.len, dir->i_ino, objectid, BTRFS_I(dir)->block_group, mode, &index); err = PTR_ERR(inode); if (IS_ERR(inode)) @@ -4637,7 +4682,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, } btrfs_set_trans_block_group(trans, inode); - err = btrfs_add_nondir(trans, dentry, inode, 0, index); + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; else { @@ -4685,10 +4730,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, btrfs_set_trans_block_group(trans, dir); inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, - dentry->d_parent->d_inode->i_ino, - objectid, BTRFS_I(dir)->block_group, mode, - &index); + dentry->d_name.len, dir->i_ino, objectid, + BTRFS_I(dir)->block_group, mode, &index); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_unlock; @@ -4700,7 +4743,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, } btrfs_set_trans_block_group(trans, inode); - err = btrfs_add_nondir(trans, dentry, inode, 0, index); + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; else { @@ -4742,6 +4785,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, return -EPERM; btrfs_inc_nlink(inode); + inode->i_ctime = CURRENT_TIME; err = btrfs_set_inode_index(dir, &index); if (err) @@ -4760,15 +4804,17 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, btrfs_set_trans_block_group(trans, dir); ihold(inode); - err = btrfs_add_nondir(trans, dentry, inode, 1, index); + err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); if (err) { drop_inode = 1; } else { + struct dentry *parent = dget_parent(dentry); btrfs_update_inode_block_group(trans, dir); err = btrfs_update_inode(trans, root, inode); BUG_ON(err); - btrfs_log_new_name(trans, inode, NULL, dentry->d_parent); + btrfs_log_new_name(trans, inode, NULL, parent); + dput(parent); } nr = trans->blocks_used; @@ -4808,8 +4854,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) btrfs_set_trans_block_group(trans, dir); inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, - dentry->d_parent->d_inode->i_ino, objectid, + dentry->d_name.len, dir->i_ino, objectid, BTRFS_I(dir)->block_group, S_IFDIR | mode, &index); if (IS_ERR(inode)) { @@ -4832,9 +4877,8 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (err) goto out_fail; - err = btrfs_add_link(trans, dentry->d_parent->d_inode, - inode, dentry->d_name.name, - dentry->d_name.len, 0, index); + err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, + dentry->d_name.len, 0, index); if (err) goto out_fail; @@ -5490,13 +5534,21 @@ struct btrfs_dio_private { u64 bytes; u32 *csums; void *private; + + /* number of bios pending for this dio */ + atomic_t pending_bios; + + /* IO errors */ + int errors; + + struct bio *orig_bio; }; static void btrfs_endio_direct_read(struct bio *bio, int err) { + struct btrfs_dio_private *dip = bio->bi_private; struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; struct bio_vec *bvec = bio->bi_io_vec; - struct btrfs_dio_private *dip = bio->bi_private; struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; u64 start; @@ -5550,15 +5602,18 @@ static void btrfs_endio_direct_write(struct bio *bio, int err) struct btrfs_trans_handle *trans; struct btrfs_ordered_extent *ordered = NULL; struct extent_state *cached_state = NULL; + u64 ordered_offset = dip->logical_offset; + u64 ordered_bytes = dip->bytes; int ret; if (err) goto out_done; - - ret = btrfs_dec_test_ordered_pending(inode, &ordered, - dip->logical_offset, dip->bytes); +again: + ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, + &ordered_offset, + ordered_bytes); if (!ret) - goto out_done; + goto out_test; BUG_ON(!ordered); @@ -5618,8 +5673,20 @@ out_unlock: out: btrfs_delalloc_release_metadata(inode, ordered->len); btrfs_end_transaction(trans, root); + ordered_offset = ordered->file_offset + ordered->len; btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); + +out_test: + /* + * our bio might span multiple ordered extents. If we haven't + * completed the accounting for the whole dio, go back and try again + */ + if (ordered_offset < dip->logical_offset + dip->bytes) { + ordered_bytes = dip->logical_offset + dip->bytes - + ordered_offset; + goto again; + } out_done: bio->bi_private = dip->private; @@ -5639,13 +5706,182 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, return 0; } +static void btrfs_end_dio_bio(struct bio *bio, int err) +{ + struct btrfs_dio_private *dip = bio->bi_private; + + if (err) { + printk(KERN_ERR "btrfs direct IO failed ino %lu rw %lu " + "disk_bytenr %lu len %u err no %d\n", + dip->inode->i_ino, bio->bi_rw, bio->bi_sector, + bio->bi_size, err); + dip->errors = 1; + + /* + * before atomic variable goto zero, we must make sure + * dip->errors is perceived to be set. + */ + smp_mb__before_atomic_dec(); + } + + /* if there are more bios still pending for this dio, just exit */ + if (!atomic_dec_and_test(&dip->pending_bios)) + goto out; + + if (dip->errors) + bio_io_error(dip->orig_bio); + else { + set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags); + bio_endio(dip->orig_bio, 0); + } +out: + bio_put(bio); +} + +static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, + u64 first_sector, gfp_t gfp_flags) +{ + int nr_vecs = bio_get_nr_vecs(bdev); + return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); +} + +static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, + int rw, u64 file_offset, int skip_sum, + u32 *csums) +{ + int write = rw & REQ_WRITE; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + bio_get(bio); + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + if (ret) + goto err; + + if (write && !skip_sum) { + ret = btrfs_wq_submit_bio(root->fs_info, + inode, rw, bio, 0, 0, + file_offset, + __btrfs_submit_bio_start_direct_io, + __btrfs_submit_bio_done); + goto err; + } else if (!skip_sum) + btrfs_lookup_bio_sums_dio(root, inode, bio, + file_offset, csums); + + ret = btrfs_map_bio(root, rw, bio, 0, 1); +err: + bio_put(bio); + return ret; +} + +static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, + int skip_sum) +{ + struct inode *inode = dip->inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + struct bio *bio; + struct bio *orig_bio = dip->orig_bio; + struct bio_vec *bvec = orig_bio->bi_io_vec; + u64 start_sector = orig_bio->bi_sector; + u64 file_offset = dip->logical_offset; + u64 submit_len = 0; + u64 map_length; + int nr_pages = 0; + u32 *csums = dip->csums; + int ret = 0; + + bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); + if (!bio) + return -ENOMEM; + bio->bi_private = dip; + bio->bi_end_io = btrfs_end_dio_bio; + atomic_inc(&dip->pending_bios); + + map_length = orig_bio->bi_size; + ret = btrfs_map_block(map_tree, READ, start_sector << 9, + &map_length, NULL, 0); + if (ret) { + bio_put(bio); + return -EIO; + } + + while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { + if (unlikely(map_length < submit_len + bvec->bv_len || + bio_add_page(bio, bvec->bv_page, bvec->bv_len, + bvec->bv_offset) < bvec->bv_len)) { + /* + * inc the count before we submit the bio so + * we know the end IO handler won't happen before + * we inc the count. Otherwise, the dip might get freed + * before we're done setting it up + */ + atomic_inc(&dip->pending_bios); + ret = __btrfs_submit_dio_bio(bio, inode, rw, + file_offset, skip_sum, + csums); + if (ret) { + bio_put(bio); + atomic_dec(&dip->pending_bios); + goto out_err; + } + + if (!skip_sum) + csums = csums + nr_pages; + start_sector += submit_len >> 9; + file_offset += submit_len; + + submit_len = 0; + nr_pages = 0; + + bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, + start_sector, GFP_NOFS); + if (!bio) + goto out_err; + bio->bi_private = dip; + bio->bi_end_io = btrfs_end_dio_bio; + + map_length = orig_bio->bi_size; + ret = btrfs_map_block(map_tree, READ, start_sector << 9, + &map_length, NULL, 0); + if (ret) { + bio_put(bio); + goto out_err; + } + } else { + submit_len += bvec->bv_len; + nr_pages ++; + bvec++; + } + } + + ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, + csums); + if (!ret) + return 0; + + bio_put(bio); +out_err: + dip->errors = 1; + /* + * before atomic variable goto zero, we must + * make sure dip->errors is perceived to be set. + */ + smp_mb__before_atomic_dec(); + if (atomic_dec_and_test(&dip->pending_bios)) + bio_io_error(dip->orig_bio); + + /* bio_end_io() will handle error, so we needn't return it */ + return 0; +} + static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, loff_t file_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_dio_private *dip; struct bio_vec *bvec = bio->bi_io_vec; - u64 start; int skip_sum; int write = rw & REQ_WRITE; int ret = 0; @@ -5671,7 +5907,6 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, dip->inode = inode; dip->logical_offset = file_offset; - start = dip->logical_offset; dip->bytes = 0; do { dip->bytes += bvec->bv_len; @@ -5680,36 +5915,18 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, dip->disk_bytenr = (u64)bio->bi_sector << 9; bio->bi_private = dip; + dip->errors = 0; + dip->orig_bio = bio; + atomic_set(&dip->pending_bios, 0); if (write) bio->bi_end_io = btrfs_endio_direct_write; else bio->bi_end_io = btrfs_endio_direct_read; - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); - if (ret) - goto out_err; - - if (write && !skip_sum) { - ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, - inode, rw, bio, 0, 0, - dip->logical_offset, - __btrfs_submit_bio_start_direct_io, - __btrfs_submit_bio_done); - if (ret) - goto out_err; + ret = btrfs_submit_direct_hook(rw, dip, skip_sum); + if (!ret) return; - } else if (!skip_sum) - btrfs_lookup_bio_sums_dio(root, inode, bio, - dip->logical_offset, dip->csums); - - ret = btrfs_map_bio(root, rw, bio, 0, 1); - if (ret) - goto out_err; - return; -out_err: - kfree(dip->csums); - kfree(dip); free_ordered: /* * If this is a write, we need to clean up the reserved space and kill @@ -6308,6 +6525,21 @@ void btrfs_destroy_inode(struct inode *inode) spin_unlock(&root->fs_info->ordered_extent_lock); } + if (root == root->fs_info->tree_root) { + struct btrfs_block_group_cache *block_group; + + block_group = btrfs_lookup_block_group(root->fs_info, + BTRFS_I(inode)->block_group); + if (block_group && block_group->inode == inode) { + spin_lock(&block_group->lock); + block_group->inode = NULL; + spin_unlock(&block_group->lock); + btrfs_put_block_group(block_group); + } else if (block_group) { + btrfs_put_block_group(block_group); + } + } + spin_lock(&root->orphan_lock); if (!list_empty(&BTRFS_I(inode)->i_orphan)) { printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", @@ -6340,7 +6572,8 @@ int btrfs_drop_inode(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; - if (btrfs_root_refs(&root->root_item) == 0) + if (btrfs_root_refs(&root->root_item) == 0 && + root != root->fs_info->tree_root) return 1; else return generic_drop_inode(inode); @@ -6548,8 +6781,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, BUG_ON(ret); if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { - btrfs_log_new_name(trans, old_inode, old_dir, - new_dentry->d_parent); + struct dentry *parent = dget_parent(new_dentry); + btrfs_log_new_name(trans, old_inode, old_dir, parent); + dput(parent); btrfs_end_log_trans(root); } out_fail: @@ -6609,7 +6843,8 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) return 0; } -int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput) +int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput, + int sync) { struct btrfs_inode *binode; struct inode *inode = NULL; @@ -6631,7 +6866,26 @@ int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput) spin_unlock(&root->fs_info->delalloc_lock); if (inode) { - write_inode_now(inode, 0); + if (sync) { + filemap_write_and_wait(inode->i_mapping); + /* + * We have to do this because compression doesn't + * actually set PG_writeback until it submits the pages + * for IO, which happens in an async thread, so we could + * race and not actually wait for any writeback pages + * because they've not been submitted yet. Technically + * this could still be the case for the ordered stuff + * since the async thread may not have started to do its + * work yet. If this becomes the case then we need to + * figure out a way to make sure that in writepage we + * wait for any async pages to be submitted before + * returning so that fdatawait does what its supposed to + * do. + */ + btrfs_wait_ordered_range(inode, 0, (u64)-1); + } else { + filemap_flush(inode->i_mapping); + } if (delay_iput) btrfs_add_delayed_iput(inode); else @@ -6679,8 +6933,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, btrfs_set_trans_block_group(trans, dir); inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, - dentry->d_parent->d_inode->i_ino, objectid, + dentry->d_name.len, dir->i_ino, objectid, BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO, &index); err = PTR_ERR(inode); @@ -6694,7 +6947,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, } btrfs_set_trans_block_group(trans, inode); - err = btrfs_add_nondir(trans, dentry, inode, 0, index); + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; else { @@ -6757,27 +7010,34 @@ out_unlock: return err; } -int btrfs_prealloc_file_range(struct inode *inode, int mode, - u64 start, u64 num_bytes, u64 min_size, - loff_t actual_len, u64 *alloc_hint) +static int __btrfs_prealloc_file_range(struct inode *inode, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint, + struct btrfs_trans_handle *trans) { - struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; u64 cur_offset = start; + u64 i_size; int ret = 0; + bool own_trans = true; + if (trans) + own_trans = false; while (num_bytes > 0) { - trans = btrfs_start_transaction(root, 3); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; + if (own_trans) { + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } } ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, 0, *alloc_hint, (u64)-1, &ins, 1); if (ret) { - btrfs_end_transaction(trans, root); + if (own_trans) + btrfs_end_transaction(trans, root); break; } @@ -6800,21 +7060,40 @@ int btrfs_prealloc_file_range(struct inode *inode, int mode, (actual_len > inode->i_size) && (cur_offset > inode->i_size)) { if (cur_offset > actual_len) - i_size_write(inode, actual_len); + i_size = actual_len; else - i_size_write(inode, cur_offset); - i_size_write(inode, cur_offset); - btrfs_ordered_update_i_size(inode, cur_offset, NULL); + i_size = cur_offset; + i_size_write(inode, i_size); + btrfs_ordered_update_i_size(inode, i_size, NULL); } ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); - btrfs_end_transaction(trans, root); + if (own_trans) + btrfs_end_transaction(trans, root); } return ret; } +int btrfs_prealloc_file_range(struct inode *inode, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint) +{ + return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, + min_size, actual_len, alloc_hint, + NULL); +} + +int btrfs_prealloc_file_range_trans(struct inode *inode, + struct btrfs_trans_handle *trans, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint) +{ + return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, + min_size, actual_len, alloc_hint, trans); +} + static long btrfs_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) { @@ -6839,6 +7118,10 @@ static long btrfs_fallocate(struct inode *inode, int mode, btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); mutex_lock(&inode->i_mutex); + ret = inode_newsize_ok(inode, alloc_end); + if (ret) + goto out; + if (alloc_start > inode->i_size) { ret = btrfs_cont_expand(inode, alloc_start); if (ret) @@ -7035,6 +7318,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, + .getattr = btrfs_getattr, .permission = btrfs_permission, .setxattr = btrfs_setxattr, .getxattr = btrfs_getxattr, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9254b3d..f1c9bb4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -224,7 +224,8 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg) static noinline int create_subvol(struct btrfs_root *root, struct dentry *dentry, - char *name, int namelen) + char *name, int namelen, + u64 *async_transid) { struct btrfs_trans_handle *trans; struct btrfs_key key; @@ -232,7 +233,8 @@ static noinline int create_subvol(struct btrfs_root *root, struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; struct btrfs_root *new_root; - struct inode *dir = dentry->d_parent->d_inode; + struct dentry *parent = dget_parent(dentry); + struct inode *dir; int ret; int err; u64 objectid; @@ -241,8 +243,13 @@ static noinline int create_subvol(struct btrfs_root *root, ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root, 0, &objectid); - if (ret) + if (ret) { + dput(parent); return ret; + } + + dir = parent->d_inode; + /* * 1 - inode item * 2 - refs @@ -250,8 +257,10 @@ static noinline int create_subvol(struct btrfs_root *root, * 2 - dir items */ trans = btrfs_start_transaction(root, 6); - if (IS_ERR(trans)) + if (IS_ERR(trans)) { + dput(parent); return PTR_ERR(trans); + } leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, objectid, NULL, 0, 0, 0); @@ -338,15 +347,23 @@ static noinline int create_subvol(struct btrfs_root *root, d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); fail: - err = btrfs_commit_transaction(trans, root); + dput(parent); + if (async_transid) { + *async_transid = trans->transid; + err = btrfs_commit_transaction_async(trans, root, 1); + } else { + err = btrfs_commit_transaction(trans, root); + } if (err && !ret) ret = err; return ret; } -static int create_snapshot(struct btrfs_root *root, struct dentry *dentry) +static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, + char *name, int namelen, u64 *async_transid) { struct inode *inode; + struct dentry *parent; struct btrfs_pending_snapshot *pending_snapshot; struct btrfs_trans_handle *trans; int ret; @@ -373,7 +390,14 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry) list_add(&pending_snapshot->list, &trans->transaction->pending_snapshots); - ret = btrfs_commit_transaction(trans, root->fs_info->extent_root); + if (async_transid) { + *async_transid = trans->transid; + ret = btrfs_commit_transaction_async(trans, + root->fs_info->extent_root, 1); + } else { + ret = btrfs_commit_transaction(trans, + root->fs_info->extent_root); + } BUG_ON(ret); ret = pending_snapshot->error; @@ -382,7 +406,9 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry) btrfs_orphan_cleanup(pending_snapshot->snap); - inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); + parent = dget_parent(dentry); + inode = btrfs_lookup_dentry(parent->d_inode, dentry); + dput(parent); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto fail; @@ -395,6 +421,76 @@ fail: return ret; } +/* copy of check_sticky in fs/namei.c() +* It's inline, so penalty for filesystems that don't use sticky bit is +* minimal. +*/ +static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode) +{ + uid_t fsuid = current_fsuid(); + + if (!(dir->i_mode & S_ISVTX)) + return 0; + if (inode->i_uid == fsuid) + return 0; + if (dir->i_uid == fsuid) + return 0; + return !capable(CAP_FOWNER); +} + +/* copy of may_delete in fs/namei.c() + * Check whether we can remove a link victim from directory dir, check + * whether the type of victim is right. + * 1. We can't do it if dir is read-only (done in permission()) + * 2. We should have write and exec permissions on dir + * 3. We can't remove anything from append-only dir + * 4. We can't do anything with immutable dir (done in permission()) + * 5. If the sticky bit on dir is set we should either + * a. be owner of dir, or + * b. be owner of victim, or + * c. have CAP_FOWNER capability + * 6. If the victim is append-only or immutable we can't do antyhing with + * links pointing to it. + * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. + * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. + * 9. We can't remove a root or mountpoint. + * 10. We don't allow removal of NFS sillyrenamed files; it's handled by + * nfs_async_unlink(). + */ + +static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir) +{ + int error; + + if (!victim->d_inode) + return -ENOENT; + + BUG_ON(victim->d_parent->d_inode != dir); + audit_inode_child(victim, dir); + + error = inode_permission(dir, MAY_WRITE | MAY_EXEC); + if (error) + return error; + if (IS_APPEND(dir)) + return -EPERM; + if (btrfs_check_sticky(dir, victim->d_inode)|| + IS_APPEND(victim->d_inode)|| + IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) + return -EPERM; + if (isdir) { + if (!S_ISDIR(victim->d_inode->i_mode)) + return -ENOTDIR; + if (IS_ROOT(victim)) + return -EBUSY; + } else if (S_ISDIR(victim->d_inode->i_mode)) + return -EISDIR; + if (IS_DEADDIR(dir)) + return -ENOENT; + if (victim->d_flags & DCACHE_NFSFS_RENAMED) + return -EBUSY; + return 0; +} + /* copy of may_create in fs/namei.c() */ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) { @@ -412,7 +508,8 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) */ static noinline int btrfs_mksubvol(struct path *parent, char *name, int namelen, - struct btrfs_root *snap_src) + struct btrfs_root *snap_src, + u64 *async_transid) { struct inode *dir = parent->dentry->d_inode; struct dentry *dentry; @@ -443,10 +540,11 @@ static noinline int btrfs_mksubvol(struct path *parent, goto out_up_read; if (snap_src) { - error = create_snapshot(snap_src, dentry); + error = create_snapshot(snap_src, dentry, + name, namelen, async_transid); } else { error = create_subvol(BTRFS_I(dir)->root, dentry, - name, namelen); + name, namelen, async_transid); } if (!error) fsnotify_mkdir(dir, dentry); @@ -708,7 +806,6 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, char *sizestr; char *devstr = NULL; int ret = 0; - int namelen; int mod = 0; if (root->fs_info->sb->s_flags & MS_RDONLY) @@ -722,7 +819,6 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, return PTR_ERR(vol_args); vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - namelen = strlen(vol_args->name); mutex_lock(&root->fs_info->volume_mutex); sizestr = vol_args->name; @@ -801,11 +897,13 @@ out_unlock: return ret; } -static noinline int btrfs_ioctl_snap_create(struct file *file, - void __user *arg, int subvol) +static noinline int btrfs_ioctl_snap_create_transid(struct file *file, + char *name, + unsigned long fd, + int subvol, + u64 *transid) { struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; - struct btrfs_ioctl_vol_args *vol_args; struct file *src_file; int namelen; int ret = 0; @@ -813,23 +911,18 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; - vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) - return PTR_ERR(vol_args); - - vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - namelen = strlen(vol_args->name); - if (strchr(vol_args->name, '/')) { + namelen = strlen(name); + if (strchr(name, '/')) { ret = -EINVAL; goto out; } if (subvol) { - ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen, - NULL); + ret = btrfs_mksubvol(&file->f_path, name, namelen, + NULL, transid); } else { struct inode *src_inode; - src_file = fget(vol_args->fd); + src_file = fget(fd); if (!src_file) { ret = -EINVAL; goto out; @@ -843,12 +936,56 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, fput(src_file); goto out; } - ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen, - BTRFS_I(src_inode)->root); + ret = btrfs_mksubvol(&file->f_path, name, namelen, + BTRFS_I(src_inode)->root, + transid); fput(src_file); } out: + return ret; +} + +static noinline int btrfs_ioctl_snap_create(struct file *file, + void __user *arg, int subvol, + int async) +{ + struct btrfs_ioctl_vol_args *vol_args = NULL; + struct btrfs_ioctl_async_vol_args *async_vol_args = NULL; + char *name; + u64 fd; + u64 transid = 0; + int ret; + + if (async) { + async_vol_args = memdup_user(arg, sizeof(*async_vol_args)); + if (IS_ERR(async_vol_args)) + return PTR_ERR(async_vol_args); + + name = async_vol_args->name; + fd = async_vol_args->fd; + async_vol_args->name[BTRFS_SNAPSHOT_NAME_MAX] = '\0'; + } else { + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + name = vol_args->name; + fd = vol_args->fd; + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + } + + ret = btrfs_ioctl_snap_create_transid(file, name, fd, + subvol, &transid); + + if (!ret && async) { + if (copy_to_user(arg + + offsetof(struct btrfs_ioctl_async_vol_args, + transid), &transid, sizeof(transid))) + return -EFAULT; + } + kfree(vol_args); + kfree(async_vol_args); + return ret; } @@ -1073,14 +1210,10 @@ static noinline int btrfs_ioctl_tree_search(struct file *file, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - args = kmalloc(sizeof(*args), GFP_KERNEL); - if (!args) - return -ENOMEM; + args = memdup_user(argp, sizeof(*args)); + if (IS_ERR(args)) + return PTR_ERR(args); - if (copy_from_user(args, argp, sizeof(*args))) { - kfree(args); - return -EFAULT; - } inode = fdentry(file)->d_inode; ret = search_ioctl(inode, args); if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) @@ -1188,14 +1321,10 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - args = kmalloc(sizeof(*args), GFP_KERNEL); - if (!args) - return -ENOMEM; + args = memdup_user(argp, sizeof(*args)); + if (IS_ERR(args)) + return PTR_ERR(args); - if (copy_from_user(args, argp, sizeof(*args))) { - kfree(args); - return -EFAULT; - } inode = fdentry(file)->d_inode; if (args->treeid == 0) @@ -1227,9 +1356,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, int ret; int err = 0; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) return PTR_ERR(vol_args); @@ -1259,13 +1385,51 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, } inode = dentry->d_inode; + dest = BTRFS_I(inode)->root; + if (!capable(CAP_SYS_ADMIN)){ + /* + * Regular user. Only allow this with a special mount + * option, when the user has write+exec access to the + * subvol root, and when rmdir(2) would have been + * allowed. + * + * Note that this is _not_ check that the subvol is + * empty or doesn't contain data that we wouldn't + * otherwise be able to delete. + * + * Users who want to delete empty subvols should try + * rmdir(2). + */ + err = -EPERM; + if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) + goto out_dput; + + /* + * Do not allow deletion if the parent dir is the same + * as the dir to be deleted. That means the ioctl + * must be called on the dentry referencing the root + * of the subvol, not a random directory contained + * within it. + */ + err = -EINVAL; + if (root == dest) + goto out_dput; + + err = inode_permission(inode, MAY_WRITE | MAY_EXEC); + if (err) + goto out_dput; + + /* check if subvolume may be deleted by a non-root user */ + err = btrfs_may_delete(dir, dentry, 1); + if (err) + goto out_dput; + } + if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { err = -EINVAL; goto out_dput; } - dest = BTRFS_I(inode)->root; - mutex_lock(&inode->i_mutex); err = d_invalidate(dentry); if (err) @@ -1304,7 +1468,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, BUG_ON(ret); } - ret = btrfs_commit_transaction(trans, root); + ret = btrfs_end_transaction(trans, root); BUG_ON(ret); inode->i_flags |= S_DEAD; out_up_write: @@ -1502,11 +1666,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, path->reada = 2; if (inode < src) { - mutex_lock(&inode->i_mutex); - mutex_lock(&src->i_mutex); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD); } else { - mutex_lock(&src->i_mutex); - mutex_lock(&inode->i_mutex); + mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); } /* determine range to clone */ @@ -1517,12 +1681,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, olen = len = src->i_size - off; /* if we extend to eof, continue to block boundary */ if (off + len == src->i_size) - len = ((src->i_size + bs-1) & ~(bs-1)) - - off; + len = ALIGN(src->i_size, bs) - off; /* verify the end result is block aligned */ - if ((off & (bs-1)) || - ((off + len) & (bs-1))) + if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || + !IS_ALIGNED(destoff, bs)) goto out_unlock; /* do any pending delalloc/csum calc on src, one way or @@ -1530,13 +1693,15 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, while (1) { struct btrfs_ordered_extent *ordered; lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); - ordered = btrfs_lookup_first_ordered_extent(inode, off+len); - if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered) + ordered = btrfs_lookup_first_ordered_extent(src, off+len); + if (!ordered && + !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len, + EXTENT_DELALLOC, 0, NULL)) break; unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); if (ordered) btrfs_put_ordered_extent(ordered); - btrfs_wait_ordered_range(src, off, off+len); + btrfs_wait_ordered_range(src, off, len); } /* clone data */ @@ -1605,7 +1770,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, } btrfs_release_path(root, path); - if (key.offset + datal < off || + if (key.offset + datal <= off || key.offset >= off+len) goto next; @@ -1720,8 +1885,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, * but shouldn't round up the file size */ endoff = new_key.offset + datal; - if (endoff > off+olen) - endoff = off+olen; + if (endoff > destoff+olen) + endoff = destoff+olen; if (endoff > inode->i_size) btrfs_i_size_write(inode, endoff); @@ -1879,6 +2044,22 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) return 0; } +static void get_block_group_info(struct list_head *groups_list, + struct btrfs_ioctl_space_info *space) +{ + struct btrfs_block_group_cache *block_group; + + space->total_bytes = 0; + space->used_bytes = 0; + space->flags = 0; + list_for_each_entry(block_group, groups_list, list) { + space->flags = block_group->flags; + space->total_bytes += block_group->key.offset; + space->used_bytes += + btrfs_block_group_used(&block_group->item); + } +} + long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) { struct btrfs_ioctl_space_args space_args; @@ -1887,27 +2068,56 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) struct btrfs_ioctl_space_info *dest_orig; struct btrfs_ioctl_space_info *user_dest; struct btrfs_space_info *info; + u64 types[] = {BTRFS_BLOCK_GROUP_DATA, + BTRFS_BLOCK_GROUP_SYSTEM, + BTRFS_BLOCK_GROUP_METADATA, + BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA}; + int num_types = 4; int alloc_size; int ret = 0; int slot_count = 0; + int i, c; if (copy_from_user(&space_args, (struct btrfs_ioctl_space_args __user *)arg, sizeof(space_args))) return -EFAULT; - /* first we count slots */ - rcu_read_lock(); - list_for_each_entry_rcu(info, &root->fs_info->space_info, list) - slot_count++; - rcu_read_unlock(); + for (i = 0; i < num_types; i++) { + struct btrfs_space_info *tmp; + + info = NULL; + rcu_read_lock(); + list_for_each_entry_rcu(tmp, &root->fs_info->space_info, + list) { + if (tmp->flags == types[i]) { + info = tmp; + break; + } + } + rcu_read_unlock(); + + if (!info) + continue; + + down_read(&info->groups_sem); + for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { + if (!list_empty(&info->block_groups[c])) + slot_count++; + } + up_read(&info->groups_sem); + } /* space_slots == 0 means they are asking for a count */ if (space_args.space_slots == 0) { space_args.total_spaces = slot_count; goto out; } + + slot_count = min_t(int, space_args.space_slots, slot_count); + alloc_size = sizeof(*dest) * slot_count; + /* we generally have at most 6 or so space infos, one for each raid * level. So, a whole page should be more than enough for everyone */ @@ -1921,27 +2131,34 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) dest_orig = dest; /* now we have a buffer to copy into */ - rcu_read_lock(); - list_for_each_entry_rcu(info, &root->fs_info->space_info, list) { - /* make sure we don't copy more than we allocated - * in our buffer - */ - if (slot_count == 0) - break; - slot_count--; - - /* make sure userland has enough room in their buffer */ - if (space_args.total_spaces >= space_args.space_slots) - break; + for (i = 0; i < num_types; i++) { + struct btrfs_space_info *tmp; + + info = NULL; + rcu_read_lock(); + list_for_each_entry_rcu(tmp, &root->fs_info->space_info, + list) { + if (tmp->flags == types[i]) { + info = tmp; + break; + } + } + rcu_read_unlock(); - space.flags = info->flags; - space.total_bytes = info->total_bytes; - space.used_bytes = info->bytes_used; - memcpy(dest, &space, sizeof(space)); - dest++; - space_args.total_spaces++; + if (!info) + continue; + down_read(&info->groups_sem); + for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { + if (!list_empty(&info->block_groups[c])) { + get_block_group_info(&info->block_groups[c], + &space); + memcpy(dest, &space, sizeof(space)); + dest++; + space_args.total_spaces++; + } + } + up_read(&info->groups_sem); } - rcu_read_unlock(); user_dest = (struct btrfs_ioctl_space_info *) (arg + sizeof(struct btrfs_ioctl_space_args)); @@ -1984,6 +2201,36 @@ long btrfs_ioctl_trans_end(struct file *file) return 0; } +static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp) +{ + struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root; + struct btrfs_trans_handle *trans; + u64 transid; + + trans = btrfs_start_transaction(root, 0); + transid = trans->transid; + btrfs_commit_transaction_async(trans, root, 0); + + if (argp) + if (copy_to_user(argp, &transid, sizeof(transid))) + return -EFAULT; + return 0; +} + +static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp) +{ + struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root; + u64 transid; + + if (argp) { + if (copy_from_user(&transid, argp, sizeof(transid))) + return -EFAULT; + } else { + transid = 0; /* current trans */ + } + return btrfs_wait_for_commit(root, transid); +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -1998,9 +2245,11 @@ long btrfs_ioctl(struct file *file, unsigned int case FS_IOC_GETVERSION: return btrfs_ioctl_getversion(file, argp); case BTRFS_IOC_SNAP_CREATE: - return btrfs_ioctl_snap_create(file, argp, 0); + return btrfs_ioctl_snap_create(file, argp, 0, 0); + case BTRFS_IOC_SNAP_CREATE_ASYNC: + return btrfs_ioctl_snap_create(file, argp, 0, 1); case BTRFS_IOC_SUBVOL_CREATE: - return btrfs_ioctl_snap_create(file, argp, 1); + return btrfs_ioctl_snap_create(file, argp, 1, 0); case BTRFS_IOC_SNAP_DESTROY: return btrfs_ioctl_snap_destroy(file, argp); case BTRFS_IOC_DEFAULT_SUBVOL: @@ -2034,6 +2283,10 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SYNC: btrfs_sync_fs(file->f_dentry->d_sb, 1); return 0; + case BTRFS_IOC_START_SYNC: + return btrfs_ioctl_start_sync(file, argp); + case BTRFS_IOC_WAIT_SYNC: + return btrfs_ioctl_wait_sync(file, argp); } return -ENOTTY; diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 424694a..17c99eb 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -22,14 +22,21 @@ #define BTRFS_IOCTL_MAGIC 0x94 #define BTRFS_VOL_NAME_MAX 255 -#define BTRFS_PATH_NAME_MAX 4087 /* this should be 4k */ +#define BTRFS_PATH_NAME_MAX 4087 struct btrfs_ioctl_vol_args { __s64 fd; char name[BTRFS_PATH_NAME_MAX + 1]; }; +#define BTRFS_SNAPSHOT_NAME_MAX 4079 +struct btrfs_ioctl_async_vol_args { + __s64 fd; + __u64 transid; + char name[BTRFS_SNAPSHOT_NAME_MAX + 1]; +}; + #define BTRFS_INO_LOOKUP_PATH_MAX 4080 struct btrfs_ioctl_ino_lookup_args { __u64 treeid; @@ -178,4 +185,8 @@ struct btrfs_ioctl_space_args { #define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64) #define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \ struct btrfs_ioctl_space_args) +#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64) +#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) +#define BTRFS_IOC_SNAP_CREATE_ASYNC _IOW(BTRFS_IOCTL_MAGIC, 23, \ + struct btrfs_ioctl_async_vol_args) #endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index e56c72b..ae7737e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -250,6 +250,73 @@ int btrfs_add_ordered_sum(struct inode *inode, /* * this is used to account for finished IO across a given range + * of the file. The IO may span ordered extents. If + * a given ordered_extent is completely done, 1 is returned, otherwise + * 0. + * + * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used + * to make sure this function only returns 1 once for a given ordered extent. + * + * file_offset is updated to one byte past the range that is recorded as + * complete. This allows you to walk forward in the file. + */ +int btrfs_dec_test_first_ordered_pending(struct inode *inode, + struct btrfs_ordered_extent **cached, + u64 *file_offset, u64 io_size) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + int ret; + u64 dec_end; + u64 dec_start; + u64 to_dec; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock(&tree->lock); + node = tree_search(tree, *file_offset); + if (!node) { + ret = 1; + goto out; + } + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (!offset_in_entry(entry, *file_offset)) { + ret = 1; + goto out; + } + + dec_start = max(*file_offset, entry->file_offset); + dec_end = min(*file_offset + io_size, entry->file_offset + + entry->len); + *file_offset = dec_end; + if (dec_start > dec_end) { + printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n", + (unsigned long long)dec_start, + (unsigned long long)dec_end); + } + to_dec = dec_end - dec_start; + if (to_dec > entry->bytes_left) { + printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", + (unsigned long long)entry->bytes_left, + (unsigned long long)to_dec); + } + entry->bytes_left -= to_dec; + if (entry->bytes_left == 0) + ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + else + ret = 1; +out: + if (!ret && cached && entry) { + *cached = entry; + atomic_inc(&entry->refs); + } + spin_unlock(&tree->lock); + return ret == 0; +} + +/* + * this is used to account for finished IO across a given range * of the file. The IO should not span ordered extents. If * a given ordered_extent is completely done, 1 is returned, otherwise * 0. @@ -526,7 +593,6 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) { u64 end; u64 orig_end; - u64 wait_end; struct btrfs_ordered_extent *ordered; int found; @@ -537,7 +603,6 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) if (orig_end > INT_LIMIT(loff_t)) orig_end = INT_LIMIT(loff_t); } - wait_end = orig_end; again: /* start IO across the range first to instantiate any delalloc * extents diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 8ac3654..61dca83 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -141,6 +141,9 @@ int btrfs_remove_ordered_extent(struct inode *inode, int btrfs_dec_test_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size); +int btrfs_dec_test_first_ordered_pending(struct inode *inode, + struct btrfs_ordered_extent **cached, + u64 *file_offset, u64 io_size); int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type); int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b37d723..045c9c2 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -29,6 +29,7 @@ #include "locking.h" #include "btrfs_inode.h" #include "async-thread.h" +#include "free-space-cache.h" /* * backref_node, mapping_node and tree_block start with this @@ -178,8 +179,6 @@ struct reloc_control { u64 search_start; u64 extents_found; - int block_rsv_retries; - unsigned int stage:8; unsigned int create_reloc_tree:1; unsigned int merge_reloc_tree:1; @@ -2133,7 +2132,6 @@ int prepare_to_merge(struct reloc_control *rc, int err) LIST_HEAD(reloc_roots); u64 num_bytes = 0; int ret; - int retries = 0; mutex_lock(&root->fs_info->trans_mutex); rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; @@ -2143,7 +2141,7 @@ again: if (!err) { num_bytes = rc->merging_rsv_size; ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, - num_bytes, &retries); + num_bytes); if (ret) err = ret; } @@ -2155,7 +2153,6 @@ again: btrfs_end_transaction(trans, rc->extent_root); btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes); - retries = 0; goto again; } } @@ -2405,15 +2402,13 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, num_bytes = calcu_metadata_size(rc, node, 1) * 2; trans->block_rsv = rc->block_rsv; - ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes, - &rc->block_rsv_retries); + ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes); if (ret) { if (ret == -EAGAIN) rc->commit_transaction = 1; return ret; } - rc->block_rsv_retries = 0; return 0; } @@ -3099,6 +3094,8 @@ static int add_tree_block(struct reloc_control *rc, BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0)); ret = get_ref_objectid_v0(rc, path, extent_key, &ref_owner, NULL); + if (ret < 0) + return ret; BUG_ON(ref_owner >= BTRFS_MAX_LEVEL); level = (int)ref_owner; /* FIXME: get real generation */ @@ -3191,6 +3188,54 @@ static int block_use_full_backref(struct reloc_control *rc, return ret; } +static int delete_block_group_cache(struct btrfs_fs_info *fs_info, + struct inode *inode, u64 ino) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_trans_handle *trans; + unsigned long nr; + int ret = 0; + + if (inode) + goto truncate; + + key.objectid = ino; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + + inode = btrfs_iget(fs_info->sb, &key, root, NULL); + if (!inode || IS_ERR(inode) || is_bad_inode(inode)) { + if (inode && !IS_ERR(inode)) + iput(inode); + return -ENOENT; + } + +truncate: + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + trans = btrfs_join_transaction(root, 0); + if (IS_ERR(trans)) { + btrfs_free_path(path); + goto out; + } + + ret = btrfs_truncate_free_space_cache(root, trans, path, inode); + + btrfs_free_path(path); + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); +out: + iput(inode); + return ret; +} + /* * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY * this function scans fs tree to find blocks reference the data extent @@ -3217,15 +3262,27 @@ static int find_data_references(struct reloc_control *rc, int counted; int ret; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - ref_root = btrfs_extent_data_ref_root(leaf, ref); ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref); ref_offset = btrfs_extent_data_ref_offset(leaf, ref); ref_count = btrfs_extent_data_ref_count(leaf, ref); + /* + * This is an extent belonging to the free space cache, lets just delete + * it and redo the search. + */ + if (ref_root == BTRFS_ROOT_TREE_OBJECTID) { + ret = delete_block_group_cache(rc->extent_root->fs_info, + NULL, ref_objectid); + if (ret != -ENOENT) + return ret; + ret = 0; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + root = read_fs_root(rc->extent_root->fs_info, ref_root); if (IS_ERR(root)) { err = PTR_ERR(root); @@ -3554,8 +3611,7 @@ int prepare_to_relocate(struct reloc_control *rc) * is no reservation in transaction handle. */ ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, - rc->extent_root->nodesize * 256, - &rc->block_rsv_retries); + rc->extent_root->nodesize * 256); if (ret) return ret; @@ -3567,7 +3623,6 @@ int prepare_to_relocate(struct reloc_control *rc) rc->extents_found = 0; rc->nodes_relocated = 0; rc->merging_rsv_size = 0; - rc->block_rsv_retries = 0; rc->create_reloc_tree = 1; set_reloc_control(rc); @@ -3860,6 +3915,8 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) { struct btrfs_fs_info *fs_info = extent_root->fs_info; struct reloc_control *rc; + struct inode *inode; + struct btrfs_path *path; int ret; int rw = 0; int err = 0; @@ -3882,6 +3939,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) rw = 1; } + path = btrfs_alloc_path(); + if (!path) { + err = -ENOMEM; + goto out; + } + + inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group, + path); + btrfs_free_path(path); + + if (!IS_ERR(inode)) + ret = delete_block_group_cache(fs_info, inode, 0); + else + ret = PTR_ERR(inode); + + if (ret && ret != -ENOENT) { + err = ret; + goto out; + } + rc->data_inode = create_reloc_inode(fs_info, rc->block_group); if (IS_ERR(rc->data_inode)) { err = PTR_ERR(rc->data_inode); @@ -4143,7 +4220,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) btrfs_add_ordered_sum(inode, ordered, sums); } btrfs_put_ordered_extent(ordered); - return 0; + return ret; } void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 2d958be..6a1086e 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -181,7 +181,6 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid) { struct btrfs_root *dead_root; - struct btrfs_item *item; struct btrfs_root_item *ri; struct btrfs_key key; struct btrfs_key found_key; @@ -214,7 +213,6 @@ again: nritems = btrfs_header_nritems(leaf); slot = path->slots[0]; } - item = btrfs_item_nr(leaf, slot); btrfs_item_key_to_cpu(leaf, &key, slot); if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY) goto next; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 144f8a5..dbb51ea 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -61,6 +61,8 @@ static void btrfs_put_super(struct super_block *sb) ret = close_ctree(root); sb->s_fs_info = NULL; + + (void)ret; /* FIXME: need to fix VFS to return error? */ } enum { @@ -68,7 +70,8 @@ enum { Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit, - Opt_discard, Opt_err, + Opt_discard, Opt_space_cache, Opt_clear_cache, Opt_err, + Opt_user_subvol_rm_allowed, }; static match_table_t tokens = { @@ -92,6 +95,9 @@ static match_table_t tokens = { {Opt_flushoncommit, "flushoncommit"}, {Opt_ratio, "metadata_ratio=%d"}, {Opt_discard, "discard"}, + {Opt_space_cache, "space_cache"}, + {Opt_clear_cache, "clear_cache"}, + {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, {Opt_err, NULL}, }; @@ -235,6 +241,17 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_discard: btrfs_set_opt(info->mount_opt, DISCARD); break; + case Opt_space_cache: + printk(KERN_INFO "btrfs: enabling disk space caching\n"); + btrfs_set_opt(info->mount_opt, SPACE_CACHE); + break; + case Opt_clear_cache: + printk(KERN_INFO "btrfs: force clearing of disk cache\n"); + btrfs_set_opt(info->mount_opt, CLEAR_CACHE); + break; + case Opt_user_subvol_rm_allowed: + btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED); + break; case Opt_err: printk(KERN_INFO "btrfs: unrecognized mount option " "'%s'\n", p); @@ -380,7 +397,7 @@ static struct dentry *get_default_root(struct super_block *sb, find_root: new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); if (IS_ERR(new_root)) - return ERR_PTR(PTR_ERR(new_root)); + return ERR_CAST(new_root); if (btrfs_root_refs(&new_root->root_item) == 0) return ERR_PTR(-ENOENT); @@ -436,7 +453,6 @@ static int btrfs_fill_super(struct super_block *sb, { struct inode *inode; struct dentry *root_dentry; - struct btrfs_super_block *disk_super; struct btrfs_root *tree_root; struct btrfs_key key; int err; @@ -458,7 +474,6 @@ static int btrfs_fill_super(struct super_block *sb, return PTR_ERR(tree_root); } sb->s_fs_info = tree_root; - disk_super = &tree_root->fs_info->super_copy; key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.type = BTRFS_INODE_ITEM_KEY; @@ -548,30 +563,45 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) static int btrfs_test_super(struct super_block *s, void *data) { - struct btrfs_fs_devices *test_fs_devices = data; + struct btrfs_root *test_root = data; struct btrfs_root *root = btrfs_sb(s); - return root->fs_info->fs_devices == test_fs_devices; + /* + * If this super block is going away, return false as it + * can't match as an existing super block. + */ + if (!atomic_read(&s->s_active)) + return 0; + return root->fs_info->fs_devices == test_root->fs_info->fs_devices; } +static int btrfs_set_super(struct super_block *s, void *data) +{ + s->s_fs_info = data; + + return set_anon_super(s, data); +} + + /* * Find a superblock for the given device / mount point. * * Note: This is based on get_sb_bdev from fs/super.c with a few additions * for multiple device setup. Make sure to keep it in sync. */ -static int btrfs_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) { struct block_device *bdev = NULL; struct super_block *s; struct dentry *root; struct btrfs_fs_devices *fs_devices = NULL; + struct btrfs_root *tree_root = NULL; + struct btrfs_fs_info *fs_info = NULL; fmode_t mode = FMODE_READ; char *subvol_name = NULL; u64 subvol_objectid = 0; int error = 0; - int found = 0; if (!(flags & MS_RDONLY)) mode |= FMODE_WRITE; @@ -580,7 +610,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags, &subvol_name, &subvol_objectid, &fs_devices); if (error) - return error; + return ERR_PTR(error); error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices); if (error) @@ -595,8 +625,24 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags, goto error_close_devices; } + /* + * Setup a dummy root and fs_info for test/set super. This is because + * we don't actually fill this stuff out until open_ctree, but we need + * it for searching for existing supers, so this lets us do that and + * then open_ctree will properly initialize everything later. + */ + fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); + tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + if (!fs_info || !tree_root) { + error = -ENOMEM; + goto error_close_devices; + } + fs_info->tree_root = tree_root; + fs_info->fs_devices = fs_devices; + tree_root->fs_info = fs_info; + bdev = fs_devices->latest_bdev; - s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices); + s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); if (IS_ERR(s)) goto error_s; @@ -607,7 +653,6 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags, goto error_close_devices; } - found = 1; btrfs_close_devices(fs_devices); } else { char b[BDEVNAME_SIZE]; @@ -629,7 +674,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags, if (IS_ERR(root)) { error = PTR_ERR(root); deactivate_locked_super(s); - goto error; + goto error_free_subvol_name; } /* if they gave us a subvolume name bind mount into that */ if (strcmp(subvol_name, ".")) { @@ -643,33 +688,31 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags, deactivate_locked_super(s); error = PTR_ERR(new_root); dput(root); - goto error_close_devices; + goto error_free_subvol_name; } if (!new_root->d_inode) { dput(root); dput(new_root); deactivate_locked_super(s); error = -ENXIO; - goto error_close_devices; + goto error_free_subvol_name; } dput(root); root = new_root; } - mnt->mnt_sb = s; - mnt->mnt_root = root; - kfree(subvol_name); - return 0; + return root; error_s: error = PTR_ERR(s); error_close_devices: btrfs_close_devices(fs_devices); + kfree(fs_info); + kfree(tree_root); error_free_subvol_name: kfree(subvol_name); -error: - return error; + return ERR_PTR(error); } static int btrfs_remount(struct super_block *sb, int *flags, char *data) @@ -716,18 +759,25 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct list_head *head = &root->fs_info->space_info; struct btrfs_space_info *found; u64 total_used = 0; + u64 total_used_data = 0; int bits = dentry->d_sb->s_blocksize_bits; __be32 *fsid = (__be32 *)root->fs_info->fsid; rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) + list_for_each_entry_rcu(found, head, list) { + if (found->flags & (BTRFS_BLOCK_GROUP_METADATA | + BTRFS_BLOCK_GROUP_SYSTEM)) + total_used_data += found->disk_total; + else + total_used_data += found->disk_used; total_used += found->disk_used; + } rcu_read_unlock(); buf->f_namelen = BTRFS_NAME_LEN; buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; buf->f_bfree = buf->f_blocks - (total_used >> bits); - buf->f_bavail = buf->f_bfree; + buf->f_bavail = buf->f_blocks - (total_used_data >> bits); buf->f_bsize = dentry->d_sb->s_blocksize; buf->f_type = BTRFS_SUPER_MAGIC; @@ -746,7 +796,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) static struct file_system_type btrfs_fs_type = { .owner = THIS_MODULE, .name = "btrfs", - .get_sb = btrfs_get_sb, + .mount = btrfs_mount, .kill_sb = kill_anon_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 66e4c66..f50e931 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -163,6 +163,7 @@ enum btrfs_trans_type { TRANS_START, TRANS_JOIN, TRANS_USERSPACE, + TRANS_JOIN_NOLOCK, }; static int may_wait_transaction(struct btrfs_root *root, int type) @@ -179,14 +180,14 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, { struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; - int retries = 0; int ret; again: h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); if (!h) return ERR_PTR(-ENOMEM); - mutex_lock(&root->fs_info->trans_mutex); + if (type != TRANS_JOIN_NOLOCK) + mutex_lock(&root->fs_info->trans_mutex); if (may_wait_transaction(root, type)) wait_current_trans(root); @@ -195,7 +196,8 @@ again: cur_trans = root->fs_info->running_transaction; cur_trans->use_count++; - mutex_unlock(&root->fs_info->trans_mutex); + if (type != TRANS_JOIN_NOLOCK) + mutex_unlock(&root->fs_info->trans_mutex); h->transid = cur_trans->transid; h->transaction = cur_trans; @@ -212,8 +214,7 @@ again: } if (num_items > 0) { - ret = btrfs_trans_reserve_metadata(h, root, num_items, - &retries); + ret = btrfs_trans_reserve_metadata(h, root, num_items); if (ret == -EAGAIN) { btrfs_commit_transaction(h, root); goto again; @@ -224,9 +225,11 @@ again: } } - mutex_lock(&root->fs_info->trans_mutex); + if (type != TRANS_JOIN_NOLOCK) + mutex_lock(&root->fs_info->trans_mutex); record_root_in_trans(h, root); - mutex_unlock(&root->fs_info->trans_mutex); + if (type != TRANS_JOIN_NOLOCK) + mutex_unlock(&root->fs_info->trans_mutex); if (!current->journal_info && type != TRANS_USERSPACE) current->journal_info = h; @@ -244,6 +247,12 @@ struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, return start_transaction(root, 0, TRANS_JOIN); } +struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root, + int num_blocks) +{ + return start_transaction(root, 0, TRANS_JOIN_NOLOCK); +} + struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, int num_blocks) { @@ -270,6 +279,58 @@ static noinline int wait_for_commit(struct btrfs_root *root, return 0; } +int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) +{ + struct btrfs_transaction *cur_trans = NULL, *t; + int ret; + + mutex_lock(&root->fs_info->trans_mutex); + + ret = 0; + if (transid) { + if (transid <= root->fs_info->last_trans_committed) + goto out_unlock; + + /* find specified transaction */ + list_for_each_entry(t, &root->fs_info->trans_list, list) { + if (t->transid == transid) { + cur_trans = t; + break; + } + if (t->transid > transid) + break; + } + ret = -EINVAL; + if (!cur_trans) + goto out_unlock; /* bad transid */ + } else { + /* find newest transaction that is committing | committed */ + list_for_each_entry_reverse(t, &root->fs_info->trans_list, + list) { + if (t->in_commit) { + if (t->commit_done) + goto out_unlock; + cur_trans = t; + break; + } + } + if (!cur_trans) + goto out_unlock; /* nothing committing|committed */ + } + + cur_trans->use_count++; + mutex_unlock(&root->fs_info->trans_mutex); + + wait_for_commit(root, cur_trans); + + mutex_lock(&root->fs_info->trans_mutex); + put_transaction(cur_trans); + ret = 0; +out_unlock: + mutex_unlock(&root->fs_info->trans_mutex); + return ret; +} + #if 0 /* * rate limit against the drop_snapshot code. This helps to slow down new @@ -348,7 +409,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, } static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int throttle) + struct btrfs_root *root, int throttle, int lock) { struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *info = root->fs_info; @@ -376,26 +437,29 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, btrfs_trans_release_metadata(trans, root); - if (!root->fs_info->open_ioctl_trans && + if (lock && !root->fs_info->open_ioctl_trans && should_end_transaction(trans, root)) trans->transaction->blocked = 1; - if (cur_trans->blocked && !cur_trans->in_commit) { + if (lock && cur_trans->blocked && !cur_trans->in_commit) { if (throttle) return btrfs_commit_transaction(trans, root); else wake_up_process(info->transaction_kthread); } - mutex_lock(&info->trans_mutex); + if (lock) + mutex_lock(&info->trans_mutex); WARN_ON(cur_trans != info->running_transaction); WARN_ON(cur_trans->num_writers < 1); cur_trans->num_writers--; + smp_mb(); if (waitqueue_active(&cur_trans->writer_wait)) wake_up(&cur_trans->writer_wait); put_transaction(cur_trans); - mutex_unlock(&info->trans_mutex); + if (lock) + mutex_unlock(&info->trans_mutex); if (current->journal_info == trans) current->journal_info = NULL; @@ -411,13 +475,19 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, int btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - return __btrfs_end_transaction(trans, root, 0); + return __btrfs_end_transaction(trans, root, 0, 1); } int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - return __btrfs_end_transaction(trans, root, 1); + return __btrfs_end_transaction(trans, root, 1, 1); +} + +int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + return __btrfs_end_transaction(trans, root, 0, 0); } /* @@ -832,11 +902,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *root = pending->root; struct btrfs_root *parent_root; struct inode *parent_inode; + struct dentry *parent; struct dentry *dentry; struct extent_buffer *tmp; struct extent_buffer *old; int ret; - int retries = 0; u64 to_reserve = 0; u64 index = 0; u64 objectid; @@ -858,7 +928,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (to_reserve > 0) { ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, - to_reserve, &retries); + to_reserve); if (ret) { pending->error = ret; goto fail; @@ -872,7 +942,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, trans->block_rsv = &pending->block_rsv; dentry = pending->dentry; - parent_inode = dentry->d_parent->d_inode; + parent = dget_parent(dentry); + parent_inode = parent->d_inode; parent_root = BTRFS_I(parent_inode)->root; record_root_in_trans(trans, parent_root); @@ -920,6 +991,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, parent_inode->i_ino, index, dentry->d_name.name, dentry->d_name.len); BUG_ON(ret); + dput(parent); key.offset = (u64)-1; pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); @@ -966,6 +1038,8 @@ static void update_super_roots(struct btrfs_root *root) super->root = root_item->bytenr; super->generation = root_item->generation; super->root_level = root_item->level; + if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE)) + super->cache_generation = root_item->generation; } int btrfs_transaction_in_commit(struct btrfs_fs_info *info) @@ -988,11 +1062,127 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info) return ret; } +/* + * wait for the current transaction commit to start and block subsequent + * transaction joins + */ +static void wait_current_trans_commit_start(struct btrfs_root *root, + struct btrfs_transaction *trans) +{ + DEFINE_WAIT(wait); + + if (trans->in_commit) + return; + + while (1) { + prepare_to_wait(&root->fs_info->transaction_blocked_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (trans->in_commit) { + finish_wait(&root->fs_info->transaction_blocked_wait, + &wait); + break; + } + mutex_unlock(&root->fs_info->trans_mutex); + schedule(); + mutex_lock(&root->fs_info->trans_mutex); + finish_wait(&root->fs_info->transaction_blocked_wait, &wait); + } +} + +/* + * wait for the current transaction to start and then become unblocked. + * caller holds ref. + */ +static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, + struct btrfs_transaction *trans) +{ + DEFINE_WAIT(wait); + + if (trans->commit_done || (trans->in_commit && !trans->blocked)) + return; + + while (1) { + prepare_to_wait(&root->fs_info->transaction_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (trans->commit_done || + (trans->in_commit && !trans->blocked)) { + finish_wait(&root->fs_info->transaction_wait, + &wait); + break; + } + mutex_unlock(&root->fs_info->trans_mutex); + schedule(); + mutex_lock(&root->fs_info->trans_mutex); + finish_wait(&root->fs_info->transaction_wait, + &wait); + } +} + +/* + * commit transactions asynchronously. once btrfs_commit_transaction_async + * returns, any subsequent transaction will not be allowed to join. + */ +struct btrfs_async_commit { + struct btrfs_trans_handle *newtrans; + struct btrfs_root *root; + struct delayed_work work; +}; + +static void do_async_commit(struct work_struct *work) +{ + struct btrfs_async_commit *ac = + container_of(work, struct btrfs_async_commit, work.work); + + btrfs_commit_transaction(ac->newtrans, ac->root); + kfree(ac); +} + +int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + int wait_for_unblock) +{ + struct btrfs_async_commit *ac; + struct btrfs_transaction *cur_trans; + + ac = kmalloc(sizeof(*ac), GFP_NOFS); + BUG_ON(!ac); + + INIT_DELAYED_WORK(&ac->work, do_async_commit); + ac->root = root; + ac->newtrans = btrfs_join_transaction(root, 0); + + /* take transaction reference */ + mutex_lock(&root->fs_info->trans_mutex); + cur_trans = trans->transaction; + cur_trans->use_count++; + mutex_unlock(&root->fs_info->trans_mutex); + + btrfs_end_transaction(trans, root); + schedule_delayed_work(&ac->work, 0); + + /* wait for transaction to start and unblock */ + mutex_lock(&root->fs_info->trans_mutex); + if (wait_for_unblock) + wait_current_trans_commit_start_and_unblock(root, cur_trans); + else + wait_current_trans_commit_start(root, cur_trans); + put_transaction(cur_trans); + mutex_unlock(&root->fs_info->trans_mutex); + + return 0; +} + +/* + * btrfs_transaction state sequence: + * in_commit = 0, blocked = 0 (initial) + * in_commit = 1, blocked = 1 + * blocked = 0 + * commit_done = 1 + */ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { unsigned long joined = 0; - unsigned long timeout = 1; struct btrfs_transaction *cur_trans; struct btrfs_transaction *prev_trans = NULL; DEFINE_WAIT(wait); @@ -1039,6 +1229,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, trans->transaction->in_commit = 1; trans->transaction->blocked = 1; + wake_up(&root->fs_info->transaction_blocked_wait); + if (cur_trans->list.prev != &root->fs_info->trans_list) { prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); @@ -1063,11 +1255,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, snap_pending = 1; WARN_ON(cur_trans != trans->transaction); - if (cur_trans->num_writers > 1) - timeout = MAX_SCHEDULE_TIMEOUT; - else if (should_grow) - timeout = 1; - mutex_unlock(&root->fs_info->trans_mutex); if (flush_on_commit || snap_pending) { @@ -1089,8 +1276,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, TASK_UNINTERRUPTIBLE); smp_mb(); - if (cur_trans->num_writers > 1 || should_grow) - schedule_timeout(timeout); + if (cur_trans->num_writers > 1) + schedule_timeout(MAX_SCHEDULE_TIMEOUT); + else if (should_grow) + schedule_timeout(1); mutex_lock(&root->fs_info->trans_mutex); finish_wait(&cur_trans->writer_wait, &wait); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index e104986..f104b57 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -87,12 +87,17 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, int btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans, + struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, int num_items); struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, int num_blocks); +struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root, + int num_blocks); struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, int num_blocks); +int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, @@ -104,6 +109,9 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); int btrfs_clean_old_snapshots(struct btrfs_root *root); int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + int wait_for_unblock); int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index f7ac8e0..992ab42 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -36,7 +36,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, int ret = 0; int wret; int level; - int orig_level; int is_extent = 0; int next_key_ret = 0; u64 last_ret = 0; @@ -64,7 +63,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, return -ENOMEM; level = btrfs_header_level(root->node); - orig_level = level; if (level == 0) goto out; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index fb102a9..054744a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -786,7 +786,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, { struct inode *dir; int ret; - struct btrfs_key location; struct btrfs_inode_ref *ref; struct btrfs_dir_item *di; struct inode *inode; @@ -795,10 +794,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, unsigned long ref_ptr; unsigned long ref_end; - location.objectid = key->objectid; - location.type = BTRFS_INODE_ITEM_KEY; - location.offset = 0; - /* * it is possible that we didn't log all the parent directories * for a given inode. If we don't find the dir, just don't @@ -1583,7 +1578,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct btrfs_path *path; struct btrfs_root *root = wc->replay_dest; struct btrfs_key key; - u32 item_size; int level; int i; int ret; @@ -1601,7 +1595,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, nritems = btrfs_header_nritems(eb); for (i = 0; i < nritems; i++) { btrfs_item_key_to_cpu(eb, &key, i); - item_size = btrfs_item_size_nr(eb, i); /* inode keys are done during the first stage */ if (key.type == BTRFS_INODE_ITEM_KEY && @@ -1668,7 +1661,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, struct walk_control *wc) { u64 root_owner; - u64 root_gen; u64 bytenr; u64 ptr_gen; struct extent_buffer *next; @@ -1698,7 +1690,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, parent = path->nodes[*level]; root_owner = btrfs_header_owner(parent); - root_gen = btrfs_header_generation(parent); next = btrfs_find_create_tree_block(root, bytenr, blocksize); @@ -1749,7 +1740,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, struct walk_control *wc) { u64 root_owner; - u64 root_gen; int i; int slot; int ret; @@ -1757,8 +1747,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { slot = path->slots[i]; if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { - struct extent_buffer *node; - node = path->nodes[i]; path->slots[i]++; *level = i; WARN_ON(*level == 0); @@ -1771,7 +1759,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, parent = path->nodes[*level + 1]; root_owner = btrfs_header_owner(parent); - root_gen = btrfs_header_generation(parent); wc->process_func(root, path->nodes[*level], wc, btrfs_header_generation(path->nodes[*level])); if (wc->free) { @@ -2273,7 +2260,7 @@ fail: } btrfs_end_log_trans(root); - return 0; + return err; } /* see comments for btrfs_del_dir_entries_in_log */ @@ -2729,7 +2716,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_key max_key; struct btrfs_root *log = root->log_root; struct extent_buffer *src = NULL; - u32 size; int err = 0; int ret; int nritems; @@ -2793,7 +2779,6 @@ again: break; src = path->nodes[0]; - size = btrfs_item_size_nr(src, path->slots[0]); if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { ins_nr++; goto next_slot; @@ -2884,6 +2869,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, { int ret = 0; struct btrfs_root *root; + struct dentry *old_parent = NULL; /* * for regular files, if its inode is already on disk, we don't @@ -2925,10 +2911,13 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, if (IS_ROOT(parent)) break; - parent = parent->d_parent; + parent = dget_parent(parent); + dput(old_parent); + old_parent = parent; inode = parent->d_inode; } + dput(old_parent); out: return ret; } @@ -2960,6 +2949,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, { int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; struct super_block *sb; + struct dentry *old_parent = NULL; int ret = 0; u64 last_committed = root->fs_info->last_trans_committed; @@ -3031,10 +3021,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (IS_ROOT(parent)) break; - parent = parent->d_parent; + parent = dget_parent(parent); + dput(old_parent); + old_parent = parent; } ret = 0; end_trans: + dput(old_parent); if (ret < 0) { BUG_ON(ret != -ENOSPC); root->fs_info->last_trans_log_full_commit = trans->transid; @@ -3054,8 +3047,13 @@ end_no_trans: int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct dentry *dentry) { - return btrfs_log_inode_parent(trans, root, dentry->d_inode, - dentry->d_parent, 0); + struct dentry *parent = dget_parent(dentry); + int ret; + + ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0); + dput(parent); + + return ret; } /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e25e46a..cc04dc1 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1898,7 +1898,6 @@ int btrfs_balance(struct btrfs_root *dev_root) u64 size_to_free; struct btrfs_path *path; struct btrfs_key key; - struct btrfs_chunk *chunk; struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; struct btrfs_trans_handle *trans; struct btrfs_key found_key; @@ -1962,9 +1961,6 @@ int btrfs_balance(struct btrfs_root *dev_root) if (found_key.objectid != key.objectid) break; - chunk = btrfs_item_ptr(path->nodes[0], - path->slots[0], - struct btrfs_chunk); /* chunk zero is special */ if (found_key.offset == 0) break; @@ -3031,8 +3027,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, } bio->bi_sector = multi->stripes[dev_nr].physical >> 9; dev = multi->stripes[dev_nr].dev; - BUG_ON(rw == WRITE && !dev->writeable); - if (dev && dev->bdev) { + if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { bio->bi_bdev = dev->bdev; if (async_submit) schedule_bio(root, dev, rw, bio); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 88ecbb2..698fdd2 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -178,7 +178,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) struct inode *inode = dentry->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; - struct btrfs_item *item; struct extent_buffer *leaf; struct btrfs_dir_item *di; int ret = 0, slot, advance; @@ -234,7 +233,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) } advance = 1; - item = btrfs_item_nr(leaf, slot); btrfs_item_key_to_cpu(leaf, &found_key, slot); /* check to make sure this item is what we want */ diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 3e2b90e..b9cd544 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -199,8 +199,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping, int nr_pages = 0; struct page *in_page = NULL; struct page *out_page = NULL; - int out_written = 0; - int in_read = 0; unsigned long bytes_left; *out_pages = 0; @@ -233,9 +231,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping, workspace->def_strm.avail_out = PAGE_CACHE_SIZE; workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE); - out_written = 0; - in_read = 0; - while (workspace->def_strm.total_in < len) { ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); if (ret != Z_OK) { diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index e9c874a..561438b 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -204,7 +204,7 @@ static int readpage_nounlock(struct file *filp, struct page *page) err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, page->index << PAGE_CACHE_SHIFT, &len, ci->i_truncate_seq, ci->i_truncate_size, - &page, 1); + &page, 1, 0); if (err == -ENOENT) err = 0; if (err < 0) { @@ -287,7 +287,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, offset, &len, ci->i_truncate_seq, ci->i_truncate_size, - pages, nr_pages); + pages, nr_pages, 0); if (rc == -ENOENT) rc = 0; if (rc < 0) @@ -774,7 +774,7 @@ get_more_pages: snapc, do_sync, ci->i_truncate_seq, ci->i_truncate_size, - &inode->i_mtime, true, 1); + &inode->i_mtime, true, 1, 0); max_pages = req->r_num_pages; alloc_page_vec(fsc, req); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 98ab13e..60d27bc 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1430,8 +1430,8 @@ static int try_nonblocking_invalidate(struct inode *inode) invalidating_gen == ci->i_rdcache_gen) { /* success. */ dout("try_nonblocking_invalidate %p success\n", inode); - ci->i_rdcache_gen = 0; - ci->i_rdcache_revoking = 0; + /* save any racing async invalidate some trouble */ + ci->i_rdcache_revoking = ci->i_rdcache_gen - 1; return 0; } dout("try_nonblocking_invalidate %p failed\n", inode); @@ -2273,8 +2273,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; - unsigned seq = le32_to_cpu(grant->seq); - unsigned issue_seq = le32_to_cpu(grant->issue_seq); + int seq = le32_to_cpu(grant->seq); int newcaps = le32_to_cpu(grant->caps); int issued, implemented, used, wanted, dirty; u64 size = le64_to_cpu(grant->size); @@ -2286,8 +2285,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, int revoked_rdcache = 0; int queue_invalidate = 0; - dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n", - inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps)); + dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", + inode, cap, mds, seq, ceph_cap_string(newcaps)); dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, inode->i_size); @@ -2383,7 +2382,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, } cap->seq = seq; - cap->issue_seq = issue_seq; /* file layout may have changed */ ci->i_layout = grant->layout; @@ -2691,6 +2689,11 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, NULL /* no caps context */); try_flush_caps(inode, session, NULL); up_read(&mdsc->snap_rwsem); + + /* make sure we re-request max_size, if necessary */ + spin_lock(&inode->i_lock); + ci->i_requested_max_size = 0; + spin_unlock(&inode->i_lock); } /* diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index e0a2dc6..7d447af 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -336,7 +336,10 @@ more: if (req->r_reply_info.dir_end) { kfree(fi->last_name); fi->last_name = NULL; - fi->next_offset = 2; + if (ceph_frag_is_rightmost(frag)) + fi->next_offset = 2; + else + fi->next_offset = 0; } else { rinfo = &req->r_reply_info; err = note_last_dentry(fi, @@ -355,18 +358,22 @@ more: u64 pos = ceph_make_fpos(frag, off); struct ceph_mds_reply_inode *in = rinfo->dir_in[off - fi->offset].in; + struct ceph_vino vino; + ino_t ino; + dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", off, off - fi->offset, rinfo->dir_nr, pos, rinfo->dir_dname_len[off - fi->offset], rinfo->dir_dname[off - fi->offset], in); BUG_ON(!in); ftype = le32_to_cpu(in->mode) >> 12; + vino.ino = le64_to_cpu(in->ino); + vino.snap = le64_to_cpu(in->snapid); + ino = ceph_vino_to_ino(vino); if (filldir(dirent, rinfo->dir_dname[off - fi->offset], rinfo->dir_dname_len[off - fi->offset], - pos, - le64_to_cpu(in->ino), - ftype) < 0) { + pos, ino, ftype) < 0) { dout("filldir stopping us...\n"); return 0; } @@ -414,6 +421,7 @@ static void reset_readdir(struct ceph_file_info *fi) fi->last_readdir = NULL; } kfree(fi->last_name); + fi->last_name = NULL; fi->next_offset = 2; /* compensate for . and .. */ if (fi->dentry) { dput(fi->dentry); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index e77c28c..8d79b89 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -154,11 +154,13 @@ int ceph_open(struct inode *inode, struct file *file) } /* - * No need to block if we have any caps. Update wanted set + * No need to block if we have caps on the auth MDS (for + * write) or any MDS (for read). Update wanted set * asynchronously. */ spin_lock(&inode->i_lock); - if (__ceph_is_any_real_caps(ci)) { + if (__ceph_is_any_real_caps(ci) && + (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { int mds_wanted = __ceph_caps_mds_wanted(ci); int issued = __ceph_caps_issued(ci, NULL); @@ -280,11 +282,12 @@ int ceph_release(struct inode *inode, struct file *file) static int striped_read(struct inode *inode, u64 off, u64 len, struct page **pages, int num_pages, - int *checkeof) + int *checkeof, bool align_to_pages) { struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u64 pos, this_len; + int io_align, page_align; int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ int left, pages_left; int read; @@ -300,14 +303,19 @@ static int striped_read(struct inode *inode, page_pos = pages; pages_left = num_pages; read = 0; + io_align = off & ~PAGE_MASK; more: + if (align_to_pages) + page_align = (pos - io_align) & ~PAGE_MASK; + else + page_align = pos & ~PAGE_MASK; this_len = left; ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), &ci->i_layout, pos, &this_len, ci->i_truncate_seq, ci->i_truncate_size, - page_pos, pages_left); + page_pos, pages_left, page_align); hit_stripe = this_len < left; was_short = ret >= 0 && ret < this_len; if (ret == -ENOENT) @@ -374,26 +382,25 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, dout("sync_read on file %p %llu~%u %s\n", file, off, len, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); - if (file->f_flags & O_DIRECT) { - pages = ceph_get_direct_page_vector(data, num_pages, off, len); - - /* - * flush any page cache pages in this range. this - * will make concurrent normal and O_DIRECT io slow, - * but it will at least behave sensibly when they are - * in sequence. - */ - } else { + if (file->f_flags & O_DIRECT) + pages = ceph_get_direct_page_vector(data, num_pages); + else pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); - } if (IS_ERR(pages)) return PTR_ERR(pages); + /* + * flush any page cache pages in this range. this + * will make concurrent normal and sync io slow, + * but it will at least behave sensibly when they are + * in sequence. + */ ret = filemap_write_and_wait(inode->i_mapping); if (ret < 0) goto done; - ret = striped_read(inode, off, len, pages, num_pages, checkeof); + ret = striped_read(inode, off, len, pages, num_pages, checkeof, + file->f_flags & O_DIRECT); if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) ret = ceph_copy_page_vector_to_user(pages, data, off, ret); @@ -448,6 +455,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, int flags; int do_sync = 0; int check_caps = 0; + int page_align, io_align; int ret; struct timespec mtime = CURRENT_TIME; @@ -462,6 +470,8 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, else pos = *offset; + io_align = pos & ~PAGE_MASK; + ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); if (ret < 0) return ret; @@ -486,20 +496,26 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, */ more: len = left; + if (file->f_flags & O_DIRECT) + /* write from beginning of first page, regardless of + io alignment */ + page_align = (pos - io_align) & ~PAGE_MASK; + else + page_align = pos & ~PAGE_MASK; req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), pos, &len, CEPH_OSD_OP_WRITE, flags, ci->i_snap_realm->cached_context, do_sync, ci->i_truncate_seq, ci->i_truncate_size, - &mtime, false, 2); + &mtime, false, 2, page_align); if (!req) return -ENOMEM; num_pages = calc_pages_for(pos, len); if (file->f_flags & O_DIRECT) { - pages = ceph_get_direct_page_vector(data, num_pages, pos, len); + pages = ceph_get_direct_page_vector(data, num_pages); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 1d6a45b..bf12865 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2,7 +2,6 @@ #include <linux/module.h> #include <linux/fs.h> -#include <linux/smp_lock.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/uaccess.h> @@ -471,7 +470,9 @@ void ceph_fill_file_time(struct inode *inode, int issued, if (issued & (CEPH_CAP_FILE_EXCL| CEPH_CAP_FILE_WR| - CEPH_CAP_FILE_BUFFER)) { + CEPH_CAP_FILE_BUFFER| + CEPH_CAP_AUTH_EXCL| + CEPH_CAP_XATTR_EXCL)) { if (timespec_compare(ctime, &inode->i_ctime) > 0) { dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n", inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, @@ -511,7 +512,7 @@ void ceph_fill_file_time(struct inode *inode, int issued, warn = 1; } } else { - /* we have no write caps; whatever the MDS says is true */ + /* we have no write|excl caps; whatever the MDS says is true */ if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) { inode->i_ctime = *ctime; inode->i_mtime = *mtime; @@ -567,12 +568,17 @@ static int fill_inode(struct inode *inode, /* * provided version will be odd if inode value is projected, - * even if stable. skip the update if we have a newer info - * (e.g., due to inode info racing form multiple MDSs), or if - * we are getting projected (unstable) inode info. + * even if stable. skip the update if we have newer stable + * info (ours>=theirs, e.g. due to racing mds replies), unless + * we are getting projected (unstable) info (in which case the + * version is odd, and we want ours>theirs). + * us them + * 2 2 skip + * 3 2 skip + * 3 3 update */ if (le64_to_cpu(info->version) > 0 && - (ci->i_version & ~1) > le64_to_cpu(info->version)) + (ci->i_version & ~1) >= le64_to_cpu(info->version)) goto no_change; issued = __ceph_caps_issued(ci, &implemented); @@ -606,7 +612,14 @@ static int fill_inode(struct inode *inode, le32_to_cpu(info->time_warp_seq), &ctime, &mtime, &atime); - ci->i_max_size = le64_to_cpu(info->max_size); + /* only update max_size on auth cap */ + if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && + ci->i_max_size != le64_to_cpu(info->max_size)) { + dout("max_size %lld -> %llu\n", ci->i_max_size, + le64_to_cpu(info->max_size)); + ci->i_max_size = le64_to_cpu(info->max_size); + } + ci->i_layout = info->layout; inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; @@ -1055,7 +1068,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ininfo = rinfo->targeti.in; vino.ino = le64_to_cpu(ininfo->ino); vino.snap = le64_to_cpu(ininfo->snapid); - if (!dn->d_inode) { + in = dn->d_inode; + if (!in) { in = ceph_get_inode(sb, vino); if (IS_ERR(in)) { pr_err("fill_trace bad get_inode " @@ -1386,11 +1400,8 @@ static void ceph_invalidate_work(struct work_struct *work) spin_lock(&inode->i_lock); dout("invalidate_pages %p gen %d revoking %d\n", inode, ci->i_rdcache_gen, ci->i_rdcache_revoking); - if (ci->i_rdcache_gen == 0 || - ci->i_rdcache_revoking != ci->i_rdcache_gen) { - BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen); + if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { /* nevermind! */ - ci->i_rdcache_revoking = 0; spin_unlock(&inode->i_lock); goto out; } @@ -1400,15 +1411,16 @@ static void ceph_invalidate_work(struct work_struct *work) ceph_invalidate_nondirty_pages(inode->i_mapping); spin_lock(&inode->i_lock); - if (orig_gen == ci->i_rdcache_gen) { + if (orig_gen == ci->i_rdcache_gen && + orig_gen == ci->i_rdcache_revoking) { dout("invalidate_pages %p gen %d successful\n", inode, ci->i_rdcache_gen); - ci->i_rdcache_gen = 0; - ci->i_rdcache_revoking = 0; + ci->i_rdcache_revoking--; check = 1; } else { - dout("invalidate_pages %p gen %d raced, gen now %d\n", - inode, orig_gen, ci->i_rdcache_gen); + dout("invalidate_pages %p gen %d raced, now %d revoking %d\n", + inode, orig_gen, ci->i_rdcache_gen, + ci->i_rdcache_revoking); } spin_unlock(&inode->i_lock); @@ -1739,7 +1751,7 @@ int ceph_do_getattr(struct inode *inode, int mask) return 0; } - dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask)); + dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode); if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) return 0; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 3142b15..098b185 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -6,7 +6,6 @@ #include <linux/sched.h> #include <linux/debugfs.h> #include <linux/seq_file.h> -#include <linux/smp_lock.h> #include "super.h" #include "mds_client.h" @@ -529,6 +528,9 @@ static void __register_request(struct ceph_mds_client *mdsc, ceph_mdsc_get_request(req); __insert_request(mdsc, req); + req->r_uid = current_fsuid(); + req->r_gid = current_fsgid(); + if (dir) { struct ceph_inode_info *ci = ceph_inode(dir); @@ -1588,8 +1590,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); head->op = cpu_to_le32(req->r_op); - head->caller_uid = cpu_to_le32(current_fsuid()); - head->caller_gid = cpu_to_le32(current_fsgid()); + head->caller_uid = cpu_to_le32(req->r_uid); + head->caller_gid = cpu_to_le32(req->r_gid); head->args = req->r_args; ceph_encode_filepath(&p, end, ino1, path1); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index d66d63c..9341fd4 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -170,6 +170,8 @@ struct ceph_mds_request { union ceph_mds_request_args r_args; int r_fmode; /* file mode, if expecting cap */ + uid_t r_uid; + gid_t r_gid; /* for choosing which mds to send this request to */ int r_direct_mode; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index d6e0e04..08b460a 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -635,7 +635,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, /* * mount: join the ceph cluster, and open root directory. */ -static int ceph_mount(struct ceph_fs_client *fsc, struct vfsmount *mnt, +static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, const char *path) { int err; @@ -678,16 +678,14 @@ static int ceph_mount(struct ceph_fs_client *fsc, struct vfsmount *mnt, } } - mnt->mnt_root = root; - mnt->mnt_sb = fsc->sb; - fsc->mount_state = CEPH_MOUNT_MOUNTED; dout("mount success\n"); - err = 0; + mutex_unlock(&fsc->client->mount_mutex); + return root; out: mutex_unlock(&fsc->client->mount_mutex); - return err; + return ERR_PTR(err); fail: if (first) { @@ -777,41 +775,45 @@ static int ceph_register_bdi(struct super_block *sb, return err; } -static int ceph_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - struct vfsmount *mnt) +static struct dentry *ceph_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { struct super_block *sb; struct ceph_fs_client *fsc; + struct dentry *res; int err; int (*compare_super)(struct super_block *, void *) = ceph_compare_super; const char *path = NULL; struct ceph_mount_options *fsopt = NULL; struct ceph_options *opt = NULL; - dout("ceph_get_sb\n"); + dout("ceph_mount\n"); err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path); - if (err < 0) + if (err < 0) { + res = ERR_PTR(err); goto out_final; + } /* create client (which we may/may not use) */ fsc = create_fs_client(fsopt, opt); if (IS_ERR(fsc)) { - err = PTR_ERR(fsc); + res = ERR_CAST(fsc); kfree(fsopt); kfree(opt); goto out_final; } err = ceph_mdsc_init(fsc); - if (err < 0) + if (err < 0) { + res = ERR_PTR(err); goto out; + } if (ceph_test_opt(fsc->client, NOSHARE)) compare_super = NULL; sb = sget(fs_type, compare_super, ceph_set_super, fsc); if (IS_ERR(sb)) { - err = PTR_ERR(sb); + res = ERR_CAST(sb); goto out; } @@ -823,16 +825,18 @@ static int ceph_get_sb(struct file_system_type *fs_type, } else { dout("get_sb using new client %p\n", fsc); err = ceph_register_bdi(sb, fsc); - if (err < 0) + if (err < 0) { + res = ERR_PTR(err); goto out_splat; + } } - err = ceph_mount(fsc, mnt, path); - if (err < 0) + res = ceph_real_mount(fsc, path); + if (IS_ERR(res)) goto out_splat; - dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root, - mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode)); - return 0; + dout("root %p inode %p ino %llx.%llx\n", res, + res->d_inode, ceph_vinop(res->d_inode)); + return res; out_splat: ceph_mdsc_close_sessions(fsc->mdsc); @@ -843,8 +847,8 @@ out: ceph_mdsc_destroy(fsc); destroy_fs_client(fsc); out_final: - dout("ceph_get_sb fail %d\n", err); - return err; + dout("ceph_mount fail %ld\n", PTR_ERR(res)); + return res; } static void ceph_kill_sb(struct super_block *s) @@ -860,7 +864,7 @@ static void ceph_kill_sb(struct super_block *s) static struct file_system_type ceph_fs_type = { .owner = THIS_MODULE, .name = "ceph", - .get_sb = ceph_get_sb, + .mount = ceph_mount, .kill_sb = ceph_kill_sb, .fs_flags = FS_RENAME_DOES_D_MOVE, }; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 1886294..7f01728 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -293,9 +293,7 @@ struct ceph_inode_info { int i_rd_ref, i_rdcache_ref, i_wr_ref; int i_wrbuffer_ref, i_wrbuffer_ref_head; u32 i_shared_gen; /* increment each time we get FILE_SHARED */ - u32 i_rdcache_gen; /* we increment this each time we get - FILE_CACHE. If it's non-zero, we - _may_ have cached pages. */ + u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */ u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ struct list_head i_unsafe_writes; /* uncommitted sync writes */ diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 917b7d4..ee45648 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -2,6 +2,10 @@ config CIFS tristate "CIFS support (advanced network filesystem, SMBFS successor)" depends on INET select NLS + select CRYPTO + select CRYPTO_MD5 + select CRYPTO_HMAC + select CRYPTO_ARC4 help This is the client VFS module for the Common Internet File System (CIFS) protocol which is the successor to the Server Message Block @@ -140,6 +144,13 @@ config CIFS_FSCACHE to be cached locally on disk through the general filesystem cache manager. If unsure, say N. +config CIFS_ACL + bool "Provide CIFS ACL support (EXPERIMENTAL)" + depends on EXPERIMENTAL && CIFS_XATTR + help + Allows to fetch CIFS/NTFS ACL from the server. The DACL blob + is handed over to the application/caller. + config CIFS_EXPERIMENTAL bool "CIFS Experimental Features (EXPERIMENTAL)" depends on CIFS && EXPERIMENTAL diff --git a/fs/cifs/TODO b/fs/cifs/TODO index 5aff46c..355abcd 100644 --- a/fs/cifs/TODO +++ b/fs/cifs/TODO @@ -81,7 +81,7 @@ u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for v) mount check for unmatched uids -w) Add support for new vfs entry points for setlease and fallocate +w) Add support for new vfs entry point for fallocate x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of processes can proceed better in parallel (on the server) diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 525ba59..e9a393c 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -15,7 +15,7 @@ * the GNU Lesser General Public License for more details. * */ -#include <linux/radix-tree.h> +#include <linux/rbtree.h> #ifndef _CIFS_FS_SB_H #define _CIFS_FS_SB_H @@ -42,9 +42,9 @@ #define CIFS_MOUNT_MULTIUSER 0x20000 /* multiuser mount */ struct cifs_sb_info { - struct radix_tree_root tlink_tree; -#define CIFS_TLINK_MASTER_TAG 0 /* is "master" (mount) tcon */ + struct rb_root tlink_tree; spinlock_t tlink_tree_lock; + struct tcon_link *master_tlink; struct nls_table *local_nls; unsigned int rsize; unsigned int wsize; diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index c9b4792..c6ebea0 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -560,7 +560,7 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) - return NULL; + return ERR_CAST(tlink); xid = GetXid(); rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), fid, &pntsd, pacllen); @@ -568,7 +568,9 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, cifs_put_tlink(tlink); - cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen); + cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen); + if (rc) + return ERR_PTR(rc); return pntsd; } @@ -583,7 +585,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) - return NULL; + return ERR_CAST(tlink); tcon = tlink_tcon(tlink); xid = GetXid(); @@ -591,23 +593,22 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0, &fid, &oplock, NULL, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc) { - cERROR(1, "Unable to open file to get ACL"); - goto out; + if (!rc) { + rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen); + CIFSSMBClose(xid, tcon, fid); } - rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen); - cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen); - - CIFSSMBClose(xid, tcon, fid); - out: cifs_put_tlink(tlink); FreeXid(xid); + + cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen); + if (rc) + return ERR_PTR(rc); return pntsd; } /* Retrieve an ACL from the server */ -static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, +struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, struct inode *inode, const char *path, u32 *pacllen) { @@ -695,7 +696,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, } /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */ -void +int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct inode *inode, const char *path, const __u16 *pfid) { @@ -711,17 +712,21 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen); /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */ - if (pntsd) + if (IS_ERR(pntsd)) { + rc = PTR_ERR(pntsd); + cERROR(1, "%s: error %d getting sec desc", __func__, rc); + } else { rc = parse_sec_desc(pntsd, acllen, fattr); - if (rc) - cFYI(1, "parse sec desc failed rc = %d", rc); + kfree(pntsd); + if (rc) + cERROR(1, "parse sec desc failed rc = %d", rc); + } - kfree(pntsd); - return; + return rc; } /* Convert mode bits to an ACL so we can update the ACL on the server */ -int mode_to_acl(struct inode *inode, const char *path, __u64 nmode) +int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode) { int rc = 0; __u32 secdesclen = 0; @@ -736,7 +741,10 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode) /* Add three ACEs for owner, group, everyone getting rid of other ACEs as chmod disables ACEs and set the security descriptor */ - if (pntsd) { + if (IS_ERR(pntsd)) { + rc = PTR_ERR(pntsd); + cERROR(1, "%s: error %d getting sec desc", __func__, rc); + } else { /* allocate memory for the smb header, set security descriptor request security descriptor parameters, and secuirty descriptor itself */ diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 7ac0056..f856732 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -43,18 +43,32 @@ extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24); static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, - const struct session_key *key, char *signature) + struct TCP_Server_Info *server, char *signature) { - struct MD5Context context; + int rc; - if ((cifs_pdu == NULL) || (signature == NULL) || (key == NULL)) + if (cifs_pdu == NULL || signature == NULL || server == NULL) return -EINVAL; - cifs_MD5_init(&context); - cifs_MD5_update(&context, (char *)&key->data, key->len); - cifs_MD5_update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length); + if (!server->secmech.sdescmd5) { + cERROR(1, "%s: Can't generate signature\n", __func__); + return -1; + } + + rc = crypto_shash_init(&server->secmech.sdescmd5->shash); + if (rc) { + cERROR(1, "%s: Oould not init md5\n", __func__); + return rc; + } + + crypto_shash_update(&server->secmech.sdescmd5->shash, + server->session_key.response, server->session_key.len); + + crypto_shash_update(&server->secmech.sdescmd5->shash, + cifs_pdu->Protocol, cifs_pdu->smb_buf_length); + + rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature); - cifs_MD5_final(signature, &context); return 0; } @@ -79,8 +93,7 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, server->sequence_number++; spin_unlock(&GlobalMid_Lock); - rc = cifs_calculate_signature(cifs_pdu, &server->session_key, - smb_signature); + rc = cifs_calculate_signature(cifs_pdu, server, smb_signature); if (rc) memset(cifs_pdu->Signature.SecuritySignature, 0, 8); else @@ -90,16 +103,28 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, } static int cifs_calc_signature2(const struct kvec *iov, int n_vec, - const struct session_key *key, char *signature) + struct TCP_Server_Info *server, char *signature) { - struct MD5Context context; int i; + int rc; - if ((iov == NULL) || (signature == NULL) || (key == NULL)) + if (iov == NULL || signature == NULL || server == NULL) return -EINVAL; - cifs_MD5_init(&context); - cifs_MD5_update(&context, (char *)&key->data, key->len); + if (!server->secmech.sdescmd5) { + cERROR(1, "%s: Can't generate signature\n", __func__); + return -1; + } + + rc = crypto_shash_init(&server->secmech.sdescmd5->shash); + if (rc) { + cERROR(1, "%s: Oould not init md5\n", __func__); + return rc; + } + + crypto_shash_update(&server->secmech.sdescmd5->shash, + server->session_key.response, server->session_key.len); + for (i = 0; i < n_vec; i++) { if (iov[i].iov_len == 0) continue; @@ -112,18 +137,18 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec, if (i == 0) { if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ break; /* nothing to sign or corrupt header */ - cifs_MD5_update(&context, iov[0].iov_base+4, - iov[0].iov_len-4); + crypto_shash_update(&server->secmech.sdescmd5->shash, + iov[i].iov_base + 4, iov[i].iov_len - 4); } else - cifs_MD5_update(&context, iov[i].iov_base, iov[i].iov_len); + crypto_shash_update(&server->secmech.sdescmd5->shash, + iov[i].iov_base, iov[i].iov_len); } - cifs_MD5_final(signature, &context); + rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature); - return 0; + return rc; } - int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, __u32 *pexpected_response_sequence_number) { @@ -146,8 +171,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, server->sequence_number++; spin_unlock(&GlobalMid_Lock); - rc = cifs_calc_signature2(iov, n_vec, &server->session_key, - smb_signature); + rc = cifs_calc_signature2(iov, n_vec, server, smb_signature); if (rc) memset(cifs_pdu->Signature.SecuritySignature, 0, 8); else @@ -157,14 +181,14 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, } int cifs_verify_signature(struct smb_hdr *cifs_pdu, - const struct session_key *session_key, + struct TCP_Server_Info *server, __u32 expected_sequence_number) { unsigned int rc; char server_response_sig[8]; char what_we_think_sig_should_be[20]; - if (cifs_pdu == NULL || session_key == NULL) + if (cifs_pdu == NULL || server == NULL) return -EINVAL; if (cifs_pdu->Command == SMB_COM_NEGOTIATE) @@ -193,7 +217,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu, cpu_to_le32(expected_sequence_number); cifs_pdu->Signature.Sequence.Reserved = 0; - rc = cifs_calculate_signature(cifs_pdu, session_key, + rc = cifs_calculate_signature(cifs_pdu, server, what_we_think_sig_should_be); if (rc) @@ -209,18 +233,28 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu, } -/* We fill in key by putting in 40 byte array which was allocated by caller */ -int cifs_calculate_session_key(struct session_key *key, const char *rn, - const char *password) +/* first calculate 24 bytes ntlm response and then 16 byte session key */ +int setup_ntlm_response(struct cifsSesInfo *ses) { - char temp_key[16]; - if ((key == NULL) || (rn == NULL)) + unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE; + char temp_key[CIFS_SESS_KEY_SIZE]; + + if (!ses) return -EINVAL; - E_md4hash(password, temp_key); - mdfour(key->data.ntlm, temp_key, 16); - memcpy(key->data.ntlm+16, rn, CIFS_SESS_KEY_SIZE); - key->len = 40; + ses->auth_key.response = kmalloc(temp_len, GFP_KERNEL); + if (!ses->auth_key.response) { + cERROR(1, "NTLM can't allocate (%u bytes) memory", temp_len); + return -ENOMEM; + } + ses->auth_key.len = temp_len; + + SMBNTencrypt(ses->password, ses->server->cryptkey, + ses->auth_key.response + CIFS_SESS_KEY_SIZE); + + E_md4hash(ses->password, temp_key); + mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE); + return 0; } @@ -294,15 +328,15 @@ build_avpair_blob(struct cifsSesInfo *ses, const struct nls_table *nls_cp) * two times the unicode length of a server name + * size of a timestamp (which is 8 bytes). */ - ses->tilen = size + 2 * (2 * dlen) + 2 * (2 * wlen) + 8; - ses->tiblob = kzalloc(ses->tilen, GFP_KERNEL); - if (!ses->tiblob) { - ses->tilen = 0; + ses->auth_key.len = size + 2 * (2 * dlen) + 2 * (2 * wlen) + 8; + ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL); + if (!ses->auth_key.response) { + ses->auth_key.len = 0; cERROR(1, "Challenge target info allocation failure"); return -ENOMEM; } - blobptr = ses->tiblob; + blobptr = ses->auth_key.response; attrptr = (struct ntlmssp2_name *) blobptr; attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME); @@ -357,7 +391,7 @@ build_avpair_blob(struct cifsSesInfo *ses, const struct nls_table *nls_cp) * about target string i.e. for some, just user name might suffice. */ static int -find_domain_name(struct cifsSesInfo *ses) +find_domain_name(struct cifsSesInfo *ses, const struct nls_table *nls_cp) { unsigned int attrsize; unsigned int type; @@ -366,11 +400,11 @@ find_domain_name(struct cifsSesInfo *ses) unsigned char *blobend; struct ntlmssp2_name *attrptr; - if (!ses->tilen || !ses->tiblob) + if (!ses->auth_key.len || !ses->auth_key.response) return 0; - blobptr = ses->tiblob; - blobend = ses->tiblob + ses->tilen; + blobptr = ses->auth_key.response; + blobend = blobptr + ses->auth_key.len; while (blobptr + onesize < blobend) { attrptr = (struct ntlmssp2_name *) blobptr; @@ -386,16 +420,13 @@ find_domain_name(struct cifsSesInfo *ses) if (!attrsize) break; if (!ses->domainName) { - struct nls_table *default_nls; ses->domainName = kmalloc(attrsize + 1, GFP_KERNEL); if (!ses->domainName) return -ENOMEM; - default_nls = load_nls_default(); cifs_from_ucs2(ses->domainName, (__le16 *)blobptr, attrsize, attrsize, - default_nls, false); - unload_nls(default_nls); + nls_cp, false); break; } } @@ -405,82 +436,136 @@ find_domain_name(struct cifsSesInfo *ses) return 0; } -static int calc_ntlmv2_hash(struct cifsSesInfo *ses, +static int calc_ntlmv2_hash(struct cifsSesInfo *ses, char *ntlmv2_hash, const struct nls_table *nls_cp) { int rc = 0; int len; - char nt_hash[16]; - struct HMACMD5Context *pctxt; + char nt_hash[CIFS_NTHASH_SIZE]; wchar_t *user; wchar_t *domain; + wchar_t *server; - pctxt = kmalloc(sizeof(struct HMACMD5Context), GFP_KERNEL); - - if (pctxt == NULL) - return -ENOMEM; + if (!ses->server->secmech.sdeschmacmd5) { + cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n"); + return -1; + } /* calculate md4 hash of password */ E_md4hash(ses->password, nt_hash); - /* convert Domainname to unicode and uppercase */ - hmac_md5_init_limK_to_64(nt_hash, 16, pctxt); + crypto_shash_setkey(ses->server->secmech.hmacmd5, nt_hash, + CIFS_NTHASH_SIZE); + + rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); + if (rc) { + cERROR(1, "calc_ntlmv2_hash: could not init hmacmd5\n"); + return rc; + } /* convert ses->userName to unicode and uppercase */ len = strlen(ses->userName); user = kmalloc(2 + (len * 2), GFP_KERNEL); - if (user == NULL) + if (user == NULL) { + cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n"); + rc = -ENOMEM; goto calc_exit_2; + } len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp); UniStrupr(user); - hmac_md5_update((char *)user, 2*len, pctxt); + + crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + (char *)user, 2 * len); /* convert ses->domainName to unicode and uppercase */ if (ses->domainName) { len = strlen(ses->domainName); domain = kmalloc(2 + (len * 2), GFP_KERNEL); - if (domain == NULL) + if (domain == NULL) { + cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure"); + rc = -ENOMEM; goto calc_exit_1; + } len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len, nls_cp); - /* the following line was removed since it didn't work well - with lower cased domain name that passed as an option. - Maybe converting the domain name earlier makes sense */ - /* UniStrupr(domain); */ - - hmac_md5_update((char *)domain, 2*len, pctxt); - + crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + (char *)domain, 2 * len); kfree(domain); + } else if (ses->serverName) { + len = strlen(ses->serverName); + + server = kmalloc(2 + (len * 2), GFP_KERNEL); + if (server == NULL) { + cERROR(1, "calc_ntlmv2_hash: server mem alloc failure"); + rc = -ENOMEM; + goto calc_exit_1; + } + len = cifs_strtoUCS((__le16 *)server, ses->serverName, len, + nls_cp); + crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + (char *)server, 2 * len); + kfree(server); } + + rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, + ntlmv2_hash); + calc_exit_1: kfree(user); calc_exit_2: - /* BB FIXME what about bytes 24 through 40 of the signing key? - compare with the NTLM example */ - hmac_md5_final(ses->ntlmv2_hash, pctxt); + return rc; +} + +static int +CalcNTLMv2_response(const struct cifsSesInfo *ses, char *ntlmv2_hash) +{ + int rc; + unsigned int offset = CIFS_SESS_KEY_SIZE + 8; + + if (!ses->server->secmech.sdeschmacmd5) { + cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n"); + return -1; + } + + crypto_shash_setkey(ses->server->secmech.hmacmd5, + ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); + + rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); + if (rc) { + cERROR(1, "CalcNTLMv2_response: could not init hmacmd5"); + return rc; + } + + if (ses->server->secType == RawNTLMSSP) + memcpy(ses->auth_key.response + offset, + ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); + else + memcpy(ses->auth_key.response + offset, + ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); + crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + ses->auth_key.response + offset, ses->auth_key.len - offset); + + rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, + ses->auth_key.response + CIFS_SESS_KEY_SIZE); - kfree(pctxt); return rc; } + int -setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf, - const struct nls_table *nls_cp) +setup_ntlmv2_rsp(struct cifsSesInfo *ses, const struct nls_table *nls_cp) { int rc; - struct ntlmv2_resp *buf = (struct ntlmv2_resp *)resp_buf; - struct HMACMD5Context context; - - buf->blob_signature = cpu_to_le32(0x00000101); - buf->reserved = 0; - buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); - get_random_bytes(&buf->client_chal, sizeof(buf->client_chal)); - buf->reserved2 = 0; + int baselen; + unsigned int tilen; + struct ntlmv2_resp *buf; + char ntlmv2_hash[16]; + unsigned char *tiblob = NULL; /* target info blob */ if (ses->server->secType == RawNTLMSSP) { if (!ses->domainName) { - rc = find_domain_name(ses); + rc = find_domain_name(ses, nls_cp); if (rc) { cERROR(1, "error %d finding domain name", rc); goto setup_ntlmv2_rsp_ret; @@ -490,51 +575,179 @@ setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf, rc = build_avpair_blob(ses, nls_cp); if (rc) { cERROR(1, "error %d building av pair blob", rc); - return rc; + goto setup_ntlmv2_rsp_ret; } } - /* calculate buf->ntlmv2_hash */ - rc = calc_ntlmv2_hash(ses, nls_cp); + baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp); + tilen = ses->auth_key.len; + tiblob = ses->auth_key.response; + + ses->auth_key.response = kmalloc(baselen + tilen, GFP_KERNEL); + if (!ses->auth_key.response) { + rc = ENOMEM; + ses->auth_key.len = 0; + cERROR(1, "%s: Can't allocate auth blob", __func__); + goto setup_ntlmv2_rsp_ret; + } + ses->auth_key.len += baselen; + + buf = (struct ntlmv2_resp *) + (ses->auth_key.response + CIFS_SESS_KEY_SIZE); + buf->blob_signature = cpu_to_le32(0x00000101); + buf->reserved = 0; + buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); + get_random_bytes(&buf->client_chal, sizeof(buf->client_chal)); + buf->reserved2 = 0; + + memcpy(ses->auth_key.response + baselen, tiblob, tilen); + + /* calculate ntlmv2_hash */ + rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp); if (rc) { cERROR(1, "could not get v2 hash rc %d", rc); goto setup_ntlmv2_rsp_ret; } - CalcNTLMv2_response(ses, resp_buf); + + /* calculate first part of the client response (CR1) */ + rc = CalcNTLMv2_response(ses, ntlmv2_hash); + if (rc) { + cERROR(1, "Could not calculate CR1 rc: %d", rc); + goto setup_ntlmv2_rsp_ret; + } /* now calculate the session key for NTLMv2 */ - hmac_md5_init_limK_to_64(ses->ntlmv2_hash, 16, &context); - hmac_md5_update(resp_buf, 16, &context); - hmac_md5_final(ses->auth_key.data.ntlmv2.key, &context); + crypto_shash_setkey(ses->server->secmech.hmacmd5, + ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); + + rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash); + if (rc) { + cERROR(1, "%s: Could not init hmacmd5\n", __func__); + goto setup_ntlmv2_rsp_ret; + } - memcpy(&ses->auth_key.data.ntlmv2.resp, resp_buf, - sizeof(struct ntlmv2_resp)); - ses->auth_key.len = 16 + sizeof(struct ntlmv2_resp); + crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, + ses->auth_key.response + CIFS_SESS_KEY_SIZE, + CIFS_HMAC_MD5_HASH_SIZE); - return 0; + rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash, + ses->auth_key.response); setup_ntlmv2_rsp_ret: - kfree(ses->tiblob); - ses->tiblob = NULL; - ses->tilen = 0; + kfree(tiblob); return rc; } -void CalcNTLMv2_response(const struct cifsSesInfo *ses, - char *v2_session_response) +int +calc_seckey(struct cifsSesInfo *ses) { - struct HMACMD5Context context; - /* rest of v2 struct already generated */ - memcpy(v2_session_response + 8, ses->cryptKey, 8); - hmac_md5_init_limK_to_64(ses->ntlmv2_hash, 16, &context); + int rc; + struct crypto_blkcipher *tfm_arc4; + struct scatterlist sgin, sgout; + struct blkcipher_desc desc; + unsigned char sec_key[CIFS_SESS_KEY_SIZE]; /* a nonce */ + + get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE); + + tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC); + if (!tfm_arc4 || IS_ERR(tfm_arc4)) { + cERROR(1, "could not allocate crypto API arc4\n"); + return PTR_ERR(tfm_arc4); + } - hmac_md5_update(v2_session_response+8, - sizeof(struct ntlmv2_resp) - 8, &context); + desc.tfm = tfm_arc4; - if (ses->tilen) - hmac_md5_update(ses->tiblob, ses->tilen, &context); + crypto_blkcipher_setkey(tfm_arc4, ses->auth_key.response, + CIFS_SESS_KEY_SIZE); - hmac_md5_final(v2_session_response, &context); -/* cifs_dump_mem("v2_sess_rsp: ", v2_session_response, 32); */ + sg_init_one(&sgin, sec_key, CIFS_SESS_KEY_SIZE); + sg_init_one(&sgout, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE); + + rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE); + if (rc) { + cERROR(1, "could not encrypt session key rc: %d\n", rc); + crypto_free_blkcipher(tfm_arc4); + return rc; + } + + /* make secondary_key/nonce as session key */ + memcpy(ses->auth_key.response, sec_key, CIFS_SESS_KEY_SIZE); + /* and make len as that of session key only */ + ses->auth_key.len = CIFS_SESS_KEY_SIZE; + + crypto_free_blkcipher(tfm_arc4); + + return 0; +} + +void +cifs_crypto_shash_release(struct TCP_Server_Info *server) +{ + if (server->secmech.md5) + crypto_free_shash(server->secmech.md5); + + if (server->secmech.hmacmd5) + crypto_free_shash(server->secmech.hmacmd5); + + kfree(server->secmech.sdeschmacmd5); + + kfree(server->secmech.sdescmd5); +} + +int +cifs_crypto_shash_allocate(struct TCP_Server_Info *server) +{ + int rc; + unsigned int size; + + server->secmech.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0); + if (!server->secmech.hmacmd5 || + IS_ERR(server->secmech.hmacmd5)) { + cERROR(1, "could not allocate crypto hmacmd5\n"); + return PTR_ERR(server->secmech.hmacmd5); + } + + server->secmech.md5 = crypto_alloc_shash("md5", 0, 0); + if (!server->secmech.md5 || IS_ERR(server->secmech.md5)) { + cERROR(1, "could not allocate crypto md5\n"); + rc = PTR_ERR(server->secmech.md5); + goto crypto_allocate_md5_fail; + } + + size = sizeof(struct shash_desc) + + crypto_shash_descsize(server->secmech.hmacmd5); + server->secmech.sdeschmacmd5 = kmalloc(size, GFP_KERNEL); + if (!server->secmech.sdeschmacmd5) { + cERROR(1, "cifs_crypto_shash_allocate: can't alloc hmacmd5\n"); + rc = -ENOMEM; + goto crypto_allocate_hmacmd5_sdesc_fail; + } + server->secmech.sdeschmacmd5->shash.tfm = server->secmech.hmacmd5; + server->secmech.sdeschmacmd5->shash.flags = 0x0; + + + size = sizeof(struct shash_desc) + + crypto_shash_descsize(server->secmech.md5); + server->secmech.sdescmd5 = kmalloc(size, GFP_KERNEL); + if (!server->secmech.sdescmd5) { + cERROR(1, "cifs_crypto_shash_allocate: can't alloc md5\n"); + rc = -ENOMEM; + goto crypto_allocate_md5_sdesc_fail; + } + server->secmech.sdescmd5->shash.tfm = server->secmech.md5; + server->secmech.sdescmd5->shash.flags = 0x0; + + return 0; + +crypto_allocate_md5_sdesc_fail: + kfree(server->secmech.sdeschmacmd5); + +crypto_allocate_hmacmd5_sdesc_fail: + crypto_free_shash(server->secmech.md5); + +crypto_allocate_md5_fail: + crypto_free_shash(server->secmech.hmacmd5); + + return rc; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 3437163..76c8a90 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -116,7 +116,7 @@ cifs_read_super(struct super_block *sb, void *data, return -ENOMEM; spin_lock_init(&cifs_sb->tlink_tree_lock); - INIT_RADIX_TREE(&cifs_sb->tlink_tree, GFP_KERNEL); + cifs_sb->tlink_tree = RB_ROOT; rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY); if (rc) { @@ -318,12 +318,10 @@ cifs_alloc_inode(struct super_block *sb) return NULL; cifs_inode->cifsAttrs = 0x20; /* default */ cifs_inode->time = 0; - cifs_inode->write_behind_rc = 0; /* Until the file is open and we have gotten oplock info back from the server, can not assume caching of file data or metadata */ - cifs_inode->clientCanCacheRead = false; - cifs_inode->clientCanCacheAll = false; + cifs_set_oplock_level(cifs_inode, 0); cifs_inode->delete_pending = false; cifs_inode->invalid_mapping = false; cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ @@ -460,6 +458,8 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m) seq_printf(s, ",acl"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) seq_printf(s, ",mfsymlinks"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) + seq_printf(s, ",fsc"); seq_printf(s, ",rsize=%d", cifs_sb->rsize); seq_printf(s, ",wsize=%d", cifs_sb->wsize); @@ -545,9 +545,9 @@ static const struct super_operations cifs_super_ops = { #endif }; -static int -cifs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry * +cifs_do_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { int rc; struct super_block *sb; @@ -557,18 +557,17 @@ cifs_get_sb(struct file_system_type *fs_type, cFYI(1, "Devname: %s flags: %d ", dev_name, flags); if (IS_ERR(sb)) - return PTR_ERR(sb); + return ERR_CAST(sb); sb->s_flags = flags; rc = cifs_read_super(sb, data, dev_name, flags & MS_SILENT ? 1 : 0); if (rc) { deactivate_locked_super(sb); - return rc; + return ERR_PTR(rc); } sb->s_flags |= MS_ACTIVE; - simple_set_mnt(mnt, sb); - return 0; + return dget(sb->s_root); } static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, @@ -634,7 +633,7 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) struct file_system_type cifs_fs_type = { .owner = THIS_MODULE, .name = "cifs", - .get_sb = cifs_get_sb, + .mount = cifs_do_mount, .kill_sb = kill_anon_super, /* .fs_flags */ }; diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index f35795a..897b2b2 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -112,5 +112,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* EXPERIMENTAL */ -#define CIFS_VERSION "1.67" +#define CIFS_VERSION "1.68" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 3365e77f..b577bf0 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -25,6 +25,9 @@ #include <linux/workqueue.h> #include "cifs_fs_sb.h" #include "cifsacl.h" +#include <crypto/internal/hash.h> +#include <linux/scatterlist.h> + /* * The sizes of various internal tables and strings */ @@ -74,7 +77,7 @@ * CIFS vfs client Status information (based on what we know.) */ - /* associated with each tcp and smb session */ +/* associated with each tcp and smb session */ enum statusEnum { CifsNew = 0, CifsGood, @@ -99,14 +102,29 @@ enum protocolEnum { struct session_key { unsigned int len; - union { - char ntlm[CIFS_SESS_KEY_SIZE + 16]; - char krb5[CIFS_SESS_KEY_SIZE + 16]; /* BB: length correct? */ - struct { - char key[16]; - struct ntlmv2_resp resp; - } ntlmv2; - } data; + char *response; +}; + +/* crypto security descriptor definition */ +struct sdesc { + struct shash_desc shash; + char ctx[]; +}; + +/* crypto hashing related structure/fields, not specific to a sec mech */ +struct cifs_secmech { + struct crypto_shash *hmacmd5; /* hmac-md5 hash function */ + struct crypto_shash *md5; /* md5 hash function */ + struct sdesc *sdeschmacmd5; /* ctxt to generate ntlmv2 hash, CR1 */ + struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */ +}; + +/* per smb session structure/fields */ +struct ntlmssp_auth { + __u32 client_flags; /* sent by client in type 1 ntlmsssp exchange */ + __u32 server_flags; /* sent by server in type 2 ntlmssp exchange */ + unsigned char ciphertext[CIFS_CPHTXT_SIZE]; /* sent to server */ + char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlmssp */ }; struct cifs_cred { @@ -179,12 +197,14 @@ struct TCP_Server_Info { int capabilities; /* allow selective disabling of caps by smb sess */ int timeAdj; /* Adjust for difference in server time zone in sec */ __u16 CurrentMid; /* multiplex id - rotating counter */ + char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */ /* 16th byte of RFC1001 workstation name is always null */ char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; __u32 sequence_number; /* needed for CIFS PDU signature */ struct session_key session_key; unsigned long lstrp; /* when we got last response from this server */ u16 dialect; /* dialect index that server chose */ + struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */ /* extended security flavors that server supports */ bool sec_kerberos; /* supports plain Kerberos */ bool sec_mskerberos; /* supports legacy MS Kerberos */ @@ -222,11 +242,8 @@ struct cifsSesInfo { char userName[MAX_USERNAME_SIZE + 1]; char *domainName; char *password; - char cryptKey[CIFS_CRYPTO_KEY_SIZE]; struct session_key auth_key; - char ntlmv2_hash[16]; - unsigned int tilen; /* length of the target info blob */ - unsigned char *tiblob; /* target info blob in challenge response */ + struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */ bool need_reconnect:1; /* connection reset, uid now invalid */ }; /* no more than one of the following three session flags may be set */ @@ -319,7 +336,8 @@ struct cifsTconInfo { * "get" on the container. */ struct tcon_link { - unsigned long tl_index; + struct rb_node tl_rbnode; + uid_t tl_uid; unsigned long tl_flags; #define TCON_LINK_MASTER 0 #define TCON_LINK_PENDING 1 @@ -395,16 +413,19 @@ struct cifsFileInfo { struct list_head llist; /* list of byte range locks we have. */ bool invalidHandle:1; /* file closed via session abend */ bool oplock_break_cancelled:1; - atomic_t count; /* reference count */ + int count; /* refcount protected by cifs_file_list_lock */ struct mutex fh_mutex; /* prevents reopen race after dead ses*/ struct cifs_search_info srch_inf; struct work_struct oplock_break; /* work for oplock breaks */ }; -/* Take a reference on the file private data */ +/* + * Take a reference on the file private data. Must be called with + * cifs_file_list_lock held. + */ static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file) { - atomic_inc(&cifs_file->count); + ++cifs_file->count; } void cifsFileInfo_put(struct cifsFileInfo *cifs_file); @@ -417,7 +438,6 @@ struct cifsInodeInfo { struct list_head lockList; /* BB add in lists for dirty pages i.e. write caching info for oplock */ struct list_head openFileList; - int write_behind_rc; __u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */ unsigned long time; /* jiffies of last update/check of inode */ bool clientCanCacheRead:1; /* read oplock */ @@ -668,7 +688,7 @@ require use of the stronger protocol */ * GlobalMid_Lock protects: * list operations on pending_mid_q and oplockQ * updates to XID counters, multiplex id and SMB sequence numbers - * GlobalSMBSesLock protects: + * cifs_file_list_lock protects: * list operations on tcp and SMB session lists and tCon lists * f_owner.lock protects certain per file struct operations * mapping->page_lock protects certain per page operations diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index b0f4b56..de36b09 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -131,9 +131,20 @@ #define CIFS_CRYPTO_KEY_SIZE (8) /* + * Size of the ntlm client response + */ +#define CIFS_AUTH_RESP_SIZE (24) + +/* * Size of the session key (crypto key encrypted with the password */ -#define CIFS_SESS_KEY_SIZE (24) +#define CIFS_SESS_KEY_SIZE (16) + +#define CIFS_CLIENT_CHALLENGE_SIZE (8) +#define CIFS_SERVER_CHALLENGE_SIZE (8) +#define CIFS_HMAC_MD5_HASH_SIZE (16) +#define CIFS_CPHTXT_SIZE (16) +#define CIFS_NTHASH_SIZE (16) /* * Maximum user name length diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index e593c40..db961dc 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -104,6 +104,7 @@ extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); extern u64 cifs_UnixTimeToNT(struct timespec); extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset); +extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock); extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle, struct file *file, struct tcon_link *tlink, @@ -129,10 +130,12 @@ extern int cifs_get_file_info_unix(struct file *filp); extern int cifs_get_inode_info_unix(struct inode **pinode, const unsigned char *search_path, struct super_block *sb, int xid); -extern void cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, +extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct inode *inode, const char *path, const __u16 *pfid); -extern int mode_to_acl(struct inode *inode, const char *path, __u64); +extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64); +extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *, + const char *, u32 *); extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *, const char *); @@ -362,13 +365,15 @@ extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *); extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *, __u32 *); extern int cifs_verify_signature(struct smb_hdr *, - const struct session_key *session_key, + struct TCP_Server_Info *server, __u32 expected_sequence_number); -extern int cifs_calculate_session_key(struct session_key *key, const char *rn, - const char *pass); -extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *); -extern int setup_ntlmv2_rsp(struct cifsSesInfo *, char *, - const struct nls_table *); +extern void SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *); +extern int setup_ntlm_response(struct cifsSesInfo *); +extern int setup_ntlmv2_rsp(struct cifsSesInfo *, const struct nls_table *); +extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *); +extern void cifs_crypto_shash_release(struct TCP_Server_Info *); +extern int calc_seckey(struct cifsSesInfo *); + #ifdef CONFIG_CIFS_WEAK_PW_HASH extern void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt, char *lnm_session_key); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index e98f1f3..2f2632b 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -503,7 +503,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses) if (rsp->EncryptionKeyLength == cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) { - memcpy(ses->cryptKey, rsp->EncryptionKey, + memcpy(ses->server->cryptkey, rsp->EncryptionKey, CIFS_CRYPTO_KEY_SIZE); } else if (server->secMode & SECMODE_PW_ENCRYPT) { rc = -EIO; /* need cryptkey unless plain text */ @@ -574,7 +574,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses) server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone); server->timeAdj *= 60; if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) { - memcpy(ses->cryptKey, pSMBr->u.EncryptionKey, + memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey, CIFS_CRYPTO_KEY_SIZE); } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC) && (pSMBr->EncryptionKeyLength == 0)) { diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 7e73176..32fa4d9 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -116,6 +116,7 @@ struct smb_vol { static int ipv4_connect(struct TCP_Server_Info *server); static int ipv6_connect(struct TCP_Server_Info *server); +static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink); static void cifs_prune_tlinks(struct work_struct *work); /* @@ -175,6 +176,9 @@ cifs_reconnect(struct TCP_Server_Info *server) } server->sequence_number = 0; server->session_estab = false; + kfree(server->session_key.response); + server->session_key.response = NULL; + server->session_key.len = 0; spin_lock(&GlobalMid_Lock); list_for_each(tmp, &server->pending_mid_q) { @@ -1064,7 +1068,7 @@ cifs_parse_mount_options(char *options, const char *devname, } i = cifs_convert_address((struct sockaddr *)&vol->srcaddr, value, strlen(value)); - if (i < 0) { + if (i == 0) { printk(KERN_WARNING "CIFS: Could not parse" " srcaddr: %s\n", value); @@ -1348,6 +1352,11 @@ cifs_parse_mount_options(char *options, const char *devname, "supported. Instead set " "/proc/fs/cifs/LookupCacheEnabled to 0\n"); } else if (strnicmp(data, "fsc", 3) == 0) { +#ifndef CONFIG_CIFS_FSCACHE + cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE" + "kernel config option set"); + return 1; +#endif vol->fsc = true; } else if (strnicmp(data, "mfsymlinks", 10) == 0) { vol->mfsymlinks = true; @@ -1560,8 +1569,13 @@ cifs_put_tcp_session(struct TCP_Server_Info *server) server->tcpStatus = CifsExiting; spin_unlock(&GlobalMid_Lock); + cifs_crypto_shash_release(server); cifs_fscache_release_client_cookie(server); + kfree(server->session_key.response); + server->session_key.response = NULL; + server->session_key.len = 0; + task = xchg(&server->tsk, NULL); if (task) force_sig(SIGKILL, task); @@ -1614,10 +1628,16 @@ cifs_get_tcp_session(struct smb_vol *volume_info) goto out_err; } + rc = cifs_crypto_shash_allocate(tcp_ses); + if (rc) { + cERROR(1, "could not setup hash structures rc %d", rc); + goto out_err; + } + tcp_ses->hostname = extract_hostname(volume_info->UNC); if (IS_ERR(tcp_ses->hostname)) { rc = PTR_ERR(tcp_ses->hostname); - goto out_err; + goto out_err_crypto_release; } tcp_ses->noblocksnd = volume_info->noblocksnd; @@ -1661,7 +1681,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) } if (rc < 0) { cERROR(1, "Error connecting to socket. Aborting operation"); - goto out_err; + goto out_err_crypto_release; } /* @@ -1675,7 +1695,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) rc = PTR_ERR(tcp_ses->tsk); cERROR(1, "error %d create cifsd thread", rc); module_put(THIS_MODULE); - goto out_err; + goto out_err_crypto_release; } /* thread spawned, put it on the list */ @@ -1687,6 +1707,9 @@ cifs_get_tcp_session(struct smb_vol *volume_info) return tcp_ses; +out_err_crypto_release: + cifs_crypto_shash_release(tcp_ses); + out_err: if (tcp_ses) { if (!IS_ERR(tcp_ses->hostname)) @@ -1801,8 +1824,6 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) if (ses == NULL) goto get_ses_fail; - ses->tilen = 0; - ses->tiblob = NULL; /* new SMB session uses our server ref */ ses->server = server; if (server->addr.sockAddr6.sin6_family == AF_INET6) @@ -1823,10 +1844,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) goto get_ses_fail; } if (volume_info->domainname) { - int len = strlen(volume_info->domainname); - ses->domainName = kmalloc(len + 1, GFP_KERNEL); - if (ses->domainName) - strcpy(ses->domainName, volume_info->domainname); + ses->domainName = kstrdup(volume_info->domainname, GFP_KERNEL); + if (!ses->domainName) + goto get_ses_fail; } ses->cred_uid = volume_info->cred_uid; ses->linux_uid = volume_info->linux_uid; @@ -2886,24 +2906,16 @@ remote_path_check: goto mount_fail_check; } - tlink->tl_index = pSesInfo->linux_uid; + tlink->tl_uid = pSesInfo->linux_uid; tlink->tl_tcon = tcon; tlink->tl_time = jiffies; set_bit(TCON_LINK_MASTER, &tlink->tl_flags); set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags); - rc = radix_tree_preload(GFP_KERNEL); - if (rc == -ENOMEM) { - kfree(tlink); - goto mount_fail_check; - } - + cifs_sb->master_tlink = tlink; spin_lock(&cifs_sb->tlink_tree_lock); - radix_tree_insert(&cifs_sb->tlink_tree, pSesInfo->linux_uid, tlink); - radix_tree_tag_set(&cifs_sb->tlink_tree, pSesInfo->linux_uid, - CIFS_TLINK_MASTER_TAG); + tlink_rb_insert(&cifs_sb->tlink_tree, tlink); spin_unlock(&cifs_sb->tlink_tree_lock); - radix_tree_preload_end(); queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks, TLINK_IDLE_EXPIRE); @@ -2985,13 +2997,13 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, #ifdef CONFIG_CIFS_WEAK_PW_HASH if ((global_secflags & CIFSSEC_MAY_LANMAN) && (ses->server->secType == LANMAN)) - calc_lanman_hash(tcon->password, ses->cryptKey, + calc_lanman_hash(tcon->password, ses->server->cryptkey, ses->server->secMode & SECMODE_PW_ENCRYPT ? true : false, bcc_ptr); else #endif /* CIFS_WEAK_PW_HASH */ - SMBNTencrypt(tcon->password, ses->cryptKey, bcc_ptr); + SMBNTencrypt(tcon->password, ses->server->cryptkey, bcc_ptr); bcc_ptr += CIFS_SESS_KEY_SIZE; if (ses->capabilities & CAP_UNICODE) { @@ -3093,32 +3105,25 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, int cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb) { - int i, ret; + struct rb_root *root = &cifs_sb->tlink_tree; + struct rb_node *node; + struct tcon_link *tlink; char *tmp; - struct tcon_link *tlink[8]; - unsigned long index = 0; cancel_delayed_work_sync(&cifs_sb->prune_tlinks); - do { - spin_lock(&cifs_sb->tlink_tree_lock); - ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree, - (void **)tlink, index, - ARRAY_SIZE(tlink)); - /* increment index for next pass */ - if (ret > 0) - index = tlink[ret - 1]->tl_index + 1; - for (i = 0; i < ret; i++) { - cifs_get_tlink(tlink[i]); - clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags); - radix_tree_delete(&cifs_sb->tlink_tree, - tlink[i]->tl_index); - } - spin_unlock(&cifs_sb->tlink_tree_lock); + spin_lock(&cifs_sb->tlink_tree_lock); + while ((node = rb_first(root))) { + tlink = rb_entry(node, struct tcon_link, tl_rbnode); + cifs_get_tlink(tlink); + clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags); + rb_erase(node, root); - for (i = 0; i < ret; i++) - cifs_put_tlink(tlink[i]); - } while (ret != 0); + spin_unlock(&cifs_sb->tlink_tree_lock); + cifs_put_tlink(tlink); + spin_lock(&cifs_sb->tlink_tree_lock); + } + spin_unlock(&cifs_sb->tlink_tree_lock); tmp = cifs_sb->prepath; cifs_sb->prepathlen = 0; @@ -3178,10 +3183,11 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses, } else { mutex_lock(&ses->server->srv_mutex); if (!server->session_estab) { - memcpy(&server->session_key.data, - &ses->auth_key.data, ses->auth_key.len); + server->session_key.response = ses->auth_key.response; server->session_key.len = ses->auth_key.len; - ses->server->session_estab = true; + server->sequence_number = 0x2; + server->session_estab = true; + ses->auth_key.response = NULL; } mutex_unlock(&server->srv_mutex); @@ -3192,6 +3198,12 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *ses, spin_unlock(&GlobalMid_Lock); } + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + ses->auth_key.len = 0; + kfree(ses->ntlmssp); + ses->ntlmssp = NULL; + return rc; } @@ -3250,22 +3262,10 @@ out: return tcon; } -static struct tcon_link * +static inline struct tcon_link * cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb) { - struct tcon_link *tlink; - unsigned int ret; - - spin_lock(&cifs_sb->tlink_tree_lock); - ret = radix_tree_gang_lookup_tag(&cifs_sb->tlink_tree, (void **)&tlink, - 0, 1, CIFS_TLINK_MASTER_TAG); - spin_unlock(&cifs_sb->tlink_tree_lock); - - /* the master tcon should always be present */ - if (ret == 0) - BUG(); - - return tlink; + return cifs_sb->master_tlink; } struct cifsTconInfo * @@ -3281,6 +3281,47 @@ cifs_sb_tcon_pending_wait(void *unused) return signal_pending(current) ? -ERESTARTSYS : 0; } +/* find and return a tlink with given uid */ +static struct tcon_link * +tlink_rb_search(struct rb_root *root, uid_t uid) +{ + struct rb_node *node = root->rb_node; + struct tcon_link *tlink; + + while (node) { + tlink = rb_entry(node, struct tcon_link, tl_rbnode); + + if (tlink->tl_uid > uid) + node = node->rb_left; + else if (tlink->tl_uid < uid) + node = node->rb_right; + else + return tlink; + } + return NULL; +} + +/* insert a tcon_link into the tree */ +static void +tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + struct tcon_link *tlink; + + while (*new) { + tlink = rb_entry(*new, struct tcon_link, tl_rbnode); + parent = *new; + + if (tlink->tl_uid > new_tlink->tl_uid) + new = &((*new)->rb_left); + else + new = &((*new)->rb_right); + } + + rb_link_node(&new_tlink->tl_rbnode, parent, new); + rb_insert_color(&new_tlink->tl_rbnode, root); +} + /* * Find or construct an appropriate tcon given a cifs_sb and the fsuid of the * current task. @@ -3288,7 +3329,7 @@ cifs_sb_tcon_pending_wait(void *unused) * If the superblock doesn't refer to a multiuser mount, then just return * the master tcon for the mount. * - * First, search the radix tree for an existing tcon for this fsuid. If one + * First, search the rbtree for an existing tcon for this fsuid. If one * exists, then check to see if it's pending construction. If it is then wait * for construction to complete. Once it's no longer pending, check to see if * it failed and either return an error or retry construction, depending on @@ -3301,14 +3342,14 @@ struct tcon_link * cifs_sb_tlink(struct cifs_sb_info *cifs_sb) { int ret; - unsigned long fsuid = (unsigned long) current_fsuid(); + uid_t fsuid = current_fsuid(); struct tcon_link *tlink, *newtlink; if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) return cifs_get_tlink(cifs_sb_master_tlink(cifs_sb)); spin_lock(&cifs_sb->tlink_tree_lock); - tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid); + tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid); if (tlink) cifs_get_tlink(tlink); spin_unlock(&cifs_sb->tlink_tree_lock); @@ -3317,36 +3358,24 @@ cifs_sb_tlink(struct cifs_sb_info *cifs_sb) newtlink = kzalloc(sizeof(*tlink), GFP_KERNEL); if (newtlink == NULL) return ERR_PTR(-ENOMEM); - newtlink->tl_index = fsuid; + newtlink->tl_uid = fsuid; newtlink->tl_tcon = ERR_PTR(-EACCES); set_bit(TCON_LINK_PENDING, &newtlink->tl_flags); set_bit(TCON_LINK_IN_TREE, &newtlink->tl_flags); cifs_get_tlink(newtlink); - ret = radix_tree_preload(GFP_KERNEL); - if (ret != 0) { - kfree(newtlink); - return ERR_PTR(ret); - } - spin_lock(&cifs_sb->tlink_tree_lock); /* was one inserted after previous search? */ - tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid); + tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid); if (tlink) { cifs_get_tlink(tlink); spin_unlock(&cifs_sb->tlink_tree_lock); - radix_tree_preload_end(); kfree(newtlink); goto wait_for_construction; } - ret = radix_tree_insert(&cifs_sb->tlink_tree, fsuid, newtlink); - spin_unlock(&cifs_sb->tlink_tree_lock); - radix_tree_preload_end(); - if (ret) { - kfree(newtlink); - return ERR_PTR(ret); - } tlink = newtlink; + tlink_rb_insert(&cifs_sb->tlink_tree, tlink); + spin_unlock(&cifs_sb->tlink_tree_lock); } else { wait_for_construction: ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING, @@ -3392,39 +3421,39 @@ cifs_prune_tlinks(struct work_struct *work) { struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info, prune_tlinks.work); - struct tcon_link *tlink[8]; - unsigned long now = jiffies; - unsigned long index = 0; - int i, ret; + struct rb_root *root = &cifs_sb->tlink_tree; + struct rb_node *node = rb_first(root); + struct rb_node *tmp; + struct tcon_link *tlink; - do { - spin_lock(&cifs_sb->tlink_tree_lock); - ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree, - (void **)tlink, index, - ARRAY_SIZE(tlink)); - /* increment index for next pass */ - if (ret > 0) - index = tlink[ret - 1]->tl_index + 1; - for (i = 0; i < ret; i++) { - if (test_bit(TCON_LINK_MASTER, &tlink[i]->tl_flags) || - atomic_read(&tlink[i]->tl_count) != 0 || - time_after(tlink[i]->tl_time + TLINK_IDLE_EXPIRE, - now)) { - tlink[i] = NULL; - continue; - } - cifs_get_tlink(tlink[i]); - clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags); - radix_tree_delete(&cifs_sb->tlink_tree, - tlink[i]->tl_index); - } - spin_unlock(&cifs_sb->tlink_tree_lock); + /* + * Because we drop the spinlock in the loop in order to put the tlink + * it's not guarded against removal of links from the tree. The only + * places that remove entries from the tree are this function and + * umounts. Because this function is non-reentrant and is canceled + * before umount can proceed, this is safe. + */ + spin_lock(&cifs_sb->tlink_tree_lock); + node = rb_first(root); + while (node != NULL) { + tmp = node; + node = rb_next(tmp); + tlink = rb_entry(tmp, struct tcon_link, tl_rbnode); + + if (test_bit(TCON_LINK_MASTER, &tlink->tl_flags) || + atomic_read(&tlink->tl_count) != 0 || + time_after(tlink->tl_time + TLINK_IDLE_EXPIRE, jiffies)) + continue; - for (i = 0; i < ret; i++) { - if (tlink[i] != NULL) - cifs_put_tlink(tlink[i]); - } - } while (ret != 0); + cifs_get_tlink(tlink); + clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags); + rb_erase(tmp, root); + + spin_unlock(&cifs_sb->tlink_tree_lock); + cifs_put_tlink(tlink); + spin_lock(&cifs_sb->tlink_tree_lock); + } + spin_unlock(&cifs_sb->tlink_tree_lock); queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks, TLINK_IDLE_EXPIRE); diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c index 0eb8702..548f062 100644 --- a/fs/cifs/dns_resolve.c +++ b/fs/cifs/dns_resolve.c @@ -66,7 +66,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) /* Search for server name delimiter */ sep = memchr(hostname, '\\', len); if (sep) - len = sep - unc; + len = sep - hostname; else cFYI(1, "%s: probably server name is whole unc: %s", __func__, unc); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 45af003..b857ce5 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -131,8 +131,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, /* BB no need to lock inode until after invalidate since namei code should already have it locked? */ rc = filemap_write_and_wait(inode->i_mapping); - if (rc != 0) - pCifsInode->write_behind_rc = rc; + mapping_set_error(inode->i_mapping, rc); } cFYI(1, "invalidating remote inode since open detected it " "changed"); @@ -147,12 +146,7 @@ client_can_cache: rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb, xid, NULL); - if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { - pCifsInode->clientCanCacheAll = true; - pCifsInode->clientCanCacheRead = true; - cFYI(1, "Exclusive Oplock granted on inode %p", inode); - } else if ((oplock & 0xF) == OPLOCK_READ) - pCifsInode->clientCanCacheRead = true; + cifs_set_oplock_level(pCifsInode, oplock); return rc; } @@ -232,6 +226,7 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file, if (pCifsFile == NULL) return pCifsFile; + pCifsFile->count = 1; pCifsFile->netfid = fileHandle; pCifsFile->pid = current->tgid; pCifsFile->uid = current_fsuid(); @@ -242,7 +237,6 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file, mutex_init(&pCifsFile->fh_mutex); mutex_init(&pCifsFile->lock_mutex); INIT_LIST_HEAD(&pCifsFile->llist); - atomic_set(&pCifsFile->count, 1); INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break); spin_lock(&cifs_file_list_lock); @@ -254,12 +248,7 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file, list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList); spin_unlock(&cifs_file_list_lock); - if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { - pCifsInode->clientCanCacheAll = true; - pCifsInode->clientCanCacheRead = true; - cFYI(1, "Exclusive Oplock inode %p", inode); - } else if ((oplock & 0xF) == OPLOCK_READ) - pCifsInode->clientCanCacheRead = true; + cifs_set_oplock_level(pCifsInode, oplock); file->private_data = pCifsFile; return pCifsFile; @@ -267,16 +256,18 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file, /* * Release a reference on the file private data. This may involve closing - * the filehandle out on the server. + * the filehandle out on the server. Must be called without holding + * cifs_file_list_lock. */ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) { + struct inode *inode = cifs_file->dentry->d_inode; struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink); - struct cifsInodeInfo *cifsi = CIFS_I(cifs_file->dentry->d_inode); + struct cifsInodeInfo *cifsi = CIFS_I(inode); struct cifsLockInfo *li, *tmp; spin_lock(&cifs_file_list_lock); - if (!atomic_dec_and_test(&cifs_file->count)) { + if (--cifs_file->count > 0) { spin_unlock(&cifs_file_list_lock); return; } @@ -288,8 +279,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) if (list_empty(&cifsi->openFileList)) { cFYI(1, "closing last open instance for inode %p", cifs_file->dentry->d_inode); - cifsi->clientCanCacheRead = false; - cifsi->clientCanCacheAll = false; + cifs_set_oplock_level(cifsi, 0); } spin_unlock(&cifs_file_list_lock); @@ -605,11 +595,8 @@ reopen_success: if (can_flush) { rc = filemap_write_and_wait(inode->i_mapping); - if (rc != 0) - CIFS_I(inode)->write_behind_rc = rc; + mapping_set_error(inode->i_mapping, rc); - pCifsInode->clientCanCacheAll = false; - pCifsInode->clientCanCacheRead = false; if (tcon->unix_ext) rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb, xid); @@ -623,18 +610,9 @@ reopen_success: invalidate the current end of file on the server we can not go to the server to get the new inod info */ - if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { - pCifsInode->clientCanCacheAll = true; - pCifsInode->clientCanCacheRead = true; - cFYI(1, "Exclusive Oplock granted on inode %p", - pCifsFile->dentry->d_inode); - } else if ((oplock & 0xF) == OPLOCK_READ) { - pCifsInode->clientCanCacheRead = true; - pCifsInode->clientCanCacheAll = false; - } else { - pCifsInode->clientCanCacheRead = false; - pCifsInode->clientCanCacheAll = false; - } + + cifs_set_oplock_level(pCifsInode, oplock); + cifs_relock_file(pCifsFile); reopen_error_exit: @@ -776,12 +754,6 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink); - - if (file->private_data == NULL) { - rc = -EBADF; - FreeXid(xid); - return rc; - } netfid = ((struct cifsFileInfo *)file->private_data)->netfid; if ((tcon->ses->capabilities & CAP_UNIX) && @@ -957,6 +929,7 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, ssize_t cifs_user_write(struct file *file, const char __user *write_data, size_t write_size, loff_t *poffset) { + struct inode *inode = file->f_path.dentry->d_inode; int rc = 0; unsigned int bytes_written = 0; unsigned int total_written; @@ -964,7 +937,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, struct cifsTconInfo *pTcon; int xid, long_op; struct cifsFileInfo *open_file; - struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode); + struct cifsInodeInfo *cifsi = CIFS_I(inode); cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); @@ -1030,21 +1003,17 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, cifs_stats_bytes_written(pTcon, total_written); - /* since the write may have blocked check these pointers again */ - if ((file->f_path.dentry) && (file->f_path.dentry->d_inode)) { - struct inode *inode = file->f_path.dentry->d_inode; /* Do not update local mtime - server will set its actual value on write - * inode->i_ctime = inode->i_mtime = - * current_fs_time(inode->i_sb);*/ - if (total_written > 0) { - spin_lock(&inode->i_lock); - if (*poffset > file->f_path.dentry->d_inode->i_size) - i_size_write(file->f_path.dentry->d_inode, - *poffset); - spin_unlock(&inode->i_lock); - } - mark_inode_dirty_sync(file->f_path.dentry->d_inode); + * inode->i_ctime = inode->i_mtime = + * current_fs_time(inode->i_sb);*/ + if (total_written > 0) { + spin_lock(&inode->i_lock); + if (*poffset > inode->i_size) + i_size_write(inode, *poffset); + spin_unlock(&inode->i_lock); } + mark_inode_dirty_sync(inode); + FreeXid(xid); return total_written; } @@ -1179,7 +1148,7 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only) { struct cifsFileInfo *open_file; - struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + struct cifs_sb_info *cifs_sb; bool any_available = false; int rc; @@ -1193,6 +1162,8 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, return NULL; } + cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) fsuid_only = false; @@ -1353,6 +1324,7 @@ static int cifs_writepages(struct address_space *mapping, if (!experimEnabled && tcon->ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { cifsFileInfo_put(open_file); + kfree(iov); return generic_writepages(mapping, wbc); } cifsFileInfo_put(open_file); @@ -1478,12 +1450,7 @@ retry: if (rc || bytes_written < bytes_to_write) { cERROR(1, "Write2 ret %d, wrote %d", rc, bytes_written); - /* BB what if continued retry is - requested via mount flags? */ - if (rc == -ENOSPC) - set_bit(AS_ENOSPC, &mapping->flags); - else - set_bit(AS_EIO, &mapping->flags); + mapping_set_error(mapping, rc); } else { cifs_stats_bytes_written(tcon, bytes_written); } @@ -1628,11 +1595,10 @@ int cifs_fsync(struct file *file, int datasync) rc = filemap_write_and_wait(inode->i_mapping); if (rc == 0) { - rc = CIFS_I(inode)->write_behind_rc; - CIFS_I(inode)->write_behind_rc = 0; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + tcon = tlink_tcon(smbfile->tlink); - if (!rc && tcon && smbfile && - !(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) rc = CIFSSMBFlush(xid, tcon, smbfile->netfid); } @@ -1677,21 +1643,8 @@ int cifs_flush(struct file *file, fl_owner_t id) struct inode *inode = file->f_path.dentry->d_inode; int rc = 0; - /* Rather than do the steps manually: - lock the inode for writing - loop through pages looking for write behind data (dirty pages) - coalesce into contiguous 16K (or smaller) chunks to write to server - send to server (prefer in parallel) - deal with writebehind errors - unlock inode for writing - filemapfdatawrite appears easier for the time being */ - - rc = filemap_fdatawrite(inode->i_mapping); - /* reset wb rc if we were able to write out dirty pages */ - if (!rc) { - rc = CIFS_I(inode)->write_behind_rc; - CIFS_I(inode)->write_behind_rc = 0; - } + if (file->f_mode & FMODE_WRITE) + rc = filemap_write_and_wait(inode->i_mapping); cFYI(1, "Flush inode %p file %p rc %d", inode, file, rc); @@ -2270,7 +2223,7 @@ void cifs_oplock_break(struct work_struct *work) oplock_break); struct inode *inode = cfile->dentry->d_inode; struct cifsInodeInfo *cinode = CIFS_I(inode); - int rc, waitrc = 0; + int rc = 0; if (inode && S_ISREG(inode->i_mode)) { if (cinode->clientCanCacheRead) @@ -2279,13 +2232,10 @@ void cifs_oplock_break(struct work_struct *work) break_lease(inode, O_WRONLY); rc = filemap_fdatawrite(inode->i_mapping); if (cinode->clientCanCacheRead == 0) { - waitrc = filemap_fdatawait(inode->i_mapping); + rc = filemap_fdatawait(inode->i_mapping); + mapping_set_error(inode->i_mapping, rc); invalidate_remote_inode(inode); } - if (!rc) - rc = waitrc; - if (rc) - cinode->write_behind_rc = rc; cFYI(1, "Oplock flush inode %p rc %d", inode, rc); } @@ -2304,7 +2254,7 @@ void cifs_oplock_break(struct work_struct *work) /* * We might have kicked in before is_valid_oplock_break() * finished grabbing reference for us. Make sure it's done by - * waiting for GlobalSMSSeslock. + * waiting for cifs_file_list_lock. */ spin_lock(&cifs_file_list_lock); spin_unlock(&cifs_file_list_lock); @@ -2312,6 +2262,7 @@ void cifs_oplock_break(struct work_struct *work) cifs_oplock_break_put(cfile); } +/* must be called while holding cifs_file_list_lock */ void cifs_oplock_break_get(struct cifsFileInfo *cfile) { cifs_sb_active(cfile->dentry->d_sb); @@ -2320,8 +2271,10 @@ void cifs_oplock_break_get(struct cifsFileInfo *cfile) void cifs_oplock_break_put(struct cifsFileInfo *cfile) { + struct super_block *sb = cfile->dentry->d_sb; + cifsFileInfo_put(cfile); - cifs_sb_deactive(cfile->dentry->d_sb); + cifs_sb_deactive(sb); } const struct address_space_operations cifs_addr_ops = { diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index a2ad94e..297a43d 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -2,7 +2,7 @@ * fs/cifs/fscache.c - CIFS filesystem cache interface * * Copyright (c) 2010 Novell, Inc. - * Author(s): Suresh Jayaraman (sjayaraman@suse.de> + * Author(s): Suresh Jayaraman <sjayaraman@suse.de> * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published @@ -67,10 +67,12 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode) if (cifsi->fscache) return; - cifsi->fscache = fscache_acquire_cookie(tcon->fscache, + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) { + cifsi->fscache = fscache_acquire_cookie(tcon->fscache, &cifs_fscache_inode_object_def, cifsi); - cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache, + cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache, cifsi->fscache); + } } void cifs_fscache_release_inode_cookie(struct inode *inode) @@ -101,10 +103,8 @@ void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp) { if ((filp->f_flags & O_ACCMODE) != O_RDONLY) cifs_fscache_disable_inode_cookie(inode); - else { + else cifs_fscache_enable_inode_cookie(inode); - cFYI(1, "CIFS: fscache inode cookie set"); - } } void cifs_fscache_reset_inode_cookie(struct inode *inode) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 9497930..28cb6e7 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -689,8 +689,13 @@ int cifs_get_inode_info(struct inode **pinode, #ifdef CONFIG_CIFS_EXPERIMENTAL /* fill in 0777 bits from ACL */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { - cFYI(1, "Getting mode bits from ACL"); - cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, pfid); + rc = cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, + pfid); + if (rc) { + cFYI(1, "%s: Getting ACL failed with error: %d", + __func__, rc); + goto cgii_exit; + } } #endif @@ -881,8 +886,10 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino) rc = cifs_get_inode_info(&inode, full_path, NULL, sb, xid, NULL); - if (!inode) - return ERR_PTR(rc); + if (!inode) { + inode = ERR_PTR(rc); + goto out; + } #ifdef CONFIG_CIFS_FSCACHE /* populate tcon->resource_id */ @@ -898,13 +905,11 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino) inode->i_uid = cifs_sb->mnt_uid; inode->i_gid = cifs_sb->mnt_gid; } else if (rc) { - kfree(full_path); - _FreeXid(xid); iget_failed(inode); - return ERR_PTR(rc); + inode = ERR_PTR(rc); } - +out: kfree(full_path); /* can not call macro FreeXid here since in a void func * TODO: This is no longer true @@ -1670,7 +1675,9 @@ cifs_inode_needs_reval(struct inode *inode) return false; } -/* check invalid_mapping flag and zap the cache if it's set */ +/* + * Zap the cache. Called when invalid_mapping flag is set. + */ static void cifs_invalidate_mapping(struct inode *inode) { @@ -1682,8 +1689,7 @@ cifs_invalidate_mapping(struct inode *inode) /* write back any cached data */ if (inode->i_mapping && inode->i_mapping->nrpages != 0) { rc = filemap_write_and_wait(inode->i_mapping); - if (rc) - cifs_i->write_behind_rc = rc; + mapping_set_error(inode->i_mapping, rc); } invalidate_remote_inode(inode); cifs_fscache_reset_inode_cookie(inode); @@ -1943,10 +1949,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) * the flush returns error? */ rc = filemap_write_and_wait(inode->i_mapping); - if (rc != 0) { - cifsInode->write_behind_rc = rc; - rc = 0; - } + mapping_set_error(inode->i_mapping, rc); + rc = 0; if (attrs->ia_valid & ATTR_SIZE) { rc = cifs_set_file_size(inode, attrs, xid, full_path); @@ -2087,10 +2091,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) * the flush returns error? */ rc = filemap_write_and_wait(inode->i_mapping); - if (rc != 0) { - cifsInode->write_behind_rc = rc; - rc = 0; - } + mapping_set_error(inode->i_mapping, rc); + rc = 0; if (attrs->ia_valid & ATTR_SIZE) { rc = cifs_set_file_size(inode, attrs, xid, full_path); @@ -2120,9 +2122,14 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) if (attrs->ia_valid & ATTR_MODE) { rc = 0; #ifdef CONFIG_CIFS_EXPERIMENTAL - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) - rc = mode_to_acl(inode, full_path, mode); - else + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { + rc = mode_to_cifs_acl(inode, full_path, mode); + if (rc) { + cFYI(1, "%s: Setting ACL failed with error: %d", + __func__, rc); + goto cifs_setattr_exit; + } + } else #endif if (((mode & S_IWUGO) == 0) && (cifsInode->cifsAttrs & ATTR_READONLY) == 0) { @@ -2182,7 +2189,6 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) setattr_copy(inode, attrs); mark_inode_dirty(inode); - return 0; cifs_setattr_exit: kfree(full_path); diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 077bf75..0c98672 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -38,10 +38,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) struct cifs_sb_info *cifs_sb; #ifdef CONFIG_CIFS_POSIX struct cifsFileInfo *pSMBFile = filep->private_data; - struct cifsTconInfo *tcon = tlink_tcon(pSMBFile->tlink); + struct cifsTconInfo *tcon; __u64 ExtAttrBits = 0; __u64 ExtAttrMask = 0; - __u64 caps = le64_to_cpu(tcon->fsUnixInfo.Capability); + __u64 caps; #endif /* CONFIG_CIFS_POSIX */ xid = GetXid(); @@ -62,9 +62,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) break; #ifdef CONFIG_CIFS_POSIX case FS_IOC_GETFLAGS: + if (pSMBFile == NULL) + break; + tcon = tlink_tcon(pSMBFile->tlink); + caps = le64_to_cpu(tcon->fsUnixInfo.Capability); if (CIFS_UNIX_EXTATTR_CAP & caps) { - if (pSMBFile == NULL) - break; rc = CIFSGetExtAttr(xid, tcon, pSMBFile->netfid, &ExtAttrBits, &ExtAttrMask); if (rc == 0) @@ -75,13 +77,15 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) break; case FS_IOC_SETFLAGS: + if (pSMBFile == NULL) + break; + tcon = tlink_tcon(pSMBFile->tlink); + caps = le64_to_cpu(tcon->fsUnixInfo.Capability); if (CIFS_UNIX_EXTATTR_CAP & caps) { if (get_user(ExtAttrBits, (int __user *)arg)) { rc = -EFAULT; break; } - if (pSMBFile == NULL) - break; /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid, extAttrBits, &ExtAttrMask);*/ } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 1c681f6..43f1028 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -569,15 +569,14 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) cFYI(1, "file id match, oplock break"); pCifsInode = CIFS_I(netfile->dentry->d_inode); - pCifsInode->clientCanCacheAll = false; - if (pSMB->OplockLevel == 0) - pCifsInode->clientCanCacheRead = false; + cifs_set_oplock_level(pCifsInode, + pSMB->OplockLevel); /* * cifs_oplock_break_put() can't be called * from here. Get reference after queueing * succeeded. cifs_oplock_break() will - * synchronize using GlobalSMSSeslock. + * synchronize using cifs_file_list_lock. */ if (queue_work(system_nrt_wq, &netfile->oplock_break)) @@ -722,3 +721,23 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) cifs_sb_master_tcon(cifs_sb)->treeName); } } + +void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) +{ + oplock &= 0xF; + + if (oplock == OPLOCK_EXCLUSIVE) { + cinode->clientCanCacheAll = true; + cinode->clientCanCacheRead = true; + cFYI(1, "Exclusive Oplock granted on inode %p", + &cinode->vfs_inode); + } else if (oplock == OPLOCK_READ) { + cinode->clientCanCacheAll = false; + cinode->clientCanCacheRead = true; + cFYI(1, "Level II Oplock granted on inode %p", + &cinode->vfs_inode); + } else { + cinode->clientCanCacheAll = false; + cinode->clientCanCacheRead = false; + } +} diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index ef7bb7b..32d300e 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -226,26 +226,29 @@ static int initiate_cifs_search(const int xid, struct file *file) char *full_path = NULL; struct cifsFileInfo *cifsFile; struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - struct tcon_link *tlink; + struct tcon_link *tlink = NULL; struct cifsTconInfo *pTcon; - tlink = cifs_sb_tlink(cifs_sb); - if (IS_ERR(tlink)) - return PTR_ERR(tlink); - pTcon = tlink_tcon(tlink); - - if (file->private_data == NULL) - file->private_data = - kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); if (file->private_data == NULL) { - rc = -ENOMEM; - goto error_exit; + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + + cifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); + if (cifsFile == NULL) { + rc = -ENOMEM; + goto error_exit; + } + file->private_data = cifsFile; + cifsFile->tlink = cifs_get_tlink(tlink); + pTcon = tlink_tcon(tlink); + } else { + cifsFile = file->private_data; + pTcon = tlink_tcon(cifsFile->tlink); } - cifsFile = file->private_data; cifsFile->invalidHandle = true; cifsFile->srch_inf.endOfSearch = false; - cifsFile->tlink = cifs_get_tlink(tlink); full_path = build_path_from_dentry(file->f_path.dentry); if (full_path == NULL) { diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 2a11efd..7b01d3f 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -32,9 +32,6 @@ #include <linux/slab.h> #include "cifs_spnego.h" -extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, - unsigned char *p24); - /* * Checks if this is the first smb session to be reconnected after * the socket has been reestablished (so we know whether to use vc 0). @@ -402,23 +399,22 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, return -EINVAL; } - memcpy(ses->cryptKey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE); + memcpy(ses->ntlmssp->cryptkey, pblob->Challenge, CIFS_CRYPTO_KEY_SIZE); /* BB we could decode pblob->NegotiateFlags; some may be useful */ /* In particular we can examine sign flags */ /* BB spec says that if AvId field of MsvAvTimestamp is populated then we must set the MIC field of the AUTHENTICATE_MESSAGE */ - + ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags); tioffset = cpu_to_le16(pblob->TargetInfoArray.BufferOffset); tilen = cpu_to_le16(pblob->TargetInfoArray.Length); - ses->tilen = tilen; - if (ses->tilen) { - ses->tiblob = kmalloc(tilen, GFP_KERNEL); - if (!ses->tiblob) { + if (tilen) { + ses->auth_key.response = kmalloc(tilen, GFP_KERNEL); + if (!ses->auth_key.response) { cERROR(1, "Challenge target info allocation failure"); - ses->tilen = 0; return -ENOMEM; } - memcpy(ses->tiblob, bcc_ptr + tioffset, ses->tilen); + memcpy(ses->auth_key.response, bcc_ptr + tioffset, tilen); + ses->auth_key.len = tilen; } return 0; @@ -443,10 +439,12 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | NTLMSSP_NEGOTIATE_NTLM; if (ses->server->secMode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { flags |= NTLMSSP_NEGOTIATE_SIGN; - if (ses->server->secMode & SECMODE_SIGN_REQUIRED) - flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN; + if (!ses->server->session_estab) + flags |= NTLMSSP_NEGOTIATE_KEY_XCH | + NTLMSSP_NEGOTIATE_EXTENDED_SEC; + } sec_blob->NegotiateFlags |= cpu_to_le32(flags); @@ -469,11 +467,9 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer, const struct nls_table *nls_cp) { int rc; - unsigned int size; AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer; __u32 flags; unsigned char *tmp; - struct ntlmv2_resp ntlmv2_response = {}; memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8); sec_blob->MessageType = NtLmAuthenticate; @@ -497,25 +493,19 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer, sec_blob->LmChallengeResponse.MaximumLength = 0; sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer); - rc = setup_ntlmv2_rsp(ses, (char *)&ntlmv2_response, nls_cp); + rc = setup_ntlmv2_rsp(ses, nls_cp); if (rc) { cERROR(1, "Error %d during NTLMSSP authentication", rc); goto setup_ntlmv2_ret; } - size = sizeof(struct ntlmv2_resp); - memcpy(tmp, (char *)&ntlmv2_response, size); - tmp += size; - if (ses->tilen > 0) { - memcpy(tmp, ses->tiblob, ses->tilen); - tmp += ses->tilen; - } + memcpy(tmp, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + ses->auth_key.len - CIFS_SESS_KEY_SIZE); + tmp += ses->auth_key.len - CIFS_SESS_KEY_SIZE; - sec_blob->NtChallengeResponse.Length = cpu_to_le16(size + ses->tilen); + sec_blob->NtChallengeResponse.Length = + cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); sec_blob->NtChallengeResponse.MaximumLength = - cpu_to_le16(size + ses->tilen); - kfree(ses->tiblob); - ses->tiblob = NULL; - ses->tilen = 0; + cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); if (ses->domainName == NULL) { sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); @@ -554,9 +544,19 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer, sec_blob->WorkstationName.MaximumLength = 0; tmp += 2; - sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); - sec_blob->SessionKey.Length = 0; - sec_blob->SessionKey.MaximumLength = 0; + if ((ses->ntlmssp->server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) && + !calc_seckey(ses)) { + memcpy(tmp, ses->ntlmssp->ciphertext, CIFS_CPHTXT_SIZE); + sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE); + sec_blob->SessionKey.MaximumLength = + cpu_to_le16(CIFS_CPHTXT_SIZE); + tmp += CIFS_CPHTXT_SIZE; + } else { + sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->SessionKey.Length = 0; + sec_blob->SessionKey.MaximumLength = 0; + } setup_ntlmv2_ret: *buflen = tmp - pbuffer; @@ -600,8 +600,16 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, return -EINVAL; type = ses->server->secType; - cFYI(1, "sess setup type %d", type); + if (type == RawNTLMSSP) { + /* if memory allocation is successful, caller of this function + * frees it. + */ + ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL); + if (!ses->ntlmssp) + return -ENOMEM; + } + ssetup_ntlmssp_authenticate: if (phase == NtLmChallenge) phase = NtLmAuthenticate; /* if ntlmssp, now final phase */ @@ -666,10 +674,14 @@ ssetup_ntlmssp_authenticate: /* no capabilities flags in old lanman negotiation */ pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE); - /* BB calculate hash with password */ - /* and copy into bcc */ - calc_lanman_hash(ses->password, ses->cryptKey, + /* Calculate hash with password and copy into bcc_ptr. + * Encryption Key (stored as in cryptkey) gets used if the + * security mode bit in Negottiate Protocol response states + * to use challenge/response method (i.e. Password bit is 1). + */ + + calc_lanman_hash(ses->password, ses->server->cryptkey, ses->server->secMode & SECMODE_PW_ENCRYPT ? true : false, lnm_session_key); @@ -687,24 +699,27 @@ ssetup_ntlmssp_authenticate: ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); #endif } else if (type == NTLM) { - char ntlm_session_key[CIFS_SESS_KEY_SIZE]; - pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); pSMB->req_no_secext.CaseInsensitivePasswordLength = - cpu_to_le16(CIFS_SESS_KEY_SIZE); + cpu_to_le16(CIFS_AUTH_RESP_SIZE); pSMB->req_no_secext.CaseSensitivePasswordLength = - cpu_to_le16(CIFS_SESS_KEY_SIZE); + cpu_to_le16(CIFS_AUTH_RESP_SIZE); + + /* calculate ntlm response and session key */ + rc = setup_ntlm_response(ses); + if (rc) { + cERROR(1, "Error %d during NTLM authentication", rc); + goto ssetup_exit; + } - /* calculate session key */ - SMBNTencrypt(ses->password, ses->cryptKey, ntlm_session_key); + /* copy ntlm response */ + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + CIFS_AUTH_RESP_SIZE); + bcc_ptr += CIFS_AUTH_RESP_SIZE; - cifs_calculate_session_key(&ses->auth_key, - ntlm_session_key, ses->password); - /* copy session key */ - memcpy(bcc_ptr, (char *)ntlm_session_key, CIFS_SESS_KEY_SIZE); - bcc_ptr += CIFS_SESS_KEY_SIZE; - memcpy(bcc_ptr, (char *)ntlm_session_key, CIFS_SESS_KEY_SIZE); - bcc_ptr += CIFS_SESS_KEY_SIZE; if (ses->capabilities & CAP_UNICODE) { /* unicode strings must be word aligned */ if (iov[0].iov_len % 2) { @@ -715,47 +730,26 @@ ssetup_ntlmssp_authenticate: } else ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); } else if (type == NTLMv2) { - char *v2_sess_key = - kmalloc(sizeof(struct ntlmv2_resp), GFP_KERNEL); - - /* BB FIXME change all users of v2_sess_key to - struct ntlmv2_resp */ - - if (v2_sess_key == NULL) { - rc = -ENOMEM; - goto ssetup_exit; - } - pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); /* LM2 password would be here if we supported it */ pSMB->req_no_secext.CaseInsensitivePasswordLength = 0; - /* cpu_to_le16(LM2_SESS_KEY_SIZE); */ - /* calculate session key */ - rc = setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp); + /* calculate nlmv2 response and session key */ + rc = setup_ntlmv2_rsp(ses, nls_cp); if (rc) { cERROR(1, "Error %d during NTLMv2 authentication", rc); - kfree(v2_sess_key); goto ssetup_exit; } - memcpy(bcc_ptr, (char *)v2_sess_key, - sizeof(struct ntlmv2_resp)); - bcc_ptr += sizeof(struct ntlmv2_resp); - kfree(v2_sess_key); + memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, + ses->auth_key.len - CIFS_SESS_KEY_SIZE); + bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE; + /* set case sensitive password length after tilen may get * assigned, tilen is 0 otherwise. */ pSMB->req_no_secext.CaseSensitivePasswordLength = - cpu_to_le16(sizeof(struct ntlmv2_resp) + ses->tilen); - if (ses->tilen > 0) { - memcpy(bcc_ptr, ses->tiblob, ses->tilen); - bcc_ptr += ses->tilen; - /* we never did allocate ses->domainName to free */ - kfree(ses->tiblob); - ses->tiblob = NULL; - ses->tilen = 0; - } + cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE); if (ses->capabilities & CAP_UNICODE) { if (iov[0].iov_len % 2) { @@ -768,6 +762,7 @@ ssetup_ntlmssp_authenticate: } else if (type == Kerberos) { #ifdef CONFIG_CIFS_UPCALL struct cifs_spnego_msg *msg; + spnego_key = cifs_get_spnego_key(ses); if (IS_ERR(spnego_key)) { rc = PTR_ERR(spnego_key); @@ -785,16 +780,17 @@ ssetup_ntlmssp_authenticate: rc = -EKEYREJECTED; goto ssetup_exit; } - /* bail out if key is too long */ - if (msg->sesskey_len > - sizeof(ses->auth_key.data.krb5)) { - cERROR(1, "Kerberos signing key too long (%u bytes)", - msg->sesskey_len); - rc = -EOVERFLOW; + + ses->auth_key.response = kmalloc(msg->sesskey_len, GFP_KERNEL); + if (!ses->auth_key.response) { + cERROR(1, "Kerberos can't allocate (%u bytes) memory", + msg->sesskey_len); + rc = -ENOMEM; goto ssetup_exit; } + memcpy(ses->auth_key.response, msg->data, msg->sesskey_len); ses->auth_key.len = msg->sesskey_len; - memcpy(ses->auth_key.data.krb5, msg->data, msg->sesskey_len); + pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; capabilities |= CAP_EXTENDED_SECURITY; pSMB->req.Capabilities = cpu_to_le32(capabilities); @@ -897,8 +893,6 @@ ssetup_ntlmssp_authenticate: CIFS_STD_OP /* not long */ | CIFS_LOG_ERROR); /* SMB request buf freed in SendReceive2 */ - cFYI(1, "ssetup rc from sendrecv2 is %d", rc); - pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base; smb_buf = (struct smb_hdr *)iov[0].iov_base; diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index a66c91e..e0588cd 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -543,7 +543,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, (ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))) { rc = cifs_verify_signature(midQ->resp_buf, - &ses->server->session_key, + ses->server, midQ->sequence_number+1); if (rc) { cERROR(1, "Unexpected SMB signature"); @@ -731,7 +731,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses, (ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))) { rc = cifs_verify_signature(out_buf, - &ses->server->session_key, + ses->server, midQ->sequence_number+1); if (rc) { cERROR(1, "Unexpected SMB signature"); @@ -981,7 +981,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon, (ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))) { rc = cifs_verify_signature(out_buf, - &ses->server->session_key, + ses->server, midQ->sequence_number+1); if (rc) { cERROR(1, "Unexpected SMB signature"); diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index a264b74..eae2a14 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -30,10 +30,11 @@ #define MAX_EA_VALUE_SIZE 65535 #define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib" +#define CIFS_XATTR_CIFS_ACL "system.cifs_acl" #define CIFS_XATTR_USER_PREFIX "user." #define CIFS_XATTR_SYSTEM_PREFIX "system." #define CIFS_XATTR_OS2_PREFIX "os2." -#define CIFS_XATTR_SECURITY_PREFIX ".security" +#define CIFS_XATTR_SECURITY_PREFIX "security." #define CIFS_XATTR_TRUSTED_PREFIX "trusted." #define XATTR_TRUSTED_PREFIX_LEN 8 #define XATTR_SECURITY_PREFIX_LEN 9 @@ -277,29 +278,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); -#ifdef CONFIG_CIFS_EXPERIMENTAL - else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { - __u16 fid; - int oplock = 0; - struct cifs_ntsd *pacl = NULL; - __u32 buflen = 0; - if (experimEnabled) - rc = CIFSSMBOpen(xid, pTcon, full_path, - FILE_OPEN, GENERIC_READ, 0, &fid, - &oplock, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - /* else rc is EOPNOTSUPP from above */ - - if (rc == 0) { - rc = CIFSSMBGetCIFSACL(xid, pTcon, fid, &pacl, - &buflen); - CIFSSMBClose(xid, pTcon, fid); - } - } -#endif /* EXPERIMENTAL */ #else - cFYI(1, "query POSIX ACL not supported yet"); + cFYI(1, "Query POSIX ACL not supported yet"); #endif /* CONFIG_CIFS_POSIX */ } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT, strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) { @@ -311,8 +291,33 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); #else - cFYI(1, "query POSIX default ACL not supported yet"); -#endif + cFYI(1, "Query POSIX default ACL not supported yet"); +#endif /* CONFIG_CIFS_POSIX */ + } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL, + strlen(CIFS_XATTR_CIFS_ACL)) == 0) { +#ifdef CONFIG_CIFS_ACL + u32 acllen; + struct cifs_ntsd *pacl; + + pacl = get_cifs_acl(cifs_sb, direntry->d_inode, + full_path, &acllen); + if (IS_ERR(pacl)) { + rc = PTR_ERR(pacl); + cERROR(1, "%s: error %zd getting sec desc", + __func__, rc); + } else { + if (ea_value) { + if (acllen > buf_size) + acllen = -ERANGE; + else + memcpy(ea_value, pacl, acllen); + } + rc = acllen; + kfree(pacl); + } +#else + cFYI(1, "Query CIFS ACL not supported yet"); +#endif /* CONFIG_CIFS_ACL */ } else if (strncmp(ea_name, CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) { cFYI(1, "Trusted xattr namespace not supported yet"); diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 7993b96..5ea57c8 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -306,16 +306,16 @@ static int coda_statfs(struct dentry *dentry, struct kstatfs *buf) /* init_coda: used by filesystems.c to register coda */ -static int coda_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *coda_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_nodev(fs_type, flags, data, coda_fill_super, mnt); + return mount_nodev(fs_type, flags, data, coda_fill_super); } struct file_system_type coda_fs_type = { .owner = THIS_MODULE, .name = "coda", - .get_sb = coda_get_sb, + .mount = coda_mount, .kill_sb = kill_anon_super, .fs_flags = FS_BINARY_MOUNTDATA, }; diff --git a/fs/compat.c b/fs/compat.c index 52cfeb6..eb1740a 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -49,6 +49,7 @@ #include <linux/eventpoll.h> #include <linux/fs_struct.h> #include <linux/slab.h> +#include <linux/pagemap.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -606,14 +607,14 @@ ssize_t compat_rw_copy_check_uvector(int type, /* * Single unix specification: * We should -EINVAL if an element length is not >= 0 and fitting an - * ssize_t. The total length is fitting an ssize_t + * ssize_t. * - * Be careful here because iov_len is a size_t not an ssize_t + * In Linux, the total length is limited to MAX_RW_COUNT, there is + * no overflow possibility. */ tot_len = 0; ret = -EINVAL; for (seg = 0; seg < nr_segs; seg++) { - compat_ssize_t tmp = tot_len; compat_uptr_t buf; compat_ssize_t len; @@ -624,13 +625,13 @@ ssize_t compat_rw_copy_check_uvector(int type, } if (len < 0) /* size_t not fitting in compat_ssize_t .. */ goto out; - tot_len += len; - if (tot_len < tmp) /* maths overflow on the compat_ssize_t */ - goto out; if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) { ret = -EFAULT; goto out; } + if (len > MAX_RW_COUNT - tot_len) + len = MAX_RW_COUNT - tot_len; + tot_len += len; iov->iov_base = compat_ptr(buf); iov->iov_len = (compat_size_t) len; uvector++; @@ -1349,6 +1350,10 @@ static int compat_count(compat_uptr_t __user *argv, int max) argv++; if (i++ >= max) return -E2BIG; + + if (fatal_signal_pending(current)) + return -ERESTARTNOHAND; + cond_resched(); } } return i; @@ -1390,6 +1395,12 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv, while (len > 0) { int offset, bytes_to_copy; + if (fatal_signal_pending(current)) { + ret = -ERESTARTNOHAND; + goto out; + } + cond_resched(); + offset = pos % PAGE_SIZE; if (offset == 0) offset = PAGE_SIZE; @@ -1406,18 +1417,8 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv, if (!kmapped_page || kpos != (pos & PAGE_MASK)) { struct page *page; -#ifdef CONFIG_STACK_GROWSUP - ret = expand_stack_downwards(bprm->vma, pos); - if (ret < 0) { - /* We've exceed the stack rlimit. */ - ret = -E2BIG; - goto out; - } -#endif - ret = get_user_pages(current, bprm->mm, pos, - 1, 1, 1, &page, NULL); - if (ret <= 0) { - /* We've exceed the stack rlimit. */ + page = get_arg_page(bprm, pos, 1); + if (!page) { ret = -E2BIG; goto out; } @@ -1538,8 +1539,10 @@ int compat_do_execve(char * filename, return retval; out: - if (bprm->mm) + if (bprm->mm) { + acct_arg_size(bprm, 0); mmput(bprm->mm); + } out_file: if (bprm->file) { diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 410ed18..a60579b 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -19,7 +19,6 @@ #include <linux/compiler.h> #include <linux/sched.h> #include <linux/smp.h> -#include <linux/smp_lock.h> #include <linux/ioctl.h> #include <linux/if.h> #include <linux/if_bridge.h> diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 8c8d642..7d3607f 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -104,16 +104,16 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent) return 0; } -static int configfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *configfs_do_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_single(fs_type, flags, data, configfs_fill_super, mnt); + return mount_single(fs_type, flags, data, configfs_fill_super); } static struct file_system_type configfs_fs_type = { .owner = THIS_MODULE, .name = "configfs", - .get_sb = configfs_get_sb, + .mount = configfs_do_mount, .kill_sb = kill_litter_super, }; diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 1e7a330..32fd5fe 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -533,17 +533,16 @@ static const struct super_operations cramfs_ops = { .statfs = cramfs_statfs, }; -static int cramfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *cramfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, cramfs_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, cramfs_fill_super); } static struct file_system_type cramfs_fs_type = { .owner = THIS_MODULE, .name = "cramfs", - .get_sb = cramfs_get_sb, + .mount = cramfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index a4ed838..37a8ca7 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -135,17 +135,17 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent) return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files); } -static int debug_get_sb(struct file_system_type *fs_type, +static struct dentry *debug_mount(struct file_system_type *fs_type, int flags, const char *dev_name, - void *data, struct vfsmount *mnt) + void *data) { - return get_sb_single(fs_type, flags, data, debug_fill_super, mnt); + return mount_single(fs_type, flags, data, debug_fill_super); } static struct file_system_type debug_fs_type = { .owner = THIS_MODULE, .name = "debugfs", - .get_sb = debug_get_sb, + .mount = debug_mount, .kill_sb = kill_litter_super, }; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 8b3ffd5..1bb547c 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -331,7 +331,7 @@ static int compare_init_pts_sb(struct super_block *s, void *p) } /* - * devpts_get_sb() + * devpts_mount() * * If the '-o newinstance' mount option was specified, mount a new * (private) instance of devpts. PTYs created in this instance are @@ -345,20 +345,20 @@ static int compare_init_pts_sb(struct super_block *s, void *p) * semantics in devpts while preserving backward compatibility of the * current 'single-namespace' semantics. i.e all mounts of devpts * without the 'newinstance' mount option should bind to the initial - * kernel mount, like get_sb_single(). + * kernel mount, like mount_single(). * * Mounts with 'newinstance' option create a new, private namespace. * * NOTE: * - * For single-mount semantics, devpts cannot use get_sb_single(), - * because get_sb_single()/sget() find and use the super-block from + * For single-mount semantics, devpts cannot use mount_single(), + * because mount_single()/sget() find and use the super-block from * the most recent mount of devpts. But that recent mount may be a - * 'newinstance' mount and get_sb_single() would pick the newinstance + * 'newinstance' mount and mount_single() would pick the newinstance * super-block instead of the initial super-block. */ -static int devpts_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *devpts_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { int error; struct pts_mount_opts opts; @@ -366,7 +366,7 @@ static int devpts_get_sb(struct file_system_type *fs_type, error = parse_mount_options(data, PARSE_MOUNT, &opts); if (error) - return error; + return ERR_PTR(error); if (opts.newinstance) s = sget(fs_type, NULL, set_anon_super, NULL); @@ -374,7 +374,7 @@ static int devpts_get_sb(struct file_system_type *fs_type, s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL); if (IS_ERR(s)) - return PTR_ERR(s); + return ERR_CAST(s); if (!s->s_root) { s->s_flags = flags; @@ -390,13 +390,11 @@ static int devpts_get_sb(struct file_system_type *fs_type, if (error) goto out_undo_sget; - simple_set_mnt(mnt, s); - - return 0; + return dget(s->s_root); out_undo_sget: deactivate_locked_super(s); - return error; + return ERR_PTR(error); } #else @@ -404,10 +402,10 @@ out_undo_sget: * This supports only the legacy single-instance semantics (no * multiple-instance semantics) */ -static int devpts_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *devpts_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) { - return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt); + return mount_single(fs_type, flags, data, devpts_fill_super); } #endif @@ -421,7 +419,7 @@ static void devpts_kill_sb(struct super_block *sb) static struct file_system_type devpts_fs_type = { .name = "devpts", - .get_sb = devpts_get_sb, + .mount = devpts_mount, .kill_sb = devpts_kill_sb, }; diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 40186b9..413a3c4 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -377,6 +377,7 @@ struct ecryptfs_mount_crypt_stat { #define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES 0x00000010 #define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK 0x00000020 #define ECRYPTFS_GLOBAL_ENCFN_USE_FEK 0x00000040 +#define ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY 0x00000080 u32 flags; struct list_head global_auth_tok_list; struct mutex global_auth_tok_list_mutex; diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 3fbc942..9d1a22d 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -32,6 +32,7 @@ #include <linux/crypto.h> #include <linux/fs_stack.h> #include <linux/slab.h> +#include <linux/xattr.h> #include <asm/unaligned.h> #include "ecryptfs_kernel.h" @@ -70,15 +71,19 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode, struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); struct dentry *dentry_save; struct vfsmount *vfsmount_save; + unsigned int flags_save; int rc; dentry_save = nd->path.dentry; vfsmount_save = nd->path.mnt; + flags_save = nd->flags; nd->path.dentry = lower_dentry; nd->path.mnt = lower_mnt; + nd->flags &= ~LOOKUP_OPEN; rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd); nd->path.dentry = dentry_save; nd->path.mnt = vfsmount_save; + nd->flags = flags_save; return rc; } @@ -1108,10 +1113,8 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, rc = -EOPNOTSUPP; goto out; } - mutex_lock(&lower_dentry->d_inode->i_mutex); - rc = lower_dentry->d_inode->i_op->setxattr(lower_dentry, name, value, - size, flags); - mutex_unlock(&lower_dentry->d_inode->i_mutex); + + rc = vfs_setxattr(lower_dentry, name, value, size, flags); out: return rc; } diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 73811cf..b1f6858 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -446,6 +446,7 @@ out: */ static int ecryptfs_find_auth_tok_for_sig( + struct key **auth_tok_key, struct ecryptfs_auth_tok **auth_tok, struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig) @@ -453,12 +454,21 @@ ecryptfs_find_auth_tok_for_sig( struct ecryptfs_global_auth_tok *global_auth_tok; int rc = 0; + (*auth_tok_key) = NULL; (*auth_tok) = NULL; if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok, mount_crypt_stat, sig)) { - struct key *auth_tok_key; - rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok, + /* if the flag ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY is set in the + * mount_crypt_stat structure, we prevent to use auth toks that + * are not inserted through the ecryptfs_add_global_auth_tok + * function. + */ + if (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY) + return -EINVAL; + + rc = ecryptfs_keyring_auth_tok_for_sig(auth_tok_key, auth_tok, sig); } else (*auth_tok) = global_auth_tok->global_auth_tok; @@ -509,6 +519,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, char *filename, size_t filename_size) { struct ecryptfs_write_tag_70_packet_silly_stack *s; + struct key *auth_tok_key = NULL; int rc = 0; s = kmalloc(sizeof(*s), GFP_KERNEL); @@ -606,6 +617,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, } dest[s->i++] = s->cipher_code; rc = ecryptfs_find_auth_tok_for_sig( + &auth_tok_key, &s->auth_tok, mount_crypt_stat, mount_crypt_stat->global_default_fnek_sig); if (rc) { @@ -753,6 +765,8 @@ out_free_unlock: out_unlock: mutex_unlock(s->tfm_mutex); out: + if (auth_tok_key) + key_put(auth_tok_key); kfree(s); return rc; } @@ -798,6 +812,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, char *data, size_t max_packet_size) { struct ecryptfs_parse_tag_70_packet_silly_stack *s; + struct key *auth_tok_key = NULL; int rc = 0; (*packet_size) = 0; @@ -910,7 +925,8 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, * >= ECRYPTFS_MAX_IV_BYTES. */ memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES); s->desc.info = s->iv; - rc = ecryptfs_find_auth_tok_for_sig(&s->auth_tok, mount_crypt_stat, + rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key, + &s->auth_tok, mount_crypt_stat, s->fnek_sig_hex); if (rc) { printk(KERN_ERR "%s: Error attempting to find auth tok for " @@ -986,6 +1002,8 @@ out: (*filename_size) = 0; (*filename) = NULL; } + if (auth_tok_key) + key_put(auth_tok_key); kfree(s); return rc; } @@ -1557,14 +1575,19 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key, ECRYPTFS_VERSION_MAJOR, ECRYPTFS_VERSION_MINOR); rc = -EINVAL; - goto out; + goto out_release_key; } if ((*auth_tok)->token_type != ECRYPTFS_PASSWORD && (*auth_tok)->token_type != ECRYPTFS_PRIVATE_KEY) { printk(KERN_ERR "Invalid auth_tok structure " "returned from key query\n"); rc = -EINVAL; - goto out; + goto out_release_key; + } +out_release_key: + if (rc) { + key_put(*auth_tok_key); + (*auth_tok_key) = NULL; } out: return rc; @@ -1688,6 +1711,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, struct ecryptfs_auth_tok_list_item *auth_tok_list_item; size_t tag_11_contents_size; size_t tag_11_packet_size; + struct key *auth_tok_key = NULL; int rc = 0; INIT_LIST_HEAD(&auth_tok_list); @@ -1784,6 +1808,10 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, * just one will be sufficient to decrypt to get the FEK. */ find_next_matching_auth_tok: found_auth_tok = 0; + if (auth_tok_key) { + key_put(auth_tok_key); + auth_tok_key = NULL; + } list_for_each_entry(auth_tok_list_item, &auth_tok_list, list) { candidate_auth_tok = &auth_tok_list_item->auth_tok; if (unlikely(ecryptfs_verbosity > 0)) { @@ -1800,10 +1828,11 @@ find_next_matching_auth_tok: rc = -EINVAL; goto out_wipe_list; } - ecryptfs_find_auth_tok_for_sig(&matching_auth_tok, + rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key, + &matching_auth_tok, crypt_stat->mount_crypt_stat, candidate_auth_tok_sig); - if (matching_auth_tok) { + if (!rc) { found_auth_tok = 1; goto found_matching_auth_tok; } @@ -1866,6 +1895,8 @@ found_matching_auth_tok: out_wipe_list: wipe_auth_tok_list(&auth_tok_list); out: + if (auth_tok_key) + key_put(auth_tok_key); return rc; } diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index cbd4e18..a9dbd62 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -208,7 +208,8 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig, ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes, - ecryptfs_opt_unlink_sigs, ecryptfs_opt_err }; + ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only, + ecryptfs_opt_err }; static const match_table_t tokens = { {ecryptfs_opt_sig, "sig=%s"}, @@ -223,6 +224,7 @@ static const match_table_t tokens = { {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"}, {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"}, {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"}, + {ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"}, {ecryptfs_opt_err, NULL} }; @@ -406,6 +408,10 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options) case ecryptfs_opt_unlink_sigs: mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS; break; + case ecryptfs_opt_mount_auth_tok_only: + mount_crypt_stat->flags |= + ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY; + break; case ecryptfs_opt_err: default: printk(KERN_WARNING @@ -540,9 +546,8 @@ out: * ecryptfs_interpose to perform most of the linking * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c) */ -static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *raw_data, - struct vfsmount *mnt) +static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) { struct super_block *s; struct ecryptfs_sb_info *sbi; @@ -607,8 +612,7 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags, err = "Reading sb failed"; goto out; } - simple_set_mnt(mnt, s); - return 0; + return dget(s->s_root); out: if (sbi) { @@ -616,7 +620,7 @@ out: kmem_cache_free(ecryptfs_sb_info_cache, sbi); } printk(KERN_ERR "%s; rc = [%d]\n", err, rc); - return rc; + return ERR_PTR(rc); } /** @@ -639,7 +643,7 @@ static void ecryptfs_kill_block_super(struct super_block *sb) static struct file_system_type ecryptfs_fs_type = { .owner = THIS_MODULE, .name = "ecryptfs", - .get_sb = ecryptfs_get_sb, + .mount = ecryptfs_mount, .kill_sb = ecryptfs_kill_block_super, .fs_flags = 0 }; diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index f7fc286..2720178 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -28,7 +28,6 @@ #include <linux/key.h> #include <linux/slab.h> #include <linux/seq_file.h> -#include <linux/smp_lock.h> #include <linux/file.h> #include <linux/crypto.h> #include "ecryptfs_kernel.h" @@ -180,6 +179,8 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt) seq_printf(m, ",ecryptfs_encrypted_view"); if (mount_crypt_stat->flags & ECRYPTFS_UNLINK_SIGS) seq_printf(m, ",ecryptfs_unlink_sigs"); + if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY) + seq_printf(m, ",ecryptfs_mount_auth_tok_only"); return 0; } diff --git a/fs/efs/super.c b/fs/efs/super.c index f049428..5073a07 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -20,16 +20,16 @@ static int efs_statfs(struct dentry *dentry, struct kstatfs *buf); static int efs_fill_super(struct super_block *s, void *d, int silent); -static int efs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *efs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, efs_fill_super, mnt); + return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super); } static struct file_system_type efs_fs_type = { .owner = THIS_MODULE, .name = "efs", - .get_sb = efs_get_sb, + .mount = efs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; @@ -164,7 +164,26 @@ out: #ifdef CONFIG_MMU -static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, +void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +{ + struct mm_struct *mm = current->mm; + long diff = (long)(pages - bprm->vma_pages); + + if (!mm || !diff) + return; + + bprm->vma_pages = pages; + +#ifdef SPLIT_RSS_COUNTING + add_mm_counter(mm, MM_ANONPAGES, diff); +#else + spin_lock(&mm->page_table_lock); + add_mm_counter(mm, MM_ANONPAGES, diff); + spin_unlock(&mm->page_table_lock); +#endif +} + +struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; @@ -186,6 +205,8 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start; struct rlimit *rlim; + acct_arg_size(bprm, size / PAGE_SIZE); + /* * We've historically supported up to 32 pages (ARG_MAX) * of argument strings even with small stacks @@ -276,7 +297,11 @@ static bool valid_arg_len(struct linux_binprm *bprm, long len) #else -static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, +void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +{ +} + +struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; @@ -1003,6 +1028,7 @@ int flush_old_exec(struct linux_binprm * bprm) /* * Release all of the old mmap stuff */ + acct_arg_size(bprm, 0); retval = exec_mmap(bprm->mm); if (retval) goto out; @@ -1426,8 +1452,10 @@ int do_execve(const char * filename, return retval; out: - if (bprm->mm) - mmput (bprm->mm); + if (bprm->mm) { + acct_arg_size(bprm, 0); + mmput(bprm->mm); + } out_file: if (bprm->file) { diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 047e92f..79c3ae6 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -659,19 +659,19 @@ free_bdi: /* * Set up the superblock (calls exofs_fill_super eventually) */ -static int exofs_get_sb(struct file_system_type *type, +static struct dentry *exofs_mount(struct file_system_type *type, int flags, const char *dev_name, - void *data, struct vfsmount *mnt) + void *data) { struct exofs_mountopt opts; int ret; ret = parse_options(data, &opts); if (ret) - return ret; + return ERR_PTR(ret); opts.dev_name = dev_name; - return get_sb_nodev(type, flags, &opts, exofs_fill_super, mnt); + return mount_nodev(type, flags, &opts, exofs_fill_super); } /* @@ -809,7 +809,7 @@ static const struct export_operations exofs_export_ops = { static struct file_system_type exofs_type = { .owner = THIS_MODULE, .name = "exofs", - .get_sb = exofs_get_sb, + .mount = exofs_mount, .kill_sb = generic_shutdown_super, }; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 0901320..d89e0b6 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1356,10 +1356,10 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf) return 0; } -static int ext2_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *ext2_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super, mnt); + return mount_bdev(fs_type, flags, dev_name, data, ext2_fill_super); } #ifdef CONFIG_QUOTA @@ -1473,7 +1473,7 @@ out: static struct file_system_type ext2_fs_type = { .owner = THIS_MODULE, .name = "ext2", - .get_sb = ext2_get_sb, + .mount = ext2_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index db87413..acf8695 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -27,7 +27,6 @@ #include <linux/init.h> #include <linux/blkdev.h> #include <linux/parser.h> -#include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/vfs.h> @@ -3020,16 +3019,16 @@ out: #endif -static int ext3_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *ext3_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super, mnt); + return mount_bdev(fs_type, flags, dev_name, data, ext3_fill_super); } static struct file_system_type ext3_fs_type = { .owner = THIS_MODULE, .name = "ext3", - .get_sb = ext3_get_sb, + .mount = ext3_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8b5dd63..6a5edea 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -177,7 +177,7 @@ struct mpage_da_data { struct ext4_io_page { struct page *p_page; - int p_count; + atomic_t p_count; }; #define MAX_IO_PAGES 128 @@ -858,6 +858,7 @@ struct ext4_inode_info { spinlock_t i_completed_io_lock; /* current io_end structure for async DIO write*/ ext4_io_end_t *cur_aio_dio; + atomic_t i_ioend_count; /* Number of outstanding io_end structs */ /* * Transactions that contain inode's metadata needed to complete @@ -2060,6 +2061,7 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, /* page-io.c */ extern int __init ext4_init_pageio(void); extern void ext4_exit_pageio(void); +extern void ext4_ioend_wait(struct inode *); extern void ext4_free_io_end(ext4_io_end_t *io); extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); extern int ext4_end_io_nolock(ext4_io_end_t *io); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1916164..bdbe699 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -53,6 +53,7 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { + trace_ext4_begin_ordered_truncate(inode, new_size); return jbd2_journal_begin_ordered_truncate( EXT4_SB(inode->i_sb)->s_journal, &EXT4_I(inode)->jinode, @@ -178,6 +179,7 @@ void ext4_evict_inode(struct inode *inode) handle_t *handle; int err; + trace_ext4_evict_inode(inode); if (inode->i_nlink) { truncate_inode_pages(&inode->i_data, 0); goto no_delete; @@ -5410,9 +5412,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, * will return the blocks that include the delayed allocation * blocks for this file. */ - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; return 0; @@ -5649,6 +5649,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) int err, ret; might_sleep(); + trace_ext4_mark_inode_dirty(inode, _RET_IP_); err = ext4_reserve_inode_write(handle, inode, &iloc); if (ext4_handle_valid(handle) && EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index bf5ae88..eb3bc2f 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -331,6 +331,30 @@ mext_out: return err; } + case FITRIM: + { + struct super_block *sb = inode->i_sb; + struct fstrim_range range; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&range, (struct fstrim_range *)arg, + sizeof(range))) + return -EFAULT; + + ret = ext4_trim_fs(sb, &range); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range *)arg, &range, + sizeof(range))) + return -EFAULT; + + return 0; + } + default: return -ENOTTY; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c58eba3..5b4d4e3 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4640,8 +4640,6 @@ do_more: * with group lock held. generate_buddy look at * them with group lock_held */ - if (test_opt(sb, DISCARD)) - ext4_issue_discard(sb, block_group, bit, count); ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count); mb_free_blocks(inode, &e4b, bit, count); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 46a7d6a..beacce1 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -32,8 +32,14 @@ static struct kmem_cache *io_page_cachep, *io_end_cachep; +#define WQ_HASH_SZ 37 +#define to_ioend_wq(v) (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ]) +static wait_queue_head_t ioend_wq[WQ_HASH_SZ]; + int __init ext4_init_pageio(void) { + int i; + io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); if (io_page_cachep == NULL) return -ENOMEM; @@ -42,6 +48,8 @@ int __init ext4_init_pageio(void) kmem_cache_destroy(io_page_cachep); return -ENOMEM; } + for (i = 0; i < WQ_HASH_SZ; i++) + init_waitqueue_head(&ioend_wq[i]); return 0; } @@ -52,24 +60,37 @@ void ext4_exit_pageio(void) kmem_cache_destroy(io_page_cachep); } +void ext4_ioend_wait(struct inode *inode) +{ + wait_queue_head_t *wq = to_ioend_wq(inode); + + wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); +} + +static void put_io_page(struct ext4_io_page *io_page) +{ + if (atomic_dec_and_test(&io_page->p_count)) { + end_page_writeback(io_page->p_page); + put_page(io_page->p_page); + kmem_cache_free(io_page_cachep, io_page); + } +} + void ext4_free_io_end(ext4_io_end_t *io) { int i; + wait_queue_head_t *wq; BUG_ON(!io); if (io->page) put_page(io->page); - for (i = 0; i < io->num_io_pages; i++) { - if (--io->pages[i]->p_count == 0) { - struct page *page = io->pages[i]->p_page; - - end_page_writeback(page); - put_page(page); - kmem_cache_free(io_page_cachep, io->pages[i]); - } - } + for (i = 0; i < io->num_io_pages; i++) + put_io_page(io->pages[i]); io->num_io_pages = 0; - iput(io->inode); + wq = to_ioend_wq(io->inode); + if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) && + waitqueue_active(wq)) + wake_up_all(wq); kmem_cache_free(io_end_cachep, io); } @@ -142,8 +163,8 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) io = kmem_cache_alloc(io_end_cachep, flags); if (io) { memset(io, 0, sizeof(*io)); - io->inode = igrab(inode); - BUG_ON(!io->inode); + atomic_inc(&EXT4_I(inode)->i_ioend_count); + io->inode = inode; INIT_WORK(&io->work, ext4_end_io_work); INIT_LIST_HEAD(&io->list); } @@ -171,35 +192,15 @@ static void ext4_end_bio(struct bio *bio, int error) struct workqueue_struct *wq; struct inode *inode; unsigned long flags; - ext4_fsblk_t err_block; int i; BUG_ON(!io_end); - inode = io_end->inode; bio->bi_private = NULL; bio->bi_end_io = NULL; if (test_bit(BIO_UPTODATE, &bio->bi_flags)) error = 0; - err_block = bio->bi_sector >> (inode->i_blkbits - 9); bio_put(bio); - if (!(inode->i_sb->s_flags & MS_ACTIVE)) { - pr_err("sb umounted, discard end_io request for inode %lu\n", - io_end->inode->i_ino); - ext4_free_io_end(io_end); - return; - } - - if (error) { - io_end->flag |= EXT4_IO_END_ERROR; - ext4_warning(inode->i_sb, "I/O error writing to inode %lu " - "(offset %llu size %ld starting block %llu)", - inode->i_ino, - (unsigned long long) io_end->offset, - (long) io_end->size, - (unsigned long long) err_block); - } - for (i = 0; i < io_end->num_io_pages; i++) { struct page *page = io_end->pages[i]->p_page; struct buffer_head *bh, *head; @@ -236,14 +237,6 @@ static void ext4_end_bio(struct bio *bio, int error) } while (bh != head); } - if (--io_end->pages[i]->p_count == 0) { - struct page *page = io_end->pages[i]->p_page; - - end_page_writeback(page); - put_page(page); - kmem_cache_free(io_page_cachep, io_end->pages[i]); - } - /* * If this is a partial write which happened to make * all buffers uptodate then we can optimize away a @@ -253,9 +246,22 @@ static void ext4_end_bio(struct bio *bio, int error) */ if (!partial_write) SetPageUptodate(page); - } + put_io_page(io_end->pages[i]); + } io_end->num_io_pages = 0; + inode = io_end->inode; + + if (error) { + io_end->flag |= EXT4_IO_END_ERROR; + ext4_warning(inode->i_sb, "I/O error writing to inode %lu " + "(offset %llu size %ld starting block %llu)", + inode->i_ino, + (unsigned long long) io_end->offset, + (long) io_end->size, + (unsigned long long) + bio->bi_sector >> (inode->i_blkbits - 9)); + } /* Add the io_end to per-inode completed io list*/ spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); @@ -305,7 +311,6 @@ static int io_submit_init(struct ext4_io_submit *io, bio->bi_private = io->io_end = io_end; bio->bi_end_io = ext4_end_bio; - io_end->inode = inode; io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); io->io_bio = bio; @@ -360,7 +365,7 @@ submit_and_retry: if ((io_end->num_io_pages == 0) || (io_end->pages[io_end->num_io_pages-1] != io_page)) { io_end->pages[io_end->num_io_pages++] = io_page; - io_page->p_count++; + atomic_inc(&io_page->p_count); } return 0; } @@ -389,7 +394,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, return -ENOMEM; } io_page->p_page = page; - io_page->p_count = 0; + atomic_set(&io_page->p_count, 1); get_page(page); for (bh = head = page_buffers(page), block_start = 0; @@ -421,10 +426,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io, * PageWriteback bit from the page to prevent the system from * wedging later on. */ - if (io_page->p_count == 0) { - put_page(page); - end_page_writeback(page); - kmem_cache_free(io_page_cachep, io_page); - } + put_io_page(io_page); return ret; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0348ce0..e32195d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -73,8 +73,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); static int ext4_unfreeze(struct super_block *sb); static void ext4_write_super(struct super_block *sb); static int ext4_freeze(struct super_block *sb); -static int ext4_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, struct vfsmount *mnt); +static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data); static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); @@ -82,7 +82,7 @@ static void ext4_unregister_li_request(struct super_block *sb); static struct file_system_type ext3_fs_type = { .owner = THIS_MODULE, .name = "ext3", - .get_sb = ext4_get_sb, + .mount = ext4_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; @@ -828,12 +828,22 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->cur_aio_dio = NULL; ei->i_sync_tid = 0; ei->i_datasync_tid = 0; + atomic_set(&ei->i_ioend_count, 0); return &ei->vfs_inode; } +static int ext4_drop_inode(struct inode *inode) +{ + int drop = generic_drop_inode(inode); + + trace_ext4_drop_inode(inode, drop); + return drop; +} + static void ext4_destroy_inode(struct inode *inode) { + ext4_ioend_wait(inode); if (!list_empty(&(EXT4_I(inode)->i_orphan))) { ext4_msg(inode->i_sb, KERN_ERR, "Inode %lu (%p): orphan list check failed!", @@ -1173,6 +1183,7 @@ static const struct super_operations ext4_sops = { .destroy_inode = ext4_destroy_inode, .write_inode = ext4_write_inode, .dirty_inode = ext4_dirty_inode, + .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, .put_super = ext4_put_super, .sync_fs = ext4_sync_fs, @@ -1186,7 +1197,6 @@ static const struct super_operations ext4_sops = { .quota_write = ext4_quota_write, #endif .bdev_try_to_free_page = bdev_try_to_free_page, - .trim_fs = ext4_trim_fs }; static const struct super_operations ext4_nojournal_sops = { @@ -1194,6 +1204,7 @@ static const struct super_operations ext4_nojournal_sops = { .destroy_inode = ext4_destroy_inode, .write_inode = ext4_write_inode, .dirty_inode = ext4_dirty_inode, + .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, .write_super = ext4_write_super, .put_super = ext4_put_super, @@ -2699,7 +2710,6 @@ static int ext4_lazyinit_thread(void *arg) struct ext4_li_request *elr; unsigned long next_wakeup; DEFINE_WAIT(wait); - int ret; BUG_ON(NULL == eli); @@ -2723,13 +2733,12 @@ cont_thread: elr = list_entry(pos, struct ext4_li_request, lr_request); - if (time_after_eq(jiffies, elr->lr_next_sched)) - ret = ext4_run_li_request(elr); - - if (ret) { - ret = 0; - ext4_remove_li_request(elr); - continue; + if (time_after_eq(jiffies, elr->lr_next_sched)) { + if (ext4_run_li_request(elr) != 0) { + /* error, remove the lazy_init job */ + ext4_remove_li_request(elr); + continue; + } } if (time_before(elr->lr_next_sched, next_wakeup)) @@ -2740,7 +2749,8 @@ cont_thread: if (freezing(current)) refrigerator(); - if (time_after_eq(jiffies, next_wakeup)) { + if ((time_after_eq(jiffies, next_wakeup)) || + (MAX_JIFFY_OFFSET == next_wakeup)) { cond_resched(); continue; } @@ -2788,9 +2798,6 @@ static void ext4_clear_request_list(void) struct ext4_li_request *elr; mutex_lock(&ext4_li_info->li_list_mtx); - if (list_empty(&ext4_li_info->li_request_list)) - return; - list_for_each_safe(pos, n, &ext4_li_info->li_request_list) { elr = list_entry(pos, struct ext4_li_request, lr_request); @@ -3257,13 +3264,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * Test whether we have more sectors than will fit in sector_t, * and whether the max offset is addressable by the page cache. */ - ret = generic_check_addressable(sb->s_blocksize_bits, + err = generic_check_addressable(sb->s_blocksize_bits, ext4_blocks_count(es)); - if (ret) { + if (err) { ext4_msg(sb, KERN_ERR, "filesystem" " too large to mount safely on this system"); if (sizeof(sector_t) < 8) ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); + ret = err; goto failed_mount; } @@ -3348,6 +3356,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); + err = percpu_counter_init(&sbi->s_freeblocks_counter, + ext4_count_free_blocks(sb)); + if (!err) { + err = percpu_counter_init(&sbi->s_freeinodes_counter, + ext4_count_free_inodes(sb)); + } + if (!err) { + err = percpu_counter_init(&sbi->s_dirs_counter, + ext4_count_dirs(sb)); + } + if (!err) { + err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); + } + if (err) { + ext4_msg(sb, KERN_ERR, "insufficient memory"); + goto failed_mount3; + } + sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_max_writeback_mb_bump = 128; @@ -3446,22 +3472,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); -no_journal: - err = percpu_counter_init(&sbi->s_freeblocks_counter, - ext4_count_free_blocks(sb)); - if (!err) - err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext4_count_free_inodes(sb)); - if (!err) - err = percpu_counter_init(&sbi->s_dirs_counter, - ext4_count_dirs(sb)); - if (!err) - err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); - if (err) { - ext4_msg(sb, KERN_ERR, "insufficient memory"); - goto failed_mount_wq; - } + /* + * The journal may have updated the bg summary counts, so we + * need to update the global counters. + */ + percpu_counter_set(&sbi->s_freeblocks_counter, + ext4_count_free_blocks(sb)); + percpu_counter_set(&sbi->s_freeinodes_counter, + ext4_count_free_inodes(sb)); + percpu_counter_set(&sbi->s_dirs_counter, + ext4_count_dirs(sb)); + percpu_counter_set(&sbi->s_dirtyblocks_counter, 0); +no_journal: EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); if (!EXT4_SB(sb)->dio_unwritten_wq) { printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); @@ -3611,10 +3634,6 @@ failed_mount_wq: jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } - percpu_counter_destroy(&sbi->s_freeblocks_counter); - percpu_counter_destroy(&sbi->s_freeinodes_counter); - percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyblocks_counter); failed_mount3: if (sbi->s_flex_groups) { if (is_vmalloc_addr(sbi->s_flex_groups)) @@ -3622,6 +3641,10 @@ failed_mount3: else kfree(sbi->s_flex_groups); } + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyblocks_counter); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); @@ -3949,13 +3972,11 @@ static int ext4_commit_super(struct super_block *sb, int sync) else es->s_kbytes_written = cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); - if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeblocks_counter)) - ext4_free_blocks_count_set(es, percpu_counter_sum_positive( - &EXT4_SB(sb)->s_freeblocks_counter)); - if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter)) - es->s_free_inodes_count = - cpu_to_le32(percpu_counter_sum_positive( - &EXT4_SB(sb)->s_freeinodes_counter)); + ext4_free_blocks_count_set(es, percpu_counter_sum_positive( + &EXT4_SB(sb)->s_freeblocks_counter)); + es->s_free_inodes_count = + cpu_to_le32(percpu_counter_sum_positive( + &EXT4_SB(sb)->s_freeinodes_counter)); sb->s_dirt = 0; BUFFER_TRACE(sbh, "marking dirty"); mark_buffer_dirty(sbh); @@ -4556,12 +4577,10 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, static int ext4_quota_off(struct super_block *sb, int type) { - /* Force all delayed allocation blocks to be allocated */ - if (test_opt(sb, DELALLOC)) { - down_read(&sb->s_umount); + /* Force all delayed allocation blocks to be allocated. + * Caller already holds s_umount sem */ + if (test_opt(sb, DELALLOC)) sync_filesystem(sb); - up_read(&sb->s_umount); - } return dquot_quota_off(sb, type); } @@ -4667,17 +4686,17 @@ out: #endif -static int ext4_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); + return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super); } #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) static struct file_system_type ext2_fs_type = { .owner = THIS_MODULE, .name = "ext2", - .get_sb = ext4_get_sb, + .mount = ext4_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; @@ -4722,7 +4741,7 @@ static inline void unregister_as_ext3(void) { } static struct file_system_type ext4_fs_type = { .owner = THIS_MODULE, .name = "ext4", - .get_sb = ext4_get_sb, + .mount = ext4_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index bbca5c1..3345aab 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -675,18 +675,17 @@ static int msdos_fill_super(struct super_block *sb, void *data, int silent) return 0; } -static int msdos_get_sb(struct file_system_type *fs_type, +static struct dentry *msdos_mount(struct file_system_type *fs_type, int flags, const char *dev_name, - void *data, struct vfsmount *mnt) + void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, msdos_fill_super); } static struct file_system_type msdos_fs_type = { .owner = THIS_MODULE, .name = "msdos", - .get_sb = msdos_get_sb, + .mount = msdos_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 6f0f6c9..b936703 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -1071,18 +1071,17 @@ static int vfat_fill_super(struct super_block *sb, void *data, int silent) return 0; } -static int vfat_get_sb(struct file_system_type *fs_type, +static struct dentry *vfat_mount(struct file_system_type *fs_type, int flags, const char *dev_name, - void *data, struct vfsmount *mnt) + void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, vfat_fill_super); } static struct file_system_type vfat_fs_type = { .owner = THIS_MODULE, .name = "vfat", - .get_sb = vfat_get_sb, + .mount = vfat_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index 71b0148..9d1c995 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -246,17 +246,16 @@ out: /* * The usual module blurb. */ -static int vxfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *vxfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, vxfs_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, vxfs_fill_super); } static struct file_system_type vxfs_fs_type = { .owner = THIS_MODULE, .name = "vxfs", - .get_sb = vxfs_get_sb, + .mount = vxfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index aed881a..3d06ccc 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -707,6 +707,17 @@ get_next_work_item(struct backing_dev_info *bdi) return work; } +/* + * Add in the number of potentially dirty inodes, because each inode + * write can dirty pagecache in the underlying blockdev. + */ +static unsigned long get_nr_dirty_pages(void) +{ + return global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) + + get_nr_dirty_inodes(); +} + static long wb_check_old_data_flush(struct bdi_writeback *wb) { unsigned long expired; @@ -724,13 +735,7 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) return 0; wb->last_old_flush = jiffies; - /* - * Add in the number of potentially dirty inodes, because each inode - * write can dirty pagecache in the underlying blockdev. - */ - nr_pages = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) + - get_nr_dirty_inodes(); + nr_pages = get_nr_dirty_pages(); if (nr_pages) { struct wb_writeback_work work = { @@ -1076,32 +1081,42 @@ static void wait_sb_inodes(struct super_block *sb) } /** - * writeback_inodes_sb - writeback dirty inodes from given super_block + * writeback_inodes_sb_nr - writeback dirty inodes from given super_block * @sb: the superblock + * @nr: the number of pages to write * * Start writeback on some inodes on this super_block. No guarantees are made * on how many (if any) will be written, and this function does not wait - * for IO completion of submitted IO. The number of pages submitted is - * returned. + * for IO completion of submitted IO. */ -void writeback_inodes_sb(struct super_block *sb) +void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr) { - unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); - unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); DECLARE_COMPLETION_ONSTACK(done); struct wb_writeback_work work = { .sb = sb, .sync_mode = WB_SYNC_NONE, .done = &done, + .nr_pages = nr, }; WARN_ON(!rwsem_is_locked(&sb->s_umount)); - - work.nr_pages = nr_dirty + nr_unstable + get_nr_dirty_inodes(); - bdi_queue_work(sb->s_bdi, &work); wait_for_completion(&done); } +EXPORT_SYMBOL(writeback_inodes_sb_nr); + +/** + * writeback_inodes_sb - writeback dirty inodes from given super_block + * @sb: the superblock + * + * Start writeback on some inodes on this super_block. No guarantees are made + * on how many (if any) will be written, and this function does not wait + * for IO completion of submitted IO. + */ +void writeback_inodes_sb(struct super_block *sb) +{ + return writeback_inodes_sb_nr(sb, get_nr_dirty_pages()); +} EXPORT_SYMBOL(writeback_inodes_sb); /** @@ -1124,6 +1139,27 @@ int writeback_inodes_sb_if_idle(struct super_block *sb) EXPORT_SYMBOL(writeback_inodes_sb_if_idle); /** + * writeback_inodes_sb_if_idle - start writeback if none underway + * @sb: the superblock + * @nr: the number of pages to write + * + * Invoke writeback_inodes_sb if no writeback is currently underway. + * Returns 1 if writeback was started, 0 if not. + */ +int writeback_inodes_sb_nr_if_idle(struct super_block *sb, + unsigned long nr) +{ + if (!writeback_in_progress(sb->s_bdi)) { + down_read(&sb->s_umount); + writeback_inodes_sb_nr(sb, nr); + up_read(&sb->s_umount); + return 1; + } else + return 0; +} +EXPORT_SYMBOL(writeback_inodes_sb_nr_if_idle); + +/** * sync_inodes_sb - sync sb inode pages * @sb: the superblock * diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 4eba076..85542a7 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -322,12 +322,10 @@ static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent) return 0; } -static int fuse_ctl_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *raw_data, - struct vfsmount *mnt) +static struct dentry *fuse_ctl_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data) { - return get_sb_single(fs_type, flags, raw_data, - fuse_ctl_fill_super, mnt); + return mount_single(fs_type, flags, raw_data, fuse_ctl_fill_super); } static void fuse_ctl_kill_sb(struct super_block *sb) @@ -346,7 +344,7 @@ static void fuse_ctl_kill_sb(struct super_block *sb) static struct file_system_type fuse_ctl_fs_type = { .owner = THIS_MODULE, .name = "fusectl", - .get_sb = fuse_ctl_get_sb, + .mount = fuse_ctl_mount, .kill_sb = fuse_ctl_kill_sb, }; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c822458..9242d29 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -134,6 +134,7 @@ EXPORT_SYMBOL_GPL(fuse_do_open); void fuse_finish_open(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = get_fuse_conn(inode); if (ff->open_flags & FOPEN_DIRECT_IO) file->f_op = &fuse_direct_io_file_operations; @@ -141,6 +142,15 @@ void fuse_finish_open(struct inode *inode, struct file *file) invalidate_inode_pages2(inode->i_mapping); if (ff->open_flags & FOPEN_NONSEEKABLE) nonseekable_open(inode, file); + if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fc->lock); + fi->attr_version = ++fc->attr_version; + i_size_write(inode, 0); + spin_unlock(&fc->lock); + fuse_invalidate_attr(inode); + } } int fuse_open_common(struct inode *inode, struct file *file, bool isdir) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index da9e6e1..cfce3ad 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1041,11 +1041,11 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) return err; } -static int fuse_get_sb(struct file_system_type *fs_type, +static struct dentry *fuse_mount(struct file_system_type *fs_type, int flags, const char *dev_name, - void *raw_data, struct vfsmount *mnt) + void *raw_data) { - return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt); + return mount_nodev(fs_type, flags, raw_data, fuse_fill_super); } static void fuse_kill_sb_anon(struct super_block *sb) @@ -1065,17 +1065,16 @@ static struct file_system_type fuse_fs_type = { .owner = THIS_MODULE, .name = "fuse", .fs_flags = FS_HAS_SUBTYPE, - .get_sb = fuse_get_sb, + .mount = fuse_mount, .kill_sb = fuse_kill_sb_anon, }; #ifdef CONFIG_BLOCK -static int fuse_get_sb_blk(struct file_system_type *fs_type, +static struct dentry *fuse_mount_blk(struct file_system_type *fs_type, int flags, const char *dev_name, - void *raw_data, struct vfsmount *mnt) + void *raw_data) { - return get_sb_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super); } static void fuse_kill_sb_blk(struct super_block *sb) @@ -1094,7 +1093,7 @@ static void fuse_kill_sb_blk(struct super_block *sb) static struct file_system_type fuseblk_fs_type = { .owner = THIS_MODULE, .name = "fuseblk", - .get_sb = fuse_get_sb_blk, + .mount = fuse_mount_blk, .kill_sb = fuse_kill_sb_blk, .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE, }; diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index 06d5827..5ab3839 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -138,10 +138,8 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, struct gfs2_inum_host *inum) { struct gfs2_sbd *sdp = sb->s_fs_info; - struct gfs2_holder i_gh; struct inode *inode; struct dentry *dentry; - int error; inode = gfs2_ilookup(sb, inum->no_addr); if (inode) { @@ -152,52 +150,16 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, goto out_inode; } - error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops, - LM_ST_SHARED, LM_FLAG_ANY, &i_gh); - if (error) - return ERR_PTR(error); - - error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE); - if (error) - goto fail; - - inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0); - if (IS_ERR(inode)) { - error = PTR_ERR(inode); - goto fail; - } - - error = gfs2_inode_refresh(GFS2_I(inode)); - if (error) { - iput(inode); - goto fail; - } - - /* Pick up the works we bypass in gfs2_inode_lookup */ - if (inode->i_state & I_NEW) - gfs2_set_iop(inode); - - if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) { - iput(inode); - goto fail; - } - - error = -EIO; - if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) { - iput(inode); - goto fail; - } - - gfs2_glock_dq_uninit(&i_gh); + inode = gfs2_lookup_by_inum(sdp, inum->no_addr, &inum->no_formal_ino, + GFS2_BLKST_DINODE); + if (IS_ERR(inode)) + return ERR_CAST(inode); out_inode: dentry = d_obtain_alias(inode); if (!IS_ERR(dentry)) dentry->d_op = &gfs2_dops; return dentry; -fail: - gfs2_glock_dq_uninit(&i_gh); - return ERR_PTR(error); } static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 8777885..f92c177 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -686,21 +686,20 @@ static void delete_work_func(struct work_struct *work) { struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete); struct gfs2_sbd *sdp = gl->gl_sbd; - struct gfs2_inode *ip = NULL; + struct gfs2_inode *ip; struct inode *inode; - u64 no_addr = 0; + u64 no_addr = gl->gl_name.ln_number; + + ip = gl->gl_object; + /* Note: Unsafe to dereference ip as we don't hold right refs/locks */ - spin_lock(&gl->gl_spin); - ip = (struct gfs2_inode *)gl->gl_object; if (ip) - no_addr = ip->i_no_addr; - spin_unlock(&gl->gl_spin); - if (ip) { inode = gfs2_ilookup(sdp->sd_vfs, no_addr); - if (inode) { - d_prune_aliases(inode); - iput(inode); - } + else + inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED); + if (inode && !IS_ERR(inode)) { + d_prune_aliases(inode); + iput(inode); } gfs2_glock_put(gl); } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 06370f8..e1213f7 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -73,49 +73,6 @@ static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr) return iget5_locked(sb, hash, iget_test, iget_set, &no_addr); } -struct gfs2_skip_data { - u64 no_addr; - int skipped; -}; - -static int iget_skip_test(struct inode *inode, void *opaque) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_skip_data *data = opaque; - - if (ip->i_no_addr == data->no_addr) { - if (inode->i_state & (I_FREEING|I_WILL_FREE)){ - data->skipped = 1; - return 0; - } - return 1; - } - return 0; -} - -static int iget_skip_set(struct inode *inode, void *opaque) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_skip_data *data = opaque; - - if (data->skipped) - return 1; - inode->i_ino = (unsigned long)(data->no_addr); - ip->i_no_addr = data->no_addr; - return 0; -} - -static struct inode *gfs2_iget_skip(struct super_block *sb, - u64 no_addr) -{ - struct gfs2_skip_data data; - unsigned long hash = (unsigned long)no_addr; - - data.no_addr = no_addr; - data.skipped = 0; - return iget5_locked(sb, hash, iget_skip_test, iget_skip_set, &data); -} - /** * GFS2 lookup code fills in vfs inode contents based on info obtained * from directory entry inside gfs2_inode_lookup(). This has caused issues @@ -243,93 +200,54 @@ fail: return ERR_PTR(error); } -/** - * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation - * and try to reclaim it by doing iput. - * - * This function assumes no rgrp locks are currently held. - * - * @sb: The super block - * no_addr: The inode number - * - */ - -void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr) +struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, + u64 *no_formal_ino, unsigned int blktype) { - struct gfs2_sbd *sdp; - struct gfs2_inode *ip; - struct gfs2_glock *io_gl = NULL; - int error; - struct gfs2_holder gh; + struct super_block *sb = sdp->sd_vfs; + struct gfs2_holder i_gh; struct inode *inode; + int error; - inode = gfs2_iget_skip(sb, no_addr); - - if (!inode) - return; - - /* If it's not a new inode, someone's using it, so leave it alone. */ - if (!(inode->i_state & I_NEW)) { - iput(inode); - return; - } - - ip = GFS2_I(inode); - sdp = GFS2_SB(inode); - ip->i_no_formal_ino = -1; + error = gfs2_glock_nq_num(sdp, no_addr, &gfs2_inode_glops, + LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (error) + return ERR_PTR(error); - error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); - if (unlikely(error)) + error = gfs2_check_blk_type(sdp, no_addr, blktype); + if (error) goto fail; - ip->i_gl->gl_object = ip; - error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); - if (unlikely(error)) - goto fail_put; - - set_bit(GIF_INVALID, &ip->i_flags); - error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT, - &ip->i_iopen_gh); - if (unlikely(error)) - goto fail_iopen; + inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0); + if (IS_ERR(inode)) + goto fail; - ip->i_iopen_gh.gh_gl->gl_object = ip; - gfs2_glock_put(io_gl); - io_gl = NULL; + error = gfs2_inode_refresh(GFS2_I(inode)); + if (error) + goto fail_iput; - inode->i_mode = DT2IF(DT_UNKNOWN); + /* Pick up the works we bypass in gfs2_inode_lookup */ + if (inode->i_state & I_NEW) + gfs2_set_iop(inode); - /* - * We must read the inode in order to work out its type in - * this case. Note that this doesn't happen often as we normally - * know the type beforehand. This code path only occurs during - * unlinked inode recovery (where it is safe to do this glock, - * which is not true in the general case). - */ - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY, - &gh); - if (unlikely(error)) - goto fail_glock; + /* Two extra checks for NFS only */ + if (no_formal_ino) { + error = -ESTALE; + if (GFS2_I(inode)->i_no_formal_ino != *no_formal_ino) + goto fail_iput; - /* Inode is now uptodate */ - gfs2_glock_dq_uninit(&gh); - gfs2_set_iop(inode); + error = -EIO; + if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) + goto fail_iput; - /* The iput will cause it to be deleted. */ - iput(inode); - return; + error = 0; + } -fail_glock: - gfs2_glock_dq(&ip->i_iopen_gh); -fail_iopen: - if (io_gl) - gfs2_glock_put(io_gl); -fail_put: - ip->i_gl->gl_object = NULL; - gfs2_glock_put(ip->i_gl); fail: - iget_failed(inode); - return; + gfs2_glock_dq_uninit(&i_gh); + return error ? ERR_PTR(error) : inode; +fail_iput: + iput(inode); + goto fail; } static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 6720d7d..d8499fa 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -99,7 +99,9 @@ err: extern void gfs2_set_iop(struct inode *inode); extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, u64 no_addr, u64 no_formal_ino); -extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr); +extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, + u64 *no_formal_ino, + unsigned int blktype); extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); extern int gfs2_inode_refresh(struct gfs2_inode *ip); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index cade1ac..3eb1393 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1250,12 +1250,11 @@ static int test_gfs2_super(struct super_block *s, void *ptr) } /** - * gfs2_get_sb - Get the GFS2 superblock + * gfs2_mount - Get the GFS2 superblock * @fs_type: The GFS2 filesystem type * @flags: Mount flags * @dev_name: The name of the device * @data: The mount arguments - * @mnt: The vfsmnt for this mount * * Q. Why not use get_sb_bdev() ? * A. We need to select one of two root directories to mount, independent @@ -1264,8 +1263,8 @@ static int test_gfs2_super(struct super_block *s, void *ptr) * Returns: 0 or -ve on error */ -static int gfs2_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) { struct block_device *bdev; struct super_block *s; @@ -1279,7 +1278,7 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags, bdev = open_bdev_exclusive(dev_name, mode, fs_type); if (IS_ERR(bdev)) - return PTR_ERR(bdev); + return ERR_CAST(bdev); /* * once the super is inserted into the list by sget, s_umount @@ -1298,6 +1297,9 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags, if (IS_ERR(s)) goto error_bdev; + if (s->s_root) + close_bdev_exclusive(bdev, mode); + memset(&args, 0, sizeof(args)); args.ar_quota = GFS2_QUOTA_DEFAULT; args.ar_data = GFS2_DATA_DEFAULT; @@ -1309,17 +1311,13 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags, error = gfs2_mount_args(&args, data); if (error) { printk(KERN_WARNING "GFS2: can't parse mount arguments\n"); - if (s->s_root) - goto error_super; - deactivate_locked_super(s); - return error; + goto error_super; } if (s->s_root) { error = -EBUSY; if ((flags ^ s->s_flags) & MS_RDONLY) goto error_super; - close_bdev_exclusive(bdev, mode); } else { char b[BDEVNAME_SIZE]; @@ -1328,27 +1326,24 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags, strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); sb_set_blocksize(s, block_size(bdev)); error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0); - if (error) { - deactivate_locked_super(s); - return error; - } + if (error) + goto error_super; s->s_flags |= MS_ACTIVE; bdev->bd_super = s; } sdp = s->s_fs_info; - mnt->mnt_sb = s; if (args.ar_meta) - mnt->mnt_root = dget(sdp->sd_master_dir); + return dget(sdp->sd_master_dir); else - mnt->mnt_root = dget(sdp->sd_root_dir); - return 0; + return dget(sdp->sd_root_dir); error_super: deactivate_locked_super(s); + return ERR_PTR(error); error_bdev: close_bdev_exclusive(bdev, mode); - return error; + return ERR_PTR(error); } static int set_meta_super(struct super_block *s, void *ptr) @@ -1356,8 +1351,8 @@ static int set_meta_super(struct super_block *s, void *ptr) return -EINVAL; } -static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { struct super_block *s; struct gfs2_sbd *sdp; @@ -1368,23 +1363,21 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags, if (error) { printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n", dev_name, error); - return error; + return ERR_PTR(error); } s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, path.dentry->d_inode->i_sb->s_bdev); path_put(&path); if (IS_ERR(s)) { printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n"); - return PTR_ERR(s); + return ERR_CAST(s); } if ((flags ^ s->s_flags) & MS_RDONLY) { deactivate_locked_super(s); - return -EBUSY; + return ERR_PTR(-EBUSY); } sdp = s->s_fs_info; - mnt->mnt_sb = s; - mnt->mnt_root = dget(sdp->sd_master_dir); - return 0; + return dget(sdp->sd_master_dir); } static void gfs2_kill_sb(struct super_block *sb) @@ -1410,7 +1403,7 @@ static void gfs2_kill_sb(struct super_block *sb) struct file_system_type gfs2_fs_type = { .name = "gfs2", .fs_flags = FS_REQUIRES_DEV, - .get_sb = gfs2_get_sb, + .mount = gfs2_mount, .kill_sb = gfs2_kill_sb, .owner = THIS_MODULE, }; @@ -1418,7 +1411,7 @@ struct file_system_type gfs2_fs_type = { struct file_system_type gfs2meta_fs_type = { .name = "gfs2meta", .fs_flags = FS_REQUIRES_DEV, - .get_sb = gfs2_get_sb_meta, + .mount = gfs2_mount_meta, .owner = THIS_MODULE, }; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 58a9b99..f606baf 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -631,6 +631,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, struct fs_disk_quota *fdq) { struct inode *inode = &ip->i_inode; + struct gfs2_sbd *sdp = GFS2_SB(inode); struct address_space *mapping = inode->i_mapping; unsigned long index = loc >> PAGE_CACHE_SHIFT; unsigned offset = loc & (PAGE_CACHE_SIZE - 1); @@ -658,11 +659,11 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, qd->qd_qb.qb_value = qp->qu_value; if (fdq) { if (fdq->d_fieldmask & FS_DQ_BSOFT) { - qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit); + qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift); qd->qd_qb.qb_warn = qp->qu_warn; } if (fdq->d_fieldmask & FS_DQ_BHARD) { - qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit); + qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift); qd->qd_qb.qb_limit = qp->qu_limit; } } @@ -1497,9 +1498,9 @@ static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id, fdq->d_version = FS_DQUOT_VERSION; fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA; fdq->d_id = id; - fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit); - fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn); - fdq->d_bcount = be64_to_cpu(qlvb->qb_value); + fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift; + fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift; + fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift; gfs2_glock_dq_uninit(&q_gh); out: @@ -1566,10 +1567,10 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, /* If nothing has changed, this is a no-op */ if ((fdq->d_fieldmask & FS_DQ_BSOFT) && - (fdq->d_blk_softlimit == be64_to_cpu(qd->qd_qb.qb_warn))) + ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn))) fdq->d_fieldmask ^= FS_DQ_BSOFT; if ((fdq->d_fieldmask & FS_DQ_BHARD) && - (fdq->d_blk_hardlimit == be64_to_cpu(qd->qd_qb.qb_limit))) + ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit))) fdq->d_fieldmask ^= FS_DQ_BHARD; if (fdq->d_fieldmask == 0) goto out_i; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index bef3ab6..33c8407 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -963,17 +963,18 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al) * The inode, if one has been found, in inode. */ -static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, - u64 skip) +static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip) { u32 goal = 0, block; u64 no_addr; struct gfs2_sbd *sdp = rgd->rd_sbd; unsigned int n; + struct gfs2_glock *gl; + struct gfs2_inode *ip; + int error; + int found = 0; - for(;;) { - if (goal >= rgd->rd_data) - break; + while (goal < rgd->rd_data) { down_write(&sdp->sd_log_flush_lock); n = 1; block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED, @@ -990,11 +991,32 @@ static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, if (no_addr == skip) continue; *last_unlinked = no_addr; - return no_addr; + + error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &gl); + if (error) + continue; + + /* If the inode is already in cache, we can ignore it here + * because the existing inode disposal code will deal with + * it when all refs have gone away. Accessing gl_object like + * this is not safe in general. Here it is ok because we do + * not dereference the pointer, and we only need an approx + * answer to whether it is NULL or not. + */ + ip = gl->gl_object; + + if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) + gfs2_glock_put(gl); + else + found++; + + /* Limit reclaim to sensible number of tasks */ + if (found > 2*NR_CPUS) + return; } rgd->rd_flags &= ~GFS2_RDF_CHECK; - return 0; + return; } /** @@ -1075,11 +1097,9 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd) * Try to acquire rgrp in way which avoids contending with others. * * Returns: errno - * unlinked: the block address of an unlinked block to be reclaimed */ -static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked, - u64 *last_unlinked) +static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd, *begin = NULL; @@ -1089,7 +1109,6 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked, int loops = 0; int error, rg_locked; - *unlinked = 0; rgd = gfs2_blk2rgrpd(sdp, ip->i_goal); while (rgd) { @@ -1106,17 +1125,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked, case 0: if (try_rgrp_fit(rgd, al)) goto out; - /* If the rg came in already locked, there's no - way we can recover from a failed try_rgrp_unlink - because that would require an iput which can only - happen after the rgrp is unlocked. */ - if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK) - *unlinked = try_rgrp_unlink(rgd, last_unlinked, - ip->i_no_addr); + if (rgd->rd_flags & GFS2_RDF_CHECK) + try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); if (!rg_locked) gfs2_glock_dq_uninit(&al->al_rgd_gh); - if (*unlinked) - return -EAGAIN; /* fall through */ case GLR_TRYFAILED: rgd = recent_rgrp_next(rgd); @@ -1145,13 +1157,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked, case 0: if (try_rgrp_fit(rgd, al)) goto out; - if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK) - *unlinked = try_rgrp_unlink(rgd, last_unlinked, - ip->i_no_addr); + if (rgd->rd_flags & GFS2_RDF_CHECK) + try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); if (!rg_locked) gfs2_glock_dq_uninit(&al->al_rgd_gh); - if (*unlinked) - return -EAGAIN; break; case GLR_TRYFAILED: @@ -1204,12 +1213,12 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_alloc *al = ip->i_alloc; int error = 0; - u64 last_unlinked = NO_BLOCK, unlinked; + u64 last_unlinked = NO_BLOCK; + int tries = 0; if (gfs2_assert_warn(sdp, al->al_requested)) return -EINVAL; -try_again: if (hold_rindex) { /* We need to hold the rindex unless the inode we're using is the rindex itself, in which case it's already held. */ @@ -1218,31 +1227,23 @@ try_again: else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */ error = gfs2_ri_update_special(ip); + if (error) + return error; } - if (error) - return error; + do { + error = get_local_rgrp(ip, &last_unlinked); + /* If there is no space, flushing the log may release some */ + if (error) + gfs2_log_flush(sdp, NULL); + } while (error && tries++ < 3); - /* Find an rgrp suitable for allocation. If it encounters any unlinked - dinodes along the way, error will equal -EAGAIN and unlinked will - contains it block address. We then need to look up that inode and - try to free it, and try the allocation again. */ - error = get_local_rgrp(ip, &unlinked, &last_unlinked); if (error) { if (hold_rindex && ip != GFS2_I(sdp->sd_rindex)) gfs2_glock_dq_uninit(&al->al_ri_gh); - if (error != -EAGAIN) - return error; - - gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked); - /* regardless of whether or not gfs2_process_unlinked_inode - was successful, we don't want to repeat it again. */ - last_unlinked = unlinked; - gfs2_log_flush(sdp, NULL); - error = 0; - - goto try_again; + return error; } + /* no error, so we have the rgrp set in the inode's allocation. */ al->al_file = file; al->al_line = line; diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 6ee1586..4824c27 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -441,17 +441,16 @@ bail: return res; } -static int hfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - struct vfsmount *mnt) +static struct dentry *hfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, hfs_fill_super, mnt); + return mount_bdev(fs_type, flags, dev_name, data, hfs_fill_super); } static struct file_system_type hfs_fs_type = { .owner = THIS_MODULE, .name = "hfs", - .get_sb = hfs_get_sb, + .mount = hfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 9a88d75..52cc746 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -495,18 +495,16 @@ static void hfsplus_destroy_inode(struct inode *inode) #define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info) -static int hfsplus_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - struct vfsmount *mnt) +static struct dentry *hfsplus_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super); } static struct file_system_type hfsplus_fs_type = { .owner = THIS_MODULE, .name = "hfsplus", - .get_sb = hfsplus_get_sb, + .mount = hfsplus_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index cd7c939..2c0f148 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -962,11 +962,11 @@ out: return err; } -static int hostfs_read_sb(struct file_system_type *type, +static struct dentry *hostfs_read_sb(struct file_system_type *type, int flags, const char *dev_name, - void *data, struct vfsmount *mnt) + void *data) { - return get_sb_nodev(type, flags, data, hostfs_fill_sb_common, mnt); + return mount_nodev(type, flags, data, hostfs_fill_sb_common); } static void hostfs_kill_sb(struct super_block *s) @@ -978,7 +978,7 @@ static void hostfs_kill_sb(struct super_block *s) static struct file_system_type hostfs_type = { .owner = THIS_MODULE, .name = "hostfs", - .get_sb = hostfs_read_sb, + .mount = hostfs_read_sb, .kill_sb = hostfs_kill_sb, .fs_flags = 0, }; diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c index eac5f96..793cb9d 100644 --- a/fs/hpfs/buffer.c +++ b/fs/hpfs/buffer.c @@ -14,7 +14,7 @@ void hpfs_lock_creation(struct super_block *s) #ifdef DEBUG_LOCKS printk("lock creation\n"); #endif - down(&hpfs_sb(s)->hpfs_creation_de); + mutex_lock(&hpfs_sb(s)->hpfs_creation_de); } void hpfs_unlock_creation(struct super_block *s) @@ -22,7 +22,7 @@ void hpfs_unlock_creation(struct super_block *s) #ifdef DEBUG_LOCKS printk("unlock creation\n"); #endif - up(&hpfs_sb(s)->hpfs_creation_de); + mutex_unlock(&hpfs_sb(s)->hpfs_creation_de); } /* Map a sector into a buffer and return pointers to it and to the buffer. */ diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index b59eac0..2fee17d 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -87,7 +87,7 @@ struct hpfs_sb_info { unsigned *sb_bmp_dir; /* main bitmap directory */ unsigned sb_c_bitmap; /* current bitmap */ unsigned sb_max_fwd_alloc; /* max forwad allocation */ - struct semaphore hpfs_creation_de; /* when creating dirents, nobody else + struct mutex hpfs_creation_de; /* when creating dirents, nobody else can alloc blocks */ /*unsigned sb_mounting : 1;*/ int sb_timeshift; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index c969a1a..6c5f015 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -491,7 +491,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) sbi->sb_bmp_dir = NULL; sbi->sb_cp_table = NULL; - init_MUTEX(&sbi->hpfs_creation_de); + mutex_init(&sbi->hpfs_creation_de); uid = current_uid(); gid = current_gid(); @@ -686,17 +686,16 @@ bail0: return -EINVAL; } -static int hpfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *hpfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, hpfs_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, hpfs_fill_super); } static struct file_system_type hpfs_fs_type = { .owner = THIS_MODULE, .name = "hpfs", - .get_sb = hpfs_get_sb, + .mount = hpfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 4e2a45e..f702b5f 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -748,17 +748,17 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent) return(err); } -static int hppfs_read_super(struct file_system_type *type, +static struct dentry *hppfs_read_super(struct file_system_type *type, int flags, const char *dev_name, - void *data, struct vfsmount *mnt) + void *data) { - return get_sb_nodev(type, flags, data, hppfs_fill_super, mnt); + return mount_nodev(type, flags, data, hppfs_fill_super); } static struct file_system_type hppfs_type = { .owner = THIS_MODULE, .name = "hppfs", - .get_sb = hppfs_read_super, + .mount = hppfs_read_super, .kill_sb = kill_anon_super, .fs_flags = 0, }; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index b14be3f..a5fe681 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -896,15 +896,15 @@ void hugetlb_put_quota(struct address_space *mapping, long delta) } } -static int hugetlbfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *hugetlbfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super, mnt); + return mount_nodev(fs_type, flags, data, hugetlbfs_fill_super); } static struct file_system_type hugetlbfs_fs_type = { .name = "hugetlbfs", - .get_sb = hugetlbfs_get_sb, + .mount = hugetlbfs_mount, .kill_sb = kill_litter_super, }; @@ -932,8 +932,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { *user = current_user(); if (user_shm_lock(size, *user)) { - WARN_ONCE(1, - "Using mlock ulimits for SHM_HUGETLB deprecated\n"); + printk_once(KERN_WARNING "Using mlock ulimits for SHM_HUGETLB is deprecated\n"); } else { *user = NULL; return ERR_PTR(-EPERM); diff --git a/fs/internal.h b/fs/internal.h index ebad3b9..e43b9a4 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -106,5 +106,5 @@ extern void release_open_intent(struct nameidata *); * inode.c */ extern int get_nr_dirty_inodes(void); -extern int evict_inodes(struct super_block *); +extern void evict_inodes(struct super_block *); extern int invalidate_inodes(struct super_block *); @@ -6,7 +6,6 @@ #include <linux/syscalls.h> #include <linux/mm.h> -#include <linux/smp_lock.h> #include <linux/capability.h> #include <linux/file.h> #include <linux/fs.h> @@ -530,41 +529,6 @@ static int ioctl_fsthaw(struct file *filp) return thaw_super(sb); } -static int ioctl_fstrim(struct file *filp, void __user *argp) -{ - struct super_block *sb = filp->f_path.dentry->d_inode->i_sb; - struct fstrim_range range; - int ret = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - /* If filesystem doesn't support trim feature, return. */ - if (sb->s_op->trim_fs == NULL) - return -EOPNOTSUPP; - - /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */ - if (sb->s_bdev == NULL) - return -EINVAL; - - if (argp == NULL) { - range.start = 0; - range.len = ULLONG_MAX; - range.minlen = 0; - } else if (copy_from_user(&range, argp, sizeof(range))) - return -EFAULT; - - ret = sb->s_op->trim_fs(sb, &range); - if (ret < 0) - return ret; - - if ((argp != NULL) && - (copy_to_user(argp, &range, sizeof(range)))) - return -EFAULT; - - return 0; -} - /* * When you add any new common ioctls to the switches above and below * please update compat_sys_ioctl() too. @@ -615,10 +579,6 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, error = ioctl_fsthaw(filp); break; - case FITRIM: - error = ioctl_fstrim(filp, argp); - break; - case FS_IOC_FIEMAP: return ioctl_fiemap(filp, arg); diff --git a/fs/ioprio.c b/fs/ioprio.c index 748cfb9..7da2a06 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -103,12 +103,7 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) } ret = -ESRCH; - /* - * We want IOPRIO_WHO_PGRP/IOPRIO_WHO_USER to be "atomic", - * so we can't use rcu_read_lock(). See re-copy of ->ioprio - * in copy_process(). - */ - read_lock(&tasklist_lock); + rcu_read_lock(); switch (which) { case IOPRIO_WHO_PROCESS: if (!who) @@ -153,7 +148,7 @@ free_uid: ret = -EINVAL; } - read_unlock(&tasklist_lock); + rcu_read_unlock(); return ret; } @@ -197,7 +192,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) int ret = -ESRCH; int tmpio; - read_lock(&tasklist_lock); + rcu_read_lock(); switch (which) { case IOPRIO_WHO_PROCESS: if (!who) @@ -250,6 +245,6 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) ret = -EINVAL; } - read_unlock(&tasklist_lock); + rcu_read_unlock(); return ret; } diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 79cf7f61..bfdeb82 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1507,17 +1507,16 @@ struct inode *isofs_iget(struct super_block *sb, return inode; } -static int isofs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *isofs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, isofs_fill_super); } static struct file_system_type iso9660_fs_type = { .owner = THIS_MODULE, .name = "iso9660", - .get_sb = isofs_get_sb, + .mount = isofs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 538417c..f837ba9 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -899,6 +899,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, /* journal descriptor can store up to n blocks -bzzz */ journal->j_blocksize = blocksize; + journal->j_dev = bdev; + journal->j_fs_dev = fs_dev; + journal->j_blk_offset = start; + journal->j_maxlen = len; + bdevname(journal->j_dev, journal->j_devname); + p = journal->j_devname; + while ((p = strchr(p, '/'))) + *p = '!'; jbd2_stats_proc_init(journal); n = journal->j_blocksize / sizeof(journal_block_tag_t); journal->j_wbufsize = n; @@ -908,14 +916,6 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, __func__); goto out_err; } - journal->j_dev = bdev; - journal->j_fs_dev = fs_dev; - journal->j_blk_offset = start; - journal->j_maxlen = len; - bdevname(journal->j_dev, journal->j_devname); - p = journal->j_devname; - while ((p = strchr(p, '/'))) - *p = '!'; bh = __getblk(journal->j_dev, start, journal->j_blocksize); if (!bh) { @@ -1838,7 +1838,6 @@ size_t journal_tag_bytes(journal_t *journal) */ #define JBD2_MAX_SLABS 8 static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS]; -static DECLARE_MUTEX(jbd2_slab_create_sem); static const char *jbd2_slab_names[JBD2_MAX_SLABS] = { "jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k", @@ -1859,6 +1858,7 @@ static void jbd2_journal_destroy_slabs(void) static int jbd2_journal_create_slab(size_t size) { + static DEFINE_MUTEX(jbd2_slab_create_mutex); int i = order_base_2(size) - 10; size_t slab_size; @@ -1870,16 +1870,16 @@ static int jbd2_journal_create_slab(size_t size) if (unlikely(i < 0)) i = 0; - down(&jbd2_slab_create_sem); + mutex_lock(&jbd2_slab_create_mutex); if (jbd2_slab[i]) { - up(&jbd2_slab_create_sem); + mutex_unlock(&jbd2_slab_create_mutex); return 0; /* Already created */ } slab_size = 1 << (i+10); jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size, slab_size, 0, NULL); - up(&jbd2_slab_create_sem); + mutex_unlock(&jbd2_slab_create_mutex); if (!jbd2_slab[i]) { printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n"); return -ENOMEM; diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c index a906f53..85c6be2 100644 --- a/fs/jffs2/build.c +++ b/fs/jffs2/build.c @@ -23,7 +23,7 @@ static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *, static inline struct jffs2_inode_cache * first_inode_chain(int *i, struct jffs2_sb_info *c) { - for (; *i < INOCACHE_HASHSIZE; (*i)++) { + for (; *i < c->inocache_hashsize; (*i)++) { if (c->inocache_list[*i]) return c->inocache_list[*i]; } diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c index 617a1e5..de42470 100644 --- a/fs/jffs2/compr.c +++ b/fs/jffs2/compr.c @@ -103,7 +103,7 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, spin_unlock(&jffs2_compressor_list_lock); *datalen = orig_slen; *cdatalen = orig_dlen; - compr_ret = this->compress(data_in, output_buf, datalen, cdatalen, NULL); + compr_ret = this->compress(data_in, output_buf, datalen, cdatalen); spin_lock(&jffs2_compressor_list_lock); this->usecount--; if (!compr_ret) { @@ -152,7 +152,7 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, spin_unlock(&jffs2_compressor_list_lock); *datalen = orig_slen; *cdatalen = orig_dlen; - compr_ret = this->compress(data_in, this->compr_buf, datalen, cdatalen, NULL); + compr_ret = this->compress(data_in, this->compr_buf, datalen, cdatalen); spin_lock(&jffs2_compressor_list_lock); this->usecount--; if (!compr_ret) { @@ -220,7 +220,7 @@ int jffs2_decompress(struct jffs2_sb_info *c, struct jffs2_inode_info *f, if (comprtype == this->compr) { this->usecount++; spin_unlock(&jffs2_compressor_list_lock); - ret = this->decompress(cdata_in, data_out, cdatalen, datalen, NULL); + ret = this->decompress(cdata_in, data_out, cdatalen, datalen); spin_lock(&jffs2_compressor_list_lock); if (ret) { printk(KERN_WARNING "Decompressor \"%s\" returned %d\n", this->name, ret); diff --git a/fs/jffs2/compr.h b/fs/jffs2/compr.h index e471a91..13bb759 100644 --- a/fs/jffs2/compr.h +++ b/fs/jffs2/compr.h @@ -49,9 +49,9 @@ struct jffs2_compressor { char *name; char compr; /* JFFS2_COMPR_XXX */ int (*compress)(unsigned char *data_in, unsigned char *cpage_out, - uint32_t *srclen, uint32_t *destlen, void *model); + uint32_t *srclen, uint32_t *destlen); int (*decompress)(unsigned char *cdata_in, unsigned char *data_out, - uint32_t cdatalen, uint32_t datalen, void *model); + uint32_t cdatalen, uint32_t datalen); int usecount; int disabled; /* if set the compressor won't compress */ unsigned char *compr_buf; /* used by size compr. mode */ diff --git a/fs/jffs2/compr_lzo.c b/fs/jffs2/compr_lzo.c index ed25ae7c..af186ee 100644 --- a/fs/jffs2/compr_lzo.c +++ b/fs/jffs2/compr_lzo.c @@ -42,7 +42,7 @@ static int __init alloc_workspace(void) } static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out, - uint32_t *sourcelen, uint32_t *dstlen, void *model) + uint32_t *sourcelen, uint32_t *dstlen) { size_t compress_size; int ret; @@ -67,7 +67,7 @@ static int jffs2_lzo_compress(unsigned char *data_in, unsigned char *cpage_out, } static int jffs2_lzo_decompress(unsigned char *data_in, unsigned char *cpage_out, - uint32_t srclen, uint32_t destlen, void *model) + uint32_t srclen, uint32_t destlen) { size_t dl = destlen; int ret; diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c index 9696ad9..16a5047 100644 --- a/fs/jffs2/compr_rtime.c +++ b/fs/jffs2/compr_rtime.c @@ -31,8 +31,7 @@ /* _compress returns the compressed size, -1 if bigger */ static int jffs2_rtime_compress(unsigned char *data_in, unsigned char *cpage_out, - uint32_t *sourcelen, uint32_t *dstlen, - void *model) + uint32_t *sourcelen, uint32_t *dstlen) { short positions[256]; int outpos = 0; @@ -73,8 +72,7 @@ static int jffs2_rtime_compress(unsigned char *data_in, static int jffs2_rtime_decompress(unsigned char *data_in, unsigned char *cpage_out, - uint32_t srclen, uint32_t destlen, - void *model) + uint32_t srclen, uint32_t destlen) { short positions[256]; int outpos = 0; diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c index a12b4f7..9e7cec8 100644 --- a/fs/jffs2/compr_rubin.c +++ b/fs/jffs2/compr_rubin.c @@ -298,7 +298,7 @@ static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in, #if 0 /* _compress returns the compressed size, -1 if bigger */ int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out, - uint32_t *sourcelen, uint32_t *dstlen, void *model) + uint32_t *sourcelen, uint32_t *dstlen) { return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen); @@ -306,8 +306,7 @@ int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out, #endif static int jffs2_dynrubin_compress(unsigned char *data_in, unsigned char *cpage_out, - uint32_t *sourcelen, uint32_t *dstlen, - void *model) + uint32_t *sourcelen, uint32_t *dstlen) { int bits[8]; unsigned char histo[256]; @@ -387,8 +386,7 @@ static void rubin_do_decompress(int bit_divider, int *bits, static int jffs2_rubinmips_decompress(unsigned char *data_in, unsigned char *cpage_out, - uint32_t sourcelen, uint32_t dstlen, - void *model) + uint32_t sourcelen, uint32_t dstlen) { rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen); @@ -397,8 +395,7 @@ static int jffs2_rubinmips_decompress(unsigned char *data_in, static int jffs2_dynrubin_decompress(unsigned char *data_in, unsigned char *cpage_out, - uint32_t sourcelen, uint32_t dstlen, - void *model) + uint32_t sourcelen, uint32_t dstlen) { int bits[8]; int c; diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c index 97fc45d..fd05a0b 100644 --- a/fs/jffs2/compr_zlib.c +++ b/fs/jffs2/compr_zlib.c @@ -68,8 +68,7 @@ static void free_workspaces(void) static int jffs2_zlib_compress(unsigned char *data_in, unsigned char *cpage_out, - uint32_t *sourcelen, uint32_t *dstlen, - void *model) + uint32_t *sourcelen, uint32_t *dstlen) { int ret; @@ -136,8 +135,7 @@ static int jffs2_zlib_compress(unsigned char *data_in, static int jffs2_zlib_decompress(unsigned char *data_in, unsigned char *cpage_out, - uint32_t srclen, uint32_t destlen, - void *model) + uint32_t srclen, uint32_t destlen) { int ret; int wbits = MAX_WBITS; diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 79121aa..9297865 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -367,7 +367,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char } /* We use f->target field to store the target path. */ - f->target = kmalloc(targetlen + 1, GFP_KERNEL); + f->target = kmemdup(target, targetlen + 1, GFP_KERNEL); if (!f->target) { printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1); mutex_unlock(&f->sem); @@ -376,7 +376,6 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char goto fail; } - memcpy(f->target, target, targetlen + 1); D1(printk(KERN_DEBUG "jffs2_symlink: symlink's target '%s' cached\n", (char *)f->target)); /* No data here. Only a metadata node, which will be diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index abac961..e513f19 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -151,7 +151,7 @@ int jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count) } /* Be nice */ - yield(); + cond_resched(); mutex_lock(&c->erase_free_sem); spin_lock(&c->erase_completion_lock); } diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index d9beb06..e896e67 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -474,6 +474,25 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i return inode; } +static int calculate_inocache_hashsize(uint32_t flash_size) +{ + /* + * Pick a inocache hash size based on the size of the medium. + * Count how many megabytes we're dealing with, apply a hashsize twice + * that size, but rounding down to the usual big powers of 2. And keep + * to sensible bounds. + */ + + int size_mb = flash_size / 1024 / 1024; + int hashsize = (size_mb * 2) & ~0x3f; + + if (hashsize < INOCACHE_HASHSIZE_MIN) + return INOCACHE_HASHSIZE_MIN; + if (hashsize > INOCACHE_HASHSIZE_MAX) + return INOCACHE_HASHSIZE_MAX; + + return hashsize; +} int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) { @@ -520,7 +539,8 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) if (ret) return ret; - c->inocache_list = kcalloc(INOCACHE_HASHSIZE, sizeof(struct jffs2_inode_cache *), GFP_KERNEL); + c->inocache_hashsize = calculate_inocache_hashsize(c->flash_size); + c->inocache_list = kcalloc(c->inocache_hashsize, sizeof(struct jffs2_inode_cache *), GFP_KERNEL); if (!c->inocache_list) { ret = -ENOMEM; goto out_wbuf; diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c index 846a794..31dce61 100644 --- a/fs/jffs2/gc.c +++ b/fs/jffs2/gc.c @@ -219,13 +219,14 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) if (!list_empty(&c->erase_complete_list) || !list_empty(&c->erase_pending_list)) { spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->alloc_sem); D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() erasing pending blocks\n")); - if (jffs2_erase_pending_blocks(c, 1)) { - mutex_unlock(&c->alloc_sem); + if (jffs2_erase_pending_blocks(c, 1)) return 0; - } + D1(printk(KERN_DEBUG "No progress from erasing blocks; doing GC anyway\n")); spin_lock(&c->erase_completion_lock); + mutex_lock(&c->alloc_sem); } /* First, work out which block we're garbage-collecting */ diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h index 6784bc8..f864005 100644 --- a/fs/jffs2/jffs2_fs_sb.h +++ b/fs/jffs2/jffs2_fs_sb.h @@ -100,6 +100,7 @@ struct jffs2_sb_info { wait_queue_head_t erase_wait; /* For waiting for erases to complete */ wait_queue_head_t inocache_wq; + int inocache_hashsize; struct jffs2_inode_cache **inocache_list; spinlock_t inocache_lock; diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c index af02bd1..5e03233 100644 --- a/fs/jffs2/nodelist.c +++ b/fs/jffs2/nodelist.c @@ -420,7 +420,7 @@ struct jffs2_inode_cache *jffs2_get_ino_cache(struct jffs2_sb_info *c, uint32_t { struct jffs2_inode_cache *ret; - ret = c->inocache_list[ino % INOCACHE_HASHSIZE]; + ret = c->inocache_list[ino % c->inocache_hashsize]; while (ret && ret->ino < ino) { ret = ret->next; } @@ -441,7 +441,7 @@ void jffs2_add_ino_cache (struct jffs2_sb_info *c, struct jffs2_inode_cache *new dbg_inocache("add %p (ino #%u)\n", new, new->ino); - prev = &c->inocache_list[new->ino % INOCACHE_HASHSIZE]; + prev = &c->inocache_list[new->ino % c->inocache_hashsize]; while ((*prev) && (*prev)->ino < new->ino) { prev = &(*prev)->next; @@ -462,7 +462,7 @@ void jffs2_del_ino_cache(struct jffs2_sb_info *c, struct jffs2_inode_cache *old) dbg_inocache("del %p (ino #%u)\n", old, old->ino); spin_lock(&c->inocache_lock); - prev = &c->inocache_list[old->ino % INOCACHE_HASHSIZE]; + prev = &c->inocache_list[old->ino % c->inocache_hashsize]; while ((*prev) && (*prev)->ino < old->ino) { prev = &(*prev)->next; @@ -487,7 +487,7 @@ void jffs2_free_ino_caches(struct jffs2_sb_info *c) int i; struct jffs2_inode_cache *this, *next; - for (i=0; i<INOCACHE_HASHSIZE; i++) { + for (i=0; i < c->inocache_hashsize; i++) { this = c->inocache_list[i]; while (this) { next = this->next; diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h index 523a916..5a53d9b 100644 --- a/fs/jffs2/nodelist.h +++ b/fs/jffs2/nodelist.h @@ -199,7 +199,8 @@ struct jffs2_inode_cache { #define RAWNODE_CLASS_XATTR_DATUM 1 #define RAWNODE_CLASS_XATTR_REF 2 -#define INOCACHE_HASHSIZE 128 +#define INOCACHE_HASHSIZE_MIN 128 +#define INOCACHE_HASHSIZE_MAX 1024 #define write_ofs(c) ((c)->nextblock->offset + (c)->sector_size - (c)->nextblock->free_size) diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index 46f870d..b632ddd 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -20,7 +20,7 @@ #include "summary.h" #include "debug.h" -#define DEFAULT_EMPTY_SCAN_SIZE 1024 +#define DEFAULT_EMPTY_SCAN_SIZE 256 #define noisy_printk(noise, args...) do { \ if (*(noise)) { \ @@ -435,7 +435,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo unsigned char *buf, uint32_t buf_size, struct jffs2_summary *s) { struct jffs2_unknown_node *node; struct jffs2_unknown_node crcnode; - uint32_t ofs, prevofs; + uint32_t ofs, prevofs, max_ofs; uint32_t hdr_crc, buf_ofs, buf_len; int err; int noise = 0; @@ -550,12 +550,12 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo /* We temporarily use 'ofs' as a pointer into the buffer/jeb */ ofs = 0; - - /* Scan only 4KiB of 0xFF before declaring it's empty */ - while(ofs < EMPTY_SCAN_SIZE(c->sector_size) && *(uint32_t *)(&buf[ofs]) == 0xFFFFFFFF) + max_ofs = EMPTY_SCAN_SIZE(c->sector_size); + /* Scan only EMPTY_SCAN_SIZE of 0xFF before declaring it's empty */ + while(ofs < max_ofs && *(uint32_t *)(&buf[ofs]) == 0xFFFFFFFF) ofs += 4; - if (ofs == EMPTY_SCAN_SIZE(c->sector_size)) { + if (ofs == max_ofs) { #ifdef CONFIG_JFFS2_FS_WRITEBUFFER if (jffs2_cleanmarker_oob(c)) { /* scan oob, take care of cleanmarker */ diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index d1ae5df..c86041b 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -179,12 +179,11 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent) return ret; } -static int jffs2_get_sb(struct file_system_type *fs_type, +static struct dentry *jffs2_mount(struct file_system_type *fs_type, int flags, const char *dev_name, - void *data, struct vfsmount *mnt) + void *data) { - return get_sb_mtd(fs_type, flags, dev_name, data, jffs2_fill_super, - mnt); + return mount_mtd(fs_type, flags, dev_name, data, jffs2_fill_super); } static void jffs2_put_super (struct super_block *sb) @@ -229,7 +228,7 @@ static void jffs2_kill_sb(struct super_block *sb) static struct file_system_type jffs2_fs_type = { .owner = THIS_MODULE, .name = "jffs2", - .get_sb = jffs2_get_sb, + .mount = jffs2_mount, .kill_sb = jffs2_kill_sb, }; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 68eee2b..0669fc1 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -583,11 +583,10 @@ static int jfs_unfreeze(struct super_block *sb) return 0; } -static int jfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *jfs_do_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, jfs_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, jfs_fill_super); } static int jfs_sync_fs(struct super_block *sb, int wait) @@ -770,7 +769,7 @@ static const struct export_operations jfs_export_operations = { static struct file_system_type jfs_fs_type = { .owner = THIS_MODULE, .name = "jfs", - .get_sb = jfs_get_sb, + .mount = jfs_do_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; @@ -201,9 +201,8 @@ static const struct super_operations simple_super_operations = { * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that * will never be mountable) */ -int get_sb_pseudo(struct file_system_type *fs_type, char *name, - const struct super_operations *ops, unsigned long magic, - struct vfsmount *mnt) +struct dentry *mount_pseudo(struct file_system_type *fs_type, char *name, + const struct super_operations *ops, unsigned long magic) { struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); struct dentry *dentry; @@ -211,7 +210,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name, struct qstr d_name = {.name = name, .len = strlen(name)}; if (IS_ERR(s)) - return PTR_ERR(s); + return ERR_CAST(s); s->s_flags = MS_NOUSER; s->s_maxbytes = MAX_LFS_FILESIZE; @@ -241,12 +240,11 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name, d_instantiate(dentry, root); s->s_root = dentry; s->s_flags |= MS_ACTIVE; - simple_set_mnt(mnt, s); - return 0; + return dget(s->s_root); Enomem: deactivate_locked_super(s); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) @@ -951,7 +949,7 @@ EXPORT_SYMBOL(dcache_dir_lseek); EXPORT_SYMBOL(dcache_dir_open); EXPORT_SYMBOL(dcache_readdir); EXPORT_SYMBOL(generic_read_dir); -EXPORT_SYMBOL(get_sb_pseudo); +EXPORT_SYMBOL(mount_pseudo); EXPORT_SYMBOL(simple_write_begin); EXPORT_SYMBOL(simple_write_end); EXPORT_SYMBOL(simple_dir_inode_operations); diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index d5bb868..25509eb 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -14,7 +14,6 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/lockd/lockd.h> -#include <linux/smp_lock.h> #include <linux/kthread.h> #define NLMDBG_FACILITY NLMDBG_CLIENT diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 47ea1e1..332c54c 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -7,7 +7,6 @@ */ #include <linux/module.h> -#include <linux/smp_lock.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/errno.h> diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 25e21e4..ed0c59f 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -124,7 +124,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) continue; if (host->h_server != ni->server) continue; - if (ni->server && + if (ni->server && ni->src_len != 0 && !rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap)) continue; @@ -167,6 +167,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) host->h_addrlen = ni->salen; rpc_set_port(nlm_addr(host), 0); memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len); + host->h_srcaddrlen = ni->src_len; host->h_version = ni->version; host->h_proto = ni->protocol; host->h_rpcclnt = NULL; @@ -238,9 +239,6 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, const char *hostname, int noresvport) { - const struct sockaddr source = { - .sa_family = AF_UNSPEC, - }; struct nlm_lookup_host_info ni = { .server = 0, .sap = sap, @@ -249,8 +247,6 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, .version = version, .hostname = hostname, .hostname_len = strlen(hostname), - .src_sap = &source, - .src_len = sizeof(source), .noresvport = noresvport, }; @@ -357,7 +353,6 @@ nlm_bind_host(struct nlm_host *host) .protocol = host->h_proto, .address = nlm_addr(host), .addrsize = host->h_addrlen, - .saddress = nlm_srcaddr(host), .timeout = &timeparms, .servername = host->h_name, .program = &nlm_program, @@ -376,6 +371,8 @@ nlm_bind_host(struct nlm_host *host) args.flags |= RPC_CLNT_CREATE_HARDRTRY; if (host->h_noresvport) args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + if (host->h_srcaddrlen) + args.saddress = nlm_srcaddr(host); clnt = rpc_create(&args); if (!IS_ERR(clnt)) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index a336e83..38d2611 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -9,7 +9,6 @@ #include <linux/types.h> #include <linux/time.h> -#include <linux/smp_lock.h> #include <linux/lockd/lockd.h> #include <linux/lockd/share.h> diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index c462d34..ef5659b 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -25,7 +25,6 @@ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched.h> -#include <linux/smp_lock.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/lockd/nlm.h> diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index c3069f3..0caea53 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -9,7 +9,6 @@ #include <linux/types.h> #include <linux/time.h> -#include <linux/smp_lock.h> #include <linux/lockd/lockd.h> #include <linux/lockd/share.h> @@ -122,7 +122,6 @@ #include <linux/module.h> #include <linux/security.h> #include <linux/slab.h> -#include <linux/smp_lock.h> #include <linux/syscalls.h> #include <linux/time.h> #include <linux/rcupdate.h> @@ -186,7 +185,7 @@ void locks_release_private(struct file_lock *fl) EXPORT_SYMBOL_GPL(locks_release_private); /* Free a lock which is not in use. */ -static void locks_free_lock(struct file_lock *fl) +void locks_free_lock(struct file_lock *fl) { BUG_ON(waitqueue_active(&fl->fl_wait)); BUG_ON(!list_empty(&fl->fl_block)); @@ -195,6 +194,7 @@ static void locks_free_lock(struct file_lock *fl) locks_release_private(fl); kmem_cache_free(filelock_cache, fl); } +EXPORT_SYMBOL(locks_free_lock); void locks_init_lock(struct file_lock *fl) { @@ -234,11 +234,8 @@ static void locks_copy_private(struct file_lock *new, struct file_lock *fl) fl->fl_ops->fl_copy_lock(new, fl); new->fl_ops = fl->fl_ops; } - if (fl->fl_lmops) { - if (fl->fl_lmops->fl_copy_lock) - fl->fl_lmops->fl_copy_lock(new, fl); + if (fl->fl_lmops) new->fl_lmops = fl->fl_lmops; - } } /* @@ -1371,20 +1368,22 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) struct inode *inode = dentry->d_inode; int error, rdlease_count = 0, wrlease_count = 0; + lease = *flp; + + error = -EACCES; if ((current_fsuid() != inode->i_uid) && !capable(CAP_LEASE)) - return -EACCES; + goto out; + error = -EINVAL; if (!S_ISREG(inode->i_mode)) - return -EINVAL; + goto out; error = security_file_lock(filp, arg); if (error) - return error; + goto out; time_out_leases(inode); BUG_ON(!(*flp)->fl_lmops->fl_break); - lease = *flp; - if (arg != F_UNLCK) { error = -EAGAIN; if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) @@ -1425,8 +1424,9 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) goto out; if (my_before != NULL) { - *flp = *my_before; error = lease->fl_lmops->fl_change(my_before, arg); + if (!error) + *flp = *my_before; goto out; } @@ -1441,7 +1441,6 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) return 0; out: - locks_free_lock(lease); return error; } EXPORT_SYMBOL(generic_setlease); @@ -1493,21 +1492,19 @@ int vfs_setlease(struct file *filp, long arg, struct file_lock **lease) } EXPORT_SYMBOL_GPL(vfs_setlease); -/** - * fcntl_setlease - sets a lease on an open file - * @fd: open file descriptor - * @filp: file pointer - * @arg: type of lease to obtain - * - * Call this fcntl to establish a lease on the file. - * Note that you also need to call %F_SETSIG to - * receive a signal when the lease is broken. - */ -int fcntl_setlease(unsigned int fd, struct file *filp, long arg) +static int do_fcntl_delete_lease(struct file *filp) { - struct file_lock *fl; + struct file_lock fl, *flp = &fl; + + lease_init(filp, F_UNLCK, flp); + + return vfs_setlease(filp, F_UNLCK, &flp); +} + +static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) +{ + struct file_lock *fl, *ret; struct fasync_struct *new; - struct inode *inode = filp->f_path.dentry->d_inode; int error; fl = lease_alloc(filp, arg); @@ -1519,10 +1516,16 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg) locks_free_lock(fl); return -ENOMEM; } + ret = fl; lock_flocks(); - error = __vfs_setlease(filp, arg, &fl); - if (error || arg == F_UNLCK) - goto out_unlock; + error = __vfs_setlease(filp, arg, &ret); + if (error) { + unlock_flocks(); + locks_free_lock(fl); + goto out_free_fasync; + } + if (ret != fl) + locks_free_lock(fl); /* * fasync_insert_entry() returns the old entry if any. @@ -1530,26 +1533,36 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg) * inserted it into the fasync list. Clear new so that * we don't release it here. */ - if (!fasync_insert_entry(fd, filp, &fl->fl_fasync, new)) + if (!fasync_insert_entry(fd, filp, &ret->fl_fasync, new)) new = NULL; - if (error < 0) { - /* remove lease just inserted by setlease */ - fl->fl_type = F_UNLCK | F_INPROGRESS; - fl->fl_break_time = jiffies - 10; - time_out_leases(inode); - goto out_unlock; - } - error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); -out_unlock: unlock_flocks(); + +out_free_fasync: if (new) fasync_free(new); return error; } /** + * fcntl_setlease - sets a lease on an open file + * @fd: open file descriptor + * @filp: file pointer + * @arg: type of lease to obtain + * + * Call this fcntl to establish a lease on the file. + * Note that you also need to call %F_SETSIG to + * receive a signal when the lease is broken. + */ +int fcntl_setlease(unsigned int fd, struct file *filp, long arg) +{ + if (arg == F_UNLCK) + return do_fcntl_delete_lease(filp); + return do_fcntl_add_lease(fd, filp, arg); +} + +/** * flock_lock_file_wait - Apply a FLOCK-style lock to a file * @filp: The file to apply the lock to * @fl: The lock to be applied diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index 9bd2ce2..92ca6fb 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -298,9 +298,9 @@ static int bdev_write_sb(struct super_block *sb, struct page *page) return sync_request(page, bdev, WRITE); } -static void bdev_put_device(struct super_block *sb) +static void bdev_put_device(struct logfs_super *s) { - close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE); + close_bdev_exclusive(s->s_bdev, FMODE_READ|FMODE_WRITE); } static int bdev_can_write_buf(struct super_block *sb, u64 ofs) @@ -320,8 +320,8 @@ static const struct logfs_device_ops bd_devops = { .put_device = bdev_put_device, }; -int logfs_get_sb_bdev(struct file_system_type *type, int flags, - const char *devname, struct vfsmount *mnt) +int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type, + const char *devname) { struct block_device *bdev; @@ -332,8 +332,11 @@ int logfs_get_sb_bdev(struct file_system_type *type, int flags, if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) { int mtdnr = MINOR(bdev->bd_dev); close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE); - return logfs_get_sb_mtd(type, flags, mtdnr, mnt); + return logfs_get_sb_mtd(p, mtdnr); } - return logfs_get_sb_device(type, flags, NULL, bdev, &bd_devops, mnt); + p->s_bdev = bdev; + p->s_mtd = NULL; + p->s_devops = &bd_devops; + return 0; } diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c index a85d47d..7466e9d 100644 --- a/fs/logfs/dev_mtd.c +++ b/fs/logfs/dev_mtd.c @@ -230,9 +230,9 @@ static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len) __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); } -static void mtd_put_device(struct super_block *sb) +static void mtd_put_device(struct logfs_super *s) { - put_mtd_device(logfs_super(sb)->s_mtd); + put_mtd_device(s->s_mtd); } static int mtd_can_write_buf(struct super_block *sb, u64 ofs) @@ -265,14 +265,14 @@ static const struct logfs_device_ops mtd_devops = { .put_device = mtd_put_device, }; -int logfs_get_sb_mtd(struct file_system_type *type, int flags, - int mtdnr, struct vfsmount *mnt) +int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr) { - struct mtd_info *mtd; - const struct logfs_device_ops *devops = &mtd_devops; - - mtd = get_mtd_device(NULL, mtdnr); + struct mtd_info *mtd = get_mtd_device(NULL, mtdnr); if (IS_ERR(mtd)) return PTR_ERR(mtd); - return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt); + + s->s_bdev = NULL; + s->s_mtd = mtd; + s->s_devops = &mtd_devops; + return 0; } diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h index b878626..57afd4a 100644 --- a/fs/logfs/logfs.h +++ b/fs/logfs/logfs.h @@ -136,6 +136,7 @@ struct logfs_area_ops { int (*erase_segment)(struct logfs_area *area); }; +struct logfs_super; /* forward */ /** * struct logfs_device_ops - device access operations * @@ -156,7 +157,7 @@ struct logfs_device_ops { int ensure_write); int (*can_write_buf)(struct super_block *sb, u64 ofs); void (*sync)(struct super_block *sb); - void (*put_device)(struct super_block *sb); + void (*put_device)(struct logfs_super *s); }; /** @@ -471,11 +472,13 @@ void logfs_compr_exit(void); /* dev_bdev.c */ #ifdef CONFIG_BLOCK -int logfs_get_sb_bdev(struct file_system_type *type, int flags, - const char *devname, struct vfsmount *mnt); +int logfs_get_sb_bdev(struct logfs_super *s, + struct file_system_type *type, + const char *devname); #else -static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags, - const char *devname, struct vfsmount *mnt) +static inline int logfs_get_sb_bdev(struct logfs_super *s, + struct file_system_type *type, + const char *devname) { return -ENODEV; } @@ -483,11 +486,9 @@ static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags, /* dev_mtd.c */ #ifdef CONFIG_MTD -int logfs_get_sb_mtd(struct file_system_type *type, int flags, - int mtdnr, struct vfsmount *mnt); +int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr); #else -static inline int logfs_get_sb_mtd(struct file_system_type *type, int flags, - int mtdnr, struct vfsmount *mnt) +static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr) { return -ENODEV; } @@ -619,9 +620,6 @@ void emergency_read_end(struct page *page); void logfs_crash_dump(struct super_block *sb); void *memchr_inv(const void *s, int c, size_t n); int logfs_statfs(struct dentry *dentry, struct kstatfs *stats); -int logfs_get_sb_device(struct file_system_type *type, int flags, - struct mtd_info *mtd, struct block_device *bdev, - const struct logfs_device_ops *devops, struct vfsmount *mnt); int logfs_check_ds(struct logfs_disk_super *ds); int logfs_write_sb(struct super_block *sb); diff --git a/fs/logfs/super.c b/fs/logfs/super.c index 5336155..33435e4 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -325,7 +325,7 @@ static int logfs_make_writeable(struct super_block *sb) return 0; } -static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt) +static int logfs_get_sb_final(struct super_block *sb) { struct logfs_super *super = logfs_super(sb); struct inode *rootdir; @@ -356,7 +356,6 @@ static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt) } log_super("LogFS: Finished mounting\n"); - simple_set_mnt(mnt, sb); return 0; fail: @@ -529,43 +528,37 @@ static void logfs_kill_sb(struct super_block *sb) logfs_cleanup_rw(sb); if (super->s_erase_page) __free_page(super->s_erase_page); - super->s_devops->put_device(sb); + super->s_devops->put_device(super); logfs_mempool_destroy(super->s_btree_pool); logfs_mempool_destroy(super->s_alias_pool); kfree(super); log_super("LogFS: Finished unmounting\n"); } -int logfs_get_sb_device(struct file_system_type *type, int flags, - struct mtd_info *mtd, struct block_device *bdev, - const struct logfs_device_ops *devops, struct vfsmount *mnt) +static struct dentry *logfs_get_sb_device(struct logfs_super *super, + struct file_system_type *type, int flags) { - struct logfs_super *super; struct super_block *sb; int err = -ENOMEM; static int mount_count; log_super("LogFS: Start mount %x\n", mount_count++); - super = kzalloc(sizeof(*super), GFP_KERNEL); - if (!super) - goto err0; - super->s_mtd = mtd; - super->s_bdev = bdev; err = -EINVAL; sb = sget(type, logfs_sb_test, logfs_sb_set, super); - if (IS_ERR(sb)) - goto err0; + if (IS_ERR(sb)) { + super->s_devops->put_device(super); + kfree(super); + return ERR_CAST(sb); + } if (sb->s_root) { /* Device is already in use */ - err = 0; - simple_set_mnt(mnt, sb); - goto err0; + super->s_devops->put_device(super); + kfree(super); + return dget(sb->s_root); } - super->s_devops = devops; - /* * sb->s_maxbytes is limited to 8TB. On 32bit systems, the page cache * only covers 16TB and the upper 8TB are used for indirect blocks. @@ -581,10 +574,12 @@ int logfs_get_sb_device(struct file_system_type *type, int flags, goto err1; sb->s_flags |= MS_ACTIVE; - err = logfs_get_sb_final(sb, mnt); - if (err) + err = logfs_get_sb_final(sb); + if (err) { deactivate_locked_super(sb); - return err; + return ERR_PTR(err); + } + return dget(sb->s_root); err1: /* no ->s_root, no ->put_super() */ @@ -592,37 +587,45 @@ err1: iput(super->s_segfile_inode); iput(super->s_mapping_inode); deactivate_locked_super(sb); - return err; -err0: - kfree(super); - //devops->put_device(sb); - return err; + return ERR_PTR(err); } -static int logfs_get_sb(struct file_system_type *type, int flags, - const char *devname, void *data, struct vfsmount *mnt) +static struct dentry *logfs_mount(struct file_system_type *type, int flags, + const char *devname, void *data) { ulong mtdnr; + struct logfs_super *super; + int err; - if (!devname) - return logfs_get_sb_bdev(type, flags, devname, mnt); - if (strncmp(devname, "mtd", 3)) - return logfs_get_sb_bdev(type, flags, devname, mnt); + super = kzalloc(sizeof(*super), GFP_KERNEL); + if (!super) + return ERR_PTR(-ENOMEM); - { + if (!devname) + err = logfs_get_sb_bdev(super, type, devname); + else if (strncmp(devname, "mtd", 3)) + err = logfs_get_sb_bdev(super, type, devname); + else { char *garbage; mtdnr = simple_strtoul(devname+3, &garbage, 0); if (*garbage) - return -EINVAL; + err = -EINVAL; + else + err = logfs_get_sb_mtd(super, mtdnr); + } + + if (err) { + kfree(super); + return ERR_PTR(err); } - return logfs_get_sb_mtd(type, flags, mtdnr, mnt); + return logfs_get_sb_device(super, type, flags); } static struct file_system_type logfs_fs_type = { .owner = THIS_MODULE, .name = "logfs", - .get_sb = logfs_get_sb, + .mount = logfs_mount, .kill_sb = logfs_kill_sb, .fs_flags = FS_REQUIRES_DEV, diff --git a/fs/minix/inode.c b/fs/minix/inode.c index e39d6bf..fb20208 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -614,17 +614,16 @@ void minix_truncate(struct inode * inode) V2_minix_truncate(inode); } -static int minix_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *minix_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, minix_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, minix_fill_super); } static struct file_system_type minix_fs_type = { .owner = THIS_MODULE, .name = "minix", - .get_sb = minix_get_sb, + .mount = minix_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; @@ -1574,6 +1574,7 @@ static struct file *finish_open(struct nameidata *nd, */ if (will_truncate) mnt_drop_write(nd->path.mnt); + path_put(&nd->path); return filp; exit: @@ -1675,6 +1676,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path, } filp = nameidata_to_filp(nd); mnt_drop_write(nd->path.mnt); + path_put(&nd->path); if (!IS_ERR(filp)) { error = ima_file_check(filp, acc_mode); if (error) { diff --git a/fs/namespace.c b/fs/namespace.c index 8a415c9..3dbfc07 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -13,7 +13,6 @@ #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/percpu.h> -#include <linux/smp_lock.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/acct.h> diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index aac8832..f22b12e 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -19,7 +19,6 @@ #include <linux/mm.h> #include <asm/uaccess.h> #include <asm/byteorder.h> -#include <linux/smp_lock.h> #include <linux/ncp_fs.h> diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 6c754f7..cb50aaf 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -17,7 +17,6 @@ #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/sched.h> -#include <linux/smp_lock.h> #include <linux/ncp_fs.h> #include "ncplib_kernel.h" diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 985fabb..8fb93b6 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -26,7 +26,6 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/init.h> -#include <linux/smp_lock.h> #include <linux/vfs.h> #include <linux/mount.h> #include <linux/seq_file.h> @@ -1020,16 +1019,16 @@ out: return result; } -static int ncp_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *ncp_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_nodev(fs_type, flags, data, ncp_fill_super, mnt); + return mount_nodev(fs_type, flags, data, ncp_fill_super); } static struct file_system_type ncp_fs_type = { .owner = THIS_MODULE, .name = "ncpfs", - .get_sb = ncp_get_sb, + .mount = ncp_mount, .kill_sb = kill_anon_super, .fs_flags = FS_BINARY_MOUNTDATA, }; diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index c2a1f9a..d40a547 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -17,7 +17,6 @@ #include <linux/mount.h> #include <linux/slab.h> #include <linux/highuid.h> -#include <linux/smp_lock.h> #include <linux/vmalloc.h> #include <linux/sched.h> diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index aeec017..93a8b3b 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -9,7 +9,6 @@ #include <linux/completion.h> #include <linux/ip.h> #include <linux/module.h> -#include <linux/smp_lock.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/svcsock.h> #include <linux/nfs_fs.h> diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 232a7ee..1fd62fc 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -11,7 +11,6 @@ #include <linux/module.h> #include <linux/sched.h> #include <linux/slab.h> -#include <linux/smp_lock.h> #include <linux/spinlock.h> #include <linux/nfs4.h> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 07ac384..f0a384e 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -34,6 +34,7 @@ #include <linux/mount.h> #include <linux/sched.h> #include <linux/vmalloc.h> +#include <linux/kmemleak.h> #include "delegation.h" #include "iostat.h" @@ -161,6 +162,7 @@ struct nfs_cache_array_entry { u64 cookie; u64 ino; struct qstr string; + unsigned char d_type; }; struct nfs_cache_array { @@ -170,8 +172,6 @@ struct nfs_cache_array { struct nfs_cache_array_entry array[0]; }; -#define MAX_READDIR_ARRAY ((PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry)) - typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); typedef struct { struct file *file; @@ -194,9 +194,13 @@ typedef struct { static struct nfs_cache_array *nfs_readdir_get_array(struct page *page) { + void *ptr; if (page == NULL) return ERR_PTR(-EIO); - return (struct nfs_cache_array *)kmap(page); + ptr = kmap(page); + if (ptr == NULL) + return ERR_PTR(-ENOMEM); + return ptr; } static @@ -213,6 +217,9 @@ int nfs_readdir_clear_array(struct page *page, gfp_t mask) { struct nfs_cache_array *array = nfs_readdir_get_array(page); int i; + + if (IS_ERR(array)) + return PTR_ERR(array); for (i = 0; i < array->size; i++) kfree(array->array[i].string.name); nfs_readdir_release_array(page); @@ -231,6 +238,11 @@ int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int le string->name = kmemdup(name, len, GFP_KERNEL); if (string->name == NULL) return -ENOMEM; + /* + * Avoid a kmemleak false positive. The pointer to the name is stored + * in a page cache page which kmemleak does not scan. + */ + kmemleak_not_leak(string->name); string->hash = full_name_hash(name, len); return 0; } @@ -244,20 +256,24 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) if (IS_ERR(array)) return PTR_ERR(array); - ret = -EIO; - if (array->size >= MAX_READDIR_ARRAY) - goto out; cache_entry = &array->array[array->size]; + + /* Check that this entry lies within the page bounds */ + ret = -ENOSPC; + if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE) + goto out; + cache_entry->cookie = entry->prev_cookie; cache_entry->ino = entry->ino; + cache_entry->d_type = entry->d_type; ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len); if (ret) goto out; array->last_cookie = entry->cookie; + array->size++; if (entry->eof == 1) array->eof_index = array->size; - array->size++; out: nfs_readdir_release_array(page); return ret; @@ -272,7 +288,7 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri if (diff < 0) goto out_eof; if (diff >= array->size) { - if (array->eof_index > 0) + if (array->eof_index >= 0) goto out_eof; desc->current_index += array->size; return -EAGAIN; @@ -281,8 +297,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri index = (unsigned int)diff; *desc->dir_cookie = array->array[index].cookie; desc->cache_entry_index = index; - if (index == array->eof_index) - desc->eof = 1; return 0; out_eof: desc->eof = 1; @@ -296,17 +310,17 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des int status = -EAGAIN; for (i = 0; i < array->size; i++) { - if (i == array->eof_index) { - desc->eof = 1; - status = -EBADCOOKIE; - } if (array->array[i].cookie == *desc->dir_cookie) { desc->cache_entry_index = i; status = 0; - break; + goto out; } } - + if (i == array->eof_index) { + desc->eof = 1; + status = -EBADCOOKIE; + } +out: return status; } @@ -381,13 +395,9 @@ int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct x static int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) { - struct nfs_inode *node; if (dentry->d_inode == NULL) goto different; - node = NFS_I(dentry->d_inode); - if (node->fh.size != entry->fh->size) - goto different; - if (strncmp(node->fh.data, entry->fh->data, node->fh.size) != 0) + if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0) goto different; return 1; different: @@ -449,14 +459,15 @@ out: /* Perform conversion from xdr to cache array */ static -void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, +int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, void *xdr_page, struct page *page, unsigned int buflen) { struct xdr_stream stream; struct xdr_buf buf; __be32 *ptr = xdr_page; - int status; struct nfs_cache_array *array; + unsigned int count = 0; + int status; buf.head->iov_base = xdr_page; buf.head->iov_len = buflen; @@ -471,21 +482,32 @@ void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *e do { status = xdr_decode(desc, entry, &stream); - if (status != 0) + if (status != 0) { + if (status == -EAGAIN) + status = 0; break; + } + + count++; - if (nfs_readdir_add_to_array(entry, page) == -1) - break; if (desc->plus == 1) nfs_prime_dcache(desc->file->f_path.dentry, entry); + + status = nfs_readdir_add_to_array(entry, page); + if (status != 0) + break; } while (!entry->eof); - if (status == -EBADCOOKIE && entry->eof) { + if (count == 0 || (status == -EBADCOOKIE && entry->eof == 1)) { array = nfs_readdir_get_array(page); - array->eof_index = array->size - 1; - status = 0; - nfs_readdir_release_array(page); + if (!IS_ERR(array)) { + array->eof_index = array->size; + status = 0; + nfs_readdir_release_array(page); + } else + status = PTR_ERR(array); } + return status; } static @@ -537,7 +559,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct nfs_entry entry; struct file *file = desc->file; struct nfs_cache_array *array; - int status = 0; + int status = -ENOMEM; unsigned int array_size = ARRAY_SIZE(pages); entry.prev_cookie = 0; @@ -549,6 +571,10 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, goto out; array = nfs_readdir_get_array(page); + if (IS_ERR(array)) { + status = PTR_ERR(array); + goto out; + } memset(array, 0, sizeof(struct nfs_cache_array)); array->eof_index = -1; @@ -556,12 +582,19 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, if (!pages_ptr) goto out_release_array; do { + unsigned int pglen; status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode); if (status < 0) break; - nfs_readdir_page_filler(desc, &entry, pages_ptr, page, array_size * PAGE_SIZE); - } while (array->eof_index < 0 && array->size < MAX_READDIR_ARRAY); + pglen = status; + status = nfs_readdir_page_filler(desc, &entry, pages_ptr, page, pglen); + if (status < 0) { + if (status == -ENOSPC) + status = 0; + break; + } + } while (array->eof_index < 0); nfs_readdir_free_large_page(pages_ptr, pages, array_size); out_release_array: @@ -582,8 +615,10 @@ static int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) { struct inode *inode = desc->file->f_path.dentry->d_inode; + int ret; - if (nfs_readdir_xdr_to_array(desc, page, inode) < 0) + ret = nfs_readdir_xdr_to_array(desc, page, inode); + if (ret < 0) goto error; SetPageUptodate(page); @@ -595,7 +630,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) return 0; error: unlock_page(page); - return -EIO; + return ret; } static @@ -608,12 +643,8 @@ void cache_page_release(nfs_readdir_descriptor_t *desc) static struct page *get_cache_page(nfs_readdir_descriptor_t *desc) { - struct page *page; - page = read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping, + return read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping, desc->page_index, (filler_t *)nfs_readdir_filler, desc); - if (IS_ERR(page)) - desc->eof = 1; - return page; } /* @@ -639,8 +670,10 @@ int find_cache_page(nfs_readdir_descriptor_t *desc) static inline int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) { - int res = -EAGAIN; + int res; + if (desc->page_index == 0) + desc->current_index = 0; while (1) { res = find_cache_page(desc); if (res != -EAGAIN) @@ -666,35 +699,36 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, int i = 0; int res = 0; struct nfs_cache_array *array = NULL; - unsigned int d_type = DT_UNKNOWN; - struct dentry *dentry = NULL; array = nfs_readdir_get_array(desc->page); + if (IS_ERR(array)) { + res = PTR_ERR(array); + goto out; + } for (i = desc->cache_entry_index; i < array->size; i++) { - d_type = DT_UNKNOWN; + struct nfs_cache_array_entry *ent; - res = filldir(dirent, array->array[i].string.name, - array->array[i].string.len, file->f_pos, - nfs_compat_user_ino64(array->array[i].ino), d_type); - if (res < 0) + ent = &array->array[i]; + if (filldir(dirent, ent->string.name, ent->string.len, + file->f_pos, nfs_compat_user_ino64(ent->ino), + ent->d_type) < 0) { + desc->eof = 1; break; + } file->f_pos++; desc->cache_entry_index = i; if (i < (array->size-1)) *desc->dir_cookie = array->array[i+1].cookie; else *desc->dir_cookie = array->last_cookie; - if (i == array->eof_index) { - desc->eof = 1; - break; - } } + if (i == array->eof_index) + desc->eof = 1; nfs_readdir_release_array(desc->page); +out: cache_page_release(desc); - if (dentry != NULL) - dput(dentry); dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", (unsigned long long)*desc->dir_cookie, res); return res; @@ -729,13 +763,13 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, goto out; } - if (nfs_readdir_xdr_to_array(desc, page, inode) == -1) { - status = -EIO; - goto out_release; - } - desc->page_index = 0; desc->page = page; + + status = nfs_readdir_xdr_to_array(desc, page, inode); + if (status < 0) + goto out_release; + status = nfs_do_filldir(desc, dirent, filldir); out: @@ -786,14 +820,14 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) res = readdir_search_pagecache(desc); if (res == -EBADCOOKIE) { + res = 0; /* This means either end of directory */ if (*desc->dir_cookie && desc->eof == 0) { /* Or that the server has 'lost' a cookie */ res = uncached_readdir(desc, dirent, filldir); - if (res >= 0) + if (res == 0) continue; } - res = 0; break; } if (res == -ETOOSMALL && desc->plus) { @@ -808,10 +842,8 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) break; res = nfs_do_filldir(desc, dirent, filldir); - if (res < 0) { - res = 0; + if (res < 0) break; - } } out: nfs_unblock_sillyrename(dentry); @@ -1345,12 +1377,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry res = NULL; goto out; /* This turned out not to be a regular file */ - case -EISDIR: case -ENOTDIR: goto no_open; case -ELOOP: if (!(nd->intent.open.flags & O_NOFOLLOW)) goto no_open; + /* case -EISDIR: */ /* case -EINVAL: */ default: res = ERR_CAST(inode); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 84d3c8b..e6ace0d 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -867,7 +867,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, goto out; nfs_alloc_commit_data(dreq); - if (dreq->commit_data == NULL || count < wsize) + if (dreq->commit_data == NULL || count <= wsize) sync = NFS_FILE_SYNC; dreq->inode = inode; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index e756075..60677f9 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -884,6 +884,5 @@ static int nfs_setlease(struct file *file, long arg, struct file_lock **fl) dprintk("NFS: setlease(%s/%s, arg=%ld)\n", file->f_path.dentry->d_parent->d_name.name, file->f_path.dentry->d_name.name, arg); - return -EINVAL; } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index db08ff3..e6356b7 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -362,6 +362,15 @@ unsigned int nfs_page_length(struct page *page) } /* + * Convert a umode to a dirent->d_type + */ +static inline +unsigned char nfs_umode_to_dtype(umode_t mode) +{ + return (mode >> 12) & 15; +} + +/* * Determine the number of pages in an array of length 'len' and * with a base offset of 'base' */ diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index e6bf457..5914a19 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -423,7 +423,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) struct page **page; size_t hdrlen; unsigned int pglen, recvd; - int status, nr = 0; + int status; if ((status = ntohl(*p++))) return nfs_stat_to_errno(status); @@ -443,7 +443,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) if (pglen > recvd) pglen = recvd; page = rcvbuf->pages; - return nr; + return pglen; } static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) @@ -485,6 +485,8 @@ nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_se entry->prev_cookie = entry->cookie; entry->cookie = ntohl(*p++); + entry->d_type = DT_UNKNOWN; + p = xdr_inline_peek(xdr, 8); if (p != NULL) entry->eof = !p[0] && p[1]; @@ -495,7 +497,7 @@ nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_se out_overflow: print_overflow_msg(__func__, xdr); - return ERR_PTR(-EIO); + return ERR_PTR(-EAGAIN); } /* diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index d9a5e83..f6cc60f 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -555,7 +555,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res struct page **page; size_t hdrlen; u32 recvd, pglen; - int status, nr = 0; + int status; status = ntohl(*p++); /* Decode post_op_attrs */ @@ -586,7 +586,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res pglen = recvd; page = rcvbuf->pages; - return nr; + return pglen; } __be32 * @@ -622,11 +622,13 @@ nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_s entry->prev_cookie = entry->cookie; p = xdr_decode_hyper(p, &entry->cookie); + entry->d_type = DT_UNKNOWN; if (plus) { entry->fattr->valid = 0; p = xdr_decode_post_op_attr_stream(xdr, entry->fattr); if (IS_ERR(p)) goto out_overflow_exit; + entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); /* In fact, a post_op_fh3: */ p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) @@ -656,7 +658,7 @@ nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_s out_overflow: print_overflow_msg(__func__, xdr); out_overflow_exit: - return ERR_PTR(-EIO); + return ERR_PTR(-EAGAIN); } /* diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0f24cdf..6a653ff 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2852,8 +2852,10 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args); res.pgbase = args.pgbase; status = nfs4_call_sync(NFS_SERVER(dir), &msg, &args, &res, 0); - if (status == 0) + if (status >= 0) { memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE); + status += args.pgbase; + } nfs_invalidate_atime(dir); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index f313c4c..9f1826b 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4518,7 +4518,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n xdr_read_pages(xdr, pglen); - return 0; + return pglen; } static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) @@ -6208,6 +6208,10 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID) entry->ino = entry->fattr->fileid; + entry->d_type = DT_UNKNOWN; + if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE) + entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); + if (verify_attr_len(xdr, p, len) < 0) goto out_overflow; @@ -6221,7 +6225,7 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, out_overflow: print_overflow_msg(__func__, xdr); - return ERR_PTR(-EIO); + return ERR_PTR(-EAGAIN); } /* diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 3600ec7..3c04504 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -39,7 +39,6 @@ #include <linux/nfs_mount.h> #include <linux/nfs4_mount.h> #include <linux/lockd/bind.h> -#include <linux/smp_lock.h> #include <linux/seq_file.h> #include <linux/mount.h> #include <linux/mnt_namespace.h> @@ -67,6 +66,12 @@ #define NFSDBG_FACILITY NFSDBG_VFS +#ifdef CONFIG_NFS_V3 +#define NFS_DEFAULT_VERSION 3 +#else +#define NFS_DEFAULT_VERSION 2 +#endif + enum { /* Mount options that take no arguments */ Opt_soft, Opt_hard, @@ -260,8 +265,8 @@ static int nfs_statfs(struct dentry *, struct kstatfs *); static int nfs_show_options(struct seq_file *, struct vfsmount *); static int nfs_show_stats(struct seq_file *, struct vfsmount *); static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *); -static int nfs_xdev_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); +static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data); static void nfs_put_super(struct super_block *); static void nfs_kill_super(struct super_block *); static int nfs_remount(struct super_block *sb, int *flags, char *raw_data); @@ -277,7 +282,7 @@ static struct file_system_type nfs_fs_type = { struct file_system_type nfs_xdev_fs_type = { .owner = THIS_MODULE, .name = "nfs", - .get_sb = nfs_xdev_get_sb, + .mount = nfs_xdev_mount, .kill_sb = nfs_kill_super, .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, }; @@ -302,14 +307,14 @@ static int nfs4_try_mount(int flags, const char *dev_name, struct nfs_parsed_mount_data *data, struct vfsmount *mnt); static int nfs4_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); -static int nfs4_remote_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); -static int nfs4_xdev_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); +static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data); +static struct dentry *nfs4_xdev_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data); static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); -static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); +static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data); static void nfs4_kill_super(struct super_block *sb); static struct file_system_type nfs4_fs_type = { @@ -323,7 +328,7 @@ static struct file_system_type nfs4_fs_type = { static struct file_system_type nfs4_remote_fs_type = { .owner = THIS_MODULE, .name = "nfs4", - .get_sb = nfs4_remote_get_sb, + .mount = nfs4_remote_mount, .kill_sb = nfs4_kill_super, .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, }; @@ -331,7 +336,7 @@ static struct file_system_type nfs4_remote_fs_type = { struct file_system_type nfs4_xdev_fs_type = { .owner = THIS_MODULE, .name = "nfs4", - .get_sb = nfs4_xdev_get_sb, + .mount = nfs4_xdev_mount, .kill_sb = nfs4_kill_super, .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, }; @@ -339,7 +344,7 @@ struct file_system_type nfs4_xdev_fs_type = { static struct file_system_type nfs4_remote_referral_fs_type = { .owner = THIS_MODULE, .name = "nfs4", - .get_sb = nfs4_remote_referral_get_sb, + .mount = nfs4_remote_referral_mount, .kill_sb = nfs4_kill_super, .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, }; @@ -2277,7 +2282,7 @@ static int nfs_get_sb(struct file_system_type *fs_type, }; int error = -ENOMEM; - data = nfs_alloc_parsed_mount_data(3); + data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION); mntfh = nfs_alloc_fhandle(); if (data == NULL || mntfh == NULL) goto out_free_fh; @@ -2397,9 +2402,9 @@ static void nfs_kill_super(struct super_block *s) /* * Clone an NFS2/3 server record on xdev traversal (FSID-change) */ -static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *raw_data, - struct vfsmount *mnt) +static struct dentry * +nfs_xdev_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) { struct nfs_clone_mount *data = raw_data; struct super_block *s; @@ -2411,7 +2416,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, }; int error; - dprintk("--> nfs_xdev_get_sb()\n"); + dprintk("--> nfs_xdev_mount()\n"); /* create a new volume representation */ server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr); @@ -2458,28 +2463,26 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, } s->s_flags |= MS_ACTIVE; - mnt->mnt_sb = s; - mnt->mnt_root = mntroot; /* clone any lsm security options from the parent to the new sb */ security_sb_clone_mnt_opts(data->sb, s); - dprintk("<-- nfs_xdev_get_sb() = 0\n"); - return 0; + dprintk("<-- nfs_xdev_mount() = 0\n"); + return mntroot; out_err_nosb: nfs_free_server(server); out_err_noserver: - dprintk("<-- nfs_xdev_get_sb() = %d [error]\n", error); - return error; + dprintk("<-- nfs_xdev_mount() = %d [error]\n", error); + return ERR_PTR(error); error_splat_super: if (server && !s->s_root) bdi_unregister(&server->backing_dev_info); error_splat_bdi: deactivate_locked_super(s); - dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error); - return error; + dprintk("<-- nfs_xdev_mount() = %d [splat]\n", error); + return ERR_PTR(error); } #ifdef CONFIG_NFS_V4 @@ -2649,8 +2652,9 @@ out_no_address: /* * Get the superblock for the NFS4 root partition */ -static int nfs4_remote_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) +static struct dentry * +nfs4_remote_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) { struct nfs_parsed_mount_data *data = raw_data; struct super_block *s; @@ -2714,15 +2718,16 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type, goto error_splat_root; s->s_flags |= MS_ACTIVE; - mnt->mnt_sb = s; - mnt->mnt_root = mntroot; - error = 0; + + security_free_mnt_opts(&data->lsm_opts); + nfs_free_fhandle(mntfh); + return mntroot; out: security_free_mnt_opts(&data->lsm_opts); out_free_fh: nfs_free_fhandle(mntfh); - return error; + return ERR_PTR(error); out_free: nfs_free_server(server); @@ -2968,9 +2973,9 @@ static void nfs4_kill_super(struct super_block *sb) /* * Clone an NFS4 server record on xdev traversal (FSID-change) */ -static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *raw_data, - struct vfsmount *mnt) +static struct dentry * +nfs4_xdev_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) { struct nfs_clone_mount *data = raw_data; struct super_block *s; @@ -2982,7 +2987,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags, }; int error; - dprintk("--> nfs4_xdev_get_sb()\n"); + dprintk("--> nfs4_xdev_mount()\n"); /* create a new volume representation */ server = nfs_clone_server(NFS_SB(data->sb), data->fh, data->fattr); @@ -3029,32 +3034,30 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags, } s->s_flags |= MS_ACTIVE; - mnt->mnt_sb = s; - mnt->mnt_root = mntroot; security_sb_clone_mnt_opts(data->sb, s); - dprintk("<-- nfs4_xdev_get_sb() = 0\n"); - return 0; + dprintk("<-- nfs4_xdev_mount() = 0\n"); + return mntroot; out_err_nosb: nfs_free_server(server); out_err_noserver: - dprintk("<-- nfs4_xdev_get_sb() = %d [error]\n", error); - return error; + dprintk("<-- nfs4_xdev_mount() = %d [error]\n", error); + return ERR_PTR(error); error_splat_super: if (server && !s->s_root) bdi_unregister(&server->backing_dev_info); error_splat_bdi: deactivate_locked_super(s); - dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error); - return error; + dprintk("<-- nfs4_xdev_mount() = %d [splat]\n", error); + return ERR_PTR(error); } -static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data, - struct vfsmount *mnt) +static struct dentry * +nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) { struct nfs_clone_mount *data = raw_data; struct super_block *s; @@ -3118,14 +3121,12 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type, } s->s_flags |= MS_ACTIVE; - mnt->mnt_sb = s; - mnt->mnt_root = mntroot; security_sb_clone_mnt_opts(data->sb, s); nfs_free_fhandle(mntfh); dprintk("<-- nfs4_referral_get_sb() = 0\n"); - return 0; + return mntroot; out_err_nosb: nfs_free_server(server); @@ -3133,7 +3134,7 @@ out_err_noserver: nfs_free_fhandle(mntfh); out_err_nofh: dprintk("<-- nfs4_referral_get_sb() = %d [error]\n", error); - return error; + return ERR_PTR(error); error_splat_super: if (server && !s->s_root) @@ -3142,7 +3143,7 @@ error_splat_bdi: deactivate_locked_super(s); nfs_free_fhandle(mntfh); dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error); - return error; + return ERR_PTR(error); } /* diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 9a16bad..7bdec85 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -444,9 +444,9 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir, /* set up nfs_renamedata */ data->old_dir = old_dir; - atomic_inc(&old_dir->i_count); + ihold(old_dir); data->new_dir = new_dir; - atomic_inc(&new_dir->i_count); + ihold(new_dir); data->old_dentry = dget(old_dentry); data->new_dentry = dget(new_dentry); nfs_fattr_init(&data->old_fattr); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 56347e0..116cab9 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -673,16 +673,17 @@ static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses) spin_unlock(&clp->cl_lock); } -static void nfsd4_register_conn(struct nfsd4_conn *conn) +static int nfsd4_register_conn(struct nfsd4_conn *conn) { conn->cn_xpt_user.callback = nfsd4_conn_lost; - register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user); + return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user); } static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses) { struct nfsd4_conn *conn; u32 flags = NFS4_CDFC4_FORE; + int ret; if (ses->se_flags & SESSION4_BACK_CHAN) flags |= NFS4_CDFC4_BACK; @@ -690,7 +691,10 @@ static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses) if (!conn) return nfserr_jukebox; nfsd4_hash_conn(conn, ses); - nfsd4_register_conn(conn); + ret = nfsd4_register_conn(conn); + if (ret) + /* oops; xprt is already down: */ + nfsd4_conn_lost(&conn->cn_xpt_user); return nfs_ok; } @@ -1644,6 +1648,7 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi { struct nfs4_client *clp = ses->se_client; struct nfsd4_conn *c; + int ret; spin_lock(&clp->cl_lock); c = __nfsd4_find_conn(new->cn_xprt, ses); @@ -1654,7 +1659,10 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi } __nfsd4_hash_conn(new, ses); spin_unlock(&clp->cl_lock); - nfsd4_register_conn(new); + ret = nfsd4_register_conn(new); + if (ret) + /* oops; xprt is already down: */ + nfsd4_conn_lost(&new->cn_xpt_user); return; } @@ -2254,7 +2262,7 @@ nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access) * Spawn a thread to perform a recall on the delegation represented * by the lease (file_lock) * - * Called from break_lease() with lock_kernel() held. + * Called from break_lease() with lock_flocks() held. * Note: we assume break_lease will only call this *once* for any given * lease. */ @@ -2278,7 +2286,7 @@ void nfsd_break_deleg_cb(struct file_lock *fl) list_add_tail(&dp->dl_recall_lru, &del_recall_lru); spin_unlock(&recall_lock); - /* only place dl_time is set. protected by lock_kernel*/ + /* only place dl_time is set. protected by lock_flocks*/ dp->dl_time = get_seconds(); /* @@ -2295,7 +2303,7 @@ void nfsd_break_deleg_cb(struct file_lock *fl) /* * The file_lock is being reapd. * - * Called by locks_free_lock() with lock_kernel() held. + * Called by locks_free_lock() with lock_flocks() held. */ static void nfsd_release_deleg_cb(struct file_lock *fl) @@ -2310,23 +2318,7 @@ void nfsd_release_deleg_cb(struct file_lock *fl) } /* - * Set the delegation file_lock back pointer. - * - * Called from setlease() with lock_kernel() held. - */ -static -void nfsd_copy_lock_deleg_cb(struct file_lock *new, struct file_lock *fl) -{ - struct nfs4_delegation *dp = (struct nfs4_delegation *)new->fl_owner; - - dprintk("NFSD: nfsd_copy_lock_deleg_cb: new fl %p dp %p\n", new, dp); - if (!dp) - return; - dp->dl_flock = new; -} - -/* - * Called from setlease() with lock_kernel() held + * Called from setlease() with lock_flocks() held */ static int nfsd_same_client_deleg_cb(struct file_lock *onlist, struct file_lock *try) @@ -2355,7 +2347,6 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg) static const struct lock_manager_operations nfsd_lease_mng_ops = { .fl_break = nfsd_break_deleg_cb, .fl_release_private = nfsd_release_deleg_cb, - .fl_copy_lock = nfsd_copy_lock_deleg_cb, .fl_mylease = nfsd_same_client_deleg_cb, .fl_change = nfsd_change_deleg_cb, }; @@ -2661,12 +2652,15 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta fl->fl_file = find_readable_file(stp->st_file); BUG_ON(!fl->fl_file); fl->fl_pid = current->tgid; + dp->dl_flock = fl; /* vfs_setlease checks to see if delegation should be handed out. * the lock_manager callbacks fl_mylease and fl_change are used */ if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) { dprintk("NFSD: setlease failed [%d], no delegation\n", status); + dp->dl_flock = NULL; + locks_free_lock(fl); unhash_delegation(dp); flag = NFS4_OPEN_DELEGATE_NONE; goto out; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index d6dc3f6..4514ebb 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1405,16 +1405,16 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) return simple_fill_super(sb, 0x6e667364, nfsd_files); } -static int nfsd_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *nfsd_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_single(fs_type, flags, data, nfsd_fill_super, mnt); + return mount_single(fs_type, flags, data, nfsd_fill_super); } static struct file_system_type nfsd_fs_type = { .owner = THIS_MODULE, .name = "nfsd", - .get_sb = nfsd_get_sb, + .mount = nfsd_mount, .kill_sb = kill_litter_super, }; diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 49c844d..59e5fe7 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -335,7 +335,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr) * the device at this point. * * To prevent nilfs_dat_translate() from returning the - * uncommited block number, this makes a copy of the entry + * uncommitted block number, this makes a copy of the entry * buffer and redirects nilfs_dat_translate() to the copy. */ if (!buffer_nilfs_redirected(entry_bh)) { diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 3e90f86..e00d945 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -349,8 +349,8 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb, ino = vdesc->vd_ino; cno = vdesc->vd_cno; inode = nilfs_iget_for_gc(sb, ino, cno); - if (unlikely(inode == NULL)) { - ret = -ENOMEM; + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); goto failed; } do { diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 35ae03c..f804d41 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1141,9 +1141,9 @@ static int nilfs_test_bdev_super(struct super_block *s, void *data) return (void *)s->s_bdev == data; } -static int -nilfs_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry * +nilfs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) { struct nilfs_super_data sd; struct super_block *s; @@ -1156,7 +1156,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type); if (IS_ERR(sd.bdev)) - return PTR_ERR(sd.bdev); + return ERR_CAST(sd.bdev); sd.cno = 0; sd.flags = flags; @@ -1235,9 +1235,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, if (!s_new) close_bdev_exclusive(sd.bdev, mode); - mnt->mnt_sb = s; - mnt->mnt_root = root_dentry; - return 0; + return root_dentry; failed_super: deactivate_locked_super(s); @@ -1245,13 +1243,13 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, failed: if (!s_new) close_bdev_exclusive(sd.bdev, mode); - return err; + return ERR_PTR(err); } struct file_system_type nilfs_fs_type = { .owner = THIS_MODULE, .name = "nilfs2", - .get_sb = nilfs_get_sb, + .mount = nilfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig index b388443..22c629e 100644 --- a/fs/notify/Kconfig +++ b/fs/notify/Kconfig @@ -3,4 +3,4 @@ config FSNOTIFY source "fs/notify/dnotify/Kconfig" source "fs/notify/inotify/Kconfig" -#source "fs/notify/fanotify/Kconfig" +source "fs/notify/fanotify/Kconfig" diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 85366c7..b04f88e 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -131,6 +131,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW); BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM); BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM); + BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); pr_debug("%s: group=%p event=%p\n", __func__, group, event); @@ -160,20 +161,21 @@ static bool fanotify_should_send_event(struct fsnotify_group *group, __u32 event_mask, void *data, int data_type) { __u32 marks_mask, marks_ignored_mask; + struct path *path = data; pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p " "mask=%x data=%p data_type=%d\n", __func__, group, to_tell, inode_mark, vfsmnt_mark, event_mask, data, data_type); - /* sorry, fanotify only gives a damn about files and dirs */ - if (!S_ISREG(to_tell->i_mode) && - !S_ISDIR(to_tell->i_mode)) - return false; - /* if we don't have enough info to send an event to userspace say no */ if (data_type != FSNOTIFY_EVENT_PATH) return false; + /* sorry, fanotify only gives a damn about files and dirs */ + if (!S_ISREG(path->dentry->d_inode->i_mode) && + !S_ISDIR(path->dentry->d_inode->i_mode)) + return false; + if (inode_mark && vfsmnt_mark) { marks_mask = (vfsmnt_mark->mask | inode_mark->mask); marks_ignored_mask = (vfsmnt_mark->ignored_mask | inode_mark->ignored_mask); @@ -194,16 +196,29 @@ static bool fanotify_should_send_event(struct fsnotify_group *group, BUG(); } + if (S_ISDIR(path->dentry->d_inode->i_mode) && + (marks_ignored_mask & FS_ISDIR)) + return false; + if (event_mask & marks_mask & ~marks_ignored_mask) return true; return false; } +static void fanotify_free_group_priv(struct fsnotify_group *group) +{ + struct user_struct *user; + + user = group->fanotify_data.user; + atomic_dec(&user->fanotify_listeners); + free_uid(user); +} + const struct fsnotify_ops fanotify_fsnotify_ops = { .handle_event = fanotify_handle_event, .should_send_event = fanotify_should_send_event, - .free_group_priv = NULL, + .free_group_priv = fanotify_free_group_priv, .free_event_priv = NULL, .freeing_mark = NULL, }; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index bbcb98e..0632248 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -16,6 +16,10 @@ #include <asm/ioctls.h> +#define FANOTIFY_DEFAULT_MAX_EVENTS 16384 +#define FANOTIFY_DEFAULT_MAX_MARKS 8192 +#define FANOTIFY_DEFAULT_MAX_LISTENERS 128 + extern const struct fsnotify_ops fanotify_fsnotify_ops; static struct kmem_cache *fanotify_mark_cache __read_mostly; @@ -326,7 +330,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, ret = -EAGAIN; if (file->f_flags & O_NONBLOCK) break; - ret = -EINTR; + ret = -ERESTARTSYS; if (signal_pending(current)) break; @@ -372,11 +376,10 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t static int fanotify_release(struct inode *ignored, struct file *file) { struct fsnotify_group *group = file->private_data; - struct fanotify_response_event *re, *lre; - - pr_debug("%s: file=%p group=%p\n", __func__, file, group); #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + struct fanotify_response_event *re, *lre; + mutex_lock(&group->fanotify_data.access_mutex); group->fanotify_data.bypass_perm = true; @@ -554,18 +557,24 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, __u32 mask, unsigned int flags) { - __u32 oldmask; + __u32 oldmask = -1; spin_lock(&fsn_mark->lock); if (!(flags & FAN_MARK_IGNORED_MASK)) { oldmask = fsn_mark->mask; fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask)); } else { - oldmask = fsn_mark->ignored_mask; - fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask | mask)); + __u32 tmask = fsn_mark->ignored_mask | mask; + fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask); if (flags & FAN_MARK_IGNORED_SURV_MODIFY) fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; } + + if (!(flags & FAN_MARK_ONDIR)) { + __u32 tmask = fsn_mark->ignored_mask | FAN_ONDIR; + fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask); + } + spin_unlock(&fsn_mark->lock); return mask & ~oldmask; @@ -582,6 +591,9 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, if (!fsn_mark) { int ret; + if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) + return -ENOSPC; + fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); if (!fsn_mark) return -ENOMEM; @@ -610,10 +622,23 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); + /* + * If some other task has this inode open for write we should not add + * an ignored mark, unless that ignored mark is supposed to survive + * modification changes anyway. + */ + if ((flags & FAN_MARK_IGNORED_MASK) && + !(flags & FAN_MARK_IGNORED_SURV_MODIFY) && + (atomic_read(&inode->i_writecount) > 0)) + return 0; + fsn_mark = fsnotify_find_inode_mark(group, inode); if (!fsn_mark) { int ret; + if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) + return -ENOSPC; + fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); if (!fsn_mark) return -ENOMEM; @@ -637,6 +662,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) { struct fsnotify_group *group; int f_flags, fd; + struct user_struct *user; pr_debug("%s: flags=%d event_f_flags=%d\n", __func__, flags, event_f_flags); @@ -647,6 +673,12 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) if (flags & ~FAN_ALL_INIT_FLAGS) return -EINVAL; + user = get_current_user(); + if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) { + free_uid(user); + return -EMFILE; + } + f_flags = O_RDWR | FMODE_NONOTIFY; if (flags & FAN_CLOEXEC) f_flags |= O_CLOEXEC; @@ -658,12 +690,47 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) if (IS_ERR(group)) return PTR_ERR(group); + group->fanotify_data.user = user; + atomic_inc(&user->fanotify_listeners); + group->fanotify_data.f_flags = event_f_flags; #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS mutex_init(&group->fanotify_data.access_mutex); init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); #endif + switch (flags & FAN_ALL_CLASS_BITS) { + case FAN_CLASS_NOTIF: + group->priority = FS_PRIO_0; + break; + case FAN_CLASS_CONTENT: + group->priority = FS_PRIO_1; + break; + case FAN_CLASS_PRE_CONTENT: + group->priority = FS_PRIO_2; + break; + default: + fd = -EINVAL; + goto out_put_group; + } + + if (flags & FAN_UNLIMITED_QUEUE) { + fd = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto out_put_group; + group->max_events = UINT_MAX; + } else { + group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS; + } + + if (flags & FAN_UNLIMITED_MARKS) { + fd = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto out_put_group; + group->fanotify_data.max_marks = UINT_MAX; + } else { + group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS; + } fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags); if (fd < 0) @@ -704,6 +771,12 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags, default: return -EINVAL; } + + if (mask & FAN_ONDIR) { + flags |= FAN_MARK_ONDIR; + mask &= ~FAN_ONDIR; + } + #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS if (mask & ~(FAN_ALL_EVENTS | FAN_ALL_PERM_EVENTS | FAN_EVENT_ON_CHILD)) #else @@ -719,6 +792,16 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags, ret = -EINVAL; if (unlikely(filp->f_op != &fanotify_fops)) goto fput_and_out; + group = filp->private_data; + + /* + * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF. These are not + * allowed to set permissions events. + */ + ret = -EINVAL; + if (mask & FAN_ALL_PERM_EVENTS && + group->priority == FS_PRIO_0) + goto fput_and_out; ret = fanotify_find_path(dfd, pathname, &path, flags); if (ret) @@ -729,7 +812,6 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags, inode = path.dentry->d_inode; else mnt = path.mnt; - group = filp->private_data; /* create/update an inode mark */ switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 4498a20..20dc218 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -84,16 +84,17 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode) } /* Notify this dentry's parent about a child's events. */ -void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask) +int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask) { struct dentry *parent; struct inode *p_inode; + int ret = 0; if (!dentry) dentry = path->dentry; if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED)) - return; + return 0; parent = dget_parent(dentry); p_inode = parent->d_inode; @@ -106,14 +107,16 @@ void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask) mask |= FS_EVENT_ON_CHILD; if (path) - fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH, - dentry->d_name.name, 0); + ret = fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH, + dentry->d_name.name, 0); else - fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE, - dentry->d_name.name, 0); + ret = fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE, + dentry->d_name.name, 0); } dput(parent); + + return ret; } EXPORT_SYMBOL_GPL(__fsnotify_parent); @@ -252,20 +255,23 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, if (inode_group > vfsmount_group) { /* handle inode */ - send_to_group(to_tell, NULL, inode_mark, NULL, mask, data, - data_is, cookie, file_name, &event); + ret = send_to_group(to_tell, NULL, inode_mark, NULL, mask, data, + data_is, cookie, file_name, &event); /* we didn't use the vfsmount_mark */ vfsmount_group = NULL; } else if (vfsmount_group > inode_group) { - send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data, - data_is, cookie, file_name, &event); + ret = send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data, + data_is, cookie, file_name, &event); inode_group = NULL; } else { - send_to_group(to_tell, mnt, inode_mark, vfsmount_mark, - mask, data, data_is, cookie, file_name, - &event); + ret = send_to_group(to_tell, mnt, inode_mark, vfsmount_mark, + mask, data, data_is, cookie, file_name, + &event); } + if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) + goto out; + if (inode_group) inode_node = srcu_dereference(inode_node->next, &fsnotify_mark_srcu); @@ -273,7 +279,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, vfsmount_node = srcu_dereference(vfsmount_node->next, &fsnotify_mark_srcu); } - + ret = 0; +out: srcu_read_unlock(&fsnotify_mark_srcu, idx); /* * fsnotify_create_event() took a reference so the event can't be cleaned diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 21ed106..4c29fcf 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -177,7 +177,8 @@ void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark, * Attach an initialized mark to a given inode. * These marks may be used for the fsnotify backend to determine which * event types should be delivered to which group and for which inodes. These - * marks are ordered according to the group's location in memory. + * marks are ordered according to priority, highest number first, and then by + * the group's location in memory. */ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, struct inode *inode, @@ -211,7 +212,11 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, goto out; } - if (mark->group < lmark->group) + if (mark->group->priority < lmark->group->priority) + continue; + + if ((mark->group->priority == lmark->group->priority) && + (mark->group < lmark->group)) continue; hlist_add_before_rcu(&mark->i.i_list, &lmark->i.i_list); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 24edc11..444c305 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -862,7 +862,7 @@ static int __init inotify_user_setup(void) BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW); BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED); BUILD_BUG_ON(IN_EXCL_UNLINK != FS_EXCL_UNLINK); - BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR); + BUILD_BUG_ON(IN_ISDIR != FS_ISDIR); BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT); BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21); diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index 56772b5..85eebff 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -169,7 +169,11 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, goto out; } - if (mark->group < lmark->group) + if (mark->group->priority < lmark->group->priority) + continue; + + if ((mark->group->priority == lmark->group->priority) && + (mark->group < lmark->group)) continue; hlist_add_before_rcu(&mark->m.m_list, &lmark->m.m_list); diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index d3fbe57..a30ecacc 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -3059,17 +3059,16 @@ struct kmem_cache *ntfs_index_ctx_cache; /* Driver wide mutex. */ DEFINE_MUTEX(ntfs_lock); -static int ntfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *ntfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, ntfs_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super); } static struct file_system_type ntfs_fs_type = { .owner = THIS_MODULE, .name = "ntfs", - .get_sb = ntfs_get_sb, + .mount = ntfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 75e115f..b2df490 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -643,16 +643,16 @@ static const struct inode_operations dlmfs_file_inode_operations = { .setattr = dlmfs_file_setattr, }; -static int dlmfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *dlmfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt); + return mount_nodev(fs_type, flags, data, dlmfs_fill_super); } static struct file_system_type dlmfs_fs_type = { .owner = THIS_MODULE, .name = "ocfs2_dlmfs", - .get_sb = dlmfs_get_sb, + .mount = dlmfs_mount, .kill_sb = kill_litter_super, }; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index d840821..1efea36 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -159,7 +159,9 @@ struct ocfs2_lock_res { char l_name[OCFS2_LOCK_ID_MAX_LEN]; unsigned int l_ro_holders; unsigned int l_ex_holders; - unsigned char l_level; + char l_level; + char l_requested; + char l_blocking; /* Data packed - type enum ocfs2_lock_type */ unsigned char l_type; @@ -169,8 +171,6 @@ struct ocfs2_lock_res { unsigned char l_action; /* Data packed - enum type ocfs2_unlock_action */ unsigned char l_unlock_action; - unsigned char l_requested; - unsigned char l_blocking; unsigned int l_pending_gen; spinlock_t l_lock; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 56f0cb3..cfeab7c 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -41,7 +41,6 @@ #include <linux/mount.h> #include <linux/seq_file.h> #include <linux/quotaops.h> -#include <linux/smp_lock.h> #define MLOG_MASK_PREFIX ML_SUPER #include <cluster/masklog.h> @@ -1236,14 +1235,12 @@ read_super_error: return status; } -static int ocfs2_get_sb(struct file_system_type *fs_type, +static struct dentry *ocfs2_mount(struct file_system_type *fs_type, int flags, const char *dev_name, - void *data, - struct vfsmount *mnt) + void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); } static void ocfs2_kill_sb(struct super_block *sb) @@ -1267,8 +1264,7 @@ out: static struct file_system_type ocfs2_fs_type = { .owner = THIS_MODULE, .name = "ocfs2", - .get_sb = ocfs2_get_sb, /* is this called when we mount - * the fs? */ + .mount = ocfs2_mount, .kill_sb = ocfs2_kill_sb, .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 14a2286..e043c4c 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -557,17 +557,16 @@ end: return ret; } -static int omfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data, struct vfsmount *m) +static struct dentry *omfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, omfs_fill_super, m); + return mount_bdev(fs_type, flags, dev_name, data, omfs_fill_super); } static struct file_system_type omfs_fs_type = { .owner = THIS_MODULE, .name = "omfs", - .get_sb = omfs_get_sb, + .mount = omfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; @@ -786,11 +786,11 @@ struct file *nameidata_to_filp(struct nameidata *nd) /* Pick up the filp from the open intent */ filp = nd->intent.open.file; /* Has the filesystem initialised the file for us? */ - if (filp->f_path.dentry == NULL) + if (filp->f_path.dentry == NULL) { + path_get(&nd->path); filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp, NULL, cred); - else - path_put(&nd->path); + } return filp; } diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index ffcd04f..911e61f 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -415,16 +415,16 @@ out_no_root: return ret; } -static int openprom_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *openprom_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_single(fs_type, flags, data, openprom_fill_super, mnt); + return mount_single(fs_type, flags, data, openprom_fill_super); } static struct file_system_type openprom_fs_type = { .owner = THIS_MODULE, .name = "openpromfs", - .get_sb = openprom_get_sb, + .mount = openprom_mount, .kill_sb = kill_anon_super, }; @@ -1199,12 +1199,24 @@ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, return ret; } +/* + * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same + * location, so checking ->i_pipe is not enough to verify that this is a + * pipe. + */ +struct pipe_inode_info *get_pipe_info(struct file *file) +{ + struct inode *i = file->f_path.dentry->d_inode; + + return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL; +} + long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { struct pipe_inode_info *pipe; long ret; - pipe = file->f_path.dentry->d_inode->i_pipe; + pipe = get_pipe_info(file); if (!pipe) return -EBADF; @@ -1247,16 +1259,15 @@ out: * any operations on the root directory. However, we need a non-trivial * d_name - pipe: will go nicely and kill the special-casing in procfs. */ -static int pipefs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - struct vfsmount *mnt) +static struct dentry *pipefs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt); + return mount_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC); } static struct file_system_type pipe_fs_type = { .name = "pipefs", - .get_sb = pipefs_get_sb, + .mount = pipefs_mount, .kill_sb = kill_anon_super, }; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 9c2b5f4..3ddb606 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -16,7 +16,6 @@ #include <linux/limits.h> #include <linux/init.h> #include <linux/module.h> -#include <linux/smp_lock.h> #include <linux/sysctl.h> #include <linux/slab.h> diff --git a/fs/proc/root.c b/fs/proc/root.c index 93d99b3..ef9fa8e 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -35,8 +35,8 @@ static int proc_set_super(struct super_block *sb, void *data) return set_anon_super(sb, NULL); } -static int proc_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *proc_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { int err; struct super_block *sb; @@ -61,14 +61,14 @@ static int proc_get_sb(struct file_system_type *fs_type, sb = sget(fs_type, proc_test_super, proc_set_super, ns); if (IS_ERR(sb)) - return PTR_ERR(sb); + return ERR_CAST(sb); if (!sb->s_root) { sb->s_flags = flags; err = proc_fill_super(sb); if (err) { deactivate_locked_super(sb); - return err; + return ERR_PTR(err); } ei = PROC_I(sb->s_root->d_inode); @@ -79,11 +79,9 @@ static int proc_get_sb(struct file_system_type *fs_type, } sb->s_flags |= MS_ACTIVE; - ns->proc_mnt = mnt; } - simple_set_mnt(mnt, sb); - return 0; + return dget(sb->s_root); } static void proc_kill_sb(struct super_block *sb) @@ -97,7 +95,7 @@ static void proc_kill_sb(struct super_block *sb) static struct file_system_type proc_fs_type = { .name = "proc", - .get_sb = proc_get_sb, + .mount = proc_mount, .kill_sb = proc_kill_sb, }; @@ -115,6 +113,7 @@ void __init proc_root_init(void) return; } + init_pid_ns.proc_mnt = proc_mnt; proc_symlink("mounts", NULL, "self/mounts"); proc_net_init(); @@ -213,6 +212,7 @@ int pid_ns_prepare_proc(struct pid_namespace *ns) if (IS_ERR(mnt)) return PTR_ERR(mnt); + ns->proc_mnt = mnt; return 0; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index da6b01d..c126c83 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -706,6 +706,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, * skip over unmapped regions. */ #define PAGEMAP_WALK_SIZE (PMD_SIZE) +#define PAGEMAP_WALK_MASK (PMD_MASK) static ssize_t pagemap_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -776,7 +777,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, unsigned long end; pm.pos = 0; - end = start_vaddr + PAGEMAP_WALK_SIZE; + end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; /* overflow ? */ if (end < start_vaddr || end > end_vaddr) end = end_vaddr; diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 01bad30..fcada42 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -454,17 +454,16 @@ static void destroy_inodecache(void) kmem_cache_destroy(qnx4_inode_cachep); } -static int qnx4_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *qnx4_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, qnx4_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, qnx4_fill_super); } static struct file_system_type qnx4_fs_type = { .owner = THIS_MODULE, .name = "qnx4", - .get_sb = qnx4_get_sb, + .mount = qnx4_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 67fadb1..eacb166 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -255,17 +255,16 @@ fail: return err; } -int ramfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +struct dentry *ramfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_nodev(fs_type, flags, data, ramfs_fill_super, mnt); + return mount_nodev(fs_type, flags, data, ramfs_fill_super); } -static int rootfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *rootfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super, - mnt); + return mount_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super); } static void ramfs_kill_sb(struct super_block *sb) @@ -276,12 +275,12 @@ static void ramfs_kill_sb(struct super_block *sb) static struct file_system_type ramfs_fs_type = { .name = "ramfs", - .get_sb = ramfs_get_sb, + .mount = ramfs_mount, .kill_sb = ramfs_kill_sb, }; static struct file_system_type rootfs_fs_type = { .name = "rootfs", - .get_sb = rootfs_get_sb, + .mount = rootfs_mount, .kill_sb = kill_litter_super, }; diff --git a/fs/read_write.c b/fs/read_write.c index 9cd9d14..5d431ba 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -9,7 +9,6 @@ #include <linux/fcntl.h> #include <linux/file.h> #include <linux/uio.h> -#include <linux/smp_lock.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/module.h> @@ -243,8 +242,6 @@ bad: * them to something that fits in "int" so that others * won't have to do range checks all the time. */ -#define MAX_RW_COUNT (INT_MAX & PAGE_CACHE_MASK) - int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count) { struct inode *inode; @@ -584,65 +581,71 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_pointer, struct iovec **ret_pointer) - { +{ unsigned long seg; - ssize_t ret; + ssize_t ret; struct iovec *iov = fast_pointer; - /* - * SuS says "The readv() function *may* fail if the iovcnt argument - * was less than or equal to 0, or greater than {IOV_MAX}. Linux has - * traditionally returned zero for zero segments, so... - */ + /* + * SuS says "The readv() function *may* fail if the iovcnt argument + * was less than or equal to 0, or greater than {IOV_MAX}. Linux has + * traditionally returned zero for zero segments, so... + */ if (nr_segs == 0) { ret = 0; - goto out; + goto out; } - /* - * First get the "struct iovec" from user memory and - * verify all the pointers - */ + /* + * First get the "struct iovec" from user memory and + * verify all the pointers + */ if (nr_segs > UIO_MAXIOV) { ret = -EINVAL; - goto out; + goto out; } if (nr_segs > fast_segs) { - iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); + iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); if (iov == NULL) { ret = -ENOMEM; - goto out; + goto out; } - } + } if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) { ret = -EFAULT; - goto out; + goto out; } - /* + /* * According to the Single Unix Specification we should return EINVAL * if an element length is < 0 when cast to ssize_t or if the * total length would overflow the ssize_t return value of the * system call. - */ + * + * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the + * overflow case. + */ ret = 0; - for (seg = 0; seg < nr_segs; seg++) { - void __user *buf = iov[seg].iov_base; - ssize_t len = (ssize_t)iov[seg].iov_len; + for (seg = 0; seg < nr_segs; seg++) { + void __user *buf = iov[seg].iov_base; + ssize_t len = (ssize_t)iov[seg].iov_len; /* see if we we're about to use an invalid len or if * it's about to overflow ssize_t */ - if (len < 0 || (ret + len < ret)) { + if (len < 0) { ret = -EINVAL; - goto out; + goto out; } if (unlikely(!access_ok(vrfy_dir(type), buf, len))) { ret = -EFAULT; - goto out; + goto out; + } + if (len > MAX_RW_COUNT - ret) { + len = MAX_RW_COUNT - ret; + iov[seg].iov_len = len; } - ret += len; - } + } out: *ret_pointer = iov; return ret; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 41656d4..0bae036 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -8,7 +8,6 @@ #include <linux/reiserfs_acl.h> #include <linux/reiserfs_xattr.h> #include <linux/exportfs.h> -#include <linux/smp_lock.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/slab.h> diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index adf22b4..79265fd 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -9,7 +9,6 @@ #include <linux/time.h> #include <asm/uaccess.h> #include <linux/pagemap.h> -#include <linux/smp_lock.h> #include <linux/compat.h> /* @@ -184,12 +183,11 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) return 0; } - /* we need to make sure nobody is changing the file size beneath - ** us - */ - reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb); depth = reiserfs_write_lock_once(inode->i_sb); + /* we need to make sure nobody is changing the file size beneath us */ + reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb); + write_from = inode->i_size & (blocksize - 1); /* if we are on a block boundary, we are already unpacked. */ if (write_from == 0) { diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 076c8b1..d31bce1 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -43,7 +43,6 @@ #include <linux/fcntl.h> #include <linux/stat.h> #include <linux/string.h> -#include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/workqueue.h> #include <linux/writeback.h> diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index e15ff61..b243117 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -28,7 +28,6 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/crc32.h> -#include <linux/smp_lock.h> struct file_system_type reiserfs_fs_type; @@ -2213,12 +2212,11 @@ out: #endif -static int get_super_block(struct file_system_type *fs_type, +static struct dentry *get_super_block(struct file_system_type *fs_type, int flags, const char *dev_name, - void *data, struct vfsmount *mnt) + void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super); } static int __init init_reiserfs_fs(void) @@ -2253,7 +2251,7 @@ static void __exit exit_reiserfs_fs(void) struct file_system_type reiserfs_fs_type = { .owner = THIS_MODULE, .name = "reiserfs", - .get_sb = get_super_block, + .mount = get_super_block, .kill_sb = reiserfs_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 536d697..90d2fcb 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -472,7 +472,9 @@ int reiserfs_acl_chmod(struct inode *inode) struct reiserfs_transaction_handle th; size_t size = reiserfs_xattr_nblocks(inode, reiserfs_acl_size(clone->a_count)); - reiserfs_write_lock(inode->i_sb); + int depth; + + depth = reiserfs_write_lock_once(inode->i_sb); error = journal_begin(&th, inode->i_sb, size * 2); if (!error) { int error2; @@ -482,7 +484,7 @@ int reiserfs_acl_chmod(struct inode *inode) if (error2) error = error2; } - reiserfs_write_unlock(inode->i_sb); + reiserfs_write_unlock_once(inode->i_sb, depth); } posix_acl_release(clone); return error; diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 2685805..6647f90 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -552,20 +552,19 @@ error_rsb: /* * get a superblock for mounting */ -static int romfs_get_sb(struct file_system_type *fs_type, +static struct dentry *romfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, - void *data, struct vfsmount *mnt) + void *data) { - int ret = -EINVAL; + struct dentry *ret = ERR_PTR(-EINVAL); #ifdef CONFIG_ROMFS_ON_MTD - ret = get_sb_mtd(fs_type, flags, dev_name, data, romfs_fill_super, - mnt); + ret = mount_mtd(fs_type, flags, dev_name, data, romfs_fill_super); #endif #ifdef CONFIG_ROMFS_ON_BLOCK - if (ret == -EINVAL) - ret = get_sb_bdev(fs_type, flags, dev_name, data, - romfs_fill_super, mnt); + if (ret == ERR_PTR(-EINVAL)) + ret = mount_bdev(fs_type, flags, dev_name, data, + romfs_fill_super); #endif return ret; } @@ -592,7 +591,7 @@ static void romfs_kill_sb(struct super_block *sb) static struct file_system_type romfs_fs_type = { .owner = THIS_MODULE, .name = "romfs", - .get_sb = romfs_get_sb, + .mount = romfs_mount, .kill_sb = romfs_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/splice.c b/fs/splice.c index 8f1dfae..ce2f025 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1311,18 +1311,6 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags); -/* - * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same - * location, so checking ->i_pipe is not enough to verify that this is a - * pipe. - */ -static inline struct pipe_inode_info *pipe_info(struct inode *inode) -{ - if (S_ISFIFO(inode->i_mode)) - return inode->i_pipe; - - return NULL; -} /* * Determine where to splice to/from. @@ -1336,8 +1324,8 @@ static long do_splice(struct file *in, loff_t __user *off_in, loff_t offset, *off; long ret; - ipipe = pipe_info(in->f_path.dentry->d_inode); - opipe = pipe_info(out->f_path.dentry->d_inode); + ipipe = get_pipe_info(in); + opipe = get_pipe_info(out); if (ipipe && opipe) { if (off_in || off_out) @@ -1555,7 +1543,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, int error; long ret; - pipe = pipe_info(file->f_path.dentry->d_inode); + pipe = get_pipe_info(file); if (!pipe) return -EBADF; @@ -1642,7 +1630,7 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, }; long ret; - pipe = pipe_info(file->f_path.dentry->d_inode); + pipe = get_pipe_info(file); if (!pipe) return -EBADF; @@ -2022,8 +2010,8 @@ static int link_pipe(struct pipe_inode_info *ipipe, static long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) { - struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode); - struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode); + struct pipe_inode_info *ipipe = get_pipe_info(in); + struct pipe_inode_info *opipe = get_pipe_info(out); int ret = -EINVAL; /* diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 07a4f11..24de30b 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -370,12 +370,10 @@ static void squashfs_put_super(struct super_block *sb) } -static int squashfs_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, - struct vfsmount *mnt) +static struct dentry *squashfs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, squashfs_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, squashfs_fill_super); } @@ -451,7 +449,7 @@ static void squashfs_destroy_inode(struct inode *inode) static struct file_system_type squashfs_fs_type = { .owner = THIS_MODULE, .name = "squashfs", - .get_sb = squashfs_get_sb, + .mount = squashfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV }; diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c index 652b854..3876c36 100644 --- a/fs/squashfs/xattr.c +++ b/fs/squashfs/xattr.c @@ -158,17 +158,18 @@ static int squashfs_xattr_get(struct inode *inode, int name_index, strncmp(target, name, name_size) == 0) { /* found xattr */ if (type & SQUASHFS_XATTR_VALUE_OOL) { - __le64 xattr; + __le64 xattr_val; + u64 xattr; /* val is a reference to the real location */ err = squashfs_read_metadata(sb, &val, &start, &offset, sizeof(val)); if (err < 0) goto failed; - err = squashfs_read_metadata(sb, &xattr, &start, - &offset, sizeof(xattr)); + err = squashfs_read_metadata(sb, &xattr_val, + &start, &offset, sizeof(xattr_val)); if (err < 0) goto failed; - xattr = le64_to_cpu(xattr); + xattr = le64_to_cpu(xattr_val); start = SQUASHFS_XATTR_BLK(xattr) + msblk->xattr_table; offset = SQUASHFS_XATTR_OFFSET(xattr); diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h index 49fe0d7..b634efc 100644 --- a/fs/squashfs/xattr.h +++ b/fs/squashfs/xattr.h @@ -25,7 +25,7 @@ extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64, u64 *, int *); extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *, - int *, unsigned long long *); + unsigned int *, unsigned long long *); #else static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start, u64 *xattr_table_start, int *xattr_ids) @@ -35,7 +35,7 @@ static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb, } static inline int squashfs_xattr_lookup(struct super_block *sb, - unsigned int index, int *count, int *size, + unsigned int index, int *count, unsigned int *size, unsigned long long *xattr) { return 0; diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c index cfb4110..d33be5d 100644 --- a/fs/squashfs/xattr_id.c +++ b/fs/squashfs/xattr_id.c @@ -34,6 +34,7 @@ #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" +#include "xattr.h" /* * Map xattr id using the xattr id look up table @@ -715,15 +715,14 @@ static int ns_set_super(struct super_block *sb, void *data) return set_anon_super(sb, NULL); } -int get_sb_ns(struct file_system_type *fs_type, int flags, void *data, - int (*fill_super)(struct super_block *, void *, int), - struct vfsmount *mnt) +struct dentry *mount_ns(struct file_system_type *fs_type, int flags, + void *data, int (*fill_super)(struct super_block *, void *, int)) { struct super_block *sb; sb = sget(fs_type, ns_test_super, ns_set_super, data); if (IS_ERR(sb)) - return PTR_ERR(sb); + return ERR_CAST(sb); if (!sb->s_root) { int err; @@ -731,17 +730,16 @@ int get_sb_ns(struct file_system_type *fs_type, int flags, void *data, err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0); if (err) { deactivate_locked_super(sb); - return err; + return ERR_PTR(err); } sb->s_flags |= MS_ACTIVE; } - simple_set_mnt(mnt, sb); - return 0; + return dget(sb->s_root); } -EXPORT_SYMBOL(get_sb_ns); +EXPORT_SYMBOL(mount_ns); #ifdef CONFIG_BLOCK static int set_bdev_super(struct super_block *s, void *data) @@ -762,10 +760,9 @@ static int test_bdev_super(struct super_block *s, void *data) return (void *)s->s_bdev == data; } -int get_sb_bdev(struct file_system_type *fs_type, +struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, - int (*fill_super)(struct super_block *, void *, int), - struct vfsmount *mnt) + int (*fill_super)(struct super_block *, void *, int)) { struct block_device *bdev; struct super_block *s; @@ -777,7 +774,7 @@ int get_sb_bdev(struct file_system_type *fs_type, bdev = open_bdev_exclusive(dev_name, mode, fs_type); if (IS_ERR(bdev)) - return PTR_ERR(bdev); + return ERR_CAST(bdev); /* * once the super is inserted into the list by sget, s_umount @@ -829,15 +826,30 @@ int get_sb_bdev(struct file_system_type *fs_type, bdev->bd_super = s; } - simple_set_mnt(mnt, s); - return 0; + return dget(s->s_root); error_s: error = PTR_ERR(s); error_bdev: close_bdev_exclusive(bdev, mode); error: - return error; + return ERR_PTR(error); +} +EXPORT_SYMBOL(mount_bdev); + +int get_sb_bdev(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + int (*fill_super)(struct super_block *, void *, int), + struct vfsmount *mnt) +{ + struct dentry *root; + + root = mount_bdev(fs_type, flags, dev_name, data, fill_super); + if (IS_ERR(root)) + return PTR_ERR(root); + mnt->mnt_root = root; + mnt->mnt_sb = root->d_sb; + return 0; } EXPORT_SYMBOL(get_sb_bdev); @@ -856,29 +868,42 @@ void kill_block_super(struct super_block *sb) EXPORT_SYMBOL(kill_block_super); #endif -int get_sb_nodev(struct file_system_type *fs_type, +struct dentry *mount_nodev(struct file_system_type *fs_type, int flags, void *data, - int (*fill_super)(struct super_block *, void *, int), - struct vfsmount *mnt) + int (*fill_super)(struct super_block *, void *, int)) { int error; struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); if (IS_ERR(s)) - return PTR_ERR(s); + return ERR_CAST(s); s->s_flags = flags; error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); - return error; + return ERR_PTR(error); } s->s_flags |= MS_ACTIVE; - simple_set_mnt(mnt, s); - return 0; + return dget(s->s_root); } +EXPORT_SYMBOL(mount_nodev); +int get_sb_nodev(struct file_system_type *fs_type, + int flags, void *data, + int (*fill_super)(struct super_block *, void *, int), + struct vfsmount *mnt) +{ + struct dentry *root; + + root = mount_nodev(fs_type, flags, data, fill_super); + if (IS_ERR(root)) + return PTR_ERR(root); + mnt->mnt_root = root; + mnt->mnt_sb = root->d_sb; + return 0; +} EXPORT_SYMBOL(get_sb_nodev); static int compare_single(struct super_block *s, void *p) @@ -886,29 +911,42 @@ static int compare_single(struct super_block *s, void *p) return 1; } -int get_sb_single(struct file_system_type *fs_type, +struct dentry *mount_single(struct file_system_type *fs_type, int flags, void *data, - int (*fill_super)(struct super_block *, void *, int), - struct vfsmount *mnt) + int (*fill_super)(struct super_block *, void *, int)) { struct super_block *s; int error; s = sget(fs_type, compare_single, set_anon_super, NULL); if (IS_ERR(s)) - return PTR_ERR(s); + return ERR_CAST(s); if (!s->s_root) { s->s_flags = flags; error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); - return error; + return ERR_PTR(error); } s->s_flags |= MS_ACTIVE; } else { do_remount_sb(s, flags, data, 0); } - simple_set_mnt(mnt, s); + return dget(s->s_root); +} +EXPORT_SYMBOL(mount_single); + +int get_sb_single(struct file_system_type *fs_type, + int flags, void *data, + int (*fill_super)(struct super_block *, void *, int), + struct vfsmount *mnt) +{ + struct dentry *root; + root = mount_single(fs_type, flags, data, fill_super); + if (IS_ERR(root)) + return PTR_ERR(root); + mnt->mnt_root = root; + mnt->mnt_sb = root->d_sb; return 0; } @@ -918,6 +956,7 @@ struct vfsmount * vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) { struct vfsmount *mnt; + struct dentry *root; char *secdata = NULL; int error; @@ -942,9 +981,19 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void goto out_free_secdata; } - error = type->get_sb(type, flags, name, data, mnt); - if (error < 0) - goto out_free_secdata; + if (type->mount) { + root = type->mount(type, flags, name, data); + if (IS_ERR(root)) { + error = PTR_ERR(root); + goto out_free_secdata; + } + mnt->mnt_root = root; + mnt->mnt_sb = root->d_sb; + } else { + error = type->get_sb(type, flags, name, data, mnt); + if (error < 0) + goto out_free_secdata; + } BUG_ON(!mnt->mnt_sb); WARN_ON(!mnt->mnt_sb->s_bdi); mnt->mnt_sb->s_flags |= MS_BORN; diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index f2af225..2668957 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -23,7 +23,7 @@ #include "sysfs.h" -static struct vfsmount *sysfs_mount; +static struct vfsmount *sysfs_mnt; struct kmem_cache *sysfs_dir_cachep; static const struct super_operations sysfs_ops = { @@ -95,18 +95,17 @@ static int sysfs_set_super(struct super_block *sb, void *data) return error; } -static int sysfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *sysfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { struct sysfs_super_info *info; enum kobj_ns_type type; struct super_block *sb; int error; - error = -ENOMEM; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) - goto out; + return ERR_PTR(-ENOMEM); for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) info->ns[type] = kobj_ns_current(type); @@ -114,24 +113,19 @@ static int sysfs_get_sb(struct file_system_type *fs_type, sb = sget(fs_type, sysfs_test_super, sysfs_set_super, info); if (IS_ERR(sb) || sb->s_fs_info != info) kfree(info); - if (IS_ERR(sb)) { - error = PTR_ERR(sb); - goto out; - } + if (IS_ERR(sb)) + return ERR_CAST(sb); if (!sb->s_root) { sb->s_flags = flags; error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0); if (error) { deactivate_locked_super(sb); - goto out; + return ERR_PTR(error); } sb->s_flags |= MS_ACTIVE; } - simple_set_mnt(mnt, sb); - error = 0; -out: - return error; + return dget(sb->s_root); } static void sysfs_kill_sb(struct super_block *sb) @@ -147,7 +141,7 @@ static void sysfs_kill_sb(struct super_block *sb) static struct file_system_type sysfs_fs_type = { .name = "sysfs", - .get_sb = sysfs_get_sb, + .mount = sysfs_mount, .kill_sb = sysfs_kill_sb, }; @@ -189,11 +183,11 @@ int __init sysfs_init(void) err = register_filesystem(&sysfs_fs_type); if (!err) { - sysfs_mount = kern_mount(&sysfs_fs_type); - if (IS_ERR(sysfs_mount)) { + sysfs_mnt = kern_mount(&sysfs_fs_type); + if (IS_ERR(sysfs_mnt)) { printk(KERN_ERR "sysfs: could not mount!\n"); - err = PTR_ERR(sysfs_mount); - sysfs_mount = NULL; + err = PTR_ERR(sysfs_mnt); + sysfs_mnt = NULL; unregister_filesystem(&sysfs_fs_type); goto out_err; } diff --git a/fs/sysv/super.c b/fs/sysv/super.c index a0b0cda..3d9c62b 100644 --- a/fs/sysv/super.c +++ b/fs/sysv/super.c @@ -526,23 +526,22 @@ failed: /* Every kernel module contains stuff like this. */ -static int sysv_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *sysv_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, sysv_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, sysv_fill_super); } -static int v7_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *v7_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, v7_fill_super, mnt); + return mount_bdev(fs_type, flags, dev_name, data, v7_fill_super); } static struct file_system_type sysv_fs_type = { .owner = THIS_MODULE, .name = "sysv", - .get_sb = sysv_get_sb, + .mount = sysv_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; @@ -550,7 +549,7 @@ static struct file_system_type sysv_fs_type = { static struct file_system_type v7_fs_type = { .owner = THIS_MODULE, .name = "v7", - .get_sb = v7_get_sb, + .mount = v7_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 9a47c9f..91fac54 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2038,8 +2038,8 @@ static int sb_test(struct super_block *sb, void *data) return c->vi.cdev == *dev; } -static int ubifs_get_sb(struct file_system_type *fs_type, int flags, - const char *name, void *data, struct vfsmount *mnt) +static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, + const char *name, void *data) { struct ubi_volume_desc *ubi; struct ubi_volume_info vi; @@ -2057,7 +2057,7 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags, if (IS_ERR(ubi)) { dbg_err("cannot open \"%s\", error %d", name, (int)PTR_ERR(ubi)); - return PTR_ERR(ubi); + return ERR_CAST(ubi); } ubi_get_volume_info(ubi, &vi); @@ -2095,20 +2095,19 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags, /* 'fill_super()' opens ubi again so we must close it here */ ubi_close_volume(ubi); - simple_set_mnt(mnt, sb); - return 0; + return dget(sb->s_root); out_deact: deactivate_locked_super(sb); out_close: ubi_close_volume(ubi); - return err; + return ERR_PTR(err); } static struct file_system_type ubifs_fs_type = { .name = "ubifs", .owner = THIS_MODULE, - .get_sb = ubifs_get_sb, + .mount = ubifs_mount, .kill_sb = kill_anon_super, }; diff --git a/fs/udf/super.c b/fs/udf/super.c index 76f3d6d..4a5c7c6 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -107,17 +107,16 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi) } /* UDF filesystem type */ -static int udf_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - struct vfsmount *mnt) +static struct dentry *udf_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, udf_fill_super, mnt); + return mount_bdev(fs_type, flags, dev_name, data, udf_fill_super); } static struct file_system_type udf_fstype = { .owner = THIS_MODULE, .name = "udf", - .get_sb = udf_get_sb, + .mount = udf_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 6b9be90..2c47dae 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1454,16 +1454,16 @@ static const struct super_operations ufs_super_ops = { .show_options = ufs_show_options, }; -static int ufs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *ufs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, ufs_fill_super, mnt); + return mount_bdev(fs_type, flags, dev_name, data, ufs_fill_super); } static struct file_system_type ufs_fs_type = { .owner = THIS_MODULE, .name = "ufs", - .get_sb = ufs_get_sb, + .mount = ufs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index c9af48f..691f612 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -934,7 +934,6 @@ xfs_aops_discard_page( struct xfs_inode *ip = XFS_I(inode); struct buffer_head *bh, *head; loff_t offset = page_offset(page); - ssize_t len = 1 << inode->i_blkbits; if (!xfs_is_delayed_page(page, IO_DELAY)) goto out_invalidate; @@ -949,58 +948,14 @@ xfs_aops_discard_page( xfs_ilock(ip, XFS_ILOCK_EXCL); bh = head = page_buffers(page); do { - int done; - xfs_fileoff_t offset_fsb; - xfs_bmbt_irec_t imap; - int nimaps = 1; int error; - xfs_fsblock_t firstblock; - xfs_bmap_free_t flist; + xfs_fileoff_t start_fsb; if (!buffer_delay(bh)) goto next_buffer; - offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); - - /* - * Map the range first and check that it is a delalloc extent - * before trying to unmap the range. Otherwise we will be - * trying to remove a real extent (which requires a - * transaction) or a hole, which is probably a bad idea... - */ - error = xfs_bmapi(NULL, ip, offset_fsb, 1, - XFS_BMAPI_ENTIRE, NULL, 0, &imap, - &nimaps, NULL); - - if (error) { - /* something screwed, just bail */ - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_fs_cmn_err(CE_ALERT, ip->i_mount, - "page discard failed delalloc mapping lookup."); - } - break; - } - if (!nimaps) { - /* nothing there */ - goto next_buffer; - } - if (imap.br_startblock != DELAYSTARTBLOCK) { - /* been converted, ignore */ - goto next_buffer; - } - WARN_ON(imap.br_blockcount == 0); - - /* - * Note: while we initialise the firstblock/flist pair, they - * should never be used because blocks should never be - * allocated or freed for a delalloc extent and hence we need - * don't cancel or finish them after the xfs_bunmapi() call. - */ - xfs_bmap_init(&flist, &firstblock); - error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock, - &flist, &done); - - ASSERT(!flist.xbf_count && !flist.xbf_first); + start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1); if (error) { /* something screwed, just bail */ if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { @@ -1010,7 +965,7 @@ xfs_aops_discard_page( break; } next_buffer: - offset += len; + offset += 1 << inode->i_blkbits; } while ((bh = bh->b_this_page) != head); @@ -1111,11 +1066,12 @@ xfs_vm_writepage( uptodate = 0; /* - * A hole may still be marked uptodate because discard_buffer - * leaves the flag set. + * set_page_dirty dirties all buffers in a page, independent + * of their state. The dirty state however is entirely + * meaningless for holes (!mapped && uptodate), so skip + * buffers covering holes here. */ if (!buffer_mapped(bh) && buffer_uptodate(bh)) { - ASSERT(!buffer_dirty(bh)); imap_valid = 0; continue; } @@ -1504,11 +1460,42 @@ xfs_vm_write_failed( struct inode *inode = mapping->host; if (to > inode->i_size) { - struct iattr ia = { - .ia_valid = ATTR_SIZE | ATTR_FORCE, - .ia_size = inode->i_size, - }; - xfs_setattr(XFS_I(inode), &ia, XFS_ATTR_NOLOCK); + /* + * punch out the delalloc blocks we have already allocated. We + * don't call xfs_setattr() to do this as we may be in the + * middle of a multi-iovec write and so the vfs inode->i_size + * will not match the xfs ip->i_size and so it will zero too + * much. Hence we jus truncate the page cache to zero what is + * necessary and punch the delalloc blocks directly. + */ + struct xfs_inode *ip = XFS_I(inode); + xfs_fileoff_t start_fsb; + xfs_fileoff_t end_fsb; + int error; + + truncate_pagecache(inode, to, inode->i_size); + + /* + * Check if there are any blocks that are outside of i_size + * that need to be trimmed back. + */ + start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1; + end_fsb = XFS_B_TO_FSB(ip->i_mount, to); + if (end_fsb <= start_fsb) + return; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, + end_fsb - start_fsb); + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + "xfs_vm_write_failed: unable to clean up ino %lld", + ip->i_ino); + } + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); } } diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 63fd2c0..4c5deb6 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -488,29 +488,16 @@ found: spin_unlock(&pag->pag_buf_lock); xfs_perag_put(pag); - /* Attempt to get the semaphore without sleeping, - * if this does not work then we need to drop the - * spinlock and do a hard attempt on the semaphore. - */ - if (down_trylock(&bp->b_sema)) { + if (xfs_buf_cond_lock(bp)) { + /* failed, so wait for the lock if requested. */ if (!(flags & XBF_TRYLOCK)) { - /* wait for buffer ownership */ xfs_buf_lock(bp); XFS_STATS_INC(xb_get_locked_waited); } else { - /* We asked for a trylock and failed, no need - * to look at file offset and length here, we - * know that this buffer at least overlaps our - * buffer and is locked, therefore our buffer - * either does not exist, or is this buffer. - */ xfs_buf_rele(bp); XFS_STATS_INC(xb_busy_locked); return NULL; } - } else { - /* trylock worked */ - XB_SET_OWNER(bp); } if (bp->b_flags & XBF_STALE) { @@ -876,10 +863,18 @@ xfs_buf_rele( */ /* - * Locks a buffer object, if it is not already locked. - * Note that this in no way locks the underlying pages, so it is only - * useful for synchronizing concurrent use of buffer objects, not for - * synchronizing independent access to the underlying pages. + * Locks a buffer object, if it is not already locked. Note that this in + * no way locks the underlying pages, so it is only useful for + * synchronizing concurrent use of buffer objects, not for synchronizing + * independent access to the underlying pages. + * + * If we come across a stale, pinned, locked buffer, we know that we are + * being asked to lock a buffer that has been reallocated. Because it is + * pinned, we know that the log has not been pushed to disk and hence it + * will still be locked. Rather than continuing to have trylock attempts + * fail until someone else pushes the log, push it ourselves before + * returning. This means that the xfsaild will not get stuck trying + * to push on stale inode buffers. */ int xfs_buf_cond_lock( @@ -890,6 +885,8 @@ xfs_buf_cond_lock( locked = down_trylock(&bp->b_sema) == 0; if (locked) XB_SET_OWNER(bp); + else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) + xfs_log_force(bp->b_target->bt_mount, 0); trace_xfs_buf_cond_lock(bp, _RET_IP_); return locked ? 0 : -EBUSY; @@ -1781,7 +1778,6 @@ xfs_buf_delwri_split( INIT_LIST_HEAD(list); spin_lock(dwlk); list_for_each_entry_safe(bp, n, dwq, b_list) { - trace_xfs_buf_delwri_split(bp, _RET_IP_); ASSERT(bp->b_flags & XBF_DELWRI); if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) { @@ -1795,6 +1791,7 @@ xfs_buf_delwri_split( _XBF_RUN_QUEUES); bp->b_flags |= XBF_WRITE; list_move_tail(&bp->b_list, list); + trace_xfs_buf_delwri_split(bp, _RET_IP_); } else skipped++; } diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 2ea238f6..ad442d9 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -416,7 +416,7 @@ xfs_attrlist_by_handle( if (IS_ERR(dentry)) return PTR_ERR(dentry); - kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL); + kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL); if (!kbuf) goto out_dput; diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 96107ef..94d5fd6 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -762,7 +762,8 @@ xfs_setup_inode( inode->i_state = I_NEW; inode_sb_list_add(inode); - insert_inode_hash(inode); + /* make the inode look hashed for the writeback code */ + hlist_add_fake(&inode->i_hash); inode->i_mode = ip->i_d.di_mode; inode->i_nlink = ip->i_d.di_nlink; diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index cf80878..064f964 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -353,9 +353,6 @@ xfs_parseargs( mp->m_qflags &= ~XFS_OQUOTA_ENFD; } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { mp->m_flags |= XFS_MOUNT_DELAYLOG; - cmn_err(CE_WARN, - "Enabling EXPERIMENTAL delayed logging feature " - "- use at your own risk.\n"); } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { mp->m_flags &= ~XFS_MOUNT_DELAYLOG; } else if (!strcmp(this_char, "ihashsize")) { @@ -1609,16 +1606,14 @@ xfs_fs_fill_super( goto out_free_sb; } -STATIC int -xfs_fs_get_sb( +STATIC struct dentry * +xfs_fs_mount( struct file_system_type *fs_type, int flags, const char *dev_name, - void *data, - struct vfsmount *mnt) + void *data) { - return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super, - mnt); + return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super); } static const struct super_operations xfs_super_operations = { @@ -1639,7 +1634,7 @@ static const struct super_operations xfs_super_operations = { static struct file_system_type xfs_fs_type = { .owner = THIS_MODULE, .name = "xfs", - .get_sb = xfs_fs_get_sb, + .mount = xfs_fs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 37d3325..afb0d7c 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -853,6 +853,7 @@ restart: if (trylock) { if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { skipped++; + xfs_perag_put(pag); continue; } first_index = pag->pag_ici_reclaim_cursor; diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 8abd12e..4111cd3 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5471,8 +5471,13 @@ xfs_getbmap( if (error) goto out_unlock_iolock; } - - ASSERT(ip->i_delayed_blks == 0); + /* + * even after flushing the inode, there can still be delalloc + * blocks on the inode beyond EOF due to speculative + * preallocation. These are not removed until the release + * function is called or the inode is inactivated. Hence we + * cannot assert here that ip->i_delayed_blks == 0. + */ } lock = xfs_ilock_map_shared(ip); @@ -6070,3 +6075,79 @@ xfs_bmap_disk_count_leaves( *count += xfs_bmbt_disk_get_blockcount(frp); } } + +/* + * dead simple method of punching delalyed allocation blocks from a range in + * the inode. Walks a block at a time so will be slow, but is only executed in + * rare error cases so the overhead is not critical. This will alays punch out + * both the start and end blocks, even if the ranges only partially overlap + * them, so it is up to the caller to ensure that partial blocks are not + * passed in. + */ +int +xfs_bmap_punch_delalloc_range( + struct xfs_inode *ip, + xfs_fileoff_t start_fsb, + xfs_fileoff_t length) +{ + xfs_fileoff_t remaining = length; + int error = 0; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + do { + int done; + xfs_bmbt_irec_t imap; + int nimaps = 1; + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + + /* + * Map the range first and check that it is a delalloc extent + * before trying to unmap the range. Otherwise we will be + * trying to remove a real extent (which requires a + * transaction) or a hole, which is probably a bad idea... + */ + error = xfs_bmapi(NULL, ip, start_fsb, 1, + XFS_BMAPI_ENTIRE, NULL, 0, &imap, + &nimaps, NULL); + + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + "Failed delalloc mapping lookup ino %lld fsb %lld.", + ip->i_ino, start_fsb); + } + break; + } + if (!nimaps) { + /* nothing there */ + goto next_block; + } + if (imap.br_startblock != DELAYSTARTBLOCK) { + /* been converted, ignore */ + goto next_block; + } + WARN_ON(imap.br_blockcount == 0); + + /* + * Note: while we initialise the firstblock/flist pair, they + * should never be used because blocks should never be + * allocated or freed for a delalloc extent and hence we need + * don't cancel or finish them after the xfs_bunmapi() call. + */ + xfs_bmap_init(&flist, &firstblock); + error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock, + &flist, &done); + if (error) + break; + + ASSERT(!flist.xbf_count && !flist.xbf_first); +next_block: + start_fsb++; + remaining--; + } while(remaining > 0); + + return error; +} diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 71ec9b6..3651191 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -394,6 +394,11 @@ xfs_bmap_count_blocks( int whichfork, int *count); +int +xfs_bmap_punch_delalloc_range( + struct xfs_inode *ip, + xfs_fileoff_t start_fsb, + xfs_fileoff_t length); #endif /* __KERNEL__ */ #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 3b9582c..e60490b 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -377,6 +377,19 @@ xfs_swap_extents( ip->i_d.di_format = tip->i_d.di_format; tip->i_d.di_format = tmp; + /* + * The extents in the source inode could still contain speculative + * preallocation beyond EOF (e.g. the file is open but not modified + * while defrag is in progress). In that case, we need to copy over the + * number of delalloc blocks the data fork in the source inode is + * tracking beyond EOF so that when the fork is truncated away when the + * temporary inode is unlinked we don't underrun the i_delayed_blks + * counter on that inode. + */ + ASSERT(tip->i_delayed_blks == 0); + tip->i_delayed_blks = ip->i_delayed_blks; + ip->i_delayed_blks = 0; + ilf_fields = XFS_ILOG_CORE; switch(ip->i_d.di_format) { diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index ed99902..c78cc6a 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -58,6 +58,7 @@ xfs_error_trap(int e) int xfs_etest[XFS_NUM_INJECT_ERROR]; int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR]; char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR]; +int xfs_error_test_active; int xfs_error_test(int error_tag, int *fsidp, char *expression, @@ -108,6 +109,7 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp) len = strlen(mp->m_fsname); xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP); strcpy(xfs_etest_fsname[i], mp->m_fsname); + xfs_error_test_active++; return 0; } } @@ -137,6 +139,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud) xfs_etest_fsid[i] = 0LL; kmem_free(xfs_etest_fsname[i]); xfs_etest_fsname[i] = NULL; + xfs_error_test_active--; } } diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index c2c1a07..f338847 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -127,13 +127,14 @@ extern void xfs_corruption_error(const char *tag, int level, #define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT #ifdef DEBUG +extern int xfs_error_test_active; extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); #define XFS_NUM_INJECT_ERROR 10 #define XFS_TEST_ERROR(expr, mp, tag, rf) \ - ((expr) || \ + ((expr) || (xfs_error_test_active && \ xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ - (rf))) + (rf)))) extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp); extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud); diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 9b715dc..9124425 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -744,9 +744,15 @@ xfs_filestream_new_ag( * If the file's parent directory is known, take its iolock in exclusive * mode to prevent two sibling files from racing each other to migrate * themselves and their parent to different AGs. + * + * Note that we lock the parent directory iolock inside the child + * iolock here. That's fine as we never hold both parent and child + * iolock in any other place. This is different from the ilock, + * which requires locking of the child after the parent for namespace + * operations. */ if (pip) - xfs_ilock(pip, XFS_IOLOCK_EXCL); + xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); /* * A new AG needs to be found for the file. If the file's parent diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index c7ac020..7c8d30c 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -657,18 +657,37 @@ xfs_inode_item_unlock( } /* - * This is called to find out where the oldest active copy of the - * inode log item in the on disk log resides now that the last log - * write of it completed at the given lsn. Since we always re-log - * all dirty data in an inode, the latest copy in the on disk log - * is the only one that matters. Therefore, simply return the - * given lsn. + * This is called to find out where the oldest active copy of the inode log + * item in the on disk log resides now that the last log write of it completed + * at the given lsn. Since we always re-log all dirty data in an inode, the + * latest copy in the on disk log is the only one that matters. Therefore, + * simply return the given lsn. + * + * If the inode has been marked stale because the cluster is being freed, we + * don't want to (re-)insert this inode into the AIL. There is a race condition + * where the cluster buffer may be unpinned before the inode is inserted into + * the AIL during transaction committed processing. If the buffer is unpinned + * before the inode item has been committed and inserted, then it is possible + * for the buffer to be written and IO completions before the inode is inserted + * into the AIL. In that case, we'd be inserting a clean, stale inode into the + * AIL which will never get removed. It will, however, get reclaimed which + * triggers an assert in xfs_inode_free() complaining about freein an inode + * still in the AIL. + * + * To avoid this, return a lower LSN than the one passed in so that the + * transaction committed code will not move the inode forward in the AIL but + * will still unpin it properly. */ STATIC xfs_lsn_t xfs_inode_item_committed( struct xfs_log_item *lip, xfs_lsn_t lsn) { + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + + if (xfs_iflags_test(ip, XFS_ISTALE)) + return lsn - 1; return lsn; } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b1498ab..19e9dfa 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -275,6 +275,7 @@ xfs_free_perag( pag = radix_tree_delete(&mp->m_perag_tree, agno); spin_unlock(&mp->m_perag_lock); ASSERT(pag); + ASSERT(atomic_read(&pag->pag_ref) == 0); call_rcu(&pag->rcu_head, __xfs_free_perag); } } diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index e0e64b1..9bb6eda 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -346,8 +346,17 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid, #define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) #define xfs_trans_apply_dquot_deltas(tp) #define xfs_trans_unreserve_and_mod_dquots(tp) -#define xfs_trans_reserve_quota_nblks(tp, ip, nblks, ninos, flags) (0) -#define xfs_trans_reserve_quota_bydquots(tp, mp, u, g, nb, ni, fl) (0) +static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, + struct xfs_inode *ip, long nblks, long ninos, uint flags) +{ + return 0; +} +static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, + struct xfs_mount *mp, struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, long nblks, long nions, uint flags) +{ + return 0; +} #define xfs_qm_vop_create_dqattach(tp, ip, u, g) #define xfs_qm_vop_rename_dqattach(it) (0) #define xfs_qm_vop_chown(tp, ip, old, new) (NULL) @@ -357,11 +366,14 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid, #define xfs_qm_dqdetach(ip) #define xfs_qm_dqrele(d) #define xfs_qm_statvfs(ip, s) -#define xfs_qm_sync(mp, fl) (0) +static inline int xfs_qm_sync(struct xfs_mount *mp, int flags) +{ + return 0; +} #define xfs_qm_newmount(mp, a, b) (0) #define xfs_qm_mount_quotas(mp) #define xfs_qm_unmount(mp) -#define xfs_qm_unmount_quotas(mp) (0) +#define xfs_qm_unmount_quotas(mp) #endif /* CONFIG_XFS_QUOTA */ #define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \ |