diff options
Diffstat (limited to 'fs')
160 files changed, 2523 insertions, 1523 deletions
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index d5c1401..d34896c 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -980,19 +980,11 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp, } } -static DEFINE_MUTEX(autofs4_ioctl_mutex); - static long autofs4_root_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { - long ret; struct inode *inode = filp->f_dentry->d_inode; - - mutex_lock(&autofs4_ioctl_mutex); - ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); - mutex_unlock(&autofs4_ioctl_mutex); - - return ret; + return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); } #ifdef CONFIG_COMPAT @@ -1002,13 +994,11 @@ static long autofs4_root_compat_ioctl(struct file *filp, struct inode *inode = filp->f_path.dentry->d_inode; int ret; - mutex_lock(&autofs4_ioctl_mutex); if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL) ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); else ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, (unsigned long)compat_ptr(arg)); - mutex_unlock(&autofs4_ioctl_mutex); return ret; } @@ -370,6 +370,9 @@ struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) { struct bio *bio; + if (nr_iovecs > UIO_MAXIOV) + return NULL; + bio = kmalloc(sizeof(struct bio) + nr_iovecs * sizeof(struct bio_vec), gfp_mask); if (unlikely(!bio)) @@ -697,8 +700,12 @@ static void bio_free_map_data(struct bio_map_data *bmd) static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, gfp_t gfp_mask) { - struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask); + struct bio_map_data *bmd; + if (iov_count > UIO_MAXIOV) + return NULL; + + bmd = kmalloc(sizeof(*bmd), gfp_mask); if (!bmd) return NULL; @@ -827,6 +834,12 @@ struct bio *bio_copy_user_iov(struct request_queue *q, end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; start = uaddr >> PAGE_SHIFT; + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); + nr_pages += end - start; len += iov[i].iov_len; } @@ -955,6 +968,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; unsigned long start = uaddr >> PAGE_SHIFT; + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); + nr_pages += end - start; /* * buffer must be aligned to at least hardsector size for now @@ -982,7 +1001,7 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, unsigned long start = uaddr >> PAGE_SHIFT; const int local_nr_pages = end - start; const int page_limit = cur_page + local_nr_pages; - + ret = get_user_pages_fast(uaddr, local_nr_pages, write_to_vm, &pages[cur_page]); if (ret < local_nr_pages) { diff --git a/fs/block_dev.c b/fs/block_dev.c index 06e8ff1..4230252 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -11,7 +11,6 @@ #include <linux/slab.h> #include <linux/kmod.h> #include <linux/major.h> -#include <linux/smp_lock.h> #include <linux/device_cgroup.h> #include <linux/highmem.h> #include <linux/blkdev.h> diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 7845d1f..b50bc4b 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -91,23 +91,10 @@ static inline int compressed_bio_size(struct btrfs_root *root, static struct bio *compressed_bio_alloc(struct block_device *bdev, u64 first_byte, gfp_t gfp_flags) { - struct bio *bio; int nr_vecs; nr_vecs = bio_get_nr_vecs(bdev); - bio = bio_alloc(gfp_flags, nr_vecs); - - if (bio == NULL && (current->flags & PF_MEMALLOC)) { - while (!bio && (nr_vecs /= 2)) - bio = bio_alloc(gfp_flags, nr_vecs); - } - - if (bio) { - bio->bi_size = 0; - bio->bi_bdev = bdev; - bio->bi_sector = first_byte >> 9; - } - return bio; + return btrfs_bio_alloc(bdev, first_byte >> 9, nr_vecs, gfp_flags); } static int check_compressed_csum(struct inode *inode, diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8db9234..af52f6d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -808,9 +808,9 @@ struct btrfs_block_group_cache { int extents_thresh; int free_extents; int total_bitmaps; - int ro:1; - int dirty:1; - int iref:1; + unsigned int ro:1; + unsigned int dirty:1; + unsigned int iref:1; int disk_cache_state; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index fb827d0..51d2e4d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -28,6 +28,7 @@ #include <linux/freezer.h> #include <linux/crc32c.h> #include <linux/slab.h> +#include <linux/migrate.h> #include "compat.h" #include "ctree.h" #include "disk-io.h" @@ -355,6 +356,8 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, btrfs_header_generation(eb)); BUG_ON(ret); + WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN)); + found_start = btrfs_header_bytenr(eb); if (found_start != start) { WARN_ON(1); @@ -693,6 +696,27 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, __btree_submit_bio_done); } +#ifdef CONFIG_MIGRATION +static int btree_migratepage(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + /* + * we can't safely write a btree page from here, + * we haven't done the locking hook + */ + if (PageDirty(page)) + return -EAGAIN; + /* + * Buffers may be managed in a filesystem specific way. + * We must have no buffers or drop them. + */ + if (page_has_private(page) && + !try_to_release_page(page, GFP_KERNEL)) + return -EAGAIN; + return migrate_page(mapping, newpage, page); +} +#endif + static int btree_writepage(struct page *page, struct writeback_control *wbc) { struct extent_io_tree *tree; @@ -707,8 +731,7 @@ static int btree_writepage(struct page *page, struct writeback_control *wbc) } redirty_page_for_writepage(wbc, page); - eb = btrfs_find_tree_block(root, page_offset(page), - PAGE_CACHE_SIZE); + eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE); WARN_ON(!eb); was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); @@ -799,6 +822,9 @@ static const struct address_space_operations btree_aops = { .releasepage = btree_releasepage, .invalidatepage = btree_invalidatepage, .sync_page = block_sync_page, +#ifdef CONFIG_MIGRATION + .migratepage = btree_migratepage, +#endif }; int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, @@ -981,7 +1007,10 @@ static int find_and_setup_root(struct btrfs_root *tree_root, blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), blocksize, generation); - BUG_ON(!root->node); + if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) { + free_extent_buffer(root->node); + return -EIO; + } root->commit_root = btrfs_root_node(root); return 0; } @@ -1538,10 +1567,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, GFP_NOFS); struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); - struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root), - GFP_NOFS); - struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info), - GFP_NOFS); + struct btrfs_root *tree_root = btrfs_sb(sb); + struct btrfs_fs_info *fs_info = tree_root->fs_info; struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 951ef09..659f532 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -166,7 +166,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, static struct dentry *btrfs_get_parent(struct dentry *child) { struct inode *dir = child->d_inode; - static struct dentry *dentry; + struct dentry *dentry; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_path *path; struct extent_buffer *leaf; @@ -232,9 +232,85 @@ fail: return ERR_PTR(ret); } +static int btrfs_get_name(struct dentry *parent, char *name, + struct dentry *child) +{ + struct inode *inode = child->d_inode; + struct inode *dir = parent->d_inode; + struct btrfs_path *path; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_inode_ref *iref; + struct btrfs_root_ref *rref; + struct extent_buffer *leaf; + unsigned long name_ptr; + struct btrfs_key key; + int name_len; + int ret; + + if (!dir || !inode) + return -EINVAL; + + if (!S_ISDIR(dir->i_mode)) + return -EINVAL; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->leave_spinning = 1; + + if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) { + key.objectid = BTRFS_I(inode)->root->root_key.objectid; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = (u64)-1; + root = root->fs_info->tree_root; + } else { + key.objectid = inode->i_ino; + key.offset = dir->i_ino; + key.type = BTRFS_INODE_REF_KEY; + } + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + btrfs_free_path(path); + return ret; + } else if (ret > 0) { + if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) { + path->slots[0]--; + } else { + btrfs_free_path(path); + return -ENOENT; + } + } + leaf = path->nodes[0]; + + if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) { + rref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_root_ref); + name_ptr = (unsigned long)(rref + 1); + name_len = btrfs_root_ref_name_len(leaf, rref); + } else { + iref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_ref); + name_ptr = (unsigned long)(iref + 1); + name_len = btrfs_inode_ref_name_len(leaf, iref); + } + + read_extent_buffer(leaf, name, name_ptr, name_len); + btrfs_free_path(path); + + /* + * have to add the null termination to make sure that reconnect_path + * gets the right len for strlen + */ + name[name_len] = '\0'; + + return 0; +} + const struct export_operations btrfs_export_ops = { .encode_fh = btrfs_encode_fh, .fh_to_dentry = btrfs_fh_to_dentry, .fh_to_parent = btrfs_fh_to_parent, .get_parent = btrfs_get_parent, + .get_name = btrfs_get_name, }; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0c097f3..227e581 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -429,6 +429,7 @@ err: static int cache_block_group(struct btrfs_block_group_cache *cache, struct btrfs_trans_handle *trans, + struct btrfs_root *root, int load_cache_only) { struct btrfs_fs_info *fs_info = cache->fs_info; @@ -442,9 +443,12 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, /* * We can't do the read from on-disk cache during a commit since we need - * to have the normal tree locking. + * to have the normal tree locking. Also if we are currently trying to + * allocate blocks for the tree root we can't do the fast caching since + * we likely hold important locks. */ - if (!trans->transaction->in_commit) { + if (!trans->transaction->in_commit && + (root && root != root->fs_info->tree_root)) { spin_lock(&cache->lock); if (cache->cached != BTRFS_CACHE_NO) { spin_unlock(&cache->lock); @@ -2741,6 +2745,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, struct btrfs_root *root = block_group->fs_info->tree_root; struct inode *inode = NULL; u64 alloc_hint = 0; + int dcs = BTRFS_DC_ERROR; int num_pages = 0; int retries = 0; int ret = 0; @@ -2795,6 +2800,8 @@ again: spin_lock(&block_group->lock); if (block_group->cached != BTRFS_CACHE_FINISHED) { + /* We're not cached, don't bother trying to write stuff out */ + dcs = BTRFS_DC_WRITTEN; spin_unlock(&block_group->lock); goto out_put; } @@ -2821,6 +2828,8 @@ again: ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, num_pages, num_pages, &alloc_hint); + if (!ret) + dcs = BTRFS_DC_SETUP; btrfs_free_reserved_data_space(inode, num_pages); out_put: iput(inode); @@ -2828,10 +2837,7 @@ out_free: btrfs_release_path(root, path); out: spin_lock(&block_group->lock); - if (ret) - block_group->disk_cache_state = BTRFS_DC_ERROR; - else - block_group->disk_cache_state = BTRFS_DC_SETUP; + block_group->disk_cache_state = dcs; spin_unlock(&block_group->lock); return ret; @@ -3037,7 +3043,13 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { - u64 num_devices = root->fs_info->fs_devices->rw_devices; + /* + * we add in the count of missing devices because we want + * to make sure that any RAID levels on a degraded FS + * continue to be honored. + */ + u64 num_devices = root->fs_info->fs_devices->rw_devices + + root->fs_info->fs_devices->missing_devices; if (num_devices == 1) flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); @@ -3412,7 +3424,7 @@ again: * our reservation. */ if (unused <= space_info->total_bytes) { - unused -= space_info->total_bytes; + unused = space_info->total_bytes - unused; if (unused >= num_bytes) { if (!reserved) space_info->bytes_reserved += orig_bytes; @@ -4080,7 +4092,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, * space back to the block group, otherwise we will leak space. */ if (!alloc && cache->cached == BTRFS_CACHE_NO) - cache_block_group(cache, trans, 1); + cache_block_group(cache, trans, NULL, 1); byte_in_group = bytenr - cache->key.objectid; WARN_ON(byte_in_group > cache->key.offset); @@ -4930,11 +4942,31 @@ search: btrfs_get_block_group(block_group); search_start = block_group->key.objectid; + /* + * this can happen if we end up cycling through all the + * raid types, but we want to make sure we only allocate + * for the proper type. + */ + if (!block_group_bits(block_group, data)) { + u64 extra = BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10; + + /* + * if they asked for extra copies and this block group + * doesn't provide them, bail. This does allow us to + * fill raid0 from raid1. + */ + if ((data & extra) && !(block_group->flags & extra)) + goto loop; + } + have_block_group: if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { u64 free_percent; - ret = cache_block_group(block_group, trans, 1); + ret = cache_block_group(block_group, trans, + orig_root, 1); if (block_group->cached == BTRFS_CACHE_FINISHED) goto have_block_group; @@ -4958,7 +4990,8 @@ have_block_group: if (loop > LOOP_CACHING_NOWAIT || (loop > LOOP_FIND_IDEAL && atomic_read(&space_info->caching_threads) < 2)) { - ret = cache_block_group(block_group, trans, 0); + ret = cache_block_group(block_group, trans, + orig_root, 0); BUG_ON(ret); } found_uncached_bg = true; @@ -5515,7 +5548,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, u64 num_bytes = ins->offset; block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); - cache_block_group(block_group, trans, 0); + cache_block_group(block_group, trans, NULL, 0); caching_ctl = get_caching_control(block_group); if (!caching_ctl) { @@ -6300,9 +6333,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root, NULL, NULL); BUG_ON(ret < 0); if (ret > 0) { - ret = btrfs_del_orphan_item(trans, tree_root, - root->root_key.objectid); - BUG_ON(ret); + /* if we fail to delete the orphan item this time + * around, it'll get picked up the next time. + * + * The most common failure here is just -ENOENT. + */ + btrfs_del_orphan_item(trans, tree_root, + root->root_key.objectid); } } @@ -7878,7 +7915,14 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; - num_devices = root->fs_info->fs_devices->rw_devices; + /* + * we add in the count of missing devices because we want + * to make sure that any RAID levels on a degraded FS + * continue to be honored. + */ + num_devices = root->fs_info->fs_devices->rw_devices + + root->fs_info->fs_devices->missing_devices; + if (num_devices == 1) { stripped |= BTRFS_BLOCK_GROUP_DUP; stripped = flags & ~stripped; @@ -8247,7 +8291,6 @@ int btrfs_read_block_groups(struct btrfs_root *root) break; if (ret != 0) goto error; - leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); cache = kzalloc(sizeof(*cache), GFP_NOFS); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index eac10e3..3e86b9f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1828,9 +1828,9 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err) bio_put(bio); } -static struct bio * -extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, - gfp_t gfp_flags) +struct bio * +btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, + gfp_t gfp_flags) { struct bio *bio; @@ -1919,7 +1919,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, else nr = bio_get_nr_vecs(bdev); - bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); + bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; @@ -2901,21 +2901,53 @@ out: int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent) { - int ret; + int ret = 0; u64 off = start; u64 max = start + len; u32 flags = 0; + u32 found_type; + u64 last; u64 disko = 0; + struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; + struct btrfs_path *path; + struct btrfs_file_extent_item *item; int end = 0; u64 em_start = 0, em_len = 0; unsigned long emflags; - ret = 0; + int hole = 0; if (len == 0) return -EINVAL; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->leave_spinning = 1; + + ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, + path, inode->i_ino, -1, 0); + if (ret < 0) { + btrfs_free_path(path); + return ret; + } + WARN_ON(!ret); + path->slots[0]--; + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); + found_type = btrfs_key_type(&found_key); + + /* No extents, just return */ + if (found_key.objectid != inode->i_ino || + found_type != BTRFS_EXTENT_DATA_KEY) { + btrfs_free_path(path); + return 0; + } + last = found_key.offset; + btrfs_free_path(path); + lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, &cached_state, GFP_NOFS); em = get_extent(inode, NULL, 0, off, max - off, 0); @@ -2925,11 +2957,18 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ret = PTR_ERR(em); goto out; } + while (!end) { + hole = 0; off = em->start + em->len; if (off >= max) end = 1; + if (em->block_start == EXTENT_MAP_HOLE) { + hole = 1; + goto next; + } + em_start = em->start; em_len = em->len; @@ -2939,8 +2978,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (em->block_start == EXTENT_MAP_LAST_BYTE) { end = 1; flags |= FIEMAP_EXTENT_LAST; - } else if (em->block_start == EXTENT_MAP_HOLE) { - flags |= FIEMAP_EXTENT_UNWRITTEN; } else if (em->block_start == EXTENT_MAP_INLINE) { flags |= (FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED); @@ -2953,10 +2990,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; +next: emflags = em->flags; free_extent_map(em); em = NULL; - if (!end) { em = get_extent(inode, NULL, 0, off, max - off, 0); if (!em) @@ -2967,15 +3004,23 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } emflags = em->flags; } + if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) { flags |= FIEMAP_EXTENT_LAST; end = 1; } - ret = fiemap_fill_next_extent(fieinfo, em_start, disko, - em_len, flags); - if (ret) - goto out_free; + if (em_start == last) { + flags |= FIEMAP_EXTENT_LAST; + end = 1; + } + + if (!hole) { + ret = fiemap_fill_next_extent(fieinfo, em_start, disko, + em_len, flags); + if (ret) + goto out_free; + } } out_free: free_extent_map(em); @@ -3836,8 +3881,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) spin_lock(&tree->buffer_lock); eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); - if (!eb) - goto out; + if (!eb) { + spin_unlock(&tree->buffer_lock); + return ret; + } if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { ret = 0; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 1c6d4f3..4183c81 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -310,4 +310,7 @@ int extent_clear_unlock_delalloc(struct inode *inode, struct extent_io_tree *tree, u64 start, u64 end, struct page *locked_page, unsigned long op); +struct bio * +btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, + gfp_t gfp_flags); #endif diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e354c33..66836d8 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -48,30 +48,34 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, struct page **prepared_pages, struct iov_iter *i) { - size_t copied; + size_t copied = 0; int pg = 0; int offset = pos & (PAGE_CACHE_SIZE - 1); + int total_copied = 0; while (write_bytes > 0) { size_t count = min_t(size_t, PAGE_CACHE_SIZE - offset, write_bytes); struct page *page = prepared_pages[pg]; -again: - if (unlikely(iov_iter_fault_in_readable(i, count))) - return -EFAULT; - - /* Copy data from userspace to the current page */ - copied = iov_iter_copy_from_user(page, i, offset, count); + /* + * Copy data from userspace to the current page + * + * Disable pagefault to avoid recursive lock since + * the pages are already locked + */ + pagefault_disable(); + copied = iov_iter_copy_from_user_atomic(page, i, offset, count); + pagefault_enable(); /* Flush processor's dcache for this page */ flush_dcache_page(page); iov_iter_advance(i, copied); write_bytes -= copied; + total_copied += copied; + /* Return to btrfs_file_aio_write to fault page */ if (unlikely(copied == 0)) { - count = min_t(size_t, PAGE_CACHE_SIZE - offset, - iov_iter_single_seg_count(i)); - goto again; + break; } if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { @@ -81,7 +85,7 @@ again: offset = 0; } } - return 0; + return total_copied; } /* @@ -854,6 +858,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, unsigned long last_index; int will_write; int buffered = 0; + int copied = 0; + int dirty_pages = 0; will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || (file->f_flags & O_DIRECT)); @@ -970,7 +976,17 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, WARN_ON(num_pages > nrptrs); memset(pages, 0, sizeof(struct page *) * nrptrs); - ret = btrfs_delalloc_reserve_space(inode, write_bytes); + /* + * Fault pages before locking them in prepare_pages + * to avoid recursive lock + */ + if (unlikely(iov_iter_fault_in_readable(&i, write_bytes))) { + ret = -EFAULT; + goto out; + } + + ret = btrfs_delalloc_reserve_space(inode, + num_pages << PAGE_CACHE_SHIFT); if (ret) goto out; @@ -978,37 +994,49 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, pos, first_index, last_index, write_bytes); if (ret) { - btrfs_delalloc_release_space(inode, write_bytes); + btrfs_delalloc_release_space(inode, + num_pages << PAGE_CACHE_SHIFT); goto out; } - ret = btrfs_copy_from_user(pos, num_pages, + copied = btrfs_copy_from_user(pos, num_pages, write_bytes, pages, &i); - if (ret == 0) { + dirty_pages = (copied + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + + if (num_pages > dirty_pages) { + if (copied > 0) + atomic_inc( + &BTRFS_I(inode)->outstanding_extents); + btrfs_delalloc_release_space(inode, + (num_pages - dirty_pages) << + PAGE_CACHE_SHIFT); + } + + if (copied > 0) { dirty_and_release_pages(NULL, root, file, pages, - num_pages, pos, write_bytes); + dirty_pages, pos, copied); } btrfs_drop_pages(pages, num_pages); - if (ret) { - btrfs_delalloc_release_space(inode, write_bytes); - goto out; - } - if (will_write) { - filemap_fdatawrite_range(inode->i_mapping, pos, - pos + write_bytes - 1); - } else { - balance_dirty_pages_ratelimited_nr(inode->i_mapping, - num_pages); - if (num_pages < - (root->leafsize >> PAGE_CACHE_SHIFT) + 1) - btrfs_btree_balance_dirty(root, 1); - btrfs_throttle(root); + if (copied > 0) { + if (will_write) { + filemap_fdatawrite_range(inode->i_mapping, pos, + pos + copied - 1); + } else { + balance_dirty_pages_ratelimited_nr( + inode->i_mapping, + dirty_pages); + if (dirty_pages < + (root->leafsize >> PAGE_CACHE_SHIFT) + 1) + btrfs_btree_balance_dirty(root, 1); + btrfs_throttle(root); + } } - pos += write_bytes; - num_written += write_bytes; + pos += copied; + num_written += copied; cond_resched(); } @@ -1047,8 +1075,14 @@ out: if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + num_written = PTR_ERR(trans); + goto done; + } + mutex_lock(&inode->i_mutex); ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); + mutex_unlock(&inode->i_mutex); if (ret == 0) { ret = btrfs_sync_log(trans, root); if (ret == 0) @@ -1067,6 +1101,7 @@ out: (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); } } +done: current->backing_dev_info = NULL; return num_written ? num_written : err; } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 22ee0dc..60d6842 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -290,7 +290,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, (unsigned long long)BTRFS_I(inode)->generation, (unsigned long long)generation, (unsigned long long)block_group->key.objectid); - goto out; + goto free_cache; } if (!num_entries) @@ -524,6 +524,12 @@ int btrfs_write_out_cache(struct btrfs_root *root, return 0; } + node = rb_first(&block_group->free_space_offset); + if (!node) { + iput(inode); + return 0; + } + last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; filemap_write_and_wait(inode->i_mapping); btrfs_wait_ordered_range(inode, inode->i_size & @@ -543,10 +549,6 @@ int btrfs_write_out_cache(struct btrfs_root *root, */ first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); - node = rb_first(&block_group->free_space_offset); - if (!node) - goto out_free; - /* * Lock all pages first so we can lock the extent safely. * diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 558cac2..72f31ec 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -495,7 +495,7 @@ again: add_async_extent(async_cow, start, num_bytes, total_compressed, pages, nr_pages_ret); - if (start + num_bytes < end && start + num_bytes < actual_end) { + if (start + num_bytes < end) { start += num_bytes; pages = NULL; cond_resched(); @@ -4501,6 +4501,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, BTRFS_I(inode)->index_cnt = 2; BTRFS_I(inode)->root = root; BTRFS_I(inode)->generation = trans->transid; + inode->i_generation = BTRFS_I(inode)->generation; btrfs_set_inode_space_info(root, inode); if (mode & S_IFDIR) @@ -4622,12 +4623,12 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, } static int btrfs_add_nondir(struct btrfs_trans_handle *trans, - struct dentry *dentry, struct inode *inode, - int backref, u64 index) + struct inode *dir, struct dentry *dentry, + struct inode *inode, int backref, u64 index) { - int err = btrfs_add_link(trans, dentry->d_parent->d_inode, - inode, dentry->d_name.name, - dentry->d_name.len, backref, index); + int err = btrfs_add_link(trans, dir, inode, + dentry->d_name.name, dentry->d_name.len, + backref, index); if (!err) { d_instantiate(dentry, inode); return 0; @@ -4668,8 +4669,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, btrfs_set_trans_block_group(trans, dir); inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, - dentry->d_parent->d_inode->i_ino, objectid, + dentry->d_name.len, dir->i_ino, objectid, BTRFS_I(dir)->block_group, mode, &index); err = PTR_ERR(inode); if (IS_ERR(inode)) @@ -4682,7 +4682,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, } btrfs_set_trans_block_group(trans, inode); - err = btrfs_add_nondir(trans, dentry, inode, 0, index); + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; else { @@ -4730,10 +4730,8 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, btrfs_set_trans_block_group(trans, dir); inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, - dentry->d_parent->d_inode->i_ino, - objectid, BTRFS_I(dir)->block_group, mode, - &index); + dentry->d_name.len, dir->i_ino, objectid, + BTRFS_I(dir)->block_group, mode, &index); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_unlock; @@ -4745,7 +4743,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, } btrfs_set_trans_block_group(trans, inode); - err = btrfs_add_nondir(trans, dentry, inode, 0, index); + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; else { @@ -4787,6 +4785,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, return -EPERM; btrfs_inc_nlink(inode); + inode->i_ctime = CURRENT_TIME; err = btrfs_set_inode_index(dir, &index); if (err) @@ -4805,15 +4804,17 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, btrfs_set_trans_block_group(trans, dir); ihold(inode); - err = btrfs_add_nondir(trans, dentry, inode, 1, index); + err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); if (err) { drop_inode = 1; } else { + struct dentry *parent = dget_parent(dentry); btrfs_update_inode_block_group(trans, dir); err = btrfs_update_inode(trans, root, inode); BUG_ON(err); - btrfs_log_new_name(trans, inode, NULL, dentry->d_parent); + btrfs_log_new_name(trans, inode, NULL, parent); + dput(parent); } nr = trans->blocks_used; @@ -4853,8 +4854,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) btrfs_set_trans_block_group(trans, dir); inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, - dentry->d_parent->d_inode->i_ino, objectid, + dentry->d_name.len, dir->i_ino, objectid, BTRFS_I(dir)->block_group, S_IFDIR | mode, &index); if (IS_ERR(inode)) { @@ -4877,9 +4877,8 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (err) goto out_fail; - err = btrfs_add_link(trans, dentry->d_parent->d_inode, - inode, dentry->d_name.name, - dentry->d_name.len, 0, index); + err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, + dentry->d_name.len, 0, index); if (err) goto out_fail; @@ -5535,13 +5534,21 @@ struct btrfs_dio_private { u64 bytes; u32 *csums; void *private; + + /* number of bios pending for this dio */ + atomic_t pending_bios; + + /* IO errors */ + int errors; + + struct bio *orig_bio; }; static void btrfs_endio_direct_read(struct bio *bio, int err) { + struct btrfs_dio_private *dip = bio->bi_private; struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; struct bio_vec *bvec = bio->bi_io_vec; - struct btrfs_dio_private *dip = bio->bi_private; struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; u64 start; @@ -5595,15 +5602,18 @@ static void btrfs_endio_direct_write(struct bio *bio, int err) struct btrfs_trans_handle *trans; struct btrfs_ordered_extent *ordered = NULL; struct extent_state *cached_state = NULL; + u64 ordered_offset = dip->logical_offset; + u64 ordered_bytes = dip->bytes; int ret; if (err) goto out_done; - - ret = btrfs_dec_test_ordered_pending(inode, &ordered, - dip->logical_offset, dip->bytes); +again: + ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, + &ordered_offset, + ordered_bytes); if (!ret) - goto out_done; + goto out_test; BUG_ON(!ordered); @@ -5663,8 +5673,20 @@ out_unlock: out: btrfs_delalloc_release_metadata(inode, ordered->len); btrfs_end_transaction(trans, root); + ordered_offset = ordered->file_offset + ordered->len; btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); + +out_test: + /* + * our bio might span multiple ordered extents. If we haven't + * completed the accounting for the whole dio, go back and try again + */ + if (ordered_offset < dip->logical_offset + dip->bytes) { + ordered_bytes = dip->logical_offset + dip->bytes - + ordered_offset; + goto again; + } out_done: bio->bi_private = dip->private; @@ -5684,6 +5706,176 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, return 0; } +static void btrfs_end_dio_bio(struct bio *bio, int err) +{ + struct btrfs_dio_private *dip = bio->bi_private; + + if (err) { + printk(KERN_ERR "btrfs direct IO failed ino %lu rw %lu " + "sector %#Lx len %u err no %d\n", + dip->inode->i_ino, bio->bi_rw, + (unsigned long long)bio->bi_sector, bio->bi_size, err); + dip->errors = 1; + + /* + * before atomic variable goto zero, we must make sure + * dip->errors is perceived to be set. + */ + smp_mb__before_atomic_dec(); + } + + /* if there are more bios still pending for this dio, just exit */ + if (!atomic_dec_and_test(&dip->pending_bios)) + goto out; + + if (dip->errors) + bio_io_error(dip->orig_bio); + else { + set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags); + bio_endio(dip->orig_bio, 0); + } +out: + bio_put(bio); +} + +static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, + u64 first_sector, gfp_t gfp_flags) +{ + int nr_vecs = bio_get_nr_vecs(bdev); + return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); +} + +static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, + int rw, u64 file_offset, int skip_sum, + u32 *csums) +{ + int write = rw & REQ_WRITE; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + bio_get(bio); + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + if (ret) + goto err; + + if (write && !skip_sum) { + ret = btrfs_wq_submit_bio(root->fs_info, + inode, rw, bio, 0, 0, + file_offset, + __btrfs_submit_bio_start_direct_io, + __btrfs_submit_bio_done); + goto err; + } else if (!skip_sum) + btrfs_lookup_bio_sums_dio(root, inode, bio, + file_offset, csums); + + ret = btrfs_map_bio(root, rw, bio, 0, 1); +err: + bio_put(bio); + return ret; +} + +static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, + int skip_sum) +{ + struct inode *inode = dip->inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + struct bio *bio; + struct bio *orig_bio = dip->orig_bio; + struct bio_vec *bvec = orig_bio->bi_io_vec; + u64 start_sector = orig_bio->bi_sector; + u64 file_offset = dip->logical_offset; + u64 submit_len = 0; + u64 map_length; + int nr_pages = 0; + u32 *csums = dip->csums; + int ret = 0; + + bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); + if (!bio) + return -ENOMEM; + bio->bi_private = dip; + bio->bi_end_io = btrfs_end_dio_bio; + atomic_inc(&dip->pending_bios); + + map_length = orig_bio->bi_size; + ret = btrfs_map_block(map_tree, READ, start_sector << 9, + &map_length, NULL, 0); + if (ret) { + bio_put(bio); + return -EIO; + } + + while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { + if (unlikely(map_length < submit_len + bvec->bv_len || + bio_add_page(bio, bvec->bv_page, bvec->bv_len, + bvec->bv_offset) < bvec->bv_len)) { + /* + * inc the count before we submit the bio so + * we know the end IO handler won't happen before + * we inc the count. Otherwise, the dip might get freed + * before we're done setting it up + */ + atomic_inc(&dip->pending_bios); + ret = __btrfs_submit_dio_bio(bio, inode, rw, + file_offset, skip_sum, + csums); + if (ret) { + bio_put(bio); + atomic_dec(&dip->pending_bios); + goto out_err; + } + + if (!skip_sum) + csums = csums + nr_pages; + start_sector += submit_len >> 9; + file_offset += submit_len; + + submit_len = 0; + nr_pages = 0; + + bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, + start_sector, GFP_NOFS); + if (!bio) + goto out_err; + bio->bi_private = dip; + bio->bi_end_io = btrfs_end_dio_bio; + + map_length = orig_bio->bi_size; + ret = btrfs_map_block(map_tree, READ, start_sector << 9, + &map_length, NULL, 0); + if (ret) { + bio_put(bio); + goto out_err; + } + } else { + submit_len += bvec->bv_len; + nr_pages ++; + bvec++; + } + } + + ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, + csums); + if (!ret) + return 0; + + bio_put(bio); +out_err: + dip->errors = 1; + /* + * before atomic variable goto zero, we must + * make sure dip->errors is perceived to be set. + */ + smp_mb__before_atomic_dec(); + if (atomic_dec_and_test(&dip->pending_bios)) + bio_io_error(dip->orig_bio); + + /* bio_end_io() will handle error, so we needn't return it */ + return 0; +} + static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, loff_t file_offset) { @@ -5723,36 +5915,18 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, dip->disk_bytenr = (u64)bio->bi_sector << 9; bio->bi_private = dip; + dip->errors = 0; + dip->orig_bio = bio; + atomic_set(&dip->pending_bios, 0); if (write) bio->bi_end_io = btrfs_endio_direct_write; else bio->bi_end_io = btrfs_endio_direct_read; - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); - if (ret) - goto out_err; - - if (write && !skip_sum) { - ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, - inode, rw, bio, 0, 0, - dip->logical_offset, - __btrfs_submit_bio_start_direct_io, - __btrfs_submit_bio_done); - if (ret) - goto out_err; + ret = btrfs_submit_direct_hook(rw, dip, skip_sum); + if (!ret) return; - } else if (!skip_sum) - btrfs_lookup_bio_sums_dio(root, inode, bio, - dip->logical_offset, dip->csums); - - ret = btrfs_map_bio(root, rw, bio, 0, 1); - if (ret) - goto out_err; - return; -out_err: - kfree(dip->csums); - kfree(dip); free_ordered: /* * If this is a write, we need to clean up the reserved space and kill @@ -5760,8 +5934,7 @@ free_ordered: */ if (write) { struct btrfs_ordered_extent *ordered; - ordered = btrfs_lookup_ordered_extent(inode, - dip->logical_offset); + ordered = btrfs_lookup_ordered_extent(inode, file_offset); if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) btrfs_free_reserved_extent(root, ordered->start, @@ -6607,8 +6780,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, BUG_ON(ret); if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { - btrfs_log_new_name(trans, old_inode, old_dir, - new_dentry->d_parent); + struct dentry *parent = dget_parent(new_dentry); + btrfs_log_new_name(trans, old_inode, old_dir, parent); + dput(parent); btrfs_end_log_trans(root); } out_fail: @@ -6758,8 +6932,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, btrfs_set_trans_block_group(trans, dir); inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, - dentry->d_parent->d_inode->i_ino, objectid, + dentry->d_name.len, dir->i_ino, objectid, BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO, &index); err = PTR_ERR(inode); @@ -6773,7 +6946,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, } btrfs_set_trans_block_group(trans, inode); - err = btrfs_add_nondir(trans, dentry, inode, 0, index); + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) drop_inode = 1; else { @@ -6844,6 +7017,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; u64 cur_offset = start; + u64 i_size; int ret = 0; bool own_trans = true; @@ -6885,11 +7059,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, (actual_len > inode->i_size) && (cur_offset > inode->i_size)) { if (cur_offset > actual_len) - i_size_write(inode, actual_len); + i_size = actual_len; else - i_size_write(inode, cur_offset); - i_size_write(inode, cur_offset); - btrfs_ordered_update_i_size(inode, cur_offset, NULL); + i_size = cur_offset; + i_size_write(inode, i_size); + btrfs_ordered_update_i_size(inode, i_size, NULL); } ret = btrfs_update_inode(trans, root, inode); @@ -6943,6 +7117,10 @@ static long btrfs_fallocate(struct inode *inode, int mode, btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); mutex_lock(&inode->i_mutex); + ret = inode_newsize_ok(inode, alloc_end); + if (ret) + goto out; + if (alloc_start > inode->i_size) { ret = btrfs_cont_expand(inode, alloc_start); if (ret) @@ -7139,6 +7317,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, + .getattr = btrfs_getattr, .permission = btrfs_permission, .setxattr = btrfs_setxattr, .getxattr = btrfs_getxattr, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 463d91b..f87552a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -233,7 +233,8 @@ static noinline int create_subvol(struct btrfs_root *root, struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; struct btrfs_root *new_root; - struct inode *dir = dentry->d_parent->d_inode; + struct dentry *parent = dget_parent(dentry); + struct inode *dir; int ret; int err; u64 objectid; @@ -242,8 +243,13 @@ static noinline int create_subvol(struct btrfs_root *root, ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root, 0, &objectid); - if (ret) + if (ret) { + dput(parent); return ret; + } + + dir = parent->d_inode; + /* * 1 - inode item * 2 - refs @@ -251,8 +257,10 @@ static noinline int create_subvol(struct btrfs_root *root, * 2 - dir items */ trans = btrfs_start_transaction(root, 6); - if (IS_ERR(trans)) + if (IS_ERR(trans)) { + dput(parent); return PTR_ERR(trans); + } leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, objectid, NULL, 0, 0, 0); @@ -339,6 +347,7 @@ static noinline int create_subvol(struct btrfs_root *root, d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); fail: + dput(parent); if (async_transid) { *async_transid = trans->transid; err = btrfs_commit_transaction_async(trans, root, 1); @@ -354,6 +363,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, char *name, int namelen, u64 *async_transid) { struct inode *inode; + struct dentry *parent; struct btrfs_pending_snapshot *pending_snapshot; struct btrfs_trans_handle *trans; int ret; @@ -396,7 +406,9 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, btrfs_orphan_cleanup(pending_snapshot->snap); - inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); + parent = dget_parent(dentry); + inode = btrfs_lookup_dentry(parent->d_inode, dentry); + dput(parent); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto fail; @@ -935,23 +947,42 @@ out: static noinline int btrfs_ioctl_snap_create(struct file *file, void __user *arg, int subvol, - int async) + int v2) { struct btrfs_ioctl_vol_args *vol_args = NULL; - struct btrfs_ioctl_async_vol_args *async_vol_args = NULL; + struct btrfs_ioctl_vol_args_v2 *vol_args_v2 = NULL; char *name; u64 fd; - u64 transid = 0; int ret; - if (async) { - async_vol_args = memdup_user(arg, sizeof(*async_vol_args)); - if (IS_ERR(async_vol_args)) - return PTR_ERR(async_vol_args); + if (v2) { + u64 transid = 0; + u64 *ptr = NULL; + + vol_args_v2 = memdup_user(arg, sizeof(*vol_args_v2)); + if (IS_ERR(vol_args_v2)) + return PTR_ERR(vol_args_v2); + + if (vol_args_v2->flags & ~BTRFS_SUBVOL_CREATE_ASYNC) { + ret = -EINVAL; + goto out; + } + + name = vol_args_v2->name; + fd = vol_args_v2->fd; + vol_args_v2->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; + + if (vol_args_v2->flags & BTRFS_SUBVOL_CREATE_ASYNC) + ptr = &transid; + + ret = btrfs_ioctl_snap_create_transid(file, name, fd, + subvol, ptr); - name = async_vol_args->name; - fd = async_vol_args->fd; - async_vol_args->name[BTRFS_SNAPSHOT_NAME_MAX] = '\0'; + if (ret == 0 && ptr && + copy_to_user(arg + + offsetof(struct btrfs_ioctl_vol_args_v2, + transid), ptr, sizeof(*ptr))) + ret = -EFAULT; } else { vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) @@ -959,20 +990,13 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, name = vol_args->name; fd = vol_args->fd; vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - } - - ret = btrfs_ioctl_snap_create_transid(file, name, fd, - subvol, &transid); - if (!ret && async) { - if (copy_to_user(arg + - offsetof(struct btrfs_ioctl_async_vol_args, - transid), &transid, sizeof(transid))) - return -EFAULT; + ret = btrfs_ioctl_snap_create_transid(file, name, fd, + subvol, NULL); } - +out: kfree(vol_args); - kfree(async_vol_args); + kfree(vol_args_v2); return ret; } @@ -1669,12 +1693,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, olen = len = src->i_size - off; /* if we extend to eof, continue to block boundary */ if (off + len == src->i_size) - len = ((src->i_size + bs-1) & ~(bs-1)) - - off; + len = ALIGN(src->i_size, bs) - off; /* verify the end result is block aligned */ - if ((off & (bs-1)) || - ((off + len) & (bs-1))) + if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || + !IS_ALIGNED(destoff, bs)) goto out_unlock; /* do any pending delalloc/csum calc on src, one way or @@ -1874,8 +1897,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, * but shouldn't round up the file size */ endoff = new_key.offset + datal; - if (endoff > off+olen) - endoff = off+olen; + if (endoff > destoff+olen) + endoff = destoff+olen; if (endoff > inode->i_size) btrfs_i_size_write(inode, endoff); @@ -2235,7 +2258,7 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_getversion(file, argp); case BTRFS_IOC_SNAP_CREATE: return btrfs_ioctl_snap_create(file, argp, 0, 0); - case BTRFS_IOC_SNAP_CREATE_ASYNC: + case BTRFS_IOC_SNAP_CREATE_V2: return btrfs_ioctl_snap_create(file, argp, 0, 1); case BTRFS_IOC_SUBVOL_CREATE: return btrfs_ioctl_snap_create(file, argp, 1, 0); diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index 17c99eb..c344d12 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -30,11 +30,15 @@ struct btrfs_ioctl_vol_args { char name[BTRFS_PATH_NAME_MAX + 1]; }; -#define BTRFS_SNAPSHOT_NAME_MAX 4079 -struct btrfs_ioctl_async_vol_args { +#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) + +#define BTRFS_SUBVOL_NAME_MAX 4039 +struct btrfs_ioctl_vol_args_v2 { __s64 fd; __u64 transid; - char name[BTRFS_SNAPSHOT_NAME_MAX + 1]; + __u64 flags; + __u64 unused[4]; + char name[BTRFS_SUBVOL_NAME_MAX + 1]; }; #define BTRFS_INO_LOOKUP_PATH_MAX 4080 @@ -187,6 +191,6 @@ struct btrfs_ioctl_space_args { struct btrfs_ioctl_space_args) #define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64) #define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) -#define BTRFS_IOC_SNAP_CREATE_ASYNC _IOW(BTRFS_IOCTL_MAGIC, 23, \ - struct btrfs_ioctl_async_vol_args) +#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ + struct btrfs_ioctl_vol_args_v2) #endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index f4621f6..ae7737e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -250,6 +250,73 @@ int btrfs_add_ordered_sum(struct inode *inode, /* * this is used to account for finished IO across a given range + * of the file. The IO may span ordered extents. If + * a given ordered_extent is completely done, 1 is returned, otherwise + * 0. + * + * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used + * to make sure this function only returns 1 once for a given ordered extent. + * + * file_offset is updated to one byte past the range that is recorded as + * complete. This allows you to walk forward in the file. + */ +int btrfs_dec_test_first_ordered_pending(struct inode *inode, + struct btrfs_ordered_extent **cached, + u64 *file_offset, u64 io_size) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + int ret; + u64 dec_end; + u64 dec_start; + u64 to_dec; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock(&tree->lock); + node = tree_search(tree, *file_offset); + if (!node) { + ret = 1; + goto out; + } + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (!offset_in_entry(entry, *file_offset)) { + ret = 1; + goto out; + } + + dec_start = max(*file_offset, entry->file_offset); + dec_end = min(*file_offset + io_size, entry->file_offset + + entry->len); + *file_offset = dec_end; + if (dec_start > dec_end) { + printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n", + (unsigned long long)dec_start, + (unsigned long long)dec_end); + } + to_dec = dec_end - dec_start; + if (to_dec > entry->bytes_left) { + printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n", + (unsigned long long)entry->bytes_left, + (unsigned long long)to_dec); + } + entry->bytes_left -= to_dec; + if (entry->bytes_left == 0) + ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); + else + ret = 1; +out: + if (!ret && cached && entry) { + *cached = entry; + atomic_inc(&entry->refs); + } + spin_unlock(&tree->lock); + return ret == 0; +} + +/* + * this is used to account for finished IO across a given range * of the file. The IO should not span ordered extents. If * a given ordered_extent is completely done, 1 is returned, otherwise * 0. diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 8ac3654..61dca83 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -141,6 +141,9 @@ int btrfs_remove_ordered_extent(struct inode *inode, int btrfs_dec_test_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size); +int btrfs_dec_test_first_ordered_pending(struct inode *inode, + struct btrfs_ordered_extent **cached, + u64 *file_offset, u64 io_size); int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type); int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index 79cba5f..f8be250 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -56,8 +56,12 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, return -ENOMEM; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret) + if (ret < 0) goto out; + if (ret) { + ret = -ENOENT; + goto out; + } ret = btrfs_del_item(trans, root, path); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8299a25..883c6fa 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -244,6 +244,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_space_cache: printk(KERN_INFO "btrfs: enabling disk space caching\n"); btrfs_set_opt(info->mount_opt, SPACE_CACHE); + break; case Opt_clear_cache: printk(KERN_INFO "btrfs: force clearing of disk cache\n"); btrfs_set_opt(info->mount_opt, CLEAR_CACHE); @@ -562,12 +563,26 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) static int btrfs_test_super(struct super_block *s, void *data) { - struct btrfs_fs_devices *test_fs_devices = data; + struct btrfs_root *test_root = data; struct btrfs_root *root = btrfs_sb(s); - return root->fs_info->fs_devices == test_fs_devices; + /* + * If this super block is going away, return false as it + * can't match as an existing super block. + */ + if (!atomic_read(&s->s_active)) + return 0; + return root->fs_info->fs_devices == test_root->fs_info->fs_devices; +} + +static int btrfs_set_super(struct super_block *s, void *data) +{ + s->s_fs_info = data; + + return set_anon_super(s, data); } + /* * Find a superblock for the given device / mount point. * @@ -581,6 +596,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, struct super_block *s; struct dentry *root; struct btrfs_fs_devices *fs_devices = NULL; + struct btrfs_root *tree_root = NULL; + struct btrfs_fs_info *fs_info = NULL; fmode_t mode = FMODE_READ; char *subvol_name = NULL; u64 subvol_objectid = 0; @@ -608,8 +625,24 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, goto error_close_devices; } + /* + * Setup a dummy root and fs_info for test/set super. This is because + * we don't actually fill this stuff out until open_ctree, but we need + * it for searching for existing supers, so this lets us do that and + * then open_ctree will properly initialize everything later. + */ + fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); + tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); + if (!fs_info || !tree_root) { + error = -ENOMEM; + goto error_close_devices; + } + fs_info->tree_root = tree_root; + fs_info->fs_devices = fs_devices; + tree_root->fs_info = fs_info; + bdev = fs_devices->latest_bdev; - s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices); + s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); if (IS_ERR(s)) goto error_s; @@ -652,9 +685,9 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, mutex_unlock(&root->d_inode->i_mutex); if (IS_ERR(new_root)) { + dput(root); deactivate_locked_super(s); error = PTR_ERR(new_root); - dput(root); goto error_free_subvol_name; } if (!new_root->d_inode) { @@ -675,6 +708,8 @@ error_s: error = PTR_ERR(s); error_close_devices: btrfs_close_devices(fs_devices); + kfree(fs_info); + kfree(tree_root); error_free_subvol_name: kfree(subvol_name); return ERR_PTR(error); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 1fffbc0..f50e931 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -902,6 +902,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *root = pending->root; struct btrfs_root *parent_root; struct inode *parent_inode; + struct dentry *parent; struct dentry *dentry; struct extent_buffer *tmp; struct extent_buffer *old; @@ -941,7 +942,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, trans->block_rsv = &pending->block_rsv; dentry = pending->dentry; - parent_inode = dentry->d_parent->d_inode; + parent = dget_parent(dentry); + parent_inode = parent->d_inode; parent_root = BTRFS_I(parent_inode)->root; record_root_in_trans(trans, parent_root); @@ -989,6 +991,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, parent_inode->i_ino, index, dentry->d_name.name, dentry->d_name.len); BUG_ON(ret); + dput(parent); key.offset = (u64)-1; pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a29f193..054744a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2869,6 +2869,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, { int ret = 0; struct btrfs_root *root; + struct dentry *old_parent = NULL; /* * for regular files, if its inode is already on disk, we don't @@ -2910,10 +2911,13 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, if (IS_ROOT(parent)) break; - parent = parent->d_parent; + parent = dget_parent(parent); + dput(old_parent); + old_parent = parent; inode = parent->d_inode; } + dput(old_parent); out: return ret; } @@ -2945,6 +2949,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, { int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; struct super_block *sb; + struct dentry *old_parent = NULL; int ret = 0; u64 last_committed = root->fs_info->last_trans_committed; @@ -3016,10 +3021,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (IS_ROOT(parent)) break; - parent = parent->d_parent; + parent = dget_parent(parent); + dput(old_parent); + old_parent = parent; } ret = 0; end_trans: + dput(old_parent); if (ret < 0) { BUG_ON(ret != -ENOSPC); root->fs_info->last_trans_log_full_commit = trans->transid; @@ -3039,8 +3047,13 @@ end_no_trans: int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct dentry *dentry) { - return btrfs_log_inode_parent(trans, root, dentry->d_inode, - dentry->d_parent, 0); + struct dentry *parent = dget_parent(dentry); + int ret; + + ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0); + dput(parent); + + return ret; } /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index cc04dc1..6b98845 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -412,12 +412,16 @@ static noinline int device_list_add(const char *path, device->fs_devices = fs_devices; fs_devices->num_devices++; - } else if (strcmp(device->name, path)) { + } else if (!device->name || strcmp(device->name, path)) { name = kstrdup(path, GFP_NOFS); if (!name) return -ENOMEM; kfree(device->name); device->name = name; + if (device->missing) { + fs_devices->missing_devices--; + device->missing = 0; + } } if (found_transid > fs_devices->latest_trans) { @@ -1236,6 +1240,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) device->fs_devices->num_devices--; + if (device->missing) + root->fs_info->fs_devices->missing_devices--; + next_device = list_entry(root->fs_info->fs_devices->devices.next, struct btrfs_device, dev_list); if (device->bdev == root->fs_info->sb->s_bdev) @@ -3080,7 +3087,9 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root, device->devid = devid; device->work.func = pending_bios_fn; device->fs_devices = fs_devices; + device->missing = 1; fs_devices->num_devices++; + fs_devices->missing_devices++; spin_lock_init(&device->io_lock); INIT_LIST_HEAD(&device->dev_alloc_list); memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); @@ -3278,6 +3287,15 @@ static int read_one_dev(struct btrfs_root *root, device = add_missing_dev(root, devid, dev_uuid); if (!device) return -ENOMEM; + } else if (!device->missing) { + /* + * this happens when a device that was properly setup + * in the device info lists suddenly goes bad. + * device->bdev is NULL, and so we have to set + * device->missing to one here + */ + root->fs_info->fs_devices->missing_devices++; + device->missing = 1; } } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 2b638b6..2740db4 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -44,6 +44,7 @@ struct btrfs_device { int writeable; int in_fs_metadata; + int missing; spinlock_t io_lock; @@ -93,6 +94,7 @@ struct btrfs_fs_devices { u64 num_devices; u64 open_devices; u64 rw_devices; + u64 missing_devices; u64 total_rw_bytes; struct block_device *latest_bdev; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index e9c874a..561438b 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -204,7 +204,7 @@ static int readpage_nounlock(struct file *filp, struct page *page) err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, page->index << PAGE_CACHE_SHIFT, &len, ci->i_truncate_seq, ci->i_truncate_size, - &page, 1); + &page, 1, 0); if (err == -ENOENT) err = 0; if (err < 0) { @@ -287,7 +287,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, offset, &len, ci->i_truncate_seq, ci->i_truncate_size, - pages, nr_pages); + pages, nr_pages, 0); if (rc == -ENOENT) rc = 0; if (rc < 0) @@ -774,7 +774,7 @@ get_more_pages: snapc, do_sync, ci->i_truncate_seq, ci->i_truncate_size, - &inode->i_mtime, true, 1); + &inode->i_mtime, true, 1, 0); max_pages = req->r_num_pages; alloc_page_vec(fsc, req); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 98ab13e..60d27bc 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1430,8 +1430,8 @@ static int try_nonblocking_invalidate(struct inode *inode) invalidating_gen == ci->i_rdcache_gen) { /* success. */ dout("try_nonblocking_invalidate %p success\n", inode); - ci->i_rdcache_gen = 0; - ci->i_rdcache_revoking = 0; + /* save any racing async invalidate some trouble */ + ci->i_rdcache_revoking = ci->i_rdcache_gen - 1; return 0; } dout("try_nonblocking_invalidate %p failed\n", inode); @@ -2273,8 +2273,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, { struct ceph_inode_info *ci = ceph_inode(inode); int mds = session->s_mds; - unsigned seq = le32_to_cpu(grant->seq); - unsigned issue_seq = le32_to_cpu(grant->issue_seq); + int seq = le32_to_cpu(grant->seq); int newcaps = le32_to_cpu(grant->caps); int issued, implemented, used, wanted, dirty; u64 size = le64_to_cpu(grant->size); @@ -2286,8 +2285,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, int revoked_rdcache = 0; int queue_invalidate = 0; - dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n", - inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps)); + dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", + inode, cap, mds, seq, ceph_cap_string(newcaps)); dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, inode->i_size); @@ -2383,7 +2382,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, } cap->seq = seq; - cap->issue_seq = issue_seq; /* file layout may have changed */ ci->i_layout = grant->layout; @@ -2691,6 +2689,11 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, NULL /* no caps context */); try_flush_caps(inode, session, NULL); up_read(&mdsc->snap_rwsem); + + /* make sure we re-request max_size, if necessary */ + spin_lock(&inode->i_lock); + ci->i_requested_max_size = 0; + spin_unlock(&inode->i_lock); } /* diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index e0a2dc6..d902948 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -40,7 +40,8 @@ int ceph_init_dentry(struct dentry *dentry) if (dentry->d_fsdata) return 0; - if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) + if (dentry->d_parent == NULL || /* nfs fh_to_dentry */ + ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) dentry->d_op = &ceph_dentry_ops; else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) dentry->d_op = &ceph_snapdir_dentry_ops; @@ -114,8 +115,8 @@ static int __dcache_readdir(struct file *filp, spin_lock(&dcache_lock); /* start at beginning? */ - if (filp->f_pos == 2 || (last && - filp->f_pos < ceph_dentry(last)->offset)) { + if (filp->f_pos == 2 || last == NULL || + filp->f_pos < ceph_dentry(last)->offset) { if (list_empty(&parent->d_subdirs)) goto out_unlock; p = parent->d_subdirs.prev; @@ -336,7 +337,10 @@ more: if (req->r_reply_info.dir_end) { kfree(fi->last_name); fi->last_name = NULL; - fi->next_offset = 2; + if (ceph_frag_is_rightmost(frag)) + fi->next_offset = 2; + else + fi->next_offset = 0; } else { rinfo = &req->r_reply_info; err = note_last_dentry(fi, @@ -355,18 +359,22 @@ more: u64 pos = ceph_make_fpos(frag, off); struct ceph_mds_reply_inode *in = rinfo->dir_in[off - fi->offset].in; + struct ceph_vino vino; + ino_t ino; + dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", off, off - fi->offset, rinfo->dir_nr, pos, rinfo->dir_dname_len[off - fi->offset], rinfo->dir_dname[off - fi->offset], in); BUG_ON(!in); ftype = le32_to_cpu(in->mode) >> 12; + vino.ino = le64_to_cpu(in->ino); + vino.snap = le64_to_cpu(in->snapid); + ino = ceph_vino_to_ino(vino); if (filldir(dirent, rinfo->dir_dname[off - fi->offset], rinfo->dir_dname_len[off - fi->offset], - pos, - le64_to_cpu(in->ino), - ftype) < 0) { + pos, ino, ftype) < 0) { dout("filldir stopping us...\n"); return 0; } @@ -414,6 +422,7 @@ static void reset_readdir(struct ceph_file_info *fi) fi->last_readdir = NULL; } kfree(fi->last_name); + fi->last_name = NULL; fi->next_offset = 2; /* compensate for . and .. */ if (fi->dentry) { dput(fi->dentry); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index e77c28c..7d0e4a8 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -154,11 +154,13 @@ int ceph_open(struct inode *inode, struct file *file) } /* - * No need to block if we have any caps. Update wanted set + * No need to block if we have caps on the auth MDS (for + * write) or any MDS (for read). Update wanted set * asynchronously. */ spin_lock(&inode->i_lock); - if (__ceph_is_any_real_caps(ci)) { + if (__ceph_is_any_real_caps(ci) && + (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) { int mds_wanted = __ceph_caps_mds_wanted(ci); int issued = __ceph_caps_issued(ci, NULL); @@ -280,11 +282,13 @@ int ceph_release(struct inode *inode, struct file *file) static int striped_read(struct inode *inode, u64 off, u64 len, struct page **pages, int num_pages, - int *checkeof) + int *checkeof, bool align_to_pages, + unsigned long buf_align) { struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u64 pos, this_len; + int io_align, page_align; int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ int left, pages_left; int read; @@ -300,14 +304,19 @@ static int striped_read(struct inode *inode, page_pos = pages; pages_left = num_pages; read = 0; + io_align = off & ~PAGE_MASK; more: + if (align_to_pages) + page_align = (pos - io_align + buf_align) & ~PAGE_MASK; + else + page_align = pos & ~PAGE_MASK; this_len = left; ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), &ci->i_layout, pos, &this_len, ci->i_truncate_seq, ci->i_truncate_size, - page_pos, pages_left); + page_pos, pages_left, page_align); hit_stripe = this_len < left; was_short = ret >= 0 && ret < this_len; if (ret == -ENOENT) @@ -368,32 +377,34 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, struct inode *inode = file->f_dentry->d_inode; struct page **pages; u64 off = *poff; - int num_pages = calc_pages_for(off, len); - int ret; + int num_pages, ret; dout("sync_read on file %p %llu~%u %s\n", file, off, len, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); if (file->f_flags & O_DIRECT) { - pages = ceph_get_direct_page_vector(data, num_pages, off, len); - - /* - * flush any page cache pages in this range. this - * will make concurrent normal and O_DIRECT io slow, - * but it will at least behave sensibly when they are - * in sequence. - */ + num_pages = calc_pages_for((unsigned long)data, len); + pages = ceph_get_direct_page_vector(data, num_pages, true); } else { + num_pages = calc_pages_for(off, len); pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); } if (IS_ERR(pages)) return PTR_ERR(pages); + /* + * flush any page cache pages in this range. this + * will make concurrent normal and sync io slow, + * but it will at least behave sensibly when they are + * in sequence. + */ ret = filemap_write_and_wait(inode->i_mapping); if (ret < 0) goto done; - ret = striped_read(inode, off, len, pages, num_pages, checkeof); + ret = striped_read(inode, off, len, pages, num_pages, checkeof, + file->f_flags & O_DIRECT, + (unsigned long)data & ~PAGE_MASK); if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) ret = ceph_copy_page_vector_to_user(pages, data, off, ret); @@ -402,7 +413,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, done: if (file->f_flags & O_DIRECT) - ceph_put_page_vector(pages, num_pages); + ceph_put_page_vector(pages, num_pages, true); else ceph_release_page_vector(pages, num_pages); dout("sync_read result %d\n", ret); @@ -448,6 +459,8 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, int flags; int do_sync = 0; int check_caps = 0; + int page_align, io_align; + unsigned long buf_align; int ret; struct timespec mtime = CURRENT_TIME; @@ -462,6 +475,9 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, else pos = *offset; + io_align = pos & ~PAGE_MASK; + buf_align = (unsigned long)data & ~PAGE_MASK; + ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); if (ret < 0) return ret; @@ -486,20 +502,27 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, */ more: len = left; + if (file->f_flags & O_DIRECT) { + /* write from beginning of first page, regardless of + io alignment */ + page_align = (pos - io_align + buf_align) & ~PAGE_MASK; + num_pages = calc_pages_for((unsigned long)data, len); + } else { + page_align = pos & ~PAGE_MASK; + num_pages = calc_pages_for(pos, len); + } req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ceph_vino(inode), pos, &len, CEPH_OSD_OP_WRITE, flags, ci->i_snap_realm->cached_context, do_sync, ci->i_truncate_seq, ci->i_truncate_size, - &mtime, false, 2); + &mtime, false, 2, page_align); if (!req) return -ENOMEM; - num_pages = calc_pages_for(pos, len); - if (file->f_flags & O_DIRECT) { - pages = ceph_get_direct_page_vector(data, num_pages, pos, len); + pages = ceph_get_direct_page_vector(data, num_pages, false); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out; @@ -549,7 +572,7 @@ more: } if (file->f_flags & O_DIRECT) - ceph_put_page_vector(pages, num_pages); + ceph_put_page_vector(pages, num_pages, false); else if (file->f_flags & O_SYNC) ceph_release_page_vector(pages, num_pages); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 1d6a45b..bf12865 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2,7 +2,6 @@ #include <linux/module.h> #include <linux/fs.h> -#include <linux/smp_lock.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/uaccess.h> @@ -471,7 +470,9 @@ void ceph_fill_file_time(struct inode *inode, int issued, if (issued & (CEPH_CAP_FILE_EXCL| CEPH_CAP_FILE_WR| - CEPH_CAP_FILE_BUFFER)) { + CEPH_CAP_FILE_BUFFER| + CEPH_CAP_AUTH_EXCL| + CEPH_CAP_XATTR_EXCL)) { if (timespec_compare(ctime, &inode->i_ctime) > 0) { dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n", inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, @@ -511,7 +512,7 @@ void ceph_fill_file_time(struct inode *inode, int issued, warn = 1; } } else { - /* we have no write caps; whatever the MDS says is true */ + /* we have no write|excl caps; whatever the MDS says is true */ if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) { inode->i_ctime = *ctime; inode->i_mtime = *mtime; @@ -567,12 +568,17 @@ static int fill_inode(struct inode *inode, /* * provided version will be odd if inode value is projected, - * even if stable. skip the update if we have a newer info - * (e.g., due to inode info racing form multiple MDSs), or if - * we are getting projected (unstable) inode info. + * even if stable. skip the update if we have newer stable + * info (ours>=theirs, e.g. due to racing mds replies), unless + * we are getting projected (unstable) info (in which case the + * version is odd, and we want ours>theirs). + * us them + * 2 2 skip + * 3 2 skip + * 3 3 update */ if (le64_to_cpu(info->version) > 0 && - (ci->i_version & ~1) > le64_to_cpu(info->version)) + (ci->i_version & ~1) >= le64_to_cpu(info->version)) goto no_change; issued = __ceph_caps_issued(ci, &implemented); @@ -606,7 +612,14 @@ static int fill_inode(struct inode *inode, le32_to_cpu(info->time_warp_seq), &ctime, &mtime, &atime); - ci->i_max_size = le64_to_cpu(info->max_size); + /* only update max_size on auth cap */ + if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && + ci->i_max_size != le64_to_cpu(info->max_size)) { + dout("max_size %lld -> %llu\n", ci->i_max_size, + le64_to_cpu(info->max_size)); + ci->i_max_size = le64_to_cpu(info->max_size); + } + ci->i_layout = info->layout; inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; @@ -1055,7 +1068,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ininfo = rinfo->targeti.in; vino.ino = le64_to_cpu(ininfo->ino); vino.snap = le64_to_cpu(ininfo->snapid); - if (!dn->d_inode) { + in = dn->d_inode; + if (!in) { in = ceph_get_inode(sb, vino); if (IS_ERR(in)) { pr_err("fill_trace bad get_inode " @@ -1386,11 +1400,8 @@ static void ceph_invalidate_work(struct work_struct *work) spin_lock(&inode->i_lock); dout("invalidate_pages %p gen %d revoking %d\n", inode, ci->i_rdcache_gen, ci->i_rdcache_revoking); - if (ci->i_rdcache_gen == 0 || - ci->i_rdcache_revoking != ci->i_rdcache_gen) { - BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen); + if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { /* nevermind! */ - ci->i_rdcache_revoking = 0; spin_unlock(&inode->i_lock); goto out; } @@ -1400,15 +1411,16 @@ static void ceph_invalidate_work(struct work_struct *work) ceph_invalidate_nondirty_pages(inode->i_mapping); spin_lock(&inode->i_lock); - if (orig_gen == ci->i_rdcache_gen) { + if (orig_gen == ci->i_rdcache_gen && + orig_gen == ci->i_rdcache_revoking) { dout("invalidate_pages %p gen %d successful\n", inode, ci->i_rdcache_gen); - ci->i_rdcache_gen = 0; - ci->i_rdcache_revoking = 0; + ci->i_rdcache_revoking--; check = 1; } else { - dout("invalidate_pages %p gen %d raced, gen now %d\n", - inode, orig_gen, ci->i_rdcache_gen); + dout("invalidate_pages %p gen %d raced, now %d revoking %d\n", + inode, orig_gen, ci->i_rdcache_gen, + ci->i_rdcache_revoking); } spin_unlock(&inode->i_lock); @@ -1739,7 +1751,7 @@ int ceph_do_getattr(struct inode *inode, int mask) return 0; } - dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask)); + dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode); if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) return 0; diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h index a6ce54e..52e8fd7 100644 --- a/fs/ceph/ioctl.h +++ b/fs/ceph/ioctl.h @@ -4,7 +4,7 @@ #include <linux/ioctl.h> #include <linux/types.h> -#define CEPH_IOCTL_MAGIC 0x98 +#define CEPH_IOCTL_MAGIC 0x97 /* just use u64 to align sanely on all archs */ struct ceph_ioctl_layout { diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 40abde9..476b329 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -11,40 +11,68 @@ * Implement fcntl and flock locking functions. */ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, - u64 pid, u64 pid_ns, - int cmd, u64 start, u64 length, u8 wait) + int cmd, u8 wait, struct file_lock *fl) { struct inode *inode = file->f_dentry->d_inode; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; int err; + u64 length = 0; req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); req->r_inode = igrab(inode); + /* mds requires start and length rather than start and end */ + if (LLONG_MAX == fl->fl_end) + length = 0; + else + length = fl->fl_end - fl->fl_start + 1; + dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " "length: %llu, wait: %d, type`: %d", (int)lock_type, - (int)operation, pid, start, length, wait, cmd); + (int)operation, (u64)fl->fl_pid, fl->fl_start, + length, wait, fl->fl_type); + req->r_args.filelock_change.rule = lock_type; req->r_args.filelock_change.type = cmd; - req->r_args.filelock_change.pid = cpu_to_le64(pid); + req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid); /* This should be adjusted, but I'm not sure if namespaces actually get id numbers*/ req->r_args.filelock_change.pid_namespace = - cpu_to_le64((u64)pid_ns); - req->r_args.filelock_change.start = cpu_to_le64(start); + cpu_to_le64((u64)(unsigned long)fl->fl_nspid); + req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start); req->r_args.filelock_change.length = cpu_to_le64(length); req->r_args.filelock_change.wait = wait; err = ceph_mdsc_do_request(mdsc, inode, req); + + if ( operation == CEPH_MDS_OP_GETFILELOCK){ + fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid); + if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) + fl->fl_type = F_RDLCK; + else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type) + fl->fl_type = F_WRLCK; + else + fl->fl_type = F_UNLCK; + + fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start); + length = le64_to_cpu(req->r_reply_info.filelock_reply->start) + + le64_to_cpu(req->r_reply_info.filelock_reply->length); + if (length >= 1) + fl->fl_end = length -1; + else + fl->fl_end = 0; + + } ceph_mdsc_put_request(req); dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " - "length: %llu, wait: %d, type`: %d err code %d", (int)lock_type, - (int)operation, pid, start, length, wait, cmd, err); + "length: %llu, wait: %d, type`: %d, err code %d", (int)lock_type, + (int)operation, (u64)fl->fl_pid, fl->fl_start, + length, wait, fl->fl_type, err); return err; } @@ -54,7 +82,6 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, */ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) { - u64 length; u8 lock_cmd; int err; u8 wait = 0; @@ -76,29 +103,20 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) else lock_cmd = CEPH_LOCK_UNLOCK; - if (LLONG_MAX == fl->fl_end) - length = 0; - else - length = fl->fl_end - fl->fl_start + 1; - - err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, - (u64)fl->fl_pid, - (u64)(unsigned long)fl->fl_nspid, - lock_cmd, fl->fl_start, - length, wait); + err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl); if (!err) { - dout("mds locked, locking locally"); - err = posix_lock_file(file, fl, NULL); - if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { - /* undo! This should only happen if the kernel detects - * local deadlock. */ - ceph_lock_message(CEPH_LOCK_FCNTL, op, file, - (u64)fl->fl_pid, - (u64)(unsigned long)fl->fl_nspid, - CEPH_LOCK_UNLOCK, fl->fl_start, - length, 0); - dout("got %d on posix_lock_file, undid lock", err); + if ( op != CEPH_MDS_OP_GETFILELOCK ){ + dout("mds locked, locking locally"); + err = posix_lock_file(file, fl, NULL); + if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { + /* undo! This should only happen if the kernel detects + * local deadlock. */ + ceph_lock_message(CEPH_LOCK_FCNTL, op, file, + CEPH_LOCK_UNLOCK, 0, fl); + dout("got %d on posix_lock_file, undid lock", err); + } } + } else { dout("mds returned error code %d", err); } @@ -107,7 +125,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) int ceph_flock(struct file *file, int cmd, struct file_lock *fl) { - u64 length; u8 lock_cmd; int err; u8 wait = 1; @@ -127,26 +144,15 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) lock_cmd = CEPH_LOCK_EXCL; else lock_cmd = CEPH_LOCK_UNLOCK; - /* mds requires start and length rather than start and end */ - if (LLONG_MAX == fl->fl_end) - length = 0; - else - length = fl->fl_end - fl->fl_start + 1; err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, - file, (u64)fl->fl_pid, - (u64)(unsigned long)fl->fl_nspid, - lock_cmd, fl->fl_start, - length, wait); + file, lock_cmd, wait, fl); if (!err) { err = flock_lock_file_wait(file, fl); if (err) { ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, - file, (u64)fl->fl_pid, - (u64)(unsigned long)fl->fl_nspid, - CEPH_LOCK_UNLOCK, fl->fl_start, - length, 0); + file, CEPH_LOCK_UNLOCK, 0, fl); dout("got %d on flock_lock_file_wait, undid lock", err); } } else { diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 3142b15..38800ea 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -6,7 +6,6 @@ #include <linux/sched.h> #include <linux/debugfs.h> #include <linux/seq_file.h> -#include <linux/smp_lock.h> #include "super.h" #include "mds_client.h" @@ -203,6 +202,38 @@ out_bad: } /* + * parse fcntl F_GETLK results + */ +static int parse_reply_info_filelock(void **p, void *end, + struct ceph_mds_reply_info_parsed *info) +{ + if (*p + sizeof(*info->filelock_reply) > end) + goto bad; + + info->filelock_reply = *p; + *p += sizeof(*info->filelock_reply); + + if (unlikely(*p != end)) + goto bad; + return 0; + +bad: + return -EIO; +} + +/* + * parse extra results + */ +static int parse_reply_info_extra(void **p, void *end, + struct ceph_mds_reply_info_parsed *info) +{ + if (info->head->op == CEPH_MDS_OP_GETFILELOCK) + return parse_reply_info_filelock(p, end, info); + else + return parse_reply_info_dir(p, end, info); +} + +/* * parse entire mds reply */ static int parse_reply_info(struct ceph_msg *msg, @@ -224,10 +255,10 @@ static int parse_reply_info(struct ceph_msg *msg, goto out_bad; } - /* dir content */ + /* extra */ ceph_decode_32_safe(&p, end, len, bad); if (len > 0) { - err = parse_reply_info_dir(&p, p+len, info); + err = parse_reply_info_extra(&p, p+len, info); if (err < 0) goto out_bad; } @@ -529,6 +560,9 @@ static void __register_request(struct ceph_mds_client *mdsc, ceph_mdsc_get_request(req); __insert_request(mdsc, req); + req->r_uid = current_fsuid(); + req->r_gid = current_fsgid(); + if (dir) { struct ceph_inode_info *ci = ceph_inode(dir); @@ -1588,8 +1622,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); head->op = cpu_to_le32(req->r_op); - head->caller_uid = cpu_to_le32(current_fsuid()); - head->caller_gid = cpu_to_le32(current_fsgid()); + head->caller_uid = cpu_to_le32(req->r_uid); + head->caller_gid = cpu_to_le32(req->r_gid); head->args = req->r_args; ceph_encode_filepath(&p, end, ino1, path1); @@ -2072,7 +2106,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_lock(&session->s_mutex); if (err < 0) { - pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds); + pr_err("mdsc_handle_reply got corrupt reply mds%d(tid:%lld)\n", mds, tid); ceph_msg_dump(msg); goto out_err; } @@ -2092,7 +2126,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) mutex_lock(&req->r_fill_mutex); err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); if (err == 0) { - if (result == 0 && rinfo->dir_nr) + if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK && + rinfo->dir_nr) ceph_readdir_prepopulate(req, req->r_session); ceph_unreserve_caps(mdsc, &req->r_caps_reservation); } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index d66d63c..aabe563 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -42,26 +42,37 @@ struct ceph_mds_reply_info_in { }; /* - * parsed info about an mds reply, including information about the - * target inode and/or its parent directory and dentry, and directory - * contents (for readdir results). + * parsed info about an mds reply, including information about + * either: 1) the target inode and/or its parent directory and dentry, + * and directory contents (for readdir results), or + * 2) the file range lock info (for fcntl F_GETLK results). */ struct ceph_mds_reply_info_parsed { struct ceph_mds_reply_head *head; + /* trace */ struct ceph_mds_reply_info_in diri, targeti; struct ceph_mds_reply_dirfrag *dirfrag; char *dname; u32 dname_len; struct ceph_mds_reply_lease *dlease; - struct ceph_mds_reply_dirfrag *dir_dir; - int dir_nr; - char **dir_dname; - u32 *dir_dname_len; - struct ceph_mds_reply_lease **dir_dlease; - struct ceph_mds_reply_info_in *dir_in; - u8 dir_complete, dir_end; + /* extra */ + union { + /* for fcntl F_GETLK results */ + struct ceph_filelock *filelock_reply; + + /* for readdir results */ + struct { + struct ceph_mds_reply_dirfrag *dir_dir; + int dir_nr; + char **dir_dname; + u32 *dir_dname_len; + struct ceph_mds_reply_lease **dir_dlease; + struct ceph_mds_reply_info_in *dir_in; + u8 dir_complete, dir_end; + }; + }; /* encoded blob describing snapshot contexts for certain operations (e.g., open) */ @@ -170,6 +181,8 @@ struct ceph_mds_request { union ceph_mds_request_args r_args; int r_fmode; /* file mode, if expecting cap */ + uid_t r_uid; + gid_t r_gid; /* for choosing which mds to send this request to */ int r_direct_mode; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 1886294..7f01728 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -293,9 +293,7 @@ struct ceph_inode_info { int i_rd_ref, i_rdcache_ref, i_wr_ref; int i_wrbuffer_ref, i_wrbuffer_ref_head; u32 i_shared_gen; /* increment each time we get FILE_SHARED */ - u32 i_rdcache_gen; /* we increment this each time we get - FILE_CACHE. If it's non-zero, we - _may_ have cached pages. */ + u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */ u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ struct list_head i_unsafe_writes; /* uncommitted sync writes */ diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 0ed2139..ee45648 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -4,6 +4,7 @@ config CIFS select NLS select CRYPTO select CRYPTO_MD5 + select CRYPTO_HMAC select CRYPTO_ARC4 help This is the client VFS module for the Common Internet File System @@ -143,6 +144,13 @@ config CIFS_FSCACHE to be cached locally on disk through the general filesystem cache manager. If unsure, say N. +config CIFS_ACL + bool "Provide CIFS ACL support (EXPERIMENTAL)" + depends on EXPERIMENTAL && CIFS_XATTR + help + Allows to fetch CIFS/NTFS ACL from the server. The DACL blob + is handed over to the application/caller. + config CIFS_EXPERIMENTAL bool "CIFS Experimental Features (EXPERIMENTAL)" depends on CIFS && EXPERIMENTAL diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index adefa60..43b19dd 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -6,7 +6,9 @@ obj-$(CONFIG_CIFS) += cifs.o cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \ link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \ md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \ - readdir.o ioctl.o sess.o export.o cifsacl.o + readdir.o ioctl.o sess.o export.o + +cifs-$(CONFIG_CIFS_ACL) += cifsacl.o cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o diff --git a/fs/cifs/README b/fs/cifs/README index ee68d10..46af99a 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -337,6 +337,15 @@ A partial list of the supported mount options follows: wsize default write size (default 57344) maximum wsize currently allowed by CIFS is 57344 (fourteen 4096 byte pages) + actimeo=n attribute cache timeout in seconds (default 1 second). + After this timeout, the cifs client requests fresh attribute + information from the server. This option allows to tune the + attribute cache timeout to suit the workload needs. Shorter + timeouts mean better the cache coherency, but increased number + of calls to the server. Longer timeouts mean reduced number + of calls to the server at the expense of less stricter cache + coherency checks (i.e. incorrect attribute cache for a short + period of time). rw mount the network share read-write (note that the server may still consider the share read-only) ro mount network share read-only diff --git a/fs/cifs/TODO b/fs/cifs/TODO index 5aff46c..355abcd 100644 --- a/fs/cifs/TODO +++ b/fs/cifs/TODO @@ -81,7 +81,7 @@ u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for v) mount check for unmatched uids -w) Add support for new vfs entry points for setlease and fallocate +w) Add support for new vfs entry point for fallocate x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of processes can proceed better in parallel (on the server) diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 525ba59..7852cd6 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -15,7 +15,7 @@ * the GNU Lesser General Public License for more details. * */ -#include <linux/radix-tree.h> +#include <linux/rbtree.h> #ifndef _CIFS_FS_SB_H #define _CIFS_FS_SB_H @@ -42,12 +42,13 @@ #define CIFS_MOUNT_MULTIUSER 0x20000 /* multiuser mount */ struct cifs_sb_info { - struct radix_tree_root tlink_tree; -#define CIFS_TLINK_MASTER_TAG 0 /* is "master" (mount) tcon */ + struct rb_root tlink_tree; spinlock_t tlink_tree_lock; + struct tcon_link *master_tlink; struct nls_table *local_nls; unsigned int rsize; unsigned int wsize; + unsigned long actimeo; /* attribute cache timeout (jiffies) */ atomic_t active; uid_t mnt_uid; gid_t mnt_gid; diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index c9b4792..a437ec3 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -30,8 +30,6 @@ #include "cifs_debug.h" -#ifdef CONFIG_CIFS_EXPERIMENTAL - static struct cifs_wksid wksidarr[NUM_WK_SIDS] = { {{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"}, {{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"}, @@ -560,7 +558,7 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) - return NULL; + return ERR_CAST(tlink); xid = GetXid(); rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), fid, &pntsd, pacllen); @@ -568,7 +566,9 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, cifs_put_tlink(tlink); - cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen); + cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen); + if (rc) + return ERR_PTR(rc); return pntsd; } @@ -583,7 +583,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) - return NULL; + return ERR_CAST(tlink); tcon = tlink_tcon(tlink); xid = GetXid(); @@ -591,23 +591,22 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL, 0, &fid, &oplock, NULL, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc) { - cERROR(1, "Unable to open file to get ACL"); - goto out; + if (!rc) { + rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen); + CIFSSMBClose(xid, tcon, fid); } - rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen); - cFYI(1, "GetCIFSACL rc = %d ACL len %d", rc, *pacllen); - - CIFSSMBClose(xid, tcon, fid); - out: cifs_put_tlink(tlink); FreeXid(xid); + + cFYI(1, "%s: rc = %d ACL len %d", __func__, rc, *pacllen); + if (rc) + return ERR_PTR(rc); return pntsd; } /* Retrieve an ACL from the server */ -static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, +struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, struct inode *inode, const char *path, u32 *pacllen) { @@ -695,7 +694,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, } /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */ -void +int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct inode *inode, const char *path, const __u16 *pfid) { @@ -711,17 +710,21 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen); /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */ - if (pntsd) + if (IS_ERR(pntsd)) { + rc = PTR_ERR(pntsd); + cERROR(1, "%s: error %d getting sec desc", __func__, rc); + } else { rc = parse_sec_desc(pntsd, acllen, fattr); - if (rc) - cFYI(1, "parse sec desc failed rc = %d", rc); + kfree(pntsd); + if (rc) + cERROR(1, "parse sec desc failed rc = %d", rc); + } - kfree(pntsd); - return; + return rc; } /* Convert mode bits to an ACL so we can update the ACL on the server */ -int mode_to_acl(struct inode *inode, const char *path, __u64 nmode) +int mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode) { int rc = 0; __u32 secdesclen = 0; @@ -736,7 +739,10 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode) /* Add three ACEs for owner, group, everyone getting rid of other ACEs as chmod disables ACEs and set the security descriptor */ - if (pntsd) { + if (IS_ERR(pntsd)) { + rc = PTR_ERR(pntsd); + cERROR(1, "%s: error %d getting sec desc", __func__, rc); + } else { /* allocate memory for the smb header, set security descriptor request security descriptor parameters, and secuirty descriptor itself */ @@ -766,4 +772,3 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode) return rc; } -#endif /* CONFIG_CIFS_EXPERIMENTAL */ diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index 6c8096c..c4ae7d0 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -74,11 +74,7 @@ struct cifs_wksid { char sidname[SIDNAMELENGTH]; } __attribute__((packed)); -#ifdef CONFIG_CIFS_EXPERIMENTAL - extern int match_sid(struct cifs_sid *); extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *); -#endif /* CONFIG_CIFS_EXPERIMENTAL */ - #endif /* _CIFSACL_H */ diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 75c4eaa..3936aa7 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -116,7 +116,7 @@ cifs_read_super(struct super_block *sb, void *data, return -ENOMEM; spin_lock_init(&cifs_sb->tlink_tree_lock); - INIT_RADIX_TREE(&cifs_sb->tlink_tree, GFP_KERNEL); + cifs_sb->tlink_tree = RB_ROOT; rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY); if (rc) { @@ -321,8 +321,7 @@ cifs_alloc_inode(struct super_block *sb) /* Until the file is open and we have gotten oplock info back from the server, can not assume caching of file data or metadata */ - cifs_inode->clientCanCacheRead = false; - cifs_inode->clientCanCacheAll = false; + cifs_set_oplock_level(cifs_inode, 0); cifs_inode->delete_pending = false; cifs_inode->invalid_mapping = false; cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ @@ -459,9 +458,13 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m) seq_printf(s, ",acl"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) seq_printf(s, ",mfsymlinks"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) + seq_printf(s, ",fsc"); seq_printf(s, ",rsize=%d", cifs_sb->rsize); seq_printf(s, ",wsize=%d", cifs_sb->wsize); + /* convert actimeo and display it in seconds */ + seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ); return 0; } @@ -934,7 +937,6 @@ init_cifs(void) GlobalCurrentXid = 0; GlobalTotalActiveXid = 0; GlobalMaxActiveXid = 0; - memset(Local_System_Name, 0, 15); spin_lock_init(&cifs_tcp_ses_lock); spin_lock_init(&cifs_file_list_lock); spin_lock_init(&GlobalMid_Lock); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index f259e4d..7136c0c 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -45,6 +45,16 @@ #define CIFS_MIN_RCV_POOL 4 /* + * default attribute cache timeout (jiffies) + */ +#define CIFS_DEF_ACTIMEO (1 * HZ) + +/* + * max attribute cache timeout (jiffies) - 2^30 + */ +#define CIFS_MAX_ACTIMEO (1 << 30) + +/* * MAX_REQ is the maximum number of requests that WE will send * on one socket concurrently. It also matches the most common * value of max multiplex returned by servers. We may @@ -336,7 +346,8 @@ struct cifsTconInfo { * "get" on the container. */ struct tcon_link { - unsigned long tl_index; + struct rb_node tl_rbnode; + uid_t tl_uid; unsigned long tl_flags; #define TCON_LINK_MASTER 0 #define TCON_LINK_PENDING 1 @@ -745,8 +756,6 @@ GLOBAL_EXTERN unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */ GLOBAL_EXTERN unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */ GLOBAL_EXTERN spinlock_t GlobalMid_Lock; /* protects above & list operations */ /* on midQ entries */ -GLOBAL_EXTERN char Local_System_Name[15]; - /* * Global counters, updated atomically */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index edb6d90..e6d1481 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -54,7 +54,8 @@ do { \ __func__, curr_xid, (int)rc); \ } while (0) extern char *build_path_from_dentry(struct dentry *); -extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb); +extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb, + struct cifsTconInfo *tcon); extern char *build_wildcard_path_from_dentry(struct dentry *direntry); extern char *cifs_compose_mount_options(const char *sb_mountdata, const char *fullpath, const struct dfs_info3_param *ref, @@ -79,9 +80,7 @@ extern bool is_valid_oplock_break(struct smb_hdr *smb, struct TCP_Server_Info *); extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool); -#ifdef CONFIG_CIFS_EXPERIMENTAL extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool); -#endif extern unsigned int smbCalcSize(struct smb_hdr *ptr); extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr); extern int decode_negTokenInit(unsigned char *security_blob, int length, @@ -104,6 +103,7 @@ extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601); extern u64 cifs_UnixTimeToNT(struct timespec); extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset); +extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock); extern struct cifsFileInfo *cifs_new_fileinfo(__u16 fileHandle, struct file *file, struct tcon_link *tlink, @@ -129,10 +129,12 @@ extern int cifs_get_file_info_unix(struct file *filp); extern int cifs_get_inode_info_unix(struct inode **pinode, const unsigned char *search_path, struct super_block *sb, int xid); -extern void cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, +extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct inode *inode, const char *path, const __u16 *pfid); -extern int mode_to_acl(struct inode *inode, const char *path, __u64); +extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64); +extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *, + const char *, u32 *); extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *, const char *); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 2f2632b..67acfb3 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -2478,95 +2478,6 @@ querySymLinkRetry: } #ifdef CONFIG_CIFS_EXPERIMENTAL -/* Initialize NT TRANSACT SMB into small smb request buffer. - This assumes that all NT TRANSACTS that we init here have - total parm and data under about 400 bytes (to fit in small cifs - buffer size), which is the case so far, it easily fits. NB: - Setup words themselves and ByteCount - MaxSetupCount (size of returned setup area) and - MaxParameterCount (returned parms size) must be set by caller */ -static int -smb_init_nttransact(const __u16 sub_command, const int setup_count, - const int parm_len, struct cifsTconInfo *tcon, - void **ret_buf) -{ - int rc; - __u32 temp_offset; - struct smb_com_ntransact_req *pSMB; - - rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon, - (void **)&pSMB); - if (rc) - return rc; - *ret_buf = (void *)pSMB; - pSMB->Reserved = 0; - pSMB->TotalParameterCount = cpu_to_le32(parm_len); - pSMB->TotalDataCount = 0; - pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf - - MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); - pSMB->ParameterCount = pSMB->TotalParameterCount; - pSMB->DataCount = pSMB->TotalDataCount; - temp_offset = offsetof(struct smb_com_ntransact_req, Parms) + - (setup_count * 2) - 4 /* for rfc1001 length itself */; - pSMB->ParameterOffset = cpu_to_le32(temp_offset); - pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len); - pSMB->SetupCount = setup_count; /* no need to le convert byte fields */ - pSMB->SubCommand = cpu_to_le16(sub_command); - return 0; -} - -static int -validate_ntransact(char *buf, char **ppparm, char **ppdata, - __u32 *pparmlen, __u32 *pdatalen) -{ - char *end_of_smb; - __u32 data_count, data_offset, parm_count, parm_offset; - struct smb_com_ntransact_rsp *pSMBr; - - *pdatalen = 0; - *pparmlen = 0; - - if (buf == NULL) - return -EINVAL; - - pSMBr = (struct smb_com_ntransact_rsp *)buf; - - /* ByteCount was converted from little endian in SendReceive */ - end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount + - (char *)&pSMBr->ByteCount; - - data_offset = le32_to_cpu(pSMBr->DataOffset); - data_count = le32_to_cpu(pSMBr->DataCount); - parm_offset = le32_to_cpu(pSMBr->ParameterOffset); - parm_count = le32_to_cpu(pSMBr->ParameterCount); - - *ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset; - *ppdata = (char *)&pSMBr->hdr.Protocol + data_offset; - - /* should we also check that parm and data areas do not overlap? */ - if (*ppparm > end_of_smb) { - cFYI(1, "parms start after end of smb"); - return -EINVAL; - } else if (parm_count + *ppparm > end_of_smb) { - cFYI(1, "parm end after end of smb"); - return -EINVAL; - } else if (*ppdata > end_of_smb) { - cFYI(1, "data starts after end of smb"); - return -EINVAL; - } else if (data_count + *ppdata > end_of_smb) { - cFYI(1, "data %p + count %d (%p) past smb end %p start %p", - *ppdata, data_count, (data_count + *ppdata), - end_of_smb, pSMBr); - return -EINVAL; - } else if (parm_count + data_count > pSMBr->ByteCount) { - cFYI(1, "parm count and data count larger than SMB"); - return -EINVAL; - } - *pdatalen = data_count; - *pparmlen = parm_count; - return 0; -} - int CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon, const unsigned char *searchName, @@ -3056,7 +2967,97 @@ GetExtAttrOut: #endif /* CONFIG_POSIX */ -#ifdef CONFIG_CIFS_EXPERIMENTAL +#ifdef CONFIG_CIFS_ACL +/* + * Initialize NT TRANSACT SMB into small smb request buffer. This assumes that + * all NT TRANSACTS that we init here have total parm and data under about 400 + * bytes (to fit in small cifs buffer size), which is the case so far, it + * easily fits. NB: Setup words themselves and ByteCount MaxSetupCount (size of + * returned setup area) and MaxParameterCount (returned parms size) must be set + * by caller + */ +static int +smb_init_nttransact(const __u16 sub_command, const int setup_count, + const int parm_len, struct cifsTconInfo *tcon, + void **ret_buf) +{ + int rc; + __u32 temp_offset; + struct smb_com_ntransact_req *pSMB; + + rc = small_smb_init(SMB_COM_NT_TRANSACT, 19 + setup_count, tcon, + (void **)&pSMB); + if (rc) + return rc; + *ret_buf = (void *)pSMB; + pSMB->Reserved = 0; + pSMB->TotalParameterCount = cpu_to_le32(parm_len); + pSMB->TotalDataCount = 0; + pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf - + MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); + pSMB->ParameterCount = pSMB->TotalParameterCount; + pSMB->DataCount = pSMB->TotalDataCount; + temp_offset = offsetof(struct smb_com_ntransact_req, Parms) + + (setup_count * 2) - 4 /* for rfc1001 length itself */; + pSMB->ParameterOffset = cpu_to_le32(temp_offset); + pSMB->DataOffset = cpu_to_le32(temp_offset + parm_len); + pSMB->SetupCount = setup_count; /* no need to le convert byte fields */ + pSMB->SubCommand = cpu_to_le16(sub_command); + return 0; +} + +static int +validate_ntransact(char *buf, char **ppparm, char **ppdata, + __u32 *pparmlen, __u32 *pdatalen) +{ + char *end_of_smb; + __u32 data_count, data_offset, parm_count, parm_offset; + struct smb_com_ntransact_rsp *pSMBr; + + *pdatalen = 0; + *pparmlen = 0; + + if (buf == NULL) + return -EINVAL; + + pSMBr = (struct smb_com_ntransact_rsp *)buf; + + /* ByteCount was converted from little endian in SendReceive */ + end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount + + (char *)&pSMBr->ByteCount; + + data_offset = le32_to_cpu(pSMBr->DataOffset); + data_count = le32_to_cpu(pSMBr->DataCount); + parm_offset = le32_to_cpu(pSMBr->ParameterOffset); + parm_count = le32_to_cpu(pSMBr->ParameterCount); + + *ppparm = (char *)&pSMBr->hdr.Protocol + parm_offset; + *ppdata = (char *)&pSMBr->hdr.Protocol + data_offset; + + /* should we also check that parm and data areas do not overlap? */ + if (*ppparm > end_of_smb) { + cFYI(1, "parms start after end of smb"); + return -EINVAL; + } else if (parm_count + *ppparm > end_of_smb) { + cFYI(1, "parm end after end of smb"); + return -EINVAL; + } else if (*ppdata > end_of_smb) { + cFYI(1, "data starts after end of smb"); + return -EINVAL; + } else if (data_count + *ppdata > end_of_smb) { + cFYI(1, "data %p + count %d (%p) past smb end %p start %p", + *ppdata, data_count, (data_count + *ppdata), + end_of_smb, pSMBr); + return -EINVAL; + } else if (parm_count + data_count > pSMBr->ByteCount) { + cFYI(1, "parm count and data count larger than SMB"); + return -EINVAL; + } + *pdatalen = data_count; + *pparmlen = parm_count; + return 0; +} + /* Get Security Descriptor (by handle) from remote server for a file or dir */ int CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid, @@ -3214,7 +3215,7 @@ setCifsAclRetry: return (rc); } -#endif /* CONFIG_CIFS_EXPERIMENTAL */ +#endif /* CONFIG_CIFS_ACL */ /* Legacy Query Path Information call for lookup to old servers such as Win9x/WinME */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 9eb327d..cc1a860 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -105,6 +105,7 @@ struct smb_vol { unsigned int wsize; bool sockopt_tcp_nodelay:1; unsigned short int port; + unsigned long actimeo; /* attribute cache timeout (jiffies) */ char *prepath; struct sockaddr_storage srcaddr; /* allow binding to a local IP */ struct nls_table *local_nls; @@ -116,6 +117,7 @@ struct smb_vol { static int ipv4_connect(struct TCP_Server_Info *server); static int ipv6_connect(struct TCP_Server_Info *server); +static void tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink); static void cifs_prune_tlinks(struct work_struct *work); /* @@ -805,23 +807,20 @@ cifs_parse_mount_options(char *options, const char *devname, short int override_gid = -1; bool uid_specified = false; bool gid_specified = false; + char *nodename = utsname()->nodename; separator[0] = ','; separator[1] = 0; - if (Local_System_Name[0] != 0) - memcpy(vol->source_rfc1001_name, Local_System_Name, 15); - else { - char *nodename = utsname()->nodename; - int n = strnlen(nodename, 15); - memset(vol->source_rfc1001_name, 0x20, 15); - for (i = 0; i < n; i++) { - /* does not have to be perfect mapping since field is - informational, only used for servers that do not support - port 445 and it can be overridden at mount time */ - vol->source_rfc1001_name[i] = toupper(nodename[i]); - } - } + /* + * does not have to be perfect mapping since field is + * informational, only used for servers that do not support + * port 445 and it can be overridden at mount time + */ + memset(vol->source_rfc1001_name, 0x20, 15); + for (i = 0; i < strnlen(nodename, 15); i++) + vol->source_rfc1001_name[i] = toupper(nodename[i]); + vol->source_rfc1001_name[15] = 0; /* null target name indicates to use *SMBSERVR default called name if we end up sending RFC1001 session initialize */ @@ -839,6 +838,8 @@ cifs_parse_mount_options(char *options, const char *devname, /* default to using server inode numbers where available */ vol->server_ino = 1; + vol->actimeo = CIFS_DEF_ACTIMEO; + if (!options) return 1; @@ -1213,6 +1214,16 @@ cifs_parse_mount_options(char *options, const char *devname, printk(KERN_WARNING "CIFS: server net" "biosname longer than 15 truncated.\n"); } + } else if (strnicmp(data, "actimeo", 7) == 0) { + if (value && *value) { + vol->actimeo = HZ * simple_strtoul(value, + &value, 0); + if (vol->actimeo > CIFS_MAX_ACTIMEO) { + cERROR(1, "CIFS: attribute cache" + "timeout too large"); + return 1; + } + } } else if (strnicmp(data, "credentials", 4) == 0) { /* ignore */ } else if (strnicmp(data, "version", 3) == 0) { @@ -1351,6 +1362,11 @@ cifs_parse_mount_options(char *options, const char *devname, "supported. Instead set " "/proc/fs/cifs/LookupCacheEnabled to 0\n"); } else if (strnicmp(data, "fsc", 3) == 0) { +#ifndef CONFIG_CIFS_FSCACHE + cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE" + "kernel config option set"); + return 1; +#endif vol->fsc = true; } else if (strnicmp(data, "mfsymlinks", 10) == 0) { vol->mfsymlinks = true; @@ -2565,6 +2581,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info, cFYI(1, "file mode: 0x%x dir mode: 0x%x", cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode); + cifs_sb->actimeo = pvolume_info->actimeo; + if (pvolume_info->noperm) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM; if (pvolume_info->setuids) @@ -2815,13 +2833,13 @@ remote_path_check: /* check if a whole path (including prepath) is not remote */ if (!rc && cifs_sb->prepathlen && tcon) { /* build_path_to_root works only when we have a valid tcon */ - full_path = cifs_build_path_to_root(cifs_sb); + full_path = cifs_build_path_to_root(cifs_sb, tcon); if (full_path == NULL) { rc = -ENOMEM; goto mount_fail_check; } rc = is_path_accessible(xid, tcon, cifs_sb, full_path); - if (rc != -EREMOTE) { + if (rc != 0 && rc != -EREMOTE) { kfree(full_path); goto mount_fail_check; } @@ -2900,24 +2918,16 @@ remote_path_check: goto mount_fail_check; } - tlink->tl_index = pSesInfo->linux_uid; + tlink->tl_uid = pSesInfo->linux_uid; tlink->tl_tcon = tcon; tlink->tl_time = jiffies; set_bit(TCON_LINK_MASTER, &tlink->tl_flags); set_bit(TCON_LINK_IN_TREE, &tlink->tl_flags); - rc = radix_tree_preload(GFP_KERNEL); - if (rc == -ENOMEM) { - kfree(tlink); - goto mount_fail_check; - } - + cifs_sb->master_tlink = tlink; spin_lock(&cifs_sb->tlink_tree_lock); - radix_tree_insert(&cifs_sb->tlink_tree, pSesInfo->linux_uid, tlink); - radix_tree_tag_set(&cifs_sb->tlink_tree, pSesInfo->linux_uid, - CIFS_TLINK_MASTER_TAG); + tlink_rb_insert(&cifs_sb->tlink_tree, tlink); spin_unlock(&cifs_sb->tlink_tree_lock); - radix_tree_preload_end(); queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks, TLINK_IDLE_EXPIRE); @@ -3107,32 +3117,25 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, int cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb) { - int i, ret; + struct rb_root *root = &cifs_sb->tlink_tree; + struct rb_node *node; + struct tcon_link *tlink; char *tmp; - struct tcon_link *tlink[8]; - unsigned long index = 0; cancel_delayed_work_sync(&cifs_sb->prune_tlinks); - do { - spin_lock(&cifs_sb->tlink_tree_lock); - ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree, - (void **)tlink, index, - ARRAY_SIZE(tlink)); - /* increment index for next pass */ - if (ret > 0) - index = tlink[ret - 1]->tl_index + 1; - for (i = 0; i < ret; i++) { - cifs_get_tlink(tlink[i]); - clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags); - radix_tree_delete(&cifs_sb->tlink_tree, - tlink[i]->tl_index); - } - spin_unlock(&cifs_sb->tlink_tree_lock); + spin_lock(&cifs_sb->tlink_tree_lock); + while ((node = rb_first(root))) { + tlink = rb_entry(node, struct tcon_link, tl_rbnode); + cifs_get_tlink(tlink); + clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags); + rb_erase(node, root); - for (i = 0; i < ret; i++) - cifs_put_tlink(tlink[i]); - } while (ret != 0); + spin_unlock(&cifs_sb->tlink_tree_lock); + cifs_put_tlink(tlink); + spin_lock(&cifs_sb->tlink_tree_lock); + } + spin_unlock(&cifs_sb->tlink_tree_lock); tmp = cifs_sb->prepath; cifs_sb->prepathlen = 0; @@ -3271,22 +3274,10 @@ out: return tcon; } -static struct tcon_link * +static inline struct tcon_link * cifs_sb_master_tlink(struct cifs_sb_info *cifs_sb) { - struct tcon_link *tlink; - unsigned int ret; - - spin_lock(&cifs_sb->tlink_tree_lock); - ret = radix_tree_gang_lookup_tag(&cifs_sb->tlink_tree, (void **)&tlink, - 0, 1, CIFS_TLINK_MASTER_TAG); - spin_unlock(&cifs_sb->tlink_tree_lock); - - /* the master tcon should always be present */ - if (ret == 0) - BUG(); - - return tlink; + return cifs_sb->master_tlink; } struct cifsTconInfo * @@ -3302,6 +3293,47 @@ cifs_sb_tcon_pending_wait(void *unused) return signal_pending(current) ? -ERESTARTSYS : 0; } +/* find and return a tlink with given uid */ +static struct tcon_link * +tlink_rb_search(struct rb_root *root, uid_t uid) +{ + struct rb_node *node = root->rb_node; + struct tcon_link *tlink; + + while (node) { + tlink = rb_entry(node, struct tcon_link, tl_rbnode); + + if (tlink->tl_uid > uid) + node = node->rb_left; + else if (tlink->tl_uid < uid) + node = node->rb_right; + else + return tlink; + } + return NULL; +} + +/* insert a tcon_link into the tree */ +static void +tlink_rb_insert(struct rb_root *root, struct tcon_link *new_tlink) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + struct tcon_link *tlink; + + while (*new) { + tlink = rb_entry(*new, struct tcon_link, tl_rbnode); + parent = *new; + + if (tlink->tl_uid > new_tlink->tl_uid) + new = &((*new)->rb_left); + else + new = &((*new)->rb_right); + } + + rb_link_node(&new_tlink->tl_rbnode, parent, new); + rb_insert_color(&new_tlink->tl_rbnode, root); +} + /* * Find or construct an appropriate tcon given a cifs_sb and the fsuid of the * current task. @@ -3309,7 +3341,7 @@ cifs_sb_tcon_pending_wait(void *unused) * If the superblock doesn't refer to a multiuser mount, then just return * the master tcon for the mount. * - * First, search the radix tree for an existing tcon for this fsuid. If one + * First, search the rbtree for an existing tcon for this fsuid. If one * exists, then check to see if it's pending construction. If it is then wait * for construction to complete. Once it's no longer pending, check to see if * it failed and either return an error or retry construction, depending on @@ -3322,14 +3354,14 @@ struct tcon_link * cifs_sb_tlink(struct cifs_sb_info *cifs_sb) { int ret; - unsigned long fsuid = (unsigned long) current_fsuid(); + uid_t fsuid = current_fsuid(); struct tcon_link *tlink, *newtlink; if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) return cifs_get_tlink(cifs_sb_master_tlink(cifs_sb)); spin_lock(&cifs_sb->tlink_tree_lock); - tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid); + tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid); if (tlink) cifs_get_tlink(tlink); spin_unlock(&cifs_sb->tlink_tree_lock); @@ -3338,36 +3370,24 @@ cifs_sb_tlink(struct cifs_sb_info *cifs_sb) newtlink = kzalloc(sizeof(*tlink), GFP_KERNEL); if (newtlink == NULL) return ERR_PTR(-ENOMEM); - newtlink->tl_index = fsuid; + newtlink->tl_uid = fsuid; newtlink->tl_tcon = ERR_PTR(-EACCES); set_bit(TCON_LINK_PENDING, &newtlink->tl_flags); set_bit(TCON_LINK_IN_TREE, &newtlink->tl_flags); cifs_get_tlink(newtlink); - ret = radix_tree_preload(GFP_KERNEL); - if (ret != 0) { - kfree(newtlink); - return ERR_PTR(ret); - } - spin_lock(&cifs_sb->tlink_tree_lock); /* was one inserted after previous search? */ - tlink = radix_tree_lookup(&cifs_sb->tlink_tree, fsuid); + tlink = tlink_rb_search(&cifs_sb->tlink_tree, fsuid); if (tlink) { cifs_get_tlink(tlink); spin_unlock(&cifs_sb->tlink_tree_lock); - radix_tree_preload_end(); kfree(newtlink); goto wait_for_construction; } - ret = radix_tree_insert(&cifs_sb->tlink_tree, fsuid, newtlink); - spin_unlock(&cifs_sb->tlink_tree_lock); - radix_tree_preload_end(); - if (ret) { - kfree(newtlink); - return ERR_PTR(ret); - } tlink = newtlink; + tlink_rb_insert(&cifs_sb->tlink_tree, tlink); + spin_unlock(&cifs_sb->tlink_tree_lock); } else { wait_for_construction: ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING, @@ -3413,39 +3433,39 @@ cifs_prune_tlinks(struct work_struct *work) { struct cifs_sb_info *cifs_sb = container_of(work, struct cifs_sb_info, prune_tlinks.work); - struct tcon_link *tlink[8]; - unsigned long now = jiffies; - unsigned long index = 0; - int i, ret; + struct rb_root *root = &cifs_sb->tlink_tree; + struct rb_node *node = rb_first(root); + struct rb_node *tmp; + struct tcon_link *tlink; - do { - spin_lock(&cifs_sb->tlink_tree_lock); - ret = radix_tree_gang_lookup(&cifs_sb->tlink_tree, - (void **)tlink, index, - ARRAY_SIZE(tlink)); - /* increment index for next pass */ - if (ret > 0) - index = tlink[ret - 1]->tl_index + 1; - for (i = 0; i < ret; i++) { - if (test_bit(TCON_LINK_MASTER, &tlink[i]->tl_flags) || - atomic_read(&tlink[i]->tl_count) != 0 || - time_after(tlink[i]->tl_time + TLINK_IDLE_EXPIRE, - now)) { - tlink[i] = NULL; - continue; - } - cifs_get_tlink(tlink[i]); - clear_bit(TCON_LINK_IN_TREE, &tlink[i]->tl_flags); - radix_tree_delete(&cifs_sb->tlink_tree, - tlink[i]->tl_index); - } - spin_unlock(&cifs_sb->tlink_tree_lock); + /* + * Because we drop the spinlock in the loop in order to put the tlink + * it's not guarded against removal of links from the tree. The only + * places that remove entries from the tree are this function and + * umounts. Because this function is non-reentrant and is canceled + * before umount can proceed, this is safe. + */ + spin_lock(&cifs_sb->tlink_tree_lock); + node = rb_first(root); + while (node != NULL) { + tmp = node; + node = rb_next(tmp); + tlink = rb_entry(tmp, struct tcon_link, tl_rbnode); + + if (test_bit(TCON_LINK_MASTER, &tlink->tl_flags) || + atomic_read(&tlink->tl_count) != 0 || + time_after(tlink->tl_time + TLINK_IDLE_EXPIRE, jiffies)) + continue; - for (i = 0; i < ret; i++) { - if (tlink[i] != NULL) - cifs_put_tlink(tlink[i]); - } - } while (ret != 0); + cifs_get_tlink(tlink); + clear_bit(TCON_LINK_IN_TREE, &tlink->tl_flags); + rb_erase(tmp, root); + + spin_unlock(&cifs_sb->tlink_tree_lock); + cifs_put_tlink(tlink); + spin_lock(&cifs_sb->tlink_tree_lock); + } + spin_unlock(&cifs_sb->tlink_tree_lock); queue_delayed_work(system_nrt_wq, &cifs_sb->prune_tlinks, TLINK_IDLE_EXPIRE); diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c index 0eb8702..548f062 100644 --- a/fs/cifs/dns_resolve.c +++ b/fs/cifs/dns_resolve.c @@ -66,7 +66,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) /* Search for server name delimiter */ sep = memchr(hostname, '\\', len); if (sep) - len = sep - unc; + len = sep - hostname; else cFYI(1, "%s: probably server name is whole unc: %s", __func__, unc); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index ae82159..5a28660 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -146,12 +146,7 @@ client_can_cache: rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb, xid, NULL); - if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { - pCifsInode->clientCanCacheAll = true; - pCifsInode->clientCanCacheRead = true; - cFYI(1, "Exclusive Oplock granted on inode %p", inode); - } else if ((oplock & 0xF) == OPLOCK_READ) - pCifsInode->clientCanCacheRead = true; + cifs_set_oplock_level(pCifsInode, oplock); return rc; } @@ -253,12 +248,7 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file, list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList); spin_unlock(&cifs_file_list_lock); - if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { - pCifsInode->clientCanCacheAll = true; - pCifsInode->clientCanCacheRead = true; - cFYI(1, "Exclusive Oplock inode %p", inode); - } else if ((oplock & 0xF) == OPLOCK_READ) - pCifsInode->clientCanCacheRead = true; + cifs_set_oplock_level(pCifsInode, oplock); file->private_data = pCifsFile; return pCifsFile; @@ -271,8 +261,9 @@ cifs_new_fileinfo(__u16 fileHandle, struct file *file, */ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) { + struct inode *inode = cifs_file->dentry->d_inode; struct cifsTconInfo *tcon = tlink_tcon(cifs_file->tlink); - struct cifsInodeInfo *cifsi = CIFS_I(cifs_file->dentry->d_inode); + struct cifsInodeInfo *cifsi = CIFS_I(inode); struct cifsLockInfo *li, *tmp; spin_lock(&cifs_file_list_lock); @@ -288,8 +279,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) if (list_empty(&cifsi->openFileList)) { cFYI(1, "closing last open instance for inode %p", cifs_file->dentry->d_inode); - cifsi->clientCanCacheRead = false; - cifsi->clientCanCacheAll = false; + cifs_set_oplock_level(cifsi, 0); } spin_unlock(&cifs_file_list_lock); @@ -607,8 +597,6 @@ reopen_success: rc = filemap_write_and_wait(inode->i_mapping); mapping_set_error(inode->i_mapping, rc); - pCifsInode->clientCanCacheAll = false; - pCifsInode->clientCanCacheRead = false; if (tcon->unix_ext) rc = cifs_get_inode_info_unix(&inode, full_path, inode->i_sb, xid); @@ -622,18 +610,9 @@ reopen_success: invalidate the current end of file on the server we can not go to the server to get the new inod info */ - if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { - pCifsInode->clientCanCacheAll = true; - pCifsInode->clientCanCacheRead = true; - cFYI(1, "Exclusive Oplock granted on inode %p", - pCifsFile->dentry->d_inode); - } else if ((oplock & 0xF) == OPLOCK_READ) { - pCifsInode->clientCanCacheRead = true; - pCifsInode->clientCanCacheAll = false; - } else { - pCifsInode->clientCanCacheRead = false; - pCifsInode->clientCanCacheAll = false; - } + + cifs_set_oplock_level(pCifsInode, oplock); + cifs_relock_file(pCifsFile); reopen_error_exit: @@ -775,12 +754,6 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); tcon = tlink_tcon(((struct cifsFileInfo *)file->private_data)->tlink); - - if (file->private_data == NULL) { - rc = -EBADF; - FreeXid(xid); - return rc; - } netfid = ((struct cifsFileInfo *)file->private_data)->netfid; if ((tcon->ses->capabilities & CAP_UNIX) && @@ -956,6 +929,7 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, ssize_t cifs_user_write(struct file *file, const char __user *write_data, size_t write_size, loff_t *poffset) { + struct inode *inode = file->f_path.dentry->d_inode; int rc = 0; unsigned int bytes_written = 0; unsigned int total_written; @@ -963,7 +937,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, struct cifsTconInfo *pTcon; int xid, long_op; struct cifsFileInfo *open_file; - struct cifsInodeInfo *cifsi = CIFS_I(file->f_path.dentry->d_inode); + struct cifsInodeInfo *cifsi = CIFS_I(inode); cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); @@ -1029,21 +1003,17 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, cifs_stats_bytes_written(pTcon, total_written); - /* since the write may have blocked check these pointers again */ - if ((file->f_path.dentry) && (file->f_path.dentry->d_inode)) { - struct inode *inode = file->f_path.dentry->d_inode; /* Do not update local mtime - server will set its actual value on write - * inode->i_ctime = inode->i_mtime = - * current_fs_time(inode->i_sb);*/ - if (total_written > 0) { - spin_lock(&inode->i_lock); - if (*poffset > file->f_path.dentry->d_inode->i_size) - i_size_write(file->f_path.dentry->d_inode, - *poffset); - spin_unlock(&inode->i_lock); - } - mark_inode_dirty_sync(file->f_path.dentry->d_inode); + * inode->i_ctime = inode->i_mtime = + * current_fs_time(inode->i_sb);*/ + if (total_written > 0) { + spin_lock(&inode->i_lock); + if (*poffset > inode->i_size) + i_size_write(inode, *poffset); + spin_unlock(&inode->i_lock); } + mark_inode_dirty_sync(inode); + FreeXid(xid); return total_written; } @@ -1138,7 +1108,6 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file, return total_written; } -#ifdef CONFIG_CIFS_EXPERIMENTAL struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only) { @@ -1172,13 +1141,12 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, spin_unlock(&cifs_file_list_lock); return NULL; } -#endif struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only) { struct cifsFileInfo *open_file; - struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + struct cifs_sb_info *cifs_sb; bool any_available = false; int rc; @@ -1192,6 +1160,8 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode, return NULL; } + cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); + /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) fsuid_only = false; @@ -2299,8 +2269,10 @@ void cifs_oplock_break_get(struct cifsFileInfo *cfile) void cifs_oplock_break_put(struct cifsFileInfo *cfile) { + struct super_block *sb = cfile->dentry->d_sb; + cifsFileInfo_put(cfile); - cifs_sb_deactive(cfile->dentry->d_sb); + cifs_sb_deactive(sb); } const struct address_space_operations cifs_addr_ops = { diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index a2ad94e..297a43d 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -2,7 +2,7 @@ * fs/cifs/fscache.c - CIFS filesystem cache interface * * Copyright (c) 2010 Novell, Inc. - * Author(s): Suresh Jayaraman (sjayaraman@suse.de> + * Author(s): Suresh Jayaraman <sjayaraman@suse.de> * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published @@ -67,10 +67,12 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode) if (cifsi->fscache) return; - cifsi->fscache = fscache_acquire_cookie(tcon->fscache, + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) { + cifsi->fscache = fscache_acquire_cookie(tcon->fscache, &cifs_fscache_inode_object_def, cifsi); - cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache, + cFYI(1, "CIFS: got FH cookie (0x%p/0x%p)", tcon->fscache, cifsi->fscache); + } } void cifs_fscache_release_inode_cookie(struct inode *inode) @@ -101,10 +103,8 @@ void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp) { if ((filp->f_flags & O_ACCMODE) != O_RDONLY) cifs_fscache_disable_inode_cookie(inode); - else { + else cifs_fscache_enable_inode_cookie(inode); - cFYI(1, "CIFS: fscache inode cookie set"); - } } void cifs_fscache_reset_inode_cookie(struct inode *inode) diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 39869c3..589f3e3 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -686,13 +686,18 @@ int cifs_get_inode_info(struct inode **pinode, cFYI(1, "cifs_sfu_type failed: %d", tmprc); } -#ifdef CONFIG_CIFS_EXPERIMENTAL +#ifdef CONFIG_CIFS_ACL /* fill in 0777 bits from ACL */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { - cFYI(1, "Getting mode bits from ACL"); - cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, pfid); + rc = cifs_acl_to_fattr(cifs_sb, &fattr, *pinode, full_path, + pfid); + if (rc) { + cFYI(1, "%s: Getting ACL failed with error: %d", + __func__, rc); + goto cgii_exit; + } } -#endif +#endif /* CONFIG_CIFS_ACL */ /* fill in remaining high mode bits e.g. SUID, VTX */ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) @@ -723,12 +728,12 @@ static const struct inode_operations cifs_ipc_inode_ops = { .lookup = cifs_lookup, }; -char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb) +char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb, + struct cifsTconInfo *tcon) { int pplen = cifs_sb->prepathlen; int dfsplen; char *full_path = NULL; - struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb); /* if no prefix path, simply set path to the root of share to "" */ if (pplen == 0) { @@ -870,7 +875,7 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino) char *full_path; struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb); - full_path = cifs_build_path_to_root(cifs_sb); + full_path = cifs_build_path_to_root(cifs_sb, tcon); if (full_path == NULL) return ERR_PTR(-ENOMEM); @@ -881,8 +886,10 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino) rc = cifs_get_inode_info(&inode, full_path, NULL, sb, xid, NULL); - if (!inode) - return ERR_PTR(rc); + if (!inode) { + inode = ERR_PTR(rc); + goto out; + } #ifdef CONFIG_CIFS_FSCACHE /* populate tcon->resource_id */ @@ -898,13 +905,11 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino) inode->i_uid = cifs_sb->mnt_uid; inode->i_gid = cifs_sb->mnt_gid; } else if (rc) { - kfree(full_path); - _FreeXid(xid); iget_failed(inode); - return ERR_PTR(rc); + inode = ERR_PTR(rc); } - +out: kfree(full_path); /* can not call macro FreeXid here since in a void func * TODO: This is no longer true @@ -1648,6 +1653,7 @@ static bool cifs_inode_needs_reval(struct inode *inode) { struct cifsInodeInfo *cifs_i = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); if (cifs_i->clientCanCacheRead) return false; @@ -1658,19 +1664,21 @@ cifs_inode_needs_reval(struct inode *inode) if (cifs_i->time == 0) return true; - /* FIXME: the actimeo should be tunable */ - if (time_after_eq(jiffies, cifs_i->time + HZ)) + if (!time_in_range(jiffies, cifs_i->time, + cifs_i->time + cifs_sb->actimeo)) return true; /* hardlinked files w/ noserverino get "special" treatment */ - if (!(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) && + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) && S_ISREG(inode->i_mode) && inode->i_nlink != 1) return true; return false; } -/* check invalid_mapping flag and zap the cache if it's set */ +/* + * Zap the cache. Called when invalid_mapping flag is set. + */ static void cifs_invalidate_mapping(struct inode *inode) { @@ -2114,11 +2122,16 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) if (attrs->ia_valid & ATTR_MODE) { rc = 0; -#ifdef CONFIG_CIFS_EXPERIMENTAL - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) - rc = mode_to_acl(inode, full_path, mode); - else -#endif +#ifdef CONFIG_CIFS_ACL + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { + rc = mode_to_cifs_acl(inode, full_path, mode); + if (rc) { + cFYI(1, "%s: Setting ACL failed with error: %d", + __func__, rc); + goto cifs_setattr_exit; + } + } else +#endif /* CONFIG_CIFS_ACL */ if (((mode & S_IWUGO) == 0) && (cifsInode->cifsAttrs & ATTR_READONLY) == 0) { @@ -2177,7 +2190,6 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) setattr_copy(inode, attrs); mark_inode_dirty(inode); - return 0; cifs_setattr_exit: kfree(full_path); diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 077bf75..0c98672 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -38,10 +38,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) struct cifs_sb_info *cifs_sb; #ifdef CONFIG_CIFS_POSIX struct cifsFileInfo *pSMBFile = filep->private_data; - struct cifsTconInfo *tcon = tlink_tcon(pSMBFile->tlink); + struct cifsTconInfo *tcon; __u64 ExtAttrBits = 0; __u64 ExtAttrMask = 0; - __u64 caps = le64_to_cpu(tcon->fsUnixInfo.Capability); + __u64 caps; #endif /* CONFIG_CIFS_POSIX */ xid = GetXid(); @@ -62,9 +62,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) break; #ifdef CONFIG_CIFS_POSIX case FS_IOC_GETFLAGS: + if (pSMBFile == NULL) + break; + tcon = tlink_tcon(pSMBFile->tlink); + caps = le64_to_cpu(tcon->fsUnixInfo.Capability); if (CIFS_UNIX_EXTATTR_CAP & caps) { - if (pSMBFile == NULL) - break; rc = CIFSGetExtAttr(xid, tcon, pSMBFile->netfid, &ExtAttrBits, &ExtAttrMask); if (rc == 0) @@ -75,13 +77,15 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) break; case FS_IOC_SETFLAGS: + if (pSMBFile == NULL) + break; + tcon = tlink_tcon(pSMBFile->tlink); + caps = le64_to_cpu(tcon->fsUnixInfo.Capability); if (CIFS_UNIX_EXTATTR_CAP & caps) { if (get_user(ExtAttrBits, (int __user *)arg)) { rc = -EFAULT; break; } - if (pSMBFile == NULL) - break; /* rc= CIFSGetExtAttr(xid,tcon,pSMBFile->netfid, extAttrBits, &ExtAttrMask);*/ } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index c4e296f..43f1028 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -569,10 +569,9 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) cFYI(1, "file id match, oplock break"); pCifsInode = CIFS_I(netfile->dentry->d_inode); - pCifsInode->clientCanCacheAll = false; - if (pSMB->OplockLevel == 0) - pCifsInode->clientCanCacheRead = false; + cifs_set_oplock_level(pCifsInode, + pSMB->OplockLevel); /* * cifs_oplock_break_put() can't be called * from here. Get reference after queueing @@ -722,3 +721,23 @@ cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb) cifs_sb_master_tcon(cifs_sb)->treeName); } } + +void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock) +{ + oplock &= 0xF; + + if (oplock == OPLOCK_EXCLUSIVE) { + cinode->clientCanCacheAll = true; + cinode->clientCanCacheRead = true; + cFYI(1, "Exclusive Oplock granted on inode %p", + &cinode->vfs_inode); + } else if (oplock == OPLOCK_READ) { + cinode->clientCanCacheAll = false; + cinode->clientCanCacheRead = true; + cFYI(1, "Level II Oplock granted on inode %p", + &cinode->vfs_inode); + } else { + cinode->clientCanCacheAll = false; + cinode->clientCanCacheRead = false; + } +} diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index ef7bb7b..a73eb9f 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -226,26 +226,29 @@ static int initiate_cifs_search(const int xid, struct file *file) char *full_path = NULL; struct cifsFileInfo *cifsFile; struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - struct tcon_link *tlink; + struct tcon_link *tlink = NULL; struct cifsTconInfo *pTcon; - tlink = cifs_sb_tlink(cifs_sb); - if (IS_ERR(tlink)) - return PTR_ERR(tlink); - pTcon = tlink_tcon(tlink); - - if (file->private_data == NULL) - file->private_data = - kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); if (file->private_data == NULL) { - rc = -ENOMEM; - goto error_exit; + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) + return PTR_ERR(tlink); + + cifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); + if (cifsFile == NULL) { + rc = -ENOMEM; + goto error_exit; + } + file->private_data = cifsFile; + cifsFile->tlink = cifs_get_tlink(tlink); + pTcon = tlink_tcon(tlink); + } else { + cifsFile = file->private_data; + pTcon = tlink_tcon(cifsFile->tlink); } - cifsFile = file->private_data; cifsFile->invalidHandle = true; cifsFile->srch_inf.endOfSearch = false; - cifsFile->tlink = cifs_get_tlink(tlink); full_path = build_path_from_dentry(file->f_path.dentry); if (full_path == NULL) { @@ -756,18 +759,6 @@ static int cifs_filldir(char *pfindEntry, struct file *file, filldir_t filldir, rc = filldir(direntry, qstring.name, qstring.len, file->f_pos, ino, fattr.cf_dtype); - /* - * we can not return filldir errors to the caller since they are - * "normal" when the stat blocksize is too small - we return remapped - * error instead - * - * FIXME: This looks bogus. filldir returns -EOVERFLOW in the above - * case already. Why should we be clobbering other errors from it? - */ - if (rc) { - cFYI(1, "filldir rc = %d", rc); - rc = -EOVERFLOW; - } dput(tmp_dentry); return rc; } diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index a264b74..eae2a14 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -30,10 +30,11 @@ #define MAX_EA_VALUE_SIZE 65535 #define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib" +#define CIFS_XATTR_CIFS_ACL "system.cifs_acl" #define CIFS_XATTR_USER_PREFIX "user." #define CIFS_XATTR_SYSTEM_PREFIX "system." #define CIFS_XATTR_OS2_PREFIX "os2." -#define CIFS_XATTR_SECURITY_PREFIX ".security" +#define CIFS_XATTR_SECURITY_PREFIX "security." #define CIFS_XATTR_TRUSTED_PREFIX "trusted." #define XATTR_TRUSTED_PREFIX_LEN 8 #define XATTR_SECURITY_PREFIX_LEN 9 @@ -277,29 +278,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); -#ifdef CONFIG_CIFS_EXPERIMENTAL - else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) { - __u16 fid; - int oplock = 0; - struct cifs_ntsd *pacl = NULL; - __u32 buflen = 0; - if (experimEnabled) - rc = CIFSSMBOpen(xid, pTcon, full_path, - FILE_OPEN, GENERIC_READ, 0, &fid, - &oplock, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - /* else rc is EOPNOTSUPP from above */ - - if (rc == 0) { - rc = CIFSSMBGetCIFSACL(xid, pTcon, fid, &pacl, - &buflen); - CIFSSMBClose(xid, pTcon, fid); - } - } -#endif /* EXPERIMENTAL */ #else - cFYI(1, "query POSIX ACL not supported yet"); + cFYI(1, "Query POSIX ACL not supported yet"); #endif /* CONFIG_CIFS_POSIX */ } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT, strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) { @@ -311,8 +291,33 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); #else - cFYI(1, "query POSIX default ACL not supported yet"); -#endif + cFYI(1, "Query POSIX default ACL not supported yet"); +#endif /* CONFIG_CIFS_POSIX */ + } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL, + strlen(CIFS_XATTR_CIFS_ACL)) == 0) { +#ifdef CONFIG_CIFS_ACL + u32 acllen; + struct cifs_ntsd *pacl; + + pacl = get_cifs_acl(cifs_sb, direntry->d_inode, + full_path, &acllen); + if (IS_ERR(pacl)) { + rc = PTR_ERR(pacl); + cERROR(1, "%s: error %zd getting sec desc", + __func__, rc); + } else { + if (ea_value) { + if (acllen > buf_size) + acllen = -ERANGE; + else + memcpy(ea_value, pacl, acllen); + } + rc = acllen; + kfree(pacl); + } +#else + cFYI(1, "Query CIFS ACL not supported yet"); +#endif /* CONFIG_CIFS_ACL */ } else if (strncmp(ea_name, CIFS_XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) { cFYI(1, "Trusted xattr namespace not supported yet"); diff --git a/fs/compat.c b/fs/compat.c index c580c32..eb1740a 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1350,6 +1350,10 @@ static int compat_count(compat_uptr_t __user *argv, int max) argv++; if (i++ >= max) return -E2BIG; + + if (fatal_signal_pending(current)) + return -ERESTARTNOHAND; + cond_resched(); } } return i; @@ -1391,6 +1395,12 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv, while (len > 0) { int offset, bytes_to_copy; + if (fatal_signal_pending(current)) { + ret = -ERESTARTNOHAND; + goto out; + } + cond_resched(); + offset = pos % PAGE_SIZE; if (offset == 0) offset = PAGE_SIZE; @@ -1407,18 +1417,8 @@ static int compat_copy_strings(int argc, compat_uptr_t __user *argv, if (!kmapped_page || kpos != (pos & PAGE_MASK)) { struct page *page; -#ifdef CONFIG_STACK_GROWSUP - ret = expand_stack_downwards(bprm->vma, pos); - if (ret < 0) { - /* We've exceed the stack rlimit. */ - ret = -E2BIG; - goto out; - } -#endif - ret = get_user_pages(current, bprm->mm, pos, - 1, 1, 1, &page, NULL); - if (ret <= 0) { - /* We've exceed the stack rlimit. */ + page = get_arg_page(bprm, pos, 1); + if (!page) { ret = -E2BIG; goto out; } @@ -1539,8 +1539,10 @@ int compat_do_execve(char * filename, return retval; out: - if (bprm->mm) + if (bprm->mm) { + acct_arg_size(bprm, 0); mmput(bprm->mm); + } out_file: if (bprm->file) { diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 410ed18..a60579b 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -19,7 +19,6 @@ #include <linux/compiler.h> #include <linux/sched.h> #include <linux/smp.h> -#include <linux/smp_lock.h> #include <linux/ioctl.h> #include <linux/if.h> #include <linux/if_bridge.h> diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index 2537323..2720178 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -28,7 +28,6 @@ #include <linux/key.h> #include <linux/slab.h> #include <linux/seq_file.h> -#include <linux/smp_lock.h> #include <linux/file.h> #include <linux/crypto.h> #include "ecryptfs_kernel.h" @@ -164,7 +164,26 @@ out: #ifdef CONFIG_MMU -static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, +void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +{ + struct mm_struct *mm = current->mm; + long diff = (long)(pages - bprm->vma_pages); + + if (!mm || !diff) + return; + + bprm->vma_pages = pages; + +#ifdef SPLIT_RSS_COUNTING + add_mm_counter(mm, MM_ANONPAGES, diff); +#else + spin_lock(&mm->page_table_lock); + add_mm_counter(mm, MM_ANONPAGES, diff); + spin_unlock(&mm->page_table_lock); +#endif +} + +struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; @@ -186,6 +205,8 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start; struct rlimit *rlim; + acct_arg_size(bprm, size / PAGE_SIZE); + /* * We've historically supported up to 32 pages (ARG_MAX) * of argument strings even with small stacks @@ -254,6 +275,11 @@ static int __bprm_mm_init(struct linux_binprm *bprm) vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); INIT_LIST_HEAD(&vma->anon_vma_chain); + + err = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1); + if (err) + goto err; + err = insert_vm_struct(mm, vma); if (err) goto err; @@ -276,7 +302,11 @@ static bool valid_arg_len(struct linux_binprm *bprm, long len) #else -static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, +void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +{ +} + +struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; @@ -1003,6 +1033,7 @@ int flush_old_exec(struct linux_binprm * bprm) /* * Release all of the old mmap stuff */ + acct_arg_size(bprm, 0); retval = exec_mmap(bprm->mm); if (retval) goto out; @@ -1426,8 +1457,10 @@ int do_execve(const char * filename, return retval; out: - if (bprm->mm) - mmput (bprm->mm); + if (bprm->mm) { + acct_arg_size(bprm, 0); + mmput(bprm->mm); + } out_file: if (bprm->file) { diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 2fedaf8..acf8695 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -27,7 +27,6 @@ #include <linux/init.h> #include <linux/blkdev.h> #include <linux/parser.h> -#include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/vfs.h> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8b5dd63..94ce3d7 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -177,7 +177,7 @@ struct mpage_da_data { struct ext4_io_page { struct page *p_page; - int p_count; + atomic_t p_count; }; #define MAX_IO_PAGES 128 @@ -858,6 +858,7 @@ struct ext4_inode_info { spinlock_t i_completed_io_lock; /* current io_end structure for async DIO write*/ ext4_io_end_t *cur_aio_dio; + atomic_t i_ioend_count; /* Number of outstanding io_end structs */ /* * Transactions that contain inode's metadata needed to complete @@ -909,6 +910,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ +#define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */ #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ @@ -2060,6 +2062,7 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, /* page-io.c */ extern int __init ext4_init_pageio(void); extern void ext4_exit_pageio(void); +extern void ext4_ioend_wait(struct inode *); extern void ext4_free_io_end(ext4_io_end_t *io); extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); extern int ext4_end_io_nolock(ext4_io_end_t *io); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4d78342..e659597 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -53,6 +53,7 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { + trace_ext4_begin_ordered_truncate(inode, new_size); return jbd2_journal_begin_ordered_truncate( EXT4_SB(inode->i_sb)->s_journal, &EXT4_I(inode)->jinode, @@ -178,6 +179,7 @@ void ext4_evict_inode(struct inode *inode) handle_t *handle; int err; + trace_ext4_evict_inode(inode); if (inode->i_nlink) { truncate_inode_pages(&inode->i_data, 0); goto no_delete; @@ -2123,9 +2125,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, */ if (unlikely(journal_data && PageChecked(page))) err = __ext4_journalled_writepage(page, len); - else + else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT)) err = ext4_bio_write_page(&io_submit, page, len, mpd->wbc); + else + err = block_write_full_page(page, + noalloc_get_block_write, mpd->wbc); if (!err) mpd->pages_written++; @@ -5647,6 +5652,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) int err, ret; might_sleep(); + trace_ext4_mark_inode_dirty(inode, _RET_IP_); err = ext4_reserve_inode_write(handle, inode, &iloc); if (ext4_handle_valid(handle) && EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index bf5ae88..eb3bc2f 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -331,6 +331,30 @@ mext_out: return err; } + case FITRIM: + { + struct super_block *sb = inode->i_sb; + struct fstrim_range range; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&range, (struct fstrim_range *)arg, + sizeof(range))) + return -EFAULT; + + ret = ext4_trim_fs(sb, &range); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range *)arg, &range, + sizeof(range))) + return -EFAULT; + + return 0; + } + default: return -ENOTTY; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c58eba3..5b4d4e3 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4640,8 +4640,6 @@ do_more: * with group lock held. generate_buddy look at * them with group lock_held */ - if (test_opt(sb, DISCARD)) - ext4_issue_discard(sb, block_group, bit, count); ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, count); mb_free_blocks(inode, &e4b, bit, count); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 92203b8..dc40e75 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -872,7 +872,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, if (namelen > EXT4_NAME_LEN) return NULL; if ((namelen <= 2) && (name[0] == '.') && - (name[1] == '.' || name[1] == '0')) { + (name[1] == '.' || name[1] == '\0')) { /* * "." or ".." will only be in the first block * NFS may look up ".."; "." should be handled by the VFS diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 46a7d6a..beacce1 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -32,8 +32,14 @@ static struct kmem_cache *io_page_cachep, *io_end_cachep; +#define WQ_HASH_SZ 37 +#define to_ioend_wq(v) (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ]) +static wait_queue_head_t ioend_wq[WQ_HASH_SZ]; + int __init ext4_init_pageio(void) { + int i; + io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); if (io_page_cachep == NULL) return -ENOMEM; @@ -42,6 +48,8 @@ int __init ext4_init_pageio(void) kmem_cache_destroy(io_page_cachep); return -ENOMEM; } + for (i = 0; i < WQ_HASH_SZ; i++) + init_waitqueue_head(&ioend_wq[i]); return 0; } @@ -52,24 +60,37 @@ void ext4_exit_pageio(void) kmem_cache_destroy(io_page_cachep); } +void ext4_ioend_wait(struct inode *inode) +{ + wait_queue_head_t *wq = to_ioend_wq(inode); + + wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); +} + +static void put_io_page(struct ext4_io_page *io_page) +{ + if (atomic_dec_and_test(&io_page->p_count)) { + end_page_writeback(io_page->p_page); + put_page(io_page->p_page); + kmem_cache_free(io_page_cachep, io_page); + } +} + void ext4_free_io_end(ext4_io_end_t *io) { int i; + wait_queue_head_t *wq; BUG_ON(!io); if (io->page) put_page(io->page); - for (i = 0; i < io->num_io_pages; i++) { - if (--io->pages[i]->p_count == 0) { - struct page *page = io->pages[i]->p_page; - - end_page_writeback(page); - put_page(page); - kmem_cache_free(io_page_cachep, io->pages[i]); - } - } + for (i = 0; i < io->num_io_pages; i++) + put_io_page(io->pages[i]); io->num_io_pages = 0; - iput(io->inode); + wq = to_ioend_wq(io->inode); + if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) && + waitqueue_active(wq)) + wake_up_all(wq); kmem_cache_free(io_end_cachep, io); } @@ -142,8 +163,8 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) io = kmem_cache_alloc(io_end_cachep, flags); if (io) { memset(io, 0, sizeof(*io)); - io->inode = igrab(inode); - BUG_ON(!io->inode); + atomic_inc(&EXT4_I(inode)->i_ioend_count); + io->inode = inode; INIT_WORK(&io->work, ext4_end_io_work); INIT_LIST_HEAD(&io->list); } @@ -171,35 +192,15 @@ static void ext4_end_bio(struct bio *bio, int error) struct workqueue_struct *wq; struct inode *inode; unsigned long flags; - ext4_fsblk_t err_block; int i; BUG_ON(!io_end); - inode = io_end->inode; bio->bi_private = NULL; bio->bi_end_io = NULL; if (test_bit(BIO_UPTODATE, &bio->bi_flags)) error = 0; - err_block = bio->bi_sector >> (inode->i_blkbits - 9); bio_put(bio); - if (!(inode->i_sb->s_flags & MS_ACTIVE)) { - pr_err("sb umounted, discard end_io request for inode %lu\n", - io_end->inode->i_ino); - ext4_free_io_end(io_end); - return; - } - - if (error) { - io_end->flag |= EXT4_IO_END_ERROR; - ext4_warning(inode->i_sb, "I/O error writing to inode %lu " - "(offset %llu size %ld starting block %llu)", - inode->i_ino, - (unsigned long long) io_end->offset, - (long) io_end->size, - (unsigned long long) err_block); - } - for (i = 0; i < io_end->num_io_pages; i++) { struct page *page = io_end->pages[i]->p_page; struct buffer_head *bh, *head; @@ -236,14 +237,6 @@ static void ext4_end_bio(struct bio *bio, int error) } while (bh != head); } - if (--io_end->pages[i]->p_count == 0) { - struct page *page = io_end->pages[i]->p_page; - - end_page_writeback(page); - put_page(page); - kmem_cache_free(io_page_cachep, io_end->pages[i]); - } - /* * If this is a partial write which happened to make * all buffers uptodate then we can optimize away a @@ -253,9 +246,22 @@ static void ext4_end_bio(struct bio *bio, int error) */ if (!partial_write) SetPageUptodate(page); - } + put_io_page(io_end->pages[i]); + } io_end->num_io_pages = 0; + inode = io_end->inode; + + if (error) { + io_end->flag |= EXT4_IO_END_ERROR; + ext4_warning(inode->i_sb, "I/O error writing to inode %lu " + "(offset %llu size %ld starting block %llu)", + inode->i_ino, + (unsigned long long) io_end->offset, + (long) io_end->size, + (unsigned long long) + bio->bi_sector >> (inode->i_blkbits - 9)); + } /* Add the io_end to per-inode completed io list*/ spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); @@ -305,7 +311,6 @@ static int io_submit_init(struct ext4_io_submit *io, bio->bi_private = io->io_end = io_end; bio->bi_end_io = ext4_end_bio; - io_end->inode = inode; io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); io->io_bio = bio; @@ -360,7 +365,7 @@ submit_and_retry: if ((io_end->num_io_pages == 0) || (io_end->pages[io_end->num_io_pages-1] != io_page)) { io_end->pages[io_end->num_io_pages++] = io_page; - io_page->p_count++; + atomic_inc(&io_page->p_count); } return 0; } @@ -389,7 +394,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, return -ENOMEM; } io_page->p_page = page; - io_page->p_count = 0; + atomic_set(&io_page->p_count, 1); get_page(page); for (bh = head = page_buffers(page), block_start = 0; @@ -421,10 +426,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io, * PageWriteback bit from the page to prevent the system from * wedging later on. */ - if (io_page->p_count == 0) { - put_page(page); - end_page_writeback(page); - kmem_cache_free(io_page_cachep, io_page); - } + put_io_page(io_page); return ret; } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index dc96392..981c847 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -232,6 +232,8 @@ static int setup_new_group_blocks(struct super_block *sb, GFP_NOFS); if (err) goto exit_bh; + for (i = 0, bit = gdblocks + 1; i < reserved_gdb; i++, bit++) + ext4_set_bit(bit, bh->b_data); ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, input->block_bitmap - start); @@ -247,6 +249,9 @@ static int setup_new_group_blocks(struct super_block *sb, err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); if (err) goto exit_bh; + for (i = 0, bit = input->inode_table - start; + i < sbi->s_itb_per_group; i++, bit++) + ext4_set_bit(bit, bh->b_data); if ((err = extend_or_restart_transaction(handle, 2, bh))) goto exit_bh; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 40131b7..fb15c9c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -828,12 +828,22 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->cur_aio_dio = NULL; ei->i_sync_tid = 0; ei->i_datasync_tid = 0; + atomic_set(&ei->i_ioend_count, 0); return &ei->vfs_inode; } +static int ext4_drop_inode(struct inode *inode) +{ + int drop = generic_drop_inode(inode); + + trace_ext4_drop_inode(inode, drop); + return drop; +} + static void ext4_destroy_inode(struct inode *inode) { + ext4_ioend_wait(inode); if (!list_empty(&(EXT4_I(inode)->i_orphan))) { ext4_msg(inode->i_sb, KERN_ERR, "Inode %lu (%p): orphan list check failed!", @@ -1016,6 +1026,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) !(def_mount_opts & EXT4_DEFM_NODELALLOC)) seq_puts(seq, ",nodelalloc"); + if (test_opt(sb, MBLK_IO_SUBMIT)) + seq_puts(seq, ",mblk_io_submit"); if (sbi->s_stripe) seq_printf(seq, ",stripe=%lu", sbi->s_stripe); /* @@ -1173,6 +1185,7 @@ static const struct super_operations ext4_sops = { .destroy_inode = ext4_destroy_inode, .write_inode = ext4_write_inode, .dirty_inode = ext4_dirty_inode, + .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, .put_super = ext4_put_super, .sync_fs = ext4_sync_fs, @@ -1186,7 +1199,6 @@ static const struct super_operations ext4_sops = { .quota_write = ext4_quota_write, #endif .bdev_try_to_free_page = bdev_try_to_free_page, - .trim_fs = ext4_trim_fs }; static const struct super_operations ext4_nojournal_sops = { @@ -1194,6 +1206,7 @@ static const struct super_operations ext4_nojournal_sops = { .destroy_inode = ext4_destroy_inode, .write_inode = ext4_write_inode, .dirty_inode = ext4_dirty_inode, + .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, .write_super = ext4_write_super, .put_super = ext4_put_super, @@ -1228,8 +1241,8 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version, - Opt_stripe, Opt_delalloc, Opt_nodelalloc, - Opt_block_validity, Opt_noblock_validity, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, + Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, @@ -1293,6 +1306,8 @@ static const match_table_t tokens = { {Opt_resize, "resize"}, {Opt_delalloc, "delalloc"}, {Opt_nodelalloc, "nodelalloc"}, + {Opt_mblk_io_submit, "mblk_io_submit"}, + {Opt_nomblk_io_submit, "nomblk_io_submit"}, {Opt_block_validity, "block_validity"}, {Opt_noblock_validity, "noblock_validity"}, {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, @@ -1714,6 +1729,12 @@ set_qf_format: case Opt_nodelalloc: clear_opt(sbi->s_mount_opt, DELALLOC); break; + case Opt_mblk_io_submit: + set_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT); + break; + case Opt_nomblk_io_submit: + clear_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT); + break; case Opt_stripe: if (match_int(&args[0], &option)) return 0; @@ -2699,7 +2720,6 @@ static int ext4_lazyinit_thread(void *arg) struct ext4_li_request *elr; unsigned long next_wakeup; DEFINE_WAIT(wait); - int ret; BUG_ON(NULL == eli); @@ -2723,13 +2743,12 @@ cont_thread: elr = list_entry(pos, struct ext4_li_request, lr_request); - if (time_after_eq(jiffies, elr->lr_next_sched)) - ret = ext4_run_li_request(elr); - - if (ret) { - ret = 0; - ext4_remove_li_request(elr); - continue; + if (time_after_eq(jiffies, elr->lr_next_sched)) { + if (ext4_run_li_request(elr) != 0) { + /* error, remove the lazy_init job */ + ext4_remove_li_request(elr); + continue; + } } if (time_before(elr->lr_next_sched, next_wakeup)) @@ -2740,7 +2759,8 @@ cont_thread: if (freezing(current)) refrigerator(); - if (time_after_eq(jiffies, next_wakeup)) { + if ((time_after_eq(jiffies, next_wakeup)) || + (MAX_JIFFY_OFFSET == next_wakeup)) { cond_resched(); continue; } @@ -2788,9 +2808,6 @@ static void ext4_clear_request_list(void) struct ext4_li_request *elr; mutex_lock(&ext4_li_info->li_list_mtx); - if (list_empty(&ext4_li_info->li_request_list)) - return; - list_for_each_safe(pos, n, &ext4_li_info->li_request_list) { elr = list_entry(pos, struct ext4_li_request, lr_request); @@ -3257,13 +3274,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * Test whether we have more sectors than will fit in sector_t, * and whether the max offset is addressable by the page cache. */ - ret = generic_check_addressable(sb->s_blocksize_bits, + err = generic_check_addressable(sb->s_blocksize_bits, ext4_blocks_count(es)); - if (ret) { + if (err) { ext4_msg(sb, KERN_ERR, "filesystem" " too large to mount safely on this system"); if (sizeof(sector_t) < 8) ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); + ret = err; goto failed_mount; } @@ -3348,6 +3366,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); + err = percpu_counter_init(&sbi->s_freeblocks_counter, + ext4_count_free_blocks(sb)); + if (!err) { + err = percpu_counter_init(&sbi->s_freeinodes_counter, + ext4_count_free_inodes(sb)); + } + if (!err) { + err = percpu_counter_init(&sbi->s_dirs_counter, + ext4_count_dirs(sb)); + } + if (!err) { + err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); + } + if (err) { + ext4_msg(sb, KERN_ERR, "insufficient memory"); + goto failed_mount3; + } + sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_max_writeback_mb_bump = 128; @@ -3446,22 +3482,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); -no_journal: - err = percpu_counter_init(&sbi->s_freeblocks_counter, - ext4_count_free_blocks(sb)); - if (!err) - err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext4_count_free_inodes(sb)); - if (!err) - err = percpu_counter_init(&sbi->s_dirs_counter, - ext4_count_dirs(sb)); - if (!err) - err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); - if (err) { - ext4_msg(sb, KERN_ERR, "insufficient memory"); - goto failed_mount_wq; - } + /* + * The journal may have updated the bg summary counts, so we + * need to update the global counters. + */ + percpu_counter_set(&sbi->s_freeblocks_counter, + ext4_count_free_blocks(sb)); + percpu_counter_set(&sbi->s_freeinodes_counter, + ext4_count_free_inodes(sb)); + percpu_counter_set(&sbi->s_dirs_counter, + ext4_count_dirs(sb)); + percpu_counter_set(&sbi->s_dirtyblocks_counter, 0); +no_journal: EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); if (!EXT4_SB(sb)->dio_unwritten_wq) { printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); @@ -3611,10 +3644,6 @@ failed_mount_wq: jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } - percpu_counter_destroy(&sbi->s_freeblocks_counter); - percpu_counter_destroy(&sbi->s_freeinodes_counter); - percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyblocks_counter); failed_mount3: if (sbi->s_flex_groups) { if (is_vmalloc_addr(sbi->s_flex_groups)) @@ -3622,6 +3651,10 @@ failed_mount3: else kfree(sbi->s_flex_groups); } + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyblocks_counter); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); @@ -3949,13 +3982,11 @@ static int ext4_commit_super(struct super_block *sb, int sync) else es->s_kbytes_written = cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); - if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeblocks_counter)) - ext4_free_blocks_count_set(es, percpu_counter_sum_positive( - &EXT4_SB(sb)->s_freeblocks_counter)); - if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter)) - es->s_free_inodes_count = - cpu_to_le32(percpu_counter_sum_positive( - &EXT4_SB(sb)->s_freeinodes_counter)); + ext4_free_blocks_count_set(es, percpu_counter_sum_positive( + &EXT4_SB(sb)->s_freeblocks_counter)); + es->s_free_inodes_count = + cpu_to_le32(percpu_counter_sum_positive( + &EXT4_SB(sb)->s_freeinodes_counter)); sb->s_dirt = 0; BUFFER_TRACE(sbh, "marking dirty"); mark_buffer_dirty(sbh); @@ -4556,12 +4587,10 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, static int ext4_quota_off(struct super_block *sb, int type) { - /* Force all delayed allocation blocks to be allocated */ - if (test_opt(sb, DELALLOC)) { - down_read(&sb->s_umount); + /* Force all delayed allocation blocks to be allocated. + * Caller already holds s_umount sem */ + if (test_opt(sb, DELALLOC)) sync_filesystem(sb); - up_read(&sb->s_umount); - } return dquot_quota_off(sb, type); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c822458..8b984a2 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -13,6 +13,7 @@ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/module.h> +#include <linux/compat.h> static const struct file_operations fuse_direct_io_file_operations; @@ -134,6 +135,7 @@ EXPORT_SYMBOL_GPL(fuse_do_open); void fuse_finish_open(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = get_fuse_conn(inode); if (ff->open_flags & FOPEN_DIRECT_IO) file->f_op = &fuse_direct_io_file_operations; @@ -141,6 +143,15 @@ void fuse_finish_open(struct inode *inode, struct file *file) invalidate_inode_pages2(inode->i_mapping); if (ff->open_flags & FOPEN_NONSEEKABLE) nonseekable_open(inode, file); + if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fc->lock); + fi->attr_version = ++fc->attr_version; + i_size_write(inode, 0); + spin_unlock(&fc->lock); + fuse_invalidate_attr(inode); + } } int fuse_open_common(struct inode *inode, struct file *file, bool isdir) @@ -1618,6 +1629,58 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov, } /* + * CUSE servers compiled on 32bit broke on 64bit kernels because the + * ABI was defined to be 'struct iovec' which is different on 32bit + * and 64bit. Fortunately we can determine which structure the server + * used from the size of the reply. + */ +static int fuse_copy_ioctl_iovec(struct iovec *dst, void *src, + size_t transferred, unsigned count, + bool is_compat) +{ +#ifdef CONFIG_COMPAT + if (count * sizeof(struct compat_iovec) == transferred) { + struct compat_iovec *ciov = src; + unsigned i; + + /* + * With this interface a 32bit server cannot support + * non-compat (i.e. ones coming from 64bit apps) ioctl + * requests + */ + if (!is_compat) + return -EINVAL; + + for (i = 0; i < count; i++) { + dst[i].iov_base = compat_ptr(ciov[i].iov_base); + dst[i].iov_len = ciov[i].iov_len; + } + return 0; + } +#endif + + if (count * sizeof(struct iovec) != transferred) + return -EIO; + + memcpy(dst, src, transferred); + return 0; +} + +/* Make sure iov_length() won't overflow */ +static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count) +{ + size_t n; + u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT; + + for (n = 0; n < count; n++) { + if (iov->iov_len > (size_t) max) + return -ENOMEM; + max -= iov->iov_len; + } + return 0; +} + +/* * For ioctls, there is no generic way to determine how much memory * needs to be read and/or written. Furthermore, ioctls are allowed * to dereference the passed pointer, so the parameter requires deep @@ -1798,18 +1861,25 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) goto out; - err = -EIO; - if ((in_iovs + out_iovs) * sizeof(struct iovec) != transferred) - goto out; - - /* okay, copy in iovs and retry */ vaddr = kmap_atomic(pages[0], KM_USER0); - memcpy(page_address(iov_page), vaddr, transferred); + err = fuse_copy_ioctl_iovec(page_address(iov_page), vaddr, + transferred, in_iovs + out_iovs, + (flags & FUSE_IOCTL_COMPAT) != 0); kunmap_atomic(vaddr, KM_USER0); + if (err) + goto out; in_iov = page_address(iov_page); out_iov = in_iov + in_iovs; + err = fuse_verify_ioctl_iov(in_iov, in_iovs); + if (err) + goto out; + + err = fuse_verify_ioctl_iov(out_iov, out_iovs); + if (err) + goto out; + goto retry; } diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index 06d5827..5ab3839 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -138,10 +138,8 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, struct gfs2_inum_host *inum) { struct gfs2_sbd *sdp = sb->s_fs_info; - struct gfs2_holder i_gh; struct inode *inode; struct dentry *dentry; - int error; inode = gfs2_ilookup(sb, inum->no_addr); if (inode) { @@ -152,52 +150,16 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, goto out_inode; } - error = gfs2_glock_nq_num(sdp, inum->no_addr, &gfs2_inode_glops, - LM_ST_SHARED, LM_FLAG_ANY, &i_gh); - if (error) - return ERR_PTR(error); - - error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE); - if (error) - goto fail; - - inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0); - if (IS_ERR(inode)) { - error = PTR_ERR(inode); - goto fail; - } - - error = gfs2_inode_refresh(GFS2_I(inode)); - if (error) { - iput(inode); - goto fail; - } - - /* Pick up the works we bypass in gfs2_inode_lookup */ - if (inode->i_state & I_NEW) - gfs2_set_iop(inode); - - if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) { - iput(inode); - goto fail; - } - - error = -EIO; - if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) { - iput(inode); - goto fail; - } - - gfs2_glock_dq_uninit(&i_gh); + inode = gfs2_lookup_by_inum(sdp, inum->no_addr, &inum->no_formal_ino, + GFS2_BLKST_DINODE); + if (IS_ERR(inode)) + return ERR_CAST(inode); out_inode: dentry = d_obtain_alias(inode); if (!IS_ERR(dentry)) dentry->d_op = &gfs2_dops; return dentry; -fail: - gfs2_glock_dq_uninit(&i_gh); - return ERR_PTR(error); } static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 8777885..f92c177 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -686,21 +686,20 @@ static void delete_work_func(struct work_struct *work) { struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete); struct gfs2_sbd *sdp = gl->gl_sbd; - struct gfs2_inode *ip = NULL; + struct gfs2_inode *ip; struct inode *inode; - u64 no_addr = 0; + u64 no_addr = gl->gl_name.ln_number; + + ip = gl->gl_object; + /* Note: Unsafe to dereference ip as we don't hold right refs/locks */ - spin_lock(&gl->gl_spin); - ip = (struct gfs2_inode *)gl->gl_object; if (ip) - no_addr = ip->i_no_addr; - spin_unlock(&gl->gl_spin); - if (ip) { inode = gfs2_ilookup(sdp->sd_vfs, no_addr); - if (inode) { - d_prune_aliases(inode); - iput(inode); - } + else + inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED); + if (inode && !IS_ERR(inode)) { + d_prune_aliases(inode); + iput(inode); } gfs2_glock_put(gl); } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 06370f8..e1213f7 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -73,49 +73,6 @@ static struct inode *gfs2_iget(struct super_block *sb, u64 no_addr) return iget5_locked(sb, hash, iget_test, iget_set, &no_addr); } -struct gfs2_skip_data { - u64 no_addr; - int skipped; -}; - -static int iget_skip_test(struct inode *inode, void *opaque) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_skip_data *data = opaque; - - if (ip->i_no_addr == data->no_addr) { - if (inode->i_state & (I_FREEING|I_WILL_FREE)){ - data->skipped = 1; - return 0; - } - return 1; - } - return 0; -} - -static int iget_skip_set(struct inode *inode, void *opaque) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_skip_data *data = opaque; - - if (data->skipped) - return 1; - inode->i_ino = (unsigned long)(data->no_addr); - ip->i_no_addr = data->no_addr; - return 0; -} - -static struct inode *gfs2_iget_skip(struct super_block *sb, - u64 no_addr) -{ - struct gfs2_skip_data data; - unsigned long hash = (unsigned long)no_addr; - - data.no_addr = no_addr; - data.skipped = 0; - return iget5_locked(sb, hash, iget_skip_test, iget_skip_set, &data); -} - /** * GFS2 lookup code fills in vfs inode contents based on info obtained * from directory entry inside gfs2_inode_lookup(). This has caused issues @@ -243,93 +200,54 @@ fail: return ERR_PTR(error); } -/** - * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation - * and try to reclaim it by doing iput. - * - * This function assumes no rgrp locks are currently held. - * - * @sb: The super block - * no_addr: The inode number - * - */ - -void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr) +struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, + u64 *no_formal_ino, unsigned int blktype) { - struct gfs2_sbd *sdp; - struct gfs2_inode *ip; - struct gfs2_glock *io_gl = NULL; - int error; - struct gfs2_holder gh; + struct super_block *sb = sdp->sd_vfs; + struct gfs2_holder i_gh; struct inode *inode; + int error; - inode = gfs2_iget_skip(sb, no_addr); - - if (!inode) - return; - - /* If it's not a new inode, someone's using it, so leave it alone. */ - if (!(inode->i_state & I_NEW)) { - iput(inode); - return; - } - - ip = GFS2_I(inode); - sdp = GFS2_SB(inode); - ip->i_no_formal_ino = -1; + error = gfs2_glock_nq_num(sdp, no_addr, &gfs2_inode_glops, + LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (error) + return ERR_PTR(error); - error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); - if (unlikely(error)) + error = gfs2_check_blk_type(sdp, no_addr, blktype); + if (error) goto fail; - ip->i_gl->gl_object = ip; - error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); - if (unlikely(error)) - goto fail_put; - - set_bit(GIF_INVALID, &ip->i_flags); - error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT, - &ip->i_iopen_gh); - if (unlikely(error)) - goto fail_iopen; + inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, 0); + if (IS_ERR(inode)) + goto fail; - ip->i_iopen_gh.gh_gl->gl_object = ip; - gfs2_glock_put(io_gl); - io_gl = NULL; + error = gfs2_inode_refresh(GFS2_I(inode)); + if (error) + goto fail_iput; - inode->i_mode = DT2IF(DT_UNKNOWN); + /* Pick up the works we bypass in gfs2_inode_lookup */ + if (inode->i_state & I_NEW) + gfs2_set_iop(inode); - /* - * We must read the inode in order to work out its type in - * this case. Note that this doesn't happen often as we normally - * know the type beforehand. This code path only occurs during - * unlinked inode recovery (where it is safe to do this glock, - * which is not true in the general case). - */ - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY, - &gh); - if (unlikely(error)) - goto fail_glock; + /* Two extra checks for NFS only */ + if (no_formal_ino) { + error = -ESTALE; + if (GFS2_I(inode)->i_no_formal_ino != *no_formal_ino) + goto fail_iput; - /* Inode is now uptodate */ - gfs2_glock_dq_uninit(&gh); - gfs2_set_iop(inode); + error = -EIO; + if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) + goto fail_iput; - /* The iput will cause it to be deleted. */ - iput(inode); - return; + error = 0; + } -fail_glock: - gfs2_glock_dq(&ip->i_iopen_gh); -fail_iopen: - if (io_gl) - gfs2_glock_put(io_gl); -fail_put: - ip->i_gl->gl_object = NULL; - gfs2_glock_put(ip->i_gl); fail: - iget_failed(inode); - return; + gfs2_glock_dq_uninit(&i_gh); + return error ? ERR_PTR(error) : inode; +fail_iput: + iput(inode); + goto fail; } static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 6720d7d..d8499fa 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -99,7 +99,9 @@ err: extern void gfs2_set_iop(struct inode *inode); extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, u64 no_addr, u64 no_formal_ino); -extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr); +extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, + u64 *no_formal_ino, + unsigned int blktype); extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); extern int gfs2_inode_refresh(struct gfs2_inode *ip); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 58a9b99..f606baf 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -631,6 +631,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, struct fs_disk_quota *fdq) { struct inode *inode = &ip->i_inode; + struct gfs2_sbd *sdp = GFS2_SB(inode); struct address_space *mapping = inode->i_mapping; unsigned long index = loc >> PAGE_CACHE_SHIFT; unsigned offset = loc & (PAGE_CACHE_SIZE - 1); @@ -658,11 +659,11 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, qd->qd_qb.qb_value = qp->qu_value; if (fdq) { if (fdq->d_fieldmask & FS_DQ_BSOFT) { - qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit); + qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift); qd->qd_qb.qb_warn = qp->qu_warn; } if (fdq->d_fieldmask & FS_DQ_BHARD) { - qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit); + qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift); qd->qd_qb.qb_limit = qp->qu_limit; } } @@ -1497,9 +1498,9 @@ static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id, fdq->d_version = FS_DQUOT_VERSION; fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA; fdq->d_id = id; - fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit); - fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn); - fdq->d_bcount = be64_to_cpu(qlvb->qb_value); + fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift; + fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift; + fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift; gfs2_glock_dq_uninit(&q_gh); out: @@ -1566,10 +1567,10 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, /* If nothing has changed, this is a no-op */ if ((fdq->d_fieldmask & FS_DQ_BSOFT) && - (fdq->d_blk_softlimit == be64_to_cpu(qd->qd_qb.qb_warn))) + ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn))) fdq->d_fieldmask ^= FS_DQ_BSOFT; if ((fdq->d_fieldmask & FS_DQ_BHARD) && - (fdq->d_blk_hardlimit == be64_to_cpu(qd->qd_qb.qb_limit))) + ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit))) fdq->d_fieldmask ^= FS_DQ_BHARD; if (fdq->d_fieldmask == 0) goto out_i; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index bef3ab6..33c8407 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -963,17 +963,18 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al) * The inode, if one has been found, in inode. */ -static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, - u64 skip) +static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip) { u32 goal = 0, block; u64 no_addr; struct gfs2_sbd *sdp = rgd->rd_sbd; unsigned int n; + struct gfs2_glock *gl; + struct gfs2_inode *ip; + int error; + int found = 0; - for(;;) { - if (goal >= rgd->rd_data) - break; + while (goal < rgd->rd_data) { down_write(&sdp->sd_log_flush_lock); n = 1; block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED, @@ -990,11 +991,32 @@ static u64 try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, if (no_addr == skip) continue; *last_unlinked = no_addr; - return no_addr; + + error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &gl); + if (error) + continue; + + /* If the inode is already in cache, we can ignore it here + * because the existing inode disposal code will deal with + * it when all refs have gone away. Accessing gl_object like + * this is not safe in general. Here it is ok because we do + * not dereference the pointer, and we only need an approx + * answer to whether it is NULL or not. + */ + ip = gl->gl_object; + + if (ip || queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) + gfs2_glock_put(gl); + else + found++; + + /* Limit reclaim to sensible number of tasks */ + if (found > 2*NR_CPUS) + return; } rgd->rd_flags &= ~GFS2_RDF_CHECK; - return 0; + return; } /** @@ -1075,11 +1097,9 @@ static void forward_rgrp_set(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd) * Try to acquire rgrp in way which avoids contending with others. * * Returns: errno - * unlinked: the block address of an unlinked block to be reclaimed */ -static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked, - u64 *last_unlinked) +static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd, *begin = NULL; @@ -1089,7 +1109,6 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked, int loops = 0; int error, rg_locked; - *unlinked = 0; rgd = gfs2_blk2rgrpd(sdp, ip->i_goal); while (rgd) { @@ -1106,17 +1125,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked, case 0: if (try_rgrp_fit(rgd, al)) goto out; - /* If the rg came in already locked, there's no - way we can recover from a failed try_rgrp_unlink - because that would require an iput which can only - happen after the rgrp is unlocked. */ - if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK) - *unlinked = try_rgrp_unlink(rgd, last_unlinked, - ip->i_no_addr); + if (rgd->rd_flags & GFS2_RDF_CHECK) + try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); if (!rg_locked) gfs2_glock_dq_uninit(&al->al_rgd_gh); - if (*unlinked) - return -EAGAIN; /* fall through */ case GLR_TRYFAILED: rgd = recent_rgrp_next(rgd); @@ -1145,13 +1157,10 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *unlinked, case 0: if (try_rgrp_fit(rgd, al)) goto out; - if (!rg_locked && rgd->rd_flags & GFS2_RDF_CHECK) - *unlinked = try_rgrp_unlink(rgd, last_unlinked, - ip->i_no_addr); + if (rgd->rd_flags & GFS2_RDF_CHECK) + try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); if (!rg_locked) gfs2_glock_dq_uninit(&al->al_rgd_gh); - if (*unlinked) - return -EAGAIN; break; case GLR_TRYFAILED: @@ -1204,12 +1213,12 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex, struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_alloc *al = ip->i_alloc; int error = 0; - u64 last_unlinked = NO_BLOCK, unlinked; + u64 last_unlinked = NO_BLOCK; + int tries = 0; if (gfs2_assert_warn(sdp, al->al_requested)) return -EINVAL; -try_again: if (hold_rindex) { /* We need to hold the rindex unless the inode we're using is the rindex itself, in which case it's already held. */ @@ -1218,31 +1227,23 @@ try_again: else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */ error = gfs2_ri_update_special(ip); + if (error) + return error; } - if (error) - return error; + do { + error = get_local_rgrp(ip, &last_unlinked); + /* If there is no space, flushing the log may release some */ + if (error) + gfs2_log_flush(sdp, NULL); + } while (error && tries++ < 3); - /* Find an rgrp suitable for allocation. If it encounters any unlinked - dinodes along the way, error will equal -EAGAIN and unlinked will - contains it block address. We then need to look up that inode and - try to free it, and try the allocation again. */ - error = get_local_rgrp(ip, &unlinked, &last_unlinked); if (error) { if (hold_rindex && ip != GFS2_I(sdp->sd_rindex)) gfs2_glock_dq_uninit(&al->al_ri_gh); - if (error != -EAGAIN) - return error; - - gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked); - /* regardless of whether or not gfs2_process_unlinked_inode - was successful, we don't want to repeat it again. */ - last_unlinked = unlinked; - gfs2_log_flush(sdp, NULL); - error = 0; - - goto try_again; + return error; } + /* no error, so we have the rgrp set in the inode's allocation. */ al->al_file = file; al->al_line = line; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index d6cfac1..a5fe681 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -932,8 +932,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { *user = current_user(); if (user_shm_lock(size, *user)) { - WARN_ONCE(1, - "Using mlock ulimits for SHM_HUGETLB deprecated\n"); + printk_once(KERN_WARNING "Using mlock ulimits for SHM_HUGETLB is deprecated\n"); } else { *user = NULL; return ERR_PTR(-EPERM); @@ -6,7 +6,6 @@ #include <linux/syscalls.h> #include <linux/mm.h> -#include <linux/smp_lock.h> #include <linux/capability.h> #include <linux/file.h> #include <linux/fs.h> @@ -530,41 +529,6 @@ static int ioctl_fsthaw(struct file *filp) return thaw_super(sb); } -static int ioctl_fstrim(struct file *filp, void __user *argp) -{ - struct super_block *sb = filp->f_path.dentry->d_inode->i_sb; - struct fstrim_range range; - int ret = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - /* If filesystem doesn't support trim feature, return. */ - if (sb->s_op->trim_fs == NULL) - return -EOPNOTSUPP; - - /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */ - if (sb->s_bdev == NULL) - return -EINVAL; - - if (argp == NULL) { - range.start = 0; - range.len = ULLONG_MAX; - range.minlen = 0; - } else if (copy_from_user(&range, argp, sizeof(range))) - return -EFAULT; - - ret = sb->s_op->trim_fs(sb, &range); - if (ret < 0) - return ret; - - if ((argp != NULL) && - (copy_to_user(argp, &range, sizeof(range)))) - return -EFAULT; - - return 0; -} - /* * When you add any new common ioctls to the switches above and below * please update compat_sys_ioctl() too. @@ -615,10 +579,6 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, error = ioctl_fsthaw(filp); break; - case FITRIM: - error = ioctl_fstrim(filp, argp); - break; - case FS_IOC_FIEMAP: return ioctl_fiemap(filp, arg); diff --git a/fs/ioprio.c b/fs/ioprio.c index 748cfb9..7da2a06 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -103,12 +103,7 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) } ret = -ESRCH; - /* - * We want IOPRIO_WHO_PGRP/IOPRIO_WHO_USER to be "atomic", - * so we can't use rcu_read_lock(). See re-copy of ->ioprio - * in copy_process(). - */ - read_lock(&tasklist_lock); + rcu_read_lock(); switch (which) { case IOPRIO_WHO_PROCESS: if (!who) @@ -153,7 +148,7 @@ free_uid: ret = -EINVAL; } - read_unlock(&tasklist_lock); + rcu_read_unlock(); return ret; } @@ -197,7 +192,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) int ret = -ESRCH; int tmpio; - read_lock(&tasklist_lock); + rcu_read_lock(); switch (which) { case IOPRIO_WHO_PROCESS: if (!who) @@ -250,6 +245,6 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) ret = -EINVAL; } - read_unlock(&tasklist_lock); + rcu_read_unlock(); return ret; } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index c590d15..f837ba9 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -899,6 +899,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, /* journal descriptor can store up to n blocks -bzzz */ journal->j_blocksize = blocksize; + journal->j_dev = bdev; + journal->j_fs_dev = fs_dev; + journal->j_blk_offset = start; + journal->j_maxlen = len; + bdevname(journal->j_dev, journal->j_devname); + p = journal->j_devname; + while ((p = strchr(p, '/'))) + *p = '!'; jbd2_stats_proc_init(journal); n = journal->j_blocksize / sizeof(journal_block_tag_t); journal->j_wbufsize = n; @@ -908,14 +916,6 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, __func__); goto out_err; } - journal->j_dev = bdev; - journal->j_fs_dev = fs_dev; - journal->j_blk_offset = start; - journal->j_maxlen = len; - bdevname(journal->j_dev, journal->j_devname); - p = journal->j_devname; - while ((p = strchr(p, '/'))) - *p = '!'; bh = __getblk(journal->j_dev, start, journal->j_blocksize); if (!bh) { diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index d5bb868..25509eb 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -14,7 +14,6 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/lockd/lockd.h> -#include <linux/smp_lock.h> #include <linux/kthread.h> #define NLMDBG_FACILITY NLMDBG_CLIENT diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 47ea1e1..332c54c 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -7,7 +7,6 @@ */ #include <linux/module.h> -#include <linux/smp_lock.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/errno.h> diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 25e21e4..ed0c59f 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -124,7 +124,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) continue; if (host->h_server != ni->server) continue; - if (ni->server && + if (ni->server && ni->src_len != 0 && !rpc_cmp_addr(nlm_srcaddr(host), ni->src_sap)) continue; @@ -167,6 +167,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) host->h_addrlen = ni->salen; rpc_set_port(nlm_addr(host), 0); memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len); + host->h_srcaddrlen = ni->src_len; host->h_version = ni->version; host->h_proto = ni->protocol; host->h_rpcclnt = NULL; @@ -238,9 +239,6 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, const char *hostname, int noresvport) { - const struct sockaddr source = { - .sa_family = AF_UNSPEC, - }; struct nlm_lookup_host_info ni = { .server = 0, .sap = sap, @@ -249,8 +247,6 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, .version = version, .hostname = hostname, .hostname_len = strlen(hostname), - .src_sap = &source, - .src_len = sizeof(source), .noresvport = noresvport, }; @@ -357,7 +353,6 @@ nlm_bind_host(struct nlm_host *host) .protocol = host->h_proto, .address = nlm_addr(host), .addrsize = host->h_addrlen, - .saddress = nlm_srcaddr(host), .timeout = &timeparms, .servername = host->h_name, .program = &nlm_program, @@ -376,6 +371,8 @@ nlm_bind_host(struct nlm_host *host) args.flags |= RPC_CLNT_CREATE_HARDRTRY; if (host->h_noresvport) args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + if (host->h_srcaddrlen) + args.saddress = nlm_srcaddr(host); clnt = rpc_create(&args); if (!IS_ERR(clnt)) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index a336e83..38d2611 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -9,7 +9,6 @@ #include <linux/types.h> #include <linux/time.h> -#include <linux/smp_lock.h> #include <linux/lockd/lockd.h> #include <linux/lockd/share.h> diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index c462d34..ef5659b 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -25,7 +25,6 @@ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched.h> -#include <linux/smp_lock.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/lockd/nlm.h> diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index c3069f3..0caea53 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -9,7 +9,6 @@ #include <linux/types.h> #include <linux/time.h> -#include <linux/smp_lock.h> #include <linux/lockd/lockd.h> #include <linux/lockd/share.h> @@ -122,7 +122,6 @@ #include <linux/module.h> #include <linux/security.h> #include <linux/slab.h> -#include <linux/smp_lock.h> #include <linux/syscalls.h> #include <linux/time.h> #include <linux/rcupdate.h> @@ -1504,9 +1503,8 @@ static int do_fcntl_delete_lease(struct file *filp) static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) { - struct file_lock *fl; + struct file_lock *fl, *ret; struct fasync_struct *new; - struct inode *inode = filp->f_path.dentry->d_inode; int error; fl = lease_alloc(filp, arg); @@ -1518,13 +1516,16 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) locks_free_lock(fl); return -ENOMEM; } + ret = fl; lock_flocks(); - error = __vfs_setlease(filp, arg, &fl); + error = __vfs_setlease(filp, arg, &ret); if (error) { unlock_flocks(); locks_free_lock(fl); goto out_free_fasync; } + if (ret != fl) + locks_free_lock(fl); /* * fasync_insert_entry() returns the old entry if any. @@ -1532,17 +1533,10 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) * inserted it into the fasync list. Clear new so that * we don't release it here. */ - if (!fasync_insert_entry(fd, filp, &fl->fl_fasync, new)) + if (!fasync_insert_entry(fd, filp, &ret->fl_fasync, new)) new = NULL; - if (error < 0) { - /* remove lease just inserted by setlease */ - fl->fl_type = F_UNLCK | F_INPROGRESS; - fl->fl_break_time = jiffies - 10; - time_out_leases(inode); - } else { - error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); - } + error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); unlock_flocks(); out_free_fasync: diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c index f46ee8b..9da2970 100644 --- a/fs/logfs/journal.c +++ b/fs/logfs/journal.c @@ -828,7 +828,7 @@ void do_logfs_journal_wl_pass(struct super_block *sb) super->s_journal_seg[i] = segno; super->s_journal_ec[i] = ec; logfs_set_segment_reserved(sb, segno); - err = btree_insert32(head, segno, (void *)1, GFP_KERNEL); + err = btree_insert32(head, segno, (void *)1, GFP_NOFS); BUG_ON(err); /* mempool should prevent this */ err = logfs_erase_segment(sb, segno, 1); BUG_ON(err); /* FIXME: remount-ro would be nicer */ diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 6127baf..ee99a9f 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -1994,6 +1994,9 @@ static int do_write_inode(struct inode *inode) /* FIXME: transaction is part of logfs_block now. Is that enough? */ err = logfs_write_buf(master_inode, page, 0); + if (err) + move_page_to_inode(inode, page); + logfs_put_write_page(page); return err; } @@ -1748,6 +1748,9 @@ struct file *do_filp_open(int dfd, const char *pathname, if (!(open_flag & O_CREAT)) mode = 0; + /* Must never be set by userspace */ + open_flag &= ~FMODE_NONOTIFY; + /* * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only * check for O_DSYNC if the need any syncing at all we enforce it's diff --git a/fs/namespace.c b/fs/namespace.c index 8a415c9..3dbfc07 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -13,7 +13,6 @@ #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/percpu.h> -#include <linux/smp_lock.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/acct.h> diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index aac8832..f22b12e 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -19,7 +19,6 @@ #include <linux/mm.h> #include <asm/uaccess.h> #include <asm/byteorder.h> -#include <linux/smp_lock.h> #include <linux/ncp_fs.h> diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 6c754f7..cb50aaf 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -17,7 +17,6 @@ #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/sched.h> -#include <linux/smp_lock.h> #include <linux/ncp_fs.h> #include "ncplib_kernel.h" diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index d290545..8fb93b6 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -26,7 +26,6 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/init.h> -#include <linux/smp_lock.h> #include <linux/vfs.h> #include <linux/mount.h> #include <linux/seq_file.h> diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index c2a1f9a..d40a547 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -17,7 +17,6 @@ #include <linux/mount.h> #include <linux/slab.h> #include <linux/highuid.h> -#include <linux/smp_lock.h> #include <linux/vmalloc.h> #include <linux/sched.h> diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index aeec017..93a8b3b 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -9,7 +9,6 @@ #include <linux/completion.h> #include <linux/ip.h> #include <linux/module.h> -#include <linux/smp_lock.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/svcsock.h> #include <linux/nfs_fs.h> diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 232a7ee..1fd62fc 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -11,7 +11,6 @@ #include <linux/module.h> #include <linux/sched.h> #include <linux/slab.h> -#include <linux/smp_lock.h> #include <linux/spinlock.h> #include <linux/nfs4.h> diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 07ac384..996dd89 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -34,6 +34,7 @@ #include <linux/mount.h> #include <linux/sched.h> #include <linux/vmalloc.h> +#include <linux/kmemleak.h> #include "delegation.h" #include "iostat.h" @@ -56,7 +57,7 @@ static int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); static int nfs_fsync_dir(struct file *, int); static loff_t nfs_llseek_dir(struct file *, loff_t, int); -static int nfs_readdir_clear_array(struct page*, gfp_t); +static void nfs_readdir_clear_array(struct page*); const struct file_operations nfs_dir_operations = { .llseek = nfs_llseek_dir, @@ -82,8 +83,8 @@ const struct inode_operations nfs_dir_inode_operations = { .setattr = nfs_setattr, }; -const struct address_space_operations nfs_dir_addr_space_ops = { - .releasepage = nfs_readdir_clear_array, +const struct address_space_operations nfs_dir_aops = { + .freepage = nfs_readdir_clear_array, }; #ifdef CONFIG_NFS_V3 @@ -161,6 +162,7 @@ struct nfs_cache_array_entry { u64 cookie; u64 ino; struct qstr string; + unsigned char d_type; }; struct nfs_cache_array { @@ -170,14 +172,13 @@ struct nfs_cache_array { struct nfs_cache_array_entry array[0]; }; -#define MAX_READDIR_ARRAY ((PAGE_SIZE - sizeof(struct nfs_cache_array)) / sizeof(struct nfs_cache_array_entry)) - typedef __be32 * (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, struct nfs_server *, int); typedef struct { struct file *file; struct page *page; unsigned long page_index; u64 *dir_cookie; + u64 last_cookie; loff_t current_index; decode_dirent_t decode; @@ -194,9 +195,13 @@ typedef struct { static struct nfs_cache_array *nfs_readdir_get_array(struct page *page) { + void *ptr; if (page == NULL) return ERR_PTR(-EIO); - return (struct nfs_cache_array *)kmap(page); + ptr = kmap(page); + if (ptr == NULL) + return ERR_PTR(-ENOMEM); + return ptr; } static @@ -209,14 +214,15 @@ void nfs_readdir_release_array(struct page *page) * we are freeing strings created by nfs_add_to_readdir_array() */ static -int nfs_readdir_clear_array(struct page *page, gfp_t mask) +void nfs_readdir_clear_array(struct page *page) { - struct nfs_cache_array *array = nfs_readdir_get_array(page); + struct nfs_cache_array *array; int i; + + array = kmap_atomic(page, KM_USER0); for (i = 0; i < array->size; i++) kfree(array->array[i].string.name); - nfs_readdir_release_array(page); - return 0; + kunmap_atomic(array, KM_USER0); } /* @@ -231,6 +237,11 @@ int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int le string->name = kmemdup(name, len, GFP_KERNEL); if (string->name == NULL) return -ENOMEM; + /* + * Avoid a kmemleak false positive. The pointer to the name is stored + * in a page cache page which kmemleak does not scan. + */ + kmemleak_not_leak(string->name); string->hash = full_name_hash(name, len); return 0; } @@ -244,20 +255,24 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) if (IS_ERR(array)) return PTR_ERR(array); - ret = -EIO; - if (array->size >= MAX_READDIR_ARRAY) - goto out; cache_entry = &array->array[array->size]; + + /* Check that this entry lies within the page bounds */ + ret = -ENOSPC; + if ((char *)&cache_entry[1] - (char *)page_address(page) > PAGE_SIZE) + goto out; + cache_entry->cookie = entry->prev_cookie; cache_entry->ino = entry->ino; + cache_entry->d_type = entry->d_type; ret = nfs_readdir_make_qstr(&cache_entry->string, entry->name, entry->len); if (ret) goto out; array->last_cookie = entry->cookie; - if (entry->eof == 1) - array->eof_index = array->size; array->size++; + if (entry->eof != 0) + array->eof_index = array->size; out: nfs_readdir_release_array(page); return ret; @@ -272,7 +287,7 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri if (diff < 0) goto out_eof; if (diff >= array->size) { - if (array->eof_index > 0) + if (array->eof_index >= 0) goto out_eof; desc->current_index += array->size; return -EAGAIN; @@ -281,8 +296,6 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri index = (unsigned int)diff; *desc->dir_cookie = array->array[index].cookie; desc->cache_entry_index = index; - if (index == array->eof_index) - desc->eof = 1; return 0; out_eof: desc->eof = 1; @@ -296,17 +309,16 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des int status = -EAGAIN; for (i = 0; i < array->size; i++) { - if (i == array->eof_index) { - desc->eof = 1; - status = -EBADCOOKIE; - } if (array->array[i].cookie == *desc->dir_cookie) { desc->cache_entry_index = i; - status = 0; - break; + return 0; } } - + if (array->eof_index >= 0) { + status = -EBADCOOKIE; + if (*desc->dir_cookie == array->last_cookie) + desc->eof = 1; + } return status; } @@ -314,10 +326,7 @@ static int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) { struct nfs_cache_array *array; - int status = -EBADCOOKIE; - - if (desc->dir_cookie == NULL) - goto out; + int status; array = nfs_readdir_get_array(desc->page); if (IS_ERR(array)) { @@ -330,6 +339,10 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) else status = nfs_readdir_search_for_cookie(array, desc); + if (status == -EAGAIN) { + desc->last_cookie = array->last_cookie; + desc->page_index++; + } nfs_readdir_release_array(desc->page); out: return status; @@ -381,13 +394,9 @@ int xdr_decode(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, struct x static int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) { - struct nfs_inode *node; if (dentry->d_inode == NULL) goto different; - node = NFS_I(dentry->d_inode); - if (node->fh.size != entry->fh->size) - goto different; - if (strncmp(node->fh.data, entry->fh->data, node->fh.size) != 0) + if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0) goto different; return 1; different: @@ -449,14 +458,15 @@ out: /* Perform conversion from xdr to cache array */ static -void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, +int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *entry, void *xdr_page, struct page *page, unsigned int buflen) { struct xdr_stream stream; struct xdr_buf buf; __be32 *ptr = xdr_page; - int status; struct nfs_cache_array *array; + unsigned int count = 0; + int status; buf.head->iov_base = xdr_page; buf.head->iov_len = buflen; @@ -471,21 +481,32 @@ void nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *e do { status = xdr_decode(desc, entry, &stream); - if (status != 0) + if (status != 0) { + if (status == -EAGAIN) + status = 0; break; + } - if (nfs_readdir_add_to_array(entry, page) == -1) - break; - if (desc->plus == 1) + count++; + + if (desc->plus != 0) nfs_prime_dcache(desc->file->f_path.dentry, entry); + + status = nfs_readdir_add_to_array(entry, page); + if (status != 0) + break; } while (!entry->eof); - if (status == -EBADCOOKIE && entry->eof) { + if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { array = nfs_readdir_get_array(page); - array->eof_index = array->size - 1; - status = 0; - nfs_readdir_release_array(page); + if (!IS_ERR(array)) { + array->eof_index = array->size; + status = 0; + nfs_readdir_release_array(page); + } else + status = PTR_ERR(array); } + return status; } static @@ -537,11 +558,11 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct nfs_entry entry; struct file *file = desc->file; struct nfs_cache_array *array; - int status = 0; + int status = -ENOMEM; unsigned int array_size = ARRAY_SIZE(pages); entry.prev_cookie = 0; - entry.cookie = *desc->dir_cookie; + entry.cookie = desc->last_cookie; entry.eof = 0; entry.fh = nfs_alloc_fhandle(); entry.fattr = nfs_alloc_fattr(); @@ -549,6 +570,10 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, goto out; array = nfs_readdir_get_array(page); + if (IS_ERR(array)) { + status = PTR_ERR(array); + goto out; + } memset(array, 0, sizeof(struct nfs_cache_array)); array->eof_index = -1; @@ -556,12 +581,19 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, if (!pages_ptr) goto out_release_array; do { + unsigned int pglen; status = nfs_readdir_xdr_filler(pages, desc, &entry, file, inode); if (status < 0) break; - nfs_readdir_page_filler(desc, &entry, pages_ptr, page, array_size * PAGE_SIZE); - } while (array->eof_index < 0 && array->size < MAX_READDIR_ARRAY); + pglen = status; + status = nfs_readdir_page_filler(desc, &entry, pages_ptr, page, pglen); + if (status < 0) { + if (status == -ENOSPC) + status = 0; + break; + } + } while (array->eof_index < 0); nfs_readdir_free_large_page(pages_ptr, pages, array_size); out_release_array: @@ -582,8 +614,10 @@ static int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) { struct inode *inode = desc->file->f_path.dentry->d_inode; + int ret; - if (nfs_readdir_xdr_to_array(desc, page, inode) < 0) + ret = nfs_readdir_xdr_to_array(desc, page, inode); + if (ret < 0) goto error; SetPageUptodate(page); @@ -595,12 +629,14 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) return 0; error: unlock_page(page); - return -EIO; + return ret; } static void cache_page_release(nfs_readdir_descriptor_t *desc) { + if (!desc->page->mapping) + nfs_readdir_clear_array(desc->page); page_cache_release(desc->page); desc->page = NULL; } @@ -608,12 +644,8 @@ void cache_page_release(nfs_readdir_descriptor_t *desc) static struct page *get_cache_page(nfs_readdir_descriptor_t *desc) { - struct page *page; - page = read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping, + return read_cache_page(desc->file->f_path.dentry->d_inode->i_mapping, desc->page_index, (filler_t *)nfs_readdir_filler, desc); - if (IS_ERR(page)) - desc->eof = 1; - return page; } /* @@ -629,9 +661,8 @@ int find_cache_page(nfs_readdir_descriptor_t *desc) return PTR_ERR(desc->page); res = nfs_readdir_search_array(desc); - if (res == 0) - return 0; - cache_page_release(desc); + if (res != 0) + cache_page_release(desc); return res; } @@ -639,22 +670,18 @@ int find_cache_page(nfs_readdir_descriptor_t *desc) static inline int readdir_search_pagecache(nfs_readdir_descriptor_t *desc) { - int res = -EAGAIN; + int res; - while (1) { - res = find_cache_page(desc); - if (res != -EAGAIN) - break; - desc->page_index++; + if (desc->page_index == 0) { + desc->current_index = 0; + desc->last_cookie = 0; } + do { + res = find_cache_page(desc); + } while (res == -EAGAIN); return res; } -static inline unsigned int dt_type(struct inode *inode) -{ - return (inode->i_mode >> 12) & 15; -} - /* * Once we've found the start of the dirent within a page: fill 'er up... */ @@ -666,35 +693,35 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent, int i = 0; int res = 0; struct nfs_cache_array *array = NULL; - unsigned int d_type = DT_UNKNOWN; - struct dentry *dentry = NULL; array = nfs_readdir_get_array(desc->page); + if (IS_ERR(array)) { + res = PTR_ERR(array); + goto out; + } for (i = desc->cache_entry_index; i < array->size; i++) { - d_type = DT_UNKNOWN; + struct nfs_cache_array_entry *ent; - res = filldir(dirent, array->array[i].string.name, - array->array[i].string.len, file->f_pos, - nfs_compat_user_ino64(array->array[i].ino), d_type); - if (res < 0) + ent = &array->array[i]; + if (filldir(dirent, ent->string.name, ent->string.len, + file->f_pos, nfs_compat_user_ino64(ent->ino), + ent->d_type) < 0) { + desc->eof = 1; break; + } file->f_pos++; - desc->cache_entry_index = i; if (i < (array->size-1)) *desc->dir_cookie = array->array[i+1].cookie; else *desc->dir_cookie = array->last_cookie; - if (i == array->eof_index) { - desc->eof = 1; - break; - } } + if (array->eof_index >= 0) + desc->eof = 1; nfs_readdir_release_array(desc->page); +out: cache_page_release(desc); - if (dentry != NULL) - dput(dentry); dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", (unsigned long long)*desc->dir_cookie, res); return res; @@ -729,13 +756,14 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, goto out; } - if (nfs_readdir_xdr_to_array(desc, page, inode) == -1) { - status = -EIO; - goto out_release; - } - desc->page_index = 0; + desc->last_cookie = *desc->dir_cookie; desc->page = page; + + status = nfs_readdir_xdr_to_array(desc, page, inode); + if (status < 0) + goto out_release; + status = nfs_do_filldir(desc, dirent, filldir); out: @@ -757,7 +785,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) struct inode *inode = dentry->d_inode; nfs_readdir_descriptor_t my_desc, *desc = &my_desc; - int res = -ENOMEM; + int res; dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", dentry->d_parent->d_name.name, dentry->d_name.name, @@ -782,18 +810,18 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (res < 0) goto out; - while (desc->eof != 1) { + do { res = readdir_search_pagecache(desc); if (res == -EBADCOOKIE) { + res = 0; /* This means either end of directory */ if (*desc->dir_cookie && desc->eof == 0) { /* Or that the server has 'lost' a cookie */ res = uncached_readdir(desc, dirent, filldir); - if (res >= 0) + if (res == 0) continue; } - res = 0; break; } if (res == -ETOOSMALL && desc->plus) { @@ -808,11 +836,9 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir) break; res = nfs_do_filldir(desc, dirent, filldir); - if (res < 0) { - res = 0; + if (res < 0) break; - } - } + } while (!desc->eof); out: nfs_unblock_sillyrename(dentry); if (res > 0) @@ -1345,12 +1371,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry res = NULL; goto out; /* This turned out not to be a regular file */ - case -EISDIR: case -ENOTDIR: goto no_open; case -ELOOP: if (!(nd->intent.open.flags & O_NOFOLLOW)) goto no_open; + /* case -EISDIR: */ /* case -EINVAL: */ default: res = ERR_CAST(inode); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 84d3c8b..e6ace0d 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -867,7 +867,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, goto out; nfs_alloc_commit_data(dreq); - if (dreq->commit_data == NULL || count < wsize) + if (dreq->commit_data == NULL || count <= wsize) sync = NFS_FILE_SYNC; dreq->inode = inode; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 60677f9..7bf029e 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -693,6 +693,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) { struct inode *inode = filp->f_mapping->host; int status = 0; + unsigned int saved_type = fl->fl_type; /* Try local locking first */ posix_test_lock(filp, fl); @@ -700,6 +701,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) /* found a conflict */ goto out; } + fl->fl_type = saved_type; if (nfs_have_delegation(inode, FMODE_READ)) goto out_noconflict; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 314f571..e67e31c 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -289,6 +289,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) } else if (S_ISDIR(inode->i_mode)) { inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops; inode->i_fop = &nfs_dir_operations; + inode->i_data.a_ops = &nfs_dir_aops; if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)) set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); /* Deal with crossing mountpoints */ diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index db08ff3..e6356b7 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -362,6 +362,15 @@ unsigned int nfs_page_length(struct page *page) } /* + * Convert a umode to a dirent->d_type + */ +static inline +unsigned char nfs_umode_to_dtype(umode_t mode) +{ + return (mode >> 12) & 15; +} + +/* * Determine the number of pages in an array of length 'len' and * with a base offset of 'base' */ diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index eceafe7..4f981f1 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -505,13 +505,13 @@ static struct rpc_procinfo mnt3_procedures[] = { static struct rpc_version mnt_version1 = { .number = 1, - .nrprocs = 2, + .nrprocs = ARRAY_SIZE(mnt_procedures), .procs = mnt_procedures, }; static struct rpc_version mnt_version3 = { .number = 3, - .nrprocs = 2, + .nrprocs = ARRAY_SIZE(mnt3_procedures), .procs = mnt3_procedures, }; diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index e6bf457..5914a19 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -423,7 +423,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) struct page **page; size_t hdrlen; unsigned int pglen, recvd; - int status, nr = 0; + int status; if ((status = ntohl(*p++))) return nfs_stat_to_errno(status); @@ -443,7 +443,7 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) if (pglen > recvd) pglen = recvd; page = rcvbuf->pages; - return nr; + return pglen; } static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) @@ -485,6 +485,8 @@ nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_se entry->prev_cookie = entry->cookie; entry->cookie = ntohl(*p++); + entry->d_type = DT_UNKNOWN; + p = xdr_inline_peek(xdr, 8); if (p != NULL) entry->eof = !p[0] && p[1]; @@ -495,7 +497,7 @@ nfs_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_se out_overflow: print_overflow_msg(__func__, xdr); - return ERR_PTR(-EIO); + return ERR_PTR(-EAGAIN); } /* diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index d9a5e83..f6cc60f 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -555,7 +555,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res struct page **page; size_t hdrlen; u32 recvd, pglen; - int status, nr = 0; + int status; status = ntohl(*p++); /* Decode post_op_attrs */ @@ -586,7 +586,7 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res pglen = recvd; page = rcvbuf->pages; - return nr; + return pglen; } __be32 * @@ -622,11 +622,13 @@ nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_s entry->prev_cookie = entry->cookie; p = xdr_decode_hyper(p, &entry->cookie); + entry->d_type = DT_UNKNOWN; if (plus) { entry->fattr->valid = 0; p = xdr_decode_post_op_attr_stream(xdr, entry->fattr); if (IS_ERR(p)) goto out_overflow_exit; + entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); /* In fact, a post_op_fh3: */ p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) @@ -656,7 +658,7 @@ nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, struct nfs_s out_overflow: print_overflow_msg(__func__, xdr); out_overflow_exit: - return ERR_PTR(-EIO); + return ERR_PTR(-EAGAIN); } /* diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0f24cdf..4435e5e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2852,8 +2852,10 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args); res.pgbase = args.pgbase; status = nfs4_call_sync(NFS_SERVER(dir), &msg, &args, &res, 0); - if (status == 0) + if (status >= 0) { memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE); + status += args.pgbase; + } nfs_invalidate_atime(dir); @@ -3359,6 +3361,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen) ret = nfs_revalidate_inode(server, inode); if (ret < 0) return ret; + if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL) + nfs_zap_acl_cache(inode); ret = nfs4_read_cached_acl(inode, buf, buflen); if (ret != -ENOENT) return ret; @@ -3387,6 +3391,13 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl nfs_inode_return_delegation(inode); buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase); ret = nfs4_call_sync(server, &msg, &arg, &res, 1); + /* + * Acl update can result in inode attribute update. + * so mark the attribute cache invalid. + */ + spin_lock(&inode->i_lock); + NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR; + spin_unlock(&inode->i_lock); nfs_access_zap_cache(inode); nfs_zap_acl_cache(inode); return ret; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index f313c4c..9f1826b 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4518,7 +4518,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n xdr_read_pages(xdr, pglen); - return 0; + return pglen; } static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) @@ -6208,6 +6208,10 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, if (entry->fattr->valid & NFS_ATTR_FATTR_FILEID) entry->ino = entry->fattr->fileid; + entry->d_type = DT_UNKNOWN; + if (entry->fattr->valid & NFS_ATTR_FATTR_TYPE) + entry->d_type = nfs_umode_to_dtype(entry->fattr->mode); + if (verify_attr_len(xdr, p, len) < 0) goto out_overflow; @@ -6221,7 +6225,7 @@ __be32 *nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry, out_overflow: print_overflow_msg(__func__, xdr); - return ERR_PTR(-EIO); + return ERR_PTR(-EAGAIN); } /* diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 137b549..b68536c 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -115,7 +115,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req) { if (!nfs_lock_request_dontget(req)) return 0; - if (req->wb_page != NULL) + if (test_bit(PG_MAPPED, &req->wb_flags)) radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); return 1; } @@ -125,7 +125,7 @@ int nfs_set_page_tag_locked(struct nfs_page *req) */ void nfs_clear_page_tag_locked(struct nfs_page *req) { - if (req->wb_page != NULL) { + if (test_bit(PG_MAPPED, &req->wb_flags)) { struct inode *inode = req->wb_context->path.dentry->d_inode; struct nfs_inode *nfsi = NFS_I(inode); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index e4b62c6..aedcaa7 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -152,7 +152,6 @@ static void nfs_readpage_release(struct nfs_page *req) (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), req->wb_bytes, (long long)req_offset(req)); - nfs_clear_request(req); nfs_release_request(req); } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 0a42e8f..4100630 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -39,7 +39,6 @@ #include <linux/nfs_mount.h> #include <linux/nfs4_mount.h> #include <linux/lockd/bind.h> -#include <linux/smp_lock.h> #include <linux/seq_file.h> #include <linux/mount.h> #include <linux/mnt_namespace.h> @@ -67,6 +66,12 @@ #define NFSDBG_FACILITY NFSDBG_VFS +#ifdef CONFIG_NFS_V3 +#define NFS_DEFAULT_VERSION 3 +#else +#define NFS_DEFAULT_VERSION 2 +#endif + enum { /* Mount options that take no arguments */ Opt_soft, Opt_hard, @@ -1064,12 +1069,10 @@ static int nfs_parse_mount_options(char *raw, mnt->flags |= NFS_MOUNT_VER3; mnt->version = 3; break; -#ifdef CONFIG_NFS_V4 case Opt_v4: mnt->flags &= ~NFS_MOUNT_VER3; mnt->version = 4; break; -#endif case Opt_udp: mnt->flags &= ~NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; @@ -1281,12 +1284,10 @@ static int nfs_parse_mount_options(char *raw, mnt->flags |= NFS_MOUNT_VER3; mnt->version = 3; break; -#ifdef CONFIG_NFS_V4 case NFS4_VERSION: mnt->flags &= ~NFS_MOUNT_VER3; mnt->version = 4; break; -#endif default: goto out_invalid_value; } @@ -2277,7 +2278,7 @@ static int nfs_get_sb(struct file_system_type *fs_type, }; int error = -ENOMEM; - data = nfs_alloc_parsed_mount_data(3); + data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION); mntfh = nfs_alloc_fhandle(); if (data == NULL || mntfh == NULL) goto out_free_fh; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 4c14c17..10d648e 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -390,6 +390,7 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) if (nfs_have_delegation(inode, FMODE_WRITE)) nfsi->change_attr++; } + set_bit(PG_MAPPED, &req->wb_flags); SetPagePrivate(req->wb_page); set_page_private(req->wb_page, (unsigned long)req); nfsi->npages++; @@ -415,6 +416,7 @@ static void nfs_inode_remove_request(struct nfs_page *req) spin_lock(&inode->i_lock); set_page_private(req->wb_page, 0); ClearPagePrivate(req->wb_page); + clear_bit(PG_MAPPED, &req->wb_flags); radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index); nfsi->npages--; if (!nfsi->npages) { @@ -422,7 +424,6 @@ static void nfs_inode_remove_request(struct nfs_page *req) iput(inode); } else spin_unlock(&inode->i_lock); - nfs_clear_request(req); nfs_release_request(req); } diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 2a533a0..7e84a85 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -260,9 +260,11 @@ void fill_post_wcc(struct svc_fh *fhp) err = vfs_getattr(fhp->fh_export->ex_path.mnt, fhp->fh_dentry, &fhp->fh_post_attr); fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version; - if (err) + if (err) { fhp->fh_post_saved = 0; - else + /* Grab the ctime anyway - set_change_info might use it */ + fhp->fh_post_attr.ctime = fhp->fh_dentry->d_inode->i_ctime; + } else fhp->fh_post_saved = 1; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index f1e5ec6..116cab9 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -673,16 +673,17 @@ static void nfsd4_hash_conn(struct nfsd4_conn *conn, struct nfsd4_session *ses) spin_unlock(&clp->cl_lock); } -static void nfsd4_register_conn(struct nfsd4_conn *conn) +static int nfsd4_register_conn(struct nfsd4_conn *conn) { conn->cn_xpt_user.callback = nfsd4_conn_lost; - register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user); + return register_xpt_user(conn->cn_xprt, &conn->cn_xpt_user); } static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses) { struct nfsd4_conn *conn; u32 flags = NFS4_CDFC4_FORE; + int ret; if (ses->se_flags & SESSION4_BACK_CHAN) flags |= NFS4_CDFC4_BACK; @@ -690,7 +691,10 @@ static __be32 nfsd4_new_conn(struct svc_rqst *rqstp, struct nfsd4_session *ses) if (!conn) return nfserr_jukebox; nfsd4_hash_conn(conn, ses); - nfsd4_register_conn(conn); + ret = nfsd4_register_conn(conn); + if (ret) + /* oops; xprt is already down: */ + nfsd4_conn_lost(&conn->cn_xpt_user); return nfs_ok; } @@ -1644,6 +1648,7 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi { struct nfs4_client *clp = ses->se_client; struct nfsd4_conn *c; + int ret; spin_lock(&clp->cl_lock); c = __nfsd4_find_conn(new->cn_xprt, ses); @@ -1654,7 +1659,10 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi } __nfsd4_hash_conn(new, ses); spin_unlock(&clp->cl_lock); - nfsd4_register_conn(new); + ret = nfsd4_register_conn(new); + if (ret) + /* oops; xprt is already down: */ + nfsd4_conn_lost(&new->cn_xpt_user); return; } @@ -2254,7 +2262,7 @@ nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access) * Spawn a thread to perform a recall on the delegation represented * by the lease (file_lock) * - * Called from break_lease() with lock_kernel() held. + * Called from break_lease() with lock_flocks() held. * Note: we assume break_lease will only call this *once* for any given * lease. */ @@ -2278,7 +2286,7 @@ void nfsd_break_deleg_cb(struct file_lock *fl) list_add_tail(&dp->dl_recall_lru, &del_recall_lru); spin_unlock(&recall_lock); - /* only place dl_time is set. protected by lock_kernel*/ + /* only place dl_time is set. protected by lock_flocks*/ dp->dl_time = get_seconds(); /* @@ -2295,7 +2303,7 @@ void nfsd_break_deleg_cb(struct file_lock *fl) /* * The file_lock is being reapd. * - * Called by locks_free_lock() with lock_kernel() held. + * Called by locks_free_lock() with lock_flocks() held. */ static void nfsd_release_deleg_cb(struct file_lock *fl) @@ -2310,7 +2318,7 @@ void nfsd_release_deleg_cb(struct file_lock *fl) } /* - * Called from setlease() with lock_kernel() held + * Called from setlease() with lock_flocks() held */ static int nfsd_same_client_deleg_cb(struct file_lock *onlist, struct file_lock *try) diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 4d476ff..60fce3d 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -484,18 +484,17 @@ static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp) static inline void set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) { - BUG_ON(!fhp->fh_pre_saved || !fhp->fh_post_saved); - cinfo->atomic = 1; + BUG_ON(!fhp->fh_pre_saved); + cinfo->atomic = fhp->fh_post_saved; cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode); - if (cinfo->change_supported) { - cinfo->before_change = fhp->fh_pre_change; - cinfo->after_change = fhp->fh_post_change; - } else { - cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec; - cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec; - cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec; - cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec; - } + + cinfo->before_change = fhp->fh_pre_change; + cinfo->after_change = fhp->fh_post_change; + cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec; + cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec; + cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec; + cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec; + } int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *); diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 49c844d..59e5fe7 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -335,7 +335,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr) * the device at this point. * * To prevent nilfs_dat_translate() from returning the - * uncommited block number, this makes a copy of the entry + * uncommitted block number, this makes a copy of the entry * buffer and redirects nilfs_dat_translate() to the copy. */ if (!buffer_nilfs_redirected(entry_bh)) { diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 33ad25d..caf9a6a 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -176,7 +176,6 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh) int nilfs_init_gcinode(struct inode *inode) { struct nilfs_inode_info *ii = NILFS_I(inode); - struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; inode->i_mode = S_IFREG; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); @@ -186,14 +185,6 @@ int nilfs_init_gcinode(struct inode *inode) ii->i_flags = 0; nilfs_bmap_init_gc(ii->i_bmap); - /* - * Add the inode to GC inode list. Garbage Collection - * is serialized and no two processes manipulate the - * list simultaneously. - */ - igrab(inode); - list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes); - return 0; } diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 3e90f86..b185e93 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -337,6 +337,7 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb, struct nilfs_argv *argv, void *buf) { size_t nmembs = argv->v_nmembs; + struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs; struct inode *inode; struct nilfs_vdesc *vdesc; struct buffer_head *bh, *n; @@ -349,10 +350,21 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb, ino = vdesc->vd_ino; cno = vdesc->vd_cno; inode = nilfs_iget_for_gc(sb, ino, cno); - if (unlikely(inode == NULL)) { - ret = -ENOMEM; + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); goto failed; } + if (list_empty(&NILFS_I(inode)->i_dirty)) { + /* + * Add the inode to GC inode list. Garbage Collection + * is serialized and no two processes manipulate the + * list simultaneously. + */ + igrab(inode); + list_add(&NILFS_I(inode)->i_dirty, + &nilfs->ns_gc_inodes); + } + do { ret = nilfs_ioctl_move_inode_block(inode, vdesc, &buffers); diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index b04f88e..f35794b 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -92,7 +92,11 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group, pr_debug("%s: group=%p event=%p\n", __func__, group, event); - wait_event(group->fanotify_data.access_waitq, event->response); + wait_event(group->fanotify_data.access_waitq, event->response || + atomic_read(&group->fanotify_data.bypass_perm)); + + if (!event->response) /* bypass_perm set */ + return 0; /* userspace responded, convert to something usable */ spin_lock(&event->lock); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 0632248..8b61220 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -106,20 +106,29 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event) return client_fd; } -static ssize_t fill_event_metadata(struct fsnotify_group *group, +static int fill_event_metadata(struct fsnotify_group *group, struct fanotify_event_metadata *metadata, struct fsnotify_event *event) { + int ret = 0; + pr_debug("%s: group=%p metadata=%p event=%p\n", __func__, group, metadata, event); metadata->event_len = FAN_EVENT_METADATA_LEN; + metadata->metadata_len = FAN_EVENT_METADATA_LEN; metadata->vers = FANOTIFY_METADATA_VERSION; metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS; metadata->pid = pid_vnr(event->tgid); - metadata->fd = create_fd(group, event); + if (unlikely(event->mask & FAN_Q_OVERFLOW)) + metadata->fd = FAN_NOFD; + else { + metadata->fd = create_fd(group, event); + if (metadata->fd < 0) + ret = metadata->fd; + } - return metadata->fd; + return ret; } #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS @@ -200,7 +209,7 @@ static int prepare_for_access_response(struct fsnotify_group *group, mutex_lock(&group->fanotify_data.access_mutex); - if (group->fanotify_data.bypass_perm) { + if (atomic_read(&group->fanotify_data.bypass_perm)) { mutex_unlock(&group->fanotify_data.access_mutex); kmem_cache_free(fanotify_response_event_cache, re); event->response = FAN_ALLOW; @@ -257,24 +266,34 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, pr_debug("%s: group=%p event=%p\n", __func__, group, event); - fd = fill_event_metadata(group, &fanotify_event_metadata, event); - if (fd < 0) - return fd; + ret = fill_event_metadata(group, &fanotify_event_metadata, event); + if (ret < 0) + goto out; + fd = fanotify_event_metadata.fd; ret = prepare_for_access_response(group, event, fd); if (ret) goto out_close_fd; ret = -EFAULT; - if (copy_to_user(buf, &fanotify_event_metadata, FAN_EVENT_METADATA_LEN)) + if (copy_to_user(buf, &fanotify_event_metadata, + fanotify_event_metadata.event_len)) goto out_kill_access_response; - return FAN_EVENT_METADATA_LEN; + return fanotify_event_metadata.event_len; out_kill_access_response: remove_access_response(group, event, fd); out_close_fd: - sys_close(fd); + if (fd != FAN_NOFD) + sys_close(fd); +out: +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + if (event->mask & FAN_ALL_PERM_EVENTS) { + event->response = FAN_DENY; + wake_up(&group->fanotify_data.access_waitq); + } +#endif return ret; } @@ -382,7 +401,7 @@ static int fanotify_release(struct inode *ignored, struct file *file) mutex_lock(&group->fanotify_data.access_mutex); - group->fanotify_data.bypass_perm = true; + atomic_inc(&group->fanotify_data.bypass_perm); list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) { pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group, @@ -586,11 +605,10 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, { struct fsnotify_mark *fsn_mark; __u32 added; + int ret = 0; fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); if (!fsn_mark) { - int ret; - if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) return -ENOSPC; @@ -600,17 +618,16 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, fsnotify_init_mark(fsn_mark, fanotify_free_mark); ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0); - if (ret) { - fanotify_free_mark(fsn_mark); - return ret; - } + if (ret) + goto err; } added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); - fsnotify_put_mark(fsn_mark); + if (added & ~mnt->mnt_fsnotify_mask) fsnotify_recalc_vfsmount_mask(mnt); - - return 0; +err: + fsnotify_put_mark(fsn_mark); + return ret; } static int fanotify_add_inode_mark(struct fsnotify_group *group, @@ -619,6 +636,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, { struct fsnotify_mark *fsn_mark; __u32 added; + int ret = 0; pr_debug("%s: group=%p inode=%p\n", __func__, group, inode); @@ -634,8 +652,6 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, fsn_mark = fsnotify_find_inode_mark(group, inode); if (!fsn_mark) { - int ret; - if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) return -ENOSPC; @@ -645,16 +661,16 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, fsnotify_init_mark(fsn_mark, fanotify_free_mark); ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0); - if (ret) { - fanotify_free_mark(fsn_mark); - return ret; - } + if (ret) + goto err; } added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); - fsnotify_put_mark(fsn_mark); + if (added & ~inode->i_fsnotify_mask) fsnotify_recalc_inode_mask(inode); - return 0; +err: + fsnotify_put_mark(fsn_mark); + return ret; } /* fanotify syscalls */ @@ -687,8 +703,10 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ group = fsnotify_alloc_group(&fanotify_fsnotify_ops); - if (IS_ERR(group)) + if (IS_ERR(group)) { + free_uid(user); return PTR_ERR(group); + } group->fanotify_data.user = user; atomic_inc(&user->fanotify_listeners); @@ -698,6 +716,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) mutex_init(&group->fanotify_data.access_mutex); init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); + atomic_set(&group->fanotify_data.bypass_perm, 0); #endif switch (flags & FAN_ALL_CLASS_BITS) { case FAN_CLASS_NOTIF: @@ -764,8 +783,10 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags, if (flags & ~FAN_ALL_MARK_FLAGS) return -EINVAL; switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { - case FAN_MARK_ADD: + case FAN_MARK_ADD: /* fallthrough */ case FAN_MARK_REMOVE: + if (!mask) + return -EINVAL; case FAN_MARK_FLUSH: break; default: diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 444c305..4cd5d5d 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -752,6 +752,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags) if (ret >= 0) return ret; + fsnotify_put_group(group); atomic_dec(&user->inotify_devs); out_free_uid: free_uid(user); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index f1e962c..0d7c554 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -573,11 +573,14 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, /* this io's submitter should not have unlocked this before we could */ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); + if (ocfs2_iocb_is_sem_locked(iocb)) { + up_read(&inode->i_alloc_sem); + ocfs2_iocb_clear_sem_locked(iocb); + } + ocfs2_iocb_clear_rw_locked(iocb); level = ocfs2_iocb_rw_locked_level(iocb); - if (!level) - up_read(&inode->i_alloc_sem); ocfs2_rw_unlock(inode, level); if (is_async) diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 76bfdfd..eceb456 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -68,8 +68,27 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level) else clear_bit(1, (unsigned long *)&iocb->private); } + +/* + * Using a named enum representing lock types in terms of #N bit stored in + * iocb->private, which is going to be used for communication bewteen + * ocfs2_dio_end_io() and ocfs2_file_aio_write/read(). + */ +enum ocfs2_iocb_lock_bits { + OCFS2_IOCB_RW_LOCK = 0, + OCFS2_IOCB_RW_LOCK_LEVEL, + OCFS2_IOCB_SEM, + OCFS2_IOCB_NUM_LOCKS +}; + #define ocfs2_iocb_clear_rw_locked(iocb) \ - clear_bit(0, (unsigned long *)&iocb->private) + clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private) #define ocfs2_iocb_rw_locked_level(iocb) \ - test_bit(1, (unsigned long *)&iocb->private) + test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private) +#define ocfs2_iocb_set_sem_locked(iocb) \ + set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) +#define ocfs2_iocb_clear_sem_locked(iocb) \ + clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) +#define ocfs2_iocb_is_sem_locked(iocb) \ + test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) #endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 52c7557..9f26ac9 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1964,8 +1964,10 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g if (reg == NULL) return ERR_PTR(-ENOMEM); - if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) - return ERR_PTR(-ENAMETOOLONG); + if (strlen(name) > O2HB_MAX_REGION_NAME_LEN) { + ret = -ENAMETOOLONG; + goto free; + } spin_lock(&o2hb_live_lock); reg->hr_region_num = 0; @@ -1974,7 +1976,8 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g O2NM_MAX_REGIONS); if (reg->hr_region_num >= O2NM_MAX_REGIONS) { spin_unlock(&o2hb_live_lock); - return ERR_PTR(-EFBIG); + ret = -EFBIG; + goto free; } set_bit(reg->hr_region_num, o2hb_region_bitmap); } @@ -1986,10 +1989,13 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g ret = o2hb_debug_region_init(reg, o2hb_debug_dir); if (ret) { config_item_put(®->hr_item); - return ERR_PTR(ret); + goto free; } return ®->hr_item; +free: + kfree(reg); + return ERR_PTR(ret); } static void o2hb_heartbeat_group_drop_item(struct config_group *group, diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index c7fba39..6c61771 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -113,10 +113,11 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { define_mask(QUOTA), define_mask(REFCOUNT), define_mask(BASTS), + define_mask(RESERVATIONS), + define_mask(CLUSTER), define_mask(ERROR), define_mask(NOTICE), define_mask(KTHREAD), - define_mask(RESERVATIONS), }; static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, }; diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index ea2ed9f..34d6544 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -81,7 +81,7 @@ #include <linux/sched.h> /* bits that are frequently given and infrequently matched in the low word */ -/* NOTE: If you add a flag, you need to also update mlog.c! */ +/* NOTE: If you add a flag, you need to also update masklog.c! */ #define ML_ENTRY 0x0000000000000001ULL /* func call entry */ #define ML_EXIT 0x0000000000000002ULL /* func call exit */ #define ML_TCP 0x0000000000000004ULL /* net cluster/tcp.c */ @@ -114,13 +114,14 @@ #define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ #define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */ #define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */ -#define ML_BASTS 0x0000001000000000ULL /* dlmglue asts and basts */ +#define ML_BASTS 0x0000000100000000ULL /* dlmglue asts and basts */ +#define ML_RESERVATIONS 0x0000000200000000ULL /* ocfs2 alloc reservations */ +#define ML_CLUSTER 0x0000000400000000ULL /* cluster stack */ + /* bits that are infrequently given and frequently matched in the high word */ -#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ -#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ -#define ML_KTHREAD 0x0000000400000000ULL /* kernel thread activity */ -#define ML_RESERVATIONS 0x0000000800000000ULL /* ocfs2 alloc reservations */ -#define ML_CLUSTER 0x0000001000000000ULL /* cluster stack */ +#define ML_ERROR 0x1000000000000000ULL /* sent to KERN_ERR */ +#define ML_NOTICE 0x2000000000000000ULL /* setn to KERN_NOTICE */ +#define ML_KTHREAD 0x4000000000000000ULL /* kernel thread activity */ #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) #define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT) diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index edaded4..895532ac 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -476,7 +476,6 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode) out: iput(inode); - ocfs2_dentry_attach_gen(dentry); } /* diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index c49f6de..d417b3f 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -2461,8 +2461,10 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb, di->i_dx_root = cpu_to_le64(dr_blkno); + spin_lock(&OCFS2_I(dir)->ip_lock); OCFS2_I(dir)->ip_dyn_features |= OCFS2_INDEXED_DIR_FL; di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); + spin_unlock(&OCFS2_I(dir)->ip_lock); ocfs2_journal_dirty(handle, di_bh); @@ -4466,8 +4468,10 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir, goto out_commit; } + spin_lock(&OCFS2_I(dir)->ip_lock); OCFS2_I(dir)->ip_dyn_features &= ~OCFS2_INDEXED_DIR_FL; di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features); + spin_unlock(&OCFS2_I(dir)->ip_lock); di->i_dx_root = cpu_to_le64(0ULL); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 58a93b9..cc2aaa9 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -959,7 +959,7 @@ static int dlm_match_regions(struct dlm_ctxt *dlm, r += O2HB_MAX_REGION_NAME_LEN; } - local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL); + local = kmalloc(sizeof(qr->qr_regions), GFP_ATOMIC); if (!local) { status = -ENOMEM; goto bail; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index f564b0e..59f0f6b 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2346,7 +2346,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) */ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, - int *numlocks) + int *numlocks, + int *hasrefs) { int ret; int i; @@ -2356,6 +2357,9 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, assert_spin_locked(&res->spinlock); + *numlocks = 0; + *hasrefs = 0; + ret = -EINVAL; if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { mlog(0, "cannot migrate lockres with unknown owner!\n"); @@ -2386,7 +2390,13 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, } *numlocks = count; - mlog(0, "migrateable lockres having %d locks\n", *numlocks); + + count = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + if (count < O2NM_MAX_NODES) + *hasrefs = 1; + + mlog(0, "%s: res %.*s, Migrateable, locks %d, refs %d\n", dlm->name, + res->lockname.len, res->lockname.name, *numlocks, *hasrefs); leave: return ret; @@ -2408,7 +2418,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, const char *name; unsigned int namelen; int mle_added = 0; - int numlocks; + int numlocks, hasrefs; int wake = 0; if (!dlm_grab(dlm)) @@ -2417,13 +2427,13 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, name = res->lockname.name; namelen = res->lockname.len; - mlog(0, "migrating %.*s to %u\n", namelen, name, target); + mlog(0, "%s: Migrating %.*s to %u\n", dlm->name, namelen, name, target); /* * ensure this lockres is a proper candidate for migration */ spin_lock(&res->spinlock); - ret = dlm_is_lockres_migrateable(dlm, res, &numlocks); + ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs); if (ret < 0) { spin_unlock(&res->spinlock); goto leave; @@ -2431,10 +2441,8 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, spin_unlock(&res->spinlock); /* no work to do */ - if (numlocks == 0) { - mlog(0, "no locks were found on this lockres! done!\n"); + if (numlocks == 0 && !hasrefs) goto leave; - } /* * preallocate up front @@ -2459,14 +2467,14 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, * find a node to migrate the lockres to */ - mlog(0, "picking a migration node\n"); spin_lock(&dlm->spinlock); /* pick a new node */ if (!test_bit(target, dlm->domain_map) || target >= O2NM_MAX_NODES) { target = dlm_pick_migration_target(dlm, res); } - mlog(0, "node %u chosen for migration\n", target); + mlog(0, "%s: res %.*s, Node %u chosen for migration\n", dlm->name, + namelen, name, target); if (target >= O2NM_MAX_NODES || !test_bit(target, dlm->domain_map)) { @@ -2667,7 +2675,7 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { int ret; int lock_dropped = 0; - int numlocks; + int numlocks, hasrefs; spin_lock(&res->spinlock); if (res->owner != dlm->node_num) { @@ -2681,8 +2689,8 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) } /* No need to migrate a lockres having no locks */ - ret = dlm_is_lockres_migrateable(dlm, res, &numlocks); - if (ret >= 0 && numlocks == 0) { + ret = dlm_is_lockres_migrateable(dlm, res, &numlocks, &hasrefs); + if (ret >= 0 && numlocks == 0 && !hasrefs) { spin_unlock(&res->spinlock); goto leave; } @@ -2915,6 +2923,12 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm, } queue++; } + + nodenum = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); + if (nodenum < O2NM_MAX_NODES) { + spin_unlock(&res->spinlock); + return nodenum; + } spin_unlock(&res->spinlock); mlog(0, "have not found a suitable target yet! checking domain map\n"); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 77b4c04..f6cba56 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2241,11 +2241,15 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, mutex_lock(&inode->i_mutex); + ocfs2_iocb_clear_sem_locked(iocb); + relock: /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ if (direct_io) { down_read(&inode->i_alloc_sem); have_alloc_sem = 1; + /* communicate with ocfs2_dio_end_io */ + ocfs2_iocb_set_sem_locked(iocb); } /* @@ -2382,8 +2386,10 @@ out: ocfs2_rw_unlock(inode, rw_level); out_sems: - if (have_alloc_sem) + if (have_alloc_sem) { up_read(&inode->i_alloc_sem); + ocfs2_iocb_clear_sem_locked(iocb); + } mutex_unlock(&inode->i_mutex); @@ -2527,6 +2533,8 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, goto bail; } + ocfs2_iocb_clear_sem_locked(iocb); + /* * buffered reads protect themselves in ->readpage(). O_DIRECT reads * need locks to protect pending reads from racing with truncate. @@ -2534,6 +2542,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, if (filp->f_flags & O_DIRECT) { down_read(&inode->i_alloc_sem); have_alloc_sem = 1; + ocfs2_iocb_set_sem_locked(iocb); ret = ocfs2_rw_lock(inode, 0); if (ret < 0) { @@ -2575,8 +2584,10 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, } bail: - if (have_alloc_sem) + if (have_alloc_sem) { up_read(&inode->i_alloc_sem); + ocfs2_iocb_clear_sem_locked(iocb); + } if (rw_level != -1) ocfs2_rw_unlock(inode, rw_level); mlog_exit(ret); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index d840821..70dd3b1 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -159,7 +159,9 @@ struct ocfs2_lock_res { char l_name[OCFS2_LOCK_ID_MAX_LEN]; unsigned int l_ro_holders; unsigned int l_ex_holders; - unsigned char l_level; + signed char l_level; + signed char l_requested; + signed char l_blocking; /* Data packed - type enum ocfs2_lock_type */ unsigned char l_type; @@ -169,8 +171,6 @@ struct ocfs2_lock_res { unsigned char l_action; /* Data packed - enum type ocfs2_unlock_action */ unsigned char l_unlock_action; - unsigned char l_requested; - unsigned char l_blocking; unsigned int l_pending_gen; spinlock_t l_lock; diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index c2e4f82..bf2e776 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -350,7 +350,7 @@ enum { #define OCFS2_LAST_LOCAL_SYSTEM_INODE LOCAL_GROUP_QUOTA_SYSTEM_INODE NUM_SYSTEM_INODES }; -#define NUM_GLOBAL_SYSTEM_INODES OCFS2_LAST_GLOBAL_SYSTEM_INODE +#define NUM_GLOBAL_SYSTEM_INODES OCFS2_FIRST_LOCAL_SYSTEM_INODE #define NUM_LOCAL_SYSTEM_INODES \ (NUM_SYSTEM_INODES - OCFS2_FIRST_LOCAL_SYSTEM_INODE) diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 252e7c8..a5ebe42 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -190,7 +190,7 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name) return c; } - return c; + return NULL; } /* diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index f02c0ef..cfeab7c 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -41,7 +41,6 @@ #include <linux/mount.h> #include <linux/seq_file.h> #include <linux/quotaops.h> -#include <linux/smp_lock.h> #define MLOG_MASK_PREFIX ML_SUPER #include <cluster/masklog.h> diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index ddb1f41..911e61f 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -418,7 +418,7 @@ out_no_root: static struct dentry *openprom_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_single(fs_type, flags, data, openprom_fill_super) + return mount_single(fs_type, flags, data, openprom_fill_super); } static struct file_system_type openprom_fs_type = { @@ -1199,12 +1199,24 @@ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, return ret; } +/* + * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same + * location, so checking ->i_pipe is not enough to verify that this is a + * pipe. + */ +struct pipe_inode_info *get_pipe_info(struct file *file) +{ + struct inode *i = file->f_path.dentry->d_inode; + + return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL; +} + long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { struct pipe_inode_info *pipe; long ret; - pipe = file->f_path.dentry->d_inode->i_pipe; + pipe = get_pipe_info(file); if (!pipe) return -EBADF; diff --git a/fs/proc/base.c b/fs/proc/base.c index f3d02ca..1828451 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1574,7 +1574,7 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen) if (!tmp) return -ENOMEM; - pathname = d_path_with_unreachable(path, tmp, PAGE_SIZE); + pathname = d_path(path, tmp, PAGE_SIZE); len = PTR_ERR(pathname); if (IS_ERR(pathname)) goto out; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 9c2b5f4..3ddb606 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -16,7 +16,6 @@ #include <linux/limits.h> #include <linux/init.h> #include <linux/module.h> -#include <linux/smp_lock.h> #include <linux/sysctl.h> #include <linux/slab.h> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index da6b01d..c126c83 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -706,6 +706,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, * skip over unmapped regions. */ #define PAGEMAP_WALK_SIZE (PMD_SIZE) +#define PAGEMAP_WALK_MASK (PMD_MASK) static ssize_t pagemap_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -776,7 +777,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, unsigned long end; pm.pos = 0; - end = start_vaddr + PAGEMAP_WALK_SIZE; + end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; /* overflow ? */ if (end < start_vaddr || end > end_vaddr) end = end_vaddr; diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 2367fb3..74802bc5 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -499,7 +499,7 @@ static int __init parse_crash_elf64_headers(void) /* Do some basic Verification. */ if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 || (ehdr.e_type != ET_CORE) || - !vmcore_elf_check_arch(&ehdr) || + !vmcore_elf64_check_arch(&ehdr) || ehdr.e_ident[EI_CLASS] != ELFCLASS64 || ehdr.e_ident[EI_VERSION] != EV_CURRENT || ehdr.e_version != EV_CURRENT || diff --git a/fs/read_write.c b/fs/read_write.c index 431a0ed..5d431ba 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -9,7 +9,6 @@ #include <linux/fcntl.h> #include <linux/file.h> #include <linux/uio.h> -#include <linux/smp_lock.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/module.h> diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 41656d4..0bae036 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -8,7 +8,6 @@ #include <linux/reiserfs_acl.h> #include <linux/reiserfs_xattr.h> #include <linux/exportfs.h> -#include <linux/smp_lock.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/slab.h> diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index adf22b4..79265fd 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -9,7 +9,6 @@ #include <linux/time.h> #include <asm/uaccess.h> #include <linux/pagemap.h> -#include <linux/smp_lock.h> #include <linux/compat.h> /* @@ -184,12 +183,11 @@ int reiserfs_unpack(struct inode *inode, struct file *filp) return 0; } - /* we need to make sure nobody is changing the file size beneath - ** us - */ - reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb); depth = reiserfs_write_lock_once(inode->i_sb); + /* we need to make sure nobody is changing the file size beneath us */ + reiserfs_mutex_lock_safe(&inode->i_mutex, inode->i_sb); + write_from = inode->i_size & (blocksize - 1); /* if we are on a block boundary, we are already unpacked. */ if (write_from == 0) { diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 076c8b1..d31bce1 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -43,7 +43,6 @@ #include <linux/fcntl.h> #include <linux/stat.h> #include <linux/string.h> -#include <linux/smp_lock.h> #include <linux/buffer_head.h> #include <linux/workqueue.h> #include <linux/writeback.h> diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 3bf7a64..b243117 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -28,7 +28,6 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/crc32.h> -#include <linux/smp_lock.h> struct file_system_type reiserfs_fs_type; diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 536d697..90d2fcb 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -472,7 +472,9 @@ int reiserfs_acl_chmod(struct inode *inode) struct reiserfs_transaction_handle th; size_t size = reiserfs_xattr_nblocks(inode, reiserfs_acl_size(clone->a_count)); - reiserfs_write_lock(inode->i_sb); + int depth; + + depth = reiserfs_write_lock_once(inode->i_sb); error = journal_begin(&th, inode->i_sb, size * 2); if (!error) { int error2; @@ -482,7 +484,7 @@ int reiserfs_acl_chmod(struct inode *inode) if (error2) error = error2; } - reiserfs_write_unlock(inode->i_sb); + reiserfs_write_unlock_once(inode->i_sb, depth); } posix_acl_release(clone); return error; diff --git a/fs/splice.c b/fs/splice.c index 8f1dfae..ce2f025 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1311,18 +1311,6 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags); -/* - * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same - * location, so checking ->i_pipe is not enough to verify that this is a - * pipe. - */ -static inline struct pipe_inode_info *pipe_info(struct inode *inode) -{ - if (S_ISFIFO(inode->i_mode)) - return inode->i_pipe; - - return NULL; -} /* * Determine where to splice to/from. @@ -1336,8 +1324,8 @@ static long do_splice(struct file *in, loff_t __user *off_in, loff_t offset, *off; long ret; - ipipe = pipe_info(in->f_path.dentry->d_inode); - opipe = pipe_info(out->f_path.dentry->d_inode); + ipipe = get_pipe_info(in); + opipe = get_pipe_info(out); if (ipipe && opipe) { if (off_in || off_out) @@ -1555,7 +1543,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, int error; long ret; - pipe = pipe_info(file->f_path.dentry->d_inode); + pipe = get_pipe_info(file); if (!pipe) return -EBADF; @@ -1642,7 +1630,7 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, }; long ret; - pipe = pipe_info(file->f_path.dentry->d_inode); + pipe = get_pipe_info(file); if (!pipe) return -EBADF; @@ -2022,8 +2010,8 @@ static int link_pipe(struct pipe_inode_info *ipipe, static long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) { - struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode); - struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode); + struct pipe_inode_info *ipipe = get_pipe_info(in); + struct pipe_inode_info *opipe = get_pipe_info(out); int ret = -EINVAL; /* diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index c9af48f..691f612 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -934,7 +934,6 @@ xfs_aops_discard_page( struct xfs_inode *ip = XFS_I(inode); struct buffer_head *bh, *head; loff_t offset = page_offset(page); - ssize_t len = 1 << inode->i_blkbits; if (!xfs_is_delayed_page(page, IO_DELAY)) goto out_invalidate; @@ -949,58 +948,14 @@ xfs_aops_discard_page( xfs_ilock(ip, XFS_ILOCK_EXCL); bh = head = page_buffers(page); do { - int done; - xfs_fileoff_t offset_fsb; - xfs_bmbt_irec_t imap; - int nimaps = 1; int error; - xfs_fsblock_t firstblock; - xfs_bmap_free_t flist; + xfs_fileoff_t start_fsb; if (!buffer_delay(bh)) goto next_buffer; - offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); - - /* - * Map the range first and check that it is a delalloc extent - * before trying to unmap the range. Otherwise we will be - * trying to remove a real extent (which requires a - * transaction) or a hole, which is probably a bad idea... - */ - error = xfs_bmapi(NULL, ip, offset_fsb, 1, - XFS_BMAPI_ENTIRE, NULL, 0, &imap, - &nimaps, NULL); - - if (error) { - /* something screwed, just bail */ - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_fs_cmn_err(CE_ALERT, ip->i_mount, - "page discard failed delalloc mapping lookup."); - } - break; - } - if (!nimaps) { - /* nothing there */ - goto next_buffer; - } - if (imap.br_startblock != DELAYSTARTBLOCK) { - /* been converted, ignore */ - goto next_buffer; - } - WARN_ON(imap.br_blockcount == 0); - - /* - * Note: while we initialise the firstblock/flist pair, they - * should never be used because blocks should never be - * allocated or freed for a delalloc extent and hence we need - * don't cancel or finish them after the xfs_bunmapi() call. - */ - xfs_bmap_init(&flist, &firstblock); - error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock, - &flist, &done); - - ASSERT(!flist.xbf_count && !flist.xbf_first); + start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1); if (error) { /* something screwed, just bail */ if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { @@ -1010,7 +965,7 @@ xfs_aops_discard_page( break; } next_buffer: - offset += len; + offset += 1 << inode->i_blkbits; } while ((bh = bh->b_this_page) != head); @@ -1111,11 +1066,12 @@ xfs_vm_writepage( uptodate = 0; /* - * A hole may still be marked uptodate because discard_buffer - * leaves the flag set. + * set_page_dirty dirties all buffers in a page, independent + * of their state. The dirty state however is entirely + * meaningless for holes (!mapped && uptodate), so skip + * buffers covering holes here. */ if (!buffer_mapped(bh) && buffer_uptodate(bh)) { - ASSERT(!buffer_dirty(bh)); imap_valid = 0; continue; } @@ -1504,11 +1460,42 @@ xfs_vm_write_failed( struct inode *inode = mapping->host; if (to > inode->i_size) { - struct iattr ia = { - .ia_valid = ATTR_SIZE | ATTR_FORCE, - .ia_size = inode->i_size, - }; - xfs_setattr(XFS_I(inode), &ia, XFS_ATTR_NOLOCK); + /* + * punch out the delalloc blocks we have already allocated. We + * don't call xfs_setattr() to do this as we may be in the + * middle of a multi-iovec write and so the vfs inode->i_size + * will not match the xfs ip->i_size and so it will zero too + * much. Hence we jus truncate the page cache to zero what is + * necessary and punch the delalloc blocks directly. + */ + struct xfs_inode *ip = XFS_I(inode); + xfs_fileoff_t start_fsb; + xfs_fileoff_t end_fsb; + int error; + + truncate_pagecache(inode, to, inode->i_size); + + /* + * Check if there are any blocks that are outside of i_size + * that need to be trimmed back. + */ + start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1; + end_fsb = XFS_B_TO_FSB(ip->i_mount, to); + if (end_fsb <= start_fsb) + return; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, + end_fsb - start_fsb); + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + "xfs_vm_write_failed: unable to clean up ino %lld", + ip->i_ino); + } + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); } } diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 63fd2c0..4c5deb6 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -488,29 +488,16 @@ found: spin_unlock(&pag->pag_buf_lock); xfs_perag_put(pag); - /* Attempt to get the semaphore without sleeping, - * if this does not work then we need to drop the - * spinlock and do a hard attempt on the semaphore. - */ - if (down_trylock(&bp->b_sema)) { + if (xfs_buf_cond_lock(bp)) { + /* failed, so wait for the lock if requested. */ if (!(flags & XBF_TRYLOCK)) { - /* wait for buffer ownership */ xfs_buf_lock(bp); XFS_STATS_INC(xb_get_locked_waited); } else { - /* We asked for a trylock and failed, no need - * to look at file offset and length here, we - * know that this buffer at least overlaps our - * buffer and is locked, therefore our buffer - * either does not exist, or is this buffer. - */ xfs_buf_rele(bp); XFS_STATS_INC(xb_busy_locked); return NULL; } - } else { - /* trylock worked */ - XB_SET_OWNER(bp); } if (bp->b_flags & XBF_STALE) { @@ -876,10 +863,18 @@ xfs_buf_rele( */ /* - * Locks a buffer object, if it is not already locked. - * Note that this in no way locks the underlying pages, so it is only - * useful for synchronizing concurrent use of buffer objects, not for - * synchronizing independent access to the underlying pages. + * Locks a buffer object, if it is not already locked. Note that this in + * no way locks the underlying pages, so it is only useful for + * synchronizing concurrent use of buffer objects, not for synchronizing + * independent access to the underlying pages. + * + * If we come across a stale, pinned, locked buffer, we know that we are + * being asked to lock a buffer that has been reallocated. Because it is + * pinned, we know that the log has not been pushed to disk and hence it + * will still be locked. Rather than continuing to have trylock attempts + * fail until someone else pushes the log, push it ourselves before + * returning. This means that the xfsaild will not get stuck trying + * to push on stale inode buffers. */ int xfs_buf_cond_lock( @@ -890,6 +885,8 @@ xfs_buf_cond_lock( locked = down_trylock(&bp->b_sema) == 0; if (locked) XB_SET_OWNER(bp); + else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) + xfs_log_force(bp->b_target->bt_mount, 0); trace_xfs_buf_cond_lock(bp, _RET_IP_); return locked ? 0 : -EBUSY; @@ -1781,7 +1778,6 @@ xfs_buf_delwri_split( INIT_LIST_HEAD(list); spin_lock(dwlk); list_for_each_entry_safe(bp, n, dwq, b_list) { - trace_xfs_buf_delwri_split(bp, _RET_IP_); ASSERT(bp->b_flags & XBF_DELWRI); if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) { @@ -1795,6 +1791,7 @@ xfs_buf_delwri_split( _XBF_RUN_QUEUES); bp->b_flags |= XBF_WRITE; list_move_tail(&bp->b_list, list); + trace_xfs_buf_delwri_split(bp, _RET_IP_); } else skipped++; } diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 2ea238f6..ad442d9 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -416,7 +416,7 @@ xfs_attrlist_by_handle( if (IS_ERR(dentry)) return PTR_ERR(dentry); - kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL); + kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL); if (!kbuf) goto out_dput; diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 96107ef..94d5fd6 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -762,7 +762,8 @@ xfs_setup_inode( inode->i_state = I_NEW; inode_sb_list_add(inode); - insert_inode_hash(inode); + /* make the inode look hashed for the writeback code */ + hlist_add_fake(&inode->i_hash); inode->i_mode = ip->i_d.di_mode; inode->i_nlink = ip->i_d.di_nlink; diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 9f3a78f..064f964 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -353,9 +353,6 @@ xfs_parseargs( mp->m_qflags &= ~XFS_OQUOTA_ENFD; } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { mp->m_flags |= XFS_MOUNT_DELAYLOG; - cmn_err(CE_WARN, - "Enabling EXPERIMENTAL delayed logging feature " - "- use at your own risk.\n"); } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { mp->m_flags &= ~XFS_MOUNT_DELAYLOG; } else if (!strcmp(this_char, "ihashsize")) { diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 37d3325..afb0d7c 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -853,6 +853,7 @@ restart: if (trylock) { if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { skipped++; + xfs_perag_put(pag); continue; } first_index = pag->pag_ici_reclaim_cursor; diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 8abd12e..4111cd3 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5471,8 +5471,13 @@ xfs_getbmap( if (error) goto out_unlock_iolock; } - - ASSERT(ip->i_delayed_blks == 0); + /* + * even after flushing the inode, there can still be delalloc + * blocks on the inode beyond EOF due to speculative + * preallocation. These are not removed until the release + * function is called or the inode is inactivated. Hence we + * cannot assert here that ip->i_delayed_blks == 0. + */ } lock = xfs_ilock_map_shared(ip); @@ -6070,3 +6075,79 @@ xfs_bmap_disk_count_leaves( *count += xfs_bmbt_disk_get_blockcount(frp); } } + +/* + * dead simple method of punching delalyed allocation blocks from a range in + * the inode. Walks a block at a time so will be slow, but is only executed in + * rare error cases so the overhead is not critical. This will alays punch out + * both the start and end blocks, even if the ranges only partially overlap + * them, so it is up to the caller to ensure that partial blocks are not + * passed in. + */ +int +xfs_bmap_punch_delalloc_range( + struct xfs_inode *ip, + xfs_fileoff_t start_fsb, + xfs_fileoff_t length) +{ + xfs_fileoff_t remaining = length; + int error = 0; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + do { + int done; + xfs_bmbt_irec_t imap; + int nimaps = 1; + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + + /* + * Map the range first and check that it is a delalloc extent + * before trying to unmap the range. Otherwise we will be + * trying to remove a real extent (which requires a + * transaction) or a hole, which is probably a bad idea... + */ + error = xfs_bmapi(NULL, ip, start_fsb, 1, + XFS_BMAPI_ENTIRE, NULL, 0, &imap, + &nimaps, NULL); + + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + "Failed delalloc mapping lookup ino %lld fsb %lld.", + ip->i_ino, start_fsb); + } + break; + } + if (!nimaps) { + /* nothing there */ + goto next_block; + } + if (imap.br_startblock != DELAYSTARTBLOCK) { + /* been converted, ignore */ + goto next_block; + } + WARN_ON(imap.br_blockcount == 0); + + /* + * Note: while we initialise the firstblock/flist pair, they + * should never be used because blocks should never be + * allocated or freed for a delalloc extent and hence we need + * don't cancel or finish them after the xfs_bunmapi() call. + */ + xfs_bmap_init(&flist, &firstblock); + error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock, + &flist, &done); + if (error) + break; + + ASSERT(!flist.xbf_count && !flist.xbf_first); +next_block: + start_fsb++; + remaining--; + } while(remaining > 0); + + return error; +} diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 71ec9b6..3651191 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -394,6 +394,11 @@ xfs_bmap_count_blocks( int whichfork, int *count); +int +xfs_bmap_punch_delalloc_range( + struct xfs_inode *ip, + xfs_fileoff_t start_fsb, + xfs_fileoff_t length); #endif /* __KERNEL__ */ #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 3b9582c..e60490b 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -377,6 +377,19 @@ xfs_swap_extents( ip->i_d.di_format = tip->i_d.di_format; tip->i_d.di_format = tmp; + /* + * The extents in the source inode could still contain speculative + * preallocation beyond EOF (e.g. the file is open but not modified + * while defrag is in progress). In that case, we need to copy over the + * number of delalloc blocks the data fork in the source inode is + * tracking beyond EOF so that when the fork is truncated away when the + * temporary inode is unlinked we don't underrun the i_delayed_blks + * counter on that inode. + */ + ASSERT(tip->i_delayed_blks == 0); + tip->i_delayed_blks = ip->i_delayed_blks; + ip->i_delayed_blks = 0; + ilf_fields = XFS_ILOG_CORE; switch(ip->i_d.di_format) { diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index ed99902..c78cc6a 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -58,6 +58,7 @@ xfs_error_trap(int e) int xfs_etest[XFS_NUM_INJECT_ERROR]; int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR]; char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR]; +int xfs_error_test_active; int xfs_error_test(int error_tag, int *fsidp, char *expression, @@ -108,6 +109,7 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp) len = strlen(mp->m_fsname); xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP); strcpy(xfs_etest_fsname[i], mp->m_fsname); + xfs_error_test_active++; return 0; } } @@ -137,6 +139,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud) xfs_etest_fsid[i] = 0LL; kmem_free(xfs_etest_fsname[i]); xfs_etest_fsname[i] = NULL; + xfs_error_test_active--; } } diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index c2c1a07..f338847 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -127,13 +127,14 @@ extern void xfs_corruption_error(const char *tag, int level, #define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT #ifdef DEBUG +extern int xfs_error_test_active; extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); #define XFS_NUM_INJECT_ERROR 10 #define XFS_TEST_ERROR(expr, mp, tag, rf) \ - ((expr) || \ + ((expr) || (xfs_error_test_active && \ xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ - (rf))) + (rf)))) extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp); extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud); diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 9b715dc..9124425 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -744,9 +744,15 @@ xfs_filestream_new_ag( * If the file's parent directory is known, take its iolock in exclusive * mode to prevent two sibling files from racing each other to migrate * themselves and their parent to different AGs. + * + * Note that we lock the parent directory iolock inside the child + * iolock here. That's fine as we never hold both parent and child + * iolock in any other place. This is different from the ilock, + * which requires locking of the child after the parent for namespace + * operations. */ if (pip) - xfs_ilock(pip, XFS_IOLOCK_EXCL); + xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); /* * A new AG needs to be found for the file. If the file's parent diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index c7ac020..7c8d30c 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -657,18 +657,37 @@ xfs_inode_item_unlock( } /* - * This is called to find out where the oldest active copy of the - * inode log item in the on disk log resides now that the last log - * write of it completed at the given lsn. Since we always re-log - * all dirty data in an inode, the latest copy in the on disk log - * is the only one that matters. Therefore, simply return the - * given lsn. + * This is called to find out where the oldest active copy of the inode log + * item in the on disk log resides now that the last log write of it completed + * at the given lsn. Since we always re-log all dirty data in an inode, the + * latest copy in the on disk log is the only one that matters. Therefore, + * simply return the given lsn. + * + * If the inode has been marked stale because the cluster is being freed, we + * don't want to (re-)insert this inode into the AIL. There is a race condition + * where the cluster buffer may be unpinned before the inode is inserted into + * the AIL during transaction committed processing. If the buffer is unpinned + * before the inode item has been committed and inserted, then it is possible + * for the buffer to be written and IO completions before the inode is inserted + * into the AIL. In that case, we'd be inserting a clean, stale inode into the + * AIL which will never get removed. It will, however, get reclaimed which + * triggers an assert in xfs_inode_free() complaining about freein an inode + * still in the AIL. + * + * To avoid this, return a lower LSN than the one passed in so that the + * transaction committed code will not move the inode forward in the AIL but + * will still unpin it properly. */ STATIC xfs_lsn_t xfs_inode_item_committed( struct xfs_log_item *lip, xfs_lsn_t lsn) { + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + + if (xfs_iflags_test(ip, XFS_ISTALE)) + return lsn - 1; return lsn; } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b1498ab..19e9dfa 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -275,6 +275,7 @@ xfs_free_perag( pag = radix_tree_delete(&mp->m_perag_tree, agno); spin_unlock(&mp->m_perag_lock); ASSERT(pag); + ASSERT(atomic_read(&pag->pag_ref) == 0); call_rcu(&pag->rcu_head, __xfs_free_perag); } } diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index e0e64b1..9bb6eda 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -346,8 +346,17 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid, #define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) #define xfs_trans_apply_dquot_deltas(tp) #define xfs_trans_unreserve_and_mod_dquots(tp) -#define xfs_trans_reserve_quota_nblks(tp, ip, nblks, ninos, flags) (0) -#define xfs_trans_reserve_quota_bydquots(tp, mp, u, g, nb, ni, fl) (0) +static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, + struct xfs_inode *ip, long nblks, long ninos, uint flags) +{ + return 0; +} +static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, + struct xfs_mount *mp, struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, long nblks, long nions, uint flags) +{ + return 0; +} #define xfs_qm_vop_create_dqattach(tp, ip, u, g) #define xfs_qm_vop_rename_dqattach(it) (0) #define xfs_qm_vop_chown(tp, ip, old, new) (NULL) @@ -357,11 +366,14 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, uid_t uid, gid_t gid, prid_t prid, #define xfs_qm_dqdetach(ip) #define xfs_qm_dqrele(d) #define xfs_qm_statvfs(ip, s) -#define xfs_qm_sync(mp, fl) (0) +static inline int xfs_qm_sync(struct xfs_mount *mp, int flags) +{ + return 0; +} #define xfs_qm_newmount(mp, a, b) (0) #define xfs_qm_mount_quotas(mp) #define xfs_qm_unmount(mp) -#define xfs_qm_unmount_quotas(mp) (0) +#define xfs_qm_unmount_quotas(mp) #endif /* CONFIG_XFS_QUOTA */ #define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \ diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index d2af0a8..77a5989 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -297,6 +297,7 @@ xfs_rename( * it and some incremental backup programs won't work without it. */ xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE); /* * Adjust the link count on src_dp. This is necessary when |