diff options
Diffstat (limited to 'fs')
97 files changed, 2336 insertions, 1559 deletions
@@ -47,7 +47,7 @@ config FS_POSIX_ACL def_bool n config EXPORTFS - tristate + bool config FILE_LOCKING bool "Enable POSIX file locking API" if EXPERT diff --git a/fs/Makefile b/fs/Makefile index a7f7cef..ba01202 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -48,6 +48,8 @@ obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ obj-$(CONFIG_GENERIC_ACL) += generic_acl.o +obj-$(CONFIG_FHANDLE) += fhandle.o + obj-y += quota/ obj-$(CONFIG_PROC_FS) += proc/ diff --git a/fs/afs/write.c b/fs/afs/write.c index 15690bb..789b3af 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -140,6 +140,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping, candidate->first = candidate->last = index; candidate->offset_first = from; candidate->to_last = to; + INIT_LIST_HEAD(&candidate->link); candidate->usage = 1; candidate->state = AFS_WBACK_PENDING; init_waitqueue_head(&candidate->waitq); @@ -239,15 +239,23 @@ static void __put_ioctx(struct kioctx *ctx) call_rcu(&ctx->rcu_head, ctx_rcu_free); } -#define get_ioctx(kioctx) do { \ - BUG_ON(atomic_read(&(kioctx)->users) <= 0); \ - atomic_inc(&(kioctx)->users); \ -} while (0) -#define put_ioctx(kioctx) do { \ - BUG_ON(atomic_read(&(kioctx)->users) <= 0); \ - if (unlikely(atomic_dec_and_test(&(kioctx)->users))) \ - __put_ioctx(kioctx); \ -} while (0) +static inline void get_ioctx(struct kioctx *kioctx) +{ + BUG_ON(atomic_read(&kioctx->users) <= 0); + atomic_inc(&kioctx->users); +} + +static inline int try_get_ioctx(struct kioctx *kioctx) +{ + return atomic_inc_not_zero(&kioctx->users); +} + +static inline void put_ioctx(struct kioctx *kioctx) +{ + BUG_ON(atomic_read(&kioctx->users) <= 0); + if (unlikely(atomic_dec_and_test(&kioctx->users))) + __put_ioctx(kioctx); +} /* ioctx_alloc * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. @@ -601,8 +609,13 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) rcu_read_lock(); hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) { - if (ctx->user_id == ctx_id && !ctx->dead) { - get_ioctx(ctx); + /* + * RCU protects us against accessing freed memory but + * we have to be careful not to get a reference when the + * reference count already dropped to 0 (ctx->dead test + * is unreliable because of races). + */ + if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){ ret = ctx; break; } @@ -1629,6 +1642,23 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, goto out_put_req; spin_lock_irq(&ctx->ctx_lock); + /* + * We could have raced with io_destroy() and are currently holding a + * reference to ctx which should be destroyed. We cannot submit IO + * since ctx gets freed as soon as io_submit() puts its reference. The + * check here is reliable: io_destroy() sets ctx->dead before waiting + * for outstanding IO and the barrier between these two is realized by + * unlock of mm->ioctx_lock and lock of ctx->ctx_lock. Analogously we + * increment ctx->reqs_active before checking for ctx->dead and the + * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we + * don't see ctx->dead set here, io_destroy() waits for our IO to + * finish. + */ + if (ctx->dead) { + spin_unlock_irq(&ctx->ctx_lock); + ret = -EINVAL; + goto out_put_req; + } aio_run_iocb(req); if (!list_empty(&ctx->run_list)) { /* drain the run list */ diff --git a/fs/block_dev.c b/fs/block_dev.c index 4fb8a34..8892870 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -873,6 +873,11 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); if (ret) goto out_del; + /* + * bdev could be deleted beneath us which would implicitly destroy + * the holder directory. Hold on to it. + */ + kobject_get(bdev->bd_part->holder_dir); list_add(&holder->list, &bdev->bd_holder_disks); goto out_unlock; @@ -909,6 +914,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); del_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); + kobject_put(bdev->bd_part->holder_dir); list_del_init(&holder->list); kfree(holder); } @@ -922,14 +928,15 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); * flush_disk - invalidates all buffer-cache entries on a disk * * @bdev: struct block device to be flushed + * @kill_dirty: flag to guide handling of dirty inodes * * Invalidates all buffer-cache entries on a disk. It should be called * when a disk has been changed -- either by a media change or online * resize. */ -static void flush_disk(struct block_device *bdev) +static void flush_disk(struct block_device *bdev, bool kill_dirty) { - if (__invalidate_device(bdev)) { + if (__invalidate_device(bdev, kill_dirty)) { char name[BDEVNAME_SIZE] = ""; if (bdev->bd_disk) @@ -966,7 +973,7 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) "%s: detected capacity change from %lld to %lld\n", name, bdev_size, disk_size); i_size_write(bdev->bd_inode, disk_size); - flush_disk(bdev); + flush_disk(bdev, false); } } EXPORT_SYMBOL(check_disk_size_change); @@ -1019,7 +1026,7 @@ int check_disk_change(struct block_device *bdev) if (!(events & DISK_EVENT_MEDIA_CHANGE)) return 0; - flush_disk(bdev); + flush_disk(bdev, true); if (bdops->revalidate_disk) bdops->revalidate_disk(bdev->bd_disk); return 1; @@ -1600,7 +1607,7 @@ fail: } EXPORT_SYMBOL(lookup_bdev); -int __invalidate_device(struct block_device *bdev) +int __invalidate_device(struct block_device *bdev, bool kill_dirty) { struct super_block *sb = get_super(bdev); int res = 0; @@ -1613,7 +1620,7 @@ int __invalidate_device(struct block_device *bdev) * hold). */ shrink_dcache_sb(sb); - res = invalidate_inodes(sb); + res = invalidate_inodes(sb, kill_dirty); drop_super(sb); } invalidate_bdev(bdev); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2c98b3a..7f78cc7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -729,6 +729,15 @@ struct btrfs_space_info { u64 disk_total; /* total bytes on disk, takes mirrors into account */ + /* + * we bump reservation progress every time we decrement + * bytes_reserved. This way people waiting for reservations + * know something good has happened and they can check + * for progress. The number here isn't to be trusted, it + * just shows reclaim activity + */ + unsigned long reservation_progress; + int full; /* indicates that we cannot allocate any more chunks for this space */ int force_alloc; /* set if we need to force a chunk alloc for @@ -1254,6 +1263,7 @@ struct btrfs_root { #define BTRFS_MOUNT_SPACE_CACHE (1 << 12) #define BTRFS_MOUNT_CLEAR_CACHE (1 << 13) #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) +#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -2218,6 +2228,8 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end); int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, u64 num_bytes); +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 type); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index ff27d7a..b4ffad8 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -21,9 +21,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, int len = *max_len; int type; - if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) || - (connectable && len < BTRFS_FID_SIZE_CONNECTABLE)) + if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) { + *max_len = BTRFS_FID_SIZE_CONNECTABLE; return 255; + } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) { + *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE; + return 255; + } len = BTRFS_FID_SIZE_NON_CONNECTABLE; type = FILEID_BTRFS_WITHOUT_PARENT; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f3c96fc..7b3089b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3342,15 +3342,16 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, u64 max_reclaim; u64 reclaimed = 0; long time_left; - int pause = 1; int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; int loops = 0; + unsigned long progress; block_rsv = &root->fs_info->delalloc_block_rsv; space_info = block_rsv->space_info; smp_mb(); reserved = space_info->bytes_reserved; + progress = space_info->reservation_progress; if (reserved == 0) return 0; @@ -3365,31 +3366,36 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans, writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); spin_lock(&space_info->lock); - if (reserved > space_info->bytes_reserved) { - loops = 0; + if (reserved > space_info->bytes_reserved) reclaimed += reserved - space_info->bytes_reserved; - } else { - loops++; - } reserved = space_info->bytes_reserved; spin_unlock(&space_info->lock); + loops++; + if (reserved == 0 || reclaimed >= max_reclaim) break; if (trans && trans->transaction->blocked) return -EAGAIN; - __set_current_state(TASK_INTERRUPTIBLE); - time_left = schedule_timeout(pause); + time_left = schedule_timeout_interruptible(1); /* We were interrupted, exit */ if (time_left) break; - pause <<= 1; - if (pause > HZ / 10) - pause = HZ / 10; + /* we've kicked the IO a few times, if anything has been freed, + * exit. There is no sense in looping here for a long time + * when we really need to commit the transaction, or there are + * just too many writers without enough free space + */ + + if (loops > 3) { + smp_mb(); + if (progress != space_info->reservation_progress) + break; + } } return reclaimed >= to_reclaim; @@ -3612,6 +3618,7 @@ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, if (num_bytes) { spin_lock(&space_info->lock); space_info->bytes_reserved -= num_bytes; + space_info->reservation_progress++; spin_unlock(&space_info->lock); } } @@ -3844,6 +3851,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; sinfo->bytes_reserved -= num_bytes; + sinfo->reservation_progress++; block_rsv->reserved = block_rsv->size; block_rsv->full = 1; } @@ -4005,7 +4013,6 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) to_reserve = 0; } spin_unlock(&BTRFS_I(inode)->accounting_lock); - to_reserve += calc_csum_metadata_size(inode, num_bytes); ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); if (ret) @@ -4133,6 +4140,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, btrfs_set_block_group_used(&cache->item, old_val); cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; + cache->space_info->reservation_progress++; cache->space_info->bytes_used += num_bytes; cache->space_info->disk_used += num_bytes * factor; spin_unlock(&cache->lock); @@ -4184,6 +4192,7 @@ static int pin_down_extent(struct btrfs_root *root, if (reserved) { cache->reserved -= num_bytes; cache->space_info->bytes_reserved -= num_bytes; + cache->space_info->reservation_progress++; } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); @@ -4234,6 +4243,7 @@ static int update_reserved_bytes(struct btrfs_block_group_cache *cache, space_info->bytes_readonly += num_bytes; cache->reserved -= num_bytes; space_info->bytes_reserved -= num_bytes; + space_info->reservation_progress++; } spin_unlock(&cache->lock); spin_unlock(&space_info->lock); @@ -4712,6 +4722,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, if (ret) { spin_lock(&cache->space_info->lock); cache->space_info->bytes_reserved -= buf->len; + cache->space_info->reservation_progress++; spin_unlock(&cache->space_info->lock); } goto out; @@ -5376,7 +5387,7 @@ again: num_bytes, data, 1); goto again; } - if (ret == -ENOSPC) { + if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) { struct btrfs_space_info *sinfo; sinfo = __find_space_info(root->fs_info, data); @@ -8065,6 +8076,13 @@ out: return ret; } +int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 type) +{ + u64 alloc_flags = get_alloc_profile(root, type); + return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); +} + /* * helper to account the unused space of all the readonly block group in the * list. takes mirrors into account. diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 92ac519..714adc4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1433,12 +1433,13 @@ int extent_clear_unlock_delalloc(struct inode *inode, */ u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, - unsigned long bits) + unsigned long bits, int contig) { struct rb_node *node; struct extent_state *state; u64 cur_start = *start; u64 total_bytes = 0; + u64 last = 0; int found = 0; if (search_end <= cur_start) { @@ -1463,7 +1464,9 @@ u64 count_range_bits(struct extent_io_tree *tree, state = rb_entry(node, struct extent_state, rb_node); if (state->start > search_end) break; - if (state->end >= cur_start && (state->state & bits)) { + if (contig && found && state->start > last + 1) + break; + if (state->end >= cur_start && (state->state & bits) == bits) { total_bytes += min(search_end, state->end) + 1 - max(cur_start, state->start); if (total_bytes >= max_bytes) @@ -1472,6 +1475,9 @@ u64 count_range_bits(struct extent_io_tree *tree, *start = state->start; found = 1; } + last = state->end; + } else if (contig && found) { + break; } node = rb_next(node); if (!node) @@ -2912,6 +2918,46 @@ out: return sector; } +/* + * helper function for fiemap, which doesn't want to see any holes. + * This maps until we find something past 'last' + */ +static struct extent_map *get_extent_skip_holes(struct inode *inode, + u64 offset, + u64 last, + get_extent_t *get_extent) +{ + u64 sectorsize = BTRFS_I(inode)->root->sectorsize; + struct extent_map *em; + u64 len; + + if (offset >= last) + return NULL; + + while(1) { + len = last - offset; + if (len == 0) + break; + len = (len + sectorsize - 1) & ~(sectorsize - 1); + em = get_extent(inode, NULL, 0, offset, len, 0); + if (!em || IS_ERR(em)) + return em; + + /* if this isn't a hole return it */ + if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) && + em->block_start != EXTENT_MAP_HOLE) { + return em; + } + + /* this is a hole, advance to the next extent */ + offset = extent_map_end(em); + free_extent_map(em); + if (offset >= last) + break; + } + return NULL; +} + int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent) { @@ -2921,16 +2967,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u32 flags = 0; u32 found_type; u64 last; + u64 last_for_get_extent = 0; u64 disko = 0; + u64 isize = i_size_read(inode); struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; struct btrfs_path *path; struct btrfs_file_extent_item *item; int end = 0; - u64 em_start = 0, em_len = 0; + u64 em_start = 0; + u64 em_len = 0; + u64 em_end = 0; unsigned long emflags; - int hole = 0; if (len == 0) return -EINVAL; @@ -2940,6 +2989,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return -ENOMEM; path->leave_spinning = 1; + /* + * lookup the last file extent. We're not using i_size here + * because there might be preallocation past i_size + */ ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, path, inode->i_ino, -1, 0); if (ret < 0) { @@ -2953,18 +3006,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); found_type = btrfs_key_type(&found_key); - /* No extents, just return */ + /* No extents, but there might be delalloc bits */ if (found_key.objectid != inode->i_ino || found_type != BTRFS_EXTENT_DATA_KEY) { - btrfs_free_path(path); - return 0; + /* have to trust i_size as the end */ + last = (u64)-1; + last_for_get_extent = isize; + } else { + /* + * remember the start of the last extent. There are a + * bunch of different factors that go into the length of the + * extent, so its much less complex to remember where it started + */ + last = found_key.offset; + last_for_get_extent = last + 1; } - last = found_key.offset; btrfs_free_path(path); + /* + * we might have some extents allocated but more delalloc past those + * extents. so, we trust isize unless the start of the last extent is + * beyond isize + */ + if (last < isize) { + last = (u64)-1; + last_for_get_extent = isize; + } + lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, &cached_state, GFP_NOFS); - em = get_extent(inode, NULL, 0, off, max - off, 0); + + em = get_extent_skip_holes(inode, off, last_for_get_extent, + get_extent); if (!em) goto out; if (IS_ERR(em)) { @@ -2973,22 +3046,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } while (!end) { - hole = 0; - off = em->start + em->len; - if (off >= max) - end = 1; + u64 offset_in_extent; - if (em->block_start == EXTENT_MAP_HOLE) { - hole = 1; - goto next; - } + /* break if the extent we found is outside the range */ + if (em->start >= max || extent_map_end(em) < off) + break; - em_start = em->start; - em_len = em->len; + /* + * get_extent may return an extent that starts before our + * requested range. We have to make sure the ranges + * we return to fiemap always move forward and don't + * overlap, so adjust the offsets here + */ + em_start = max(em->start, off); + /* + * record the offset from the start of the extent + * for adjusting the disk offset below + */ + offset_in_extent = em_start - em->start; + em_end = extent_map_end(em); + em_len = em_end - em_start; + emflags = em->flags; disko = 0; flags = 0; + /* + * bump off for our next call to get_extent + */ + off = extent_map_end(em); + if (off >= max) + end = 1; + if (em->block_start == EXTENT_MAP_LAST_BYTE) { end = 1; flags |= FIEMAP_EXTENT_LAST; @@ -2999,42 +3088,34 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, flags |= (FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN); } else { - disko = em->block_start; + disko = em->block_start + offset_in_extent; } if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; -next: - emflags = em->flags; free_extent_map(em); em = NULL; - if (!end) { - em = get_extent(inode, NULL, 0, off, max - off, 0); - if (!em) - goto out; - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out; - } - emflags = em->flags; - } - - if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) { + if ((em_start >= last) || em_len == (u64)-1 || + (last == (u64)-1 && isize <= em_end)) { flags |= FIEMAP_EXTENT_LAST; end = 1; } - if (em_start == last) { + /* now scan forward to see if this is really the last extent. */ + em = get_extent_skip_holes(inode, off, last_for_get_extent, + get_extent); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; + } + if (!em) { flags |= FIEMAP_EXTENT_LAST; end = 1; } - - if (!hole) { - ret = fiemap_fill_next_extent(fieinfo, em_start, disko, - em_len, flags); - if (ret) - goto out_free; - } + ret = fiemap_fill_next_extent(fieinfo, em_start, disko, + em_len, flags); + if (ret) + goto out_free; } out_free: free_extent_map(em); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 7083cfa..9318dfe 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -191,7 +191,7 @@ void extent_io_exit(void); u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, - u64 max_bytes, unsigned long bits); + u64 max_bytes, unsigned long bits, int contig); void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7084140..f447b78 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -70,6 +70,19 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, /* Flush processor's dcache for this page */ flush_dcache_page(page); + + /* + * if we get a partial write, we can end up with + * partially up to date pages. These add + * a lot of complexity, so make sure they don't + * happen by forcing this copy to be retried. + * + * The rest of the btrfs_file_write code will fall + * back to page at a time copies after we return 0. + */ + if (!PageUptodate(page) && copied < count) + copied = 0; + iov_iter_advance(i, copied); write_bytes -= copied; total_copied += copied; @@ -763,6 +776,27 @@ out: } /* + * on error we return an unlocked page and the error value + * on success we return a locked page and 0 + */ +static int prepare_uptodate_page(struct page *page, u64 pos) +{ + int ret = 0; + + if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) { + ret = btrfs_readpage(NULL, page); + if (ret) + return ret; + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + return -EIO; + } + } + return 0; +} + +/* * this gets pages into the page cache and locks them down, it also properly * waits for data=ordered extents to finish before allowing the pages to be * modified. @@ -777,6 +811,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, unsigned long index = pos >> PAGE_CACHE_SHIFT; struct inode *inode = fdentry(file)->d_inode; int err = 0; + int faili = 0; u64 start_pos; u64 last_pos; @@ -794,15 +829,24 @@ again: for (i = 0; i < num_pages; i++) { pages[i] = grab_cache_page(inode->i_mapping, index + i); if (!pages[i]) { - int c; - for (c = i - 1; c >= 0; c--) { - unlock_page(pages[c]); - page_cache_release(pages[c]); - } - return -ENOMEM; + faili = i - 1; + err = -ENOMEM; + goto fail; + } + + if (i == 0) + err = prepare_uptodate_page(pages[i], pos); + if (i == num_pages - 1) + err = prepare_uptodate_page(pages[i], + pos + write_bytes); + if (err) { + page_cache_release(pages[i]); + faili = i - 1; + goto fail; } wait_on_page_writeback(pages[i]); } + err = 0; if (start_pos < inode->i_size) { struct btrfs_ordered_extent *ordered; lock_extent_bits(&BTRFS_I(inode)->io_tree, @@ -842,6 +886,14 @@ again: WARN_ON(!PageLocked(pages[i])); } return 0; +fail: + while (faili >= 0) { + unlock_page(pages[faili]); + page_cache_release(pages[faili]); + faili--; + } + return err; + } static ssize_t btrfs_file_aio_write(struct kiocb *iocb, @@ -851,7 +903,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, struct file *file = iocb->ki_filp; struct inode *inode = fdentry(file)->d_inode; struct btrfs_root *root = BTRFS_I(inode)->root; - struct page *pinned[2]; struct page **pages = NULL; struct iov_iter i; loff_t *ppos = &iocb->ki_pos; @@ -872,9 +923,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || (file->f_flags & O_DIRECT)); - pinned[0] = NULL; - pinned[1] = NULL; - start_pos = pos; vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); @@ -962,32 +1010,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, first_index = pos >> PAGE_CACHE_SHIFT; last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT; - /* - * there are lots of better ways to do this, but this code - * makes sure the first and last page in the file range are - * up to date and ready for cow - */ - if ((pos & (PAGE_CACHE_SIZE - 1))) { - pinned[0] = grab_cache_page(inode->i_mapping, first_index); - if (!PageUptodate(pinned[0])) { - ret = btrfs_readpage(NULL, pinned[0]); - BUG_ON(ret); - wait_on_page_locked(pinned[0]); - } else { - unlock_page(pinned[0]); - } - } - if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) { - pinned[1] = grab_cache_page(inode->i_mapping, last_index); - if (!PageUptodate(pinned[1])) { - ret = btrfs_readpage(NULL, pinned[1]); - BUG_ON(ret); - wait_on_page_locked(pinned[1]); - } else { - unlock_page(pinned[1]); - } - } - while (iov_iter_count(&i) > 0) { size_t offset = pos & (PAGE_CACHE_SIZE - 1); size_t write_bytes = min(iov_iter_count(&i), @@ -1024,8 +1046,20 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, copied = btrfs_copy_from_user(pos, num_pages, write_bytes, pages, &i); - dirty_pages = (copied + offset + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + + /* + * if we have trouble faulting in the pages, fall + * back to one page at a time + */ + if (copied < write_bytes) + nrptrs = 1; + + if (copied == 0) + dirty_pages = 0; + else + dirty_pages = (copied + offset + + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; if (num_pages > dirty_pages) { if (copied > 0) @@ -1069,10 +1103,6 @@ out: err = ret; kfree(pages); - if (pinned[0]) - page_cache_release(pinned[0]); - if (pinned[1]) - page_cache_release(pinned[1]); *ppos = pos; /* diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fb9bd78..4a0107e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1913,7 +1913,7 @@ static int btrfs_clean_io_failures(struct inode *inode, u64 start) private = 0; if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, - (u64)-1, 1, EXTENT_DIRTY)) { + (u64)-1, 1, EXTENT_DIRTY, 0)) { ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, &private_failure); if (ret == 0) { @@ -4806,9 +4806,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, int err; int drop_inode = 0; - if (inode->i_nlink == 0) - return -ENOENT; - /* do not allow sys_link's with other subvols of the same device */ if (root->objectid != BTRFS_I(inode)->root->objectid) return -EPERM; @@ -4821,10 +4818,11 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, goto fail; /* - * 1 item for inode ref + * 2 items for inode and inode ref * 2 items for dir items + * 1 item for parent inode */ - trans = btrfs_start_transaction(root, 3); + trans = btrfs_start_transaction(root, 5); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto fail; @@ -5280,6 +5278,128 @@ out: return em; } +struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, + size_t pg_offset, u64 start, u64 len, + int create) +{ + struct extent_map *em; + struct extent_map *hole_em = NULL; + u64 range_start = start; + u64 end; + u64 found; + u64 found_end; + int err = 0; + + em = btrfs_get_extent(inode, page, pg_offset, start, len, create); + if (IS_ERR(em)) + return em; + if (em) { + /* + * if our em maps to a hole, there might + * actually be delalloc bytes behind it + */ + if (em->block_start != EXTENT_MAP_HOLE) + return em; + else + hole_em = em; + } + + /* check to see if we've wrapped (len == -1 or similar) */ + end = start + len; + if (end < start) + end = (u64)-1; + else + end -= 1; + + em = NULL; + + /* ok, we didn't find anything, lets look for delalloc */ + found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start, + end, len, EXTENT_DELALLOC, 1); + found_end = range_start + found; + if (found_end < range_start) + found_end = (u64)-1; + + /* + * we didn't find anything useful, return + * the original results from get_extent() + */ + if (range_start > end || found_end <= start) { + em = hole_em; + hole_em = NULL; + goto out; + } + + /* adjust the range_start to make sure it doesn't + * go backwards from the start they passed in + */ + range_start = max(start,range_start); + found = found_end - range_start; + + if (found > 0) { + u64 hole_start = start; + u64 hole_len = len; + + em = alloc_extent_map(GFP_NOFS); + if (!em) { + err = -ENOMEM; + goto out; + } + /* + * when btrfs_get_extent can't find anything it + * returns one huge hole + * + * make sure what it found really fits our range, and + * adjust to make sure it is based on the start from + * the caller + */ + if (hole_em) { + u64 calc_end = extent_map_end(hole_em); + + if (calc_end <= start || (hole_em->start > end)) { + free_extent_map(hole_em); + hole_em = NULL; + } else { + hole_start = max(hole_em->start, start); + hole_len = calc_end - hole_start; + } + } + em->bdev = NULL; + if (hole_em && range_start > hole_start) { + /* our hole starts before our delalloc, so we + * have to return just the parts of the hole + * that go until the delalloc starts + */ + em->len = min(hole_len, + range_start - hole_start); + em->start = hole_start; + em->orig_start = hole_start; + /* + * don't adjust block start at all, + * it is fixed at EXTENT_MAP_HOLE + */ + em->block_start = hole_em->block_start; + em->block_len = hole_len; + } else { + em->start = range_start; + em->len = found; + em->orig_start = range_start; + em->block_start = EXTENT_MAP_DELALLOC; + em->block_len = found; + } + } else if (hole_em) { + return hole_em; + } +out: + + free_extent_map(hole_em); + if (err) { + free_extent_map(em); + return ERR_PTR(err); + } + return em; +} + static struct extent_map *btrfs_new_extent_direct(struct inode *inode, u64 start, u64 len) { @@ -5934,6 +6054,7 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, if (!skip_sum) { dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); if (!dip->csums) { + kfree(dip); ret = -ENOMEM; goto free_ordered; } @@ -6102,7 +6223,7 @@ out: static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { - return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent); + return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap); } int btrfs_readpage(struct file *file, struct page *page) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index be2d4f6..5fdb2ab 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1071,12 +1071,15 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, if (copy_from_user(&flags, arg, sizeof(flags))) return -EFAULT; - if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC) + if (flags & BTRFS_SUBVOL_CREATE_ASYNC) return -EINVAL; if (flags & ~BTRFS_SUBVOL_RDONLY) return -EOPNOTSUPP; + if (!is_owner_or_cap(inode)) + return -EACCES; + down_write(&root->fs_info->subvol_sem); /* nothing to do */ @@ -1097,7 +1100,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, goto out_reset; } - ret = btrfs_update_root(trans, root, + ret = btrfs_update_root(trans, root->fs_info->tree_root, &root->root_key, &root->root_item); btrfs_commit_transaction(trans, root); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index cc9b450..a178f5e 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -280,6 +280,7 @@ static int lzo_decompress_biovec(struct list_head *ws, unsigned long tot_out; unsigned long tot_len; char *buf; + bool may_late_unmap, need_unmap; data_in = kmap(pages_in[0]); tot_len = read_compress_length(data_in); @@ -300,11 +301,13 @@ static int lzo_decompress_biovec(struct list_head *ws, tot_in += in_len; working_bytes = in_len; + may_late_unmap = need_unmap = false; /* fast path: avoid using the working buffer */ if (in_page_bytes_left >= in_len) { buf = data_in + in_offset; bytes = in_len; + may_late_unmap = true; goto cont; } @@ -329,14 +332,17 @@ cont: if (working_bytes == 0 && tot_in >= tot_len) break; - kunmap(pages_in[page_in_index]); - page_in_index++; - if (page_in_index >= total_pages_in) { + if (page_in_index + 1 >= total_pages_in) { ret = -1; - data_in = NULL; goto done; } - data_in = kmap(pages_in[page_in_index]); + + if (may_late_unmap) + need_unmap = true; + else + kunmap(pages_in[page_in_index]); + + data_in = kmap(pages_in[++page_in_index]); in_page_bytes_left = PAGE_CACHE_SIZE; in_offset = 0; @@ -346,6 +352,8 @@ cont: out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE); ret = lzo1x_decompress_safe(buf, in_len, workspace->buf, &out_len); + if (need_unmap) + kunmap(pages_in[page_in_index - 1]); if (ret != LZO_E_OK) { printk(KERN_WARNING "btrfs decompress failed\n"); ret = -1; @@ -363,8 +371,7 @@ cont: break; } done: - if (data_in) - kunmap(pages_in[page_in_index]); + kunmap(pages_in[page_in_index]); return ret; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 0825e4e..31ade58 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3654,6 +3654,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) u32 item_size; int ret; int err = 0; + int progress = 0; path = btrfs_alloc_path(); if (!path) @@ -3666,9 +3667,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) } while (1) { + progress++; trans = btrfs_start_transaction(rc->extent_root, 0); BUG_ON(IS_ERR(trans)); - +restart: if (update_backref_cache(trans, &rc->backref_cache)) { btrfs_end_transaction(trans, rc->extent_root); continue; @@ -3781,6 +3783,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) } } } + if (trans && progress && err == -ENOSPC) { + ret = btrfs_force_chunk_alloc(trans, rc->extent_root, + rc->block_group->flags); + if (ret == 0) { + err = 0; + progress = 0; + goto restart; + } + } btrfs_release_path(rc->extent_root, path); clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a004008..d39a989 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -155,7 +155,8 @@ enum { Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_compress_type, Opt_compress_force, Opt_compress_force_type, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, - Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err, + Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, + Opt_enospc_debug, Opt_err, }; static match_table_t tokens = { @@ -184,6 +185,7 @@ static match_table_t tokens = { {Opt_space_cache, "space_cache"}, {Opt_clear_cache, "clear_cache"}, {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, + {Opt_enospc_debug, "enospc_debug"}, {Opt_err, NULL}, }; @@ -358,6 +360,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_user_subvol_rm_allowed: btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED); break; + case Opt_enospc_debug: + btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); + break; case Opt_err: printk(KERN_INFO "btrfs: unrecognized mount option " "'%s'\n", p); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index af7dbca..dd13eb8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1338,11 +1338,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) ret = btrfs_shrink_device(device, 0); if (ret) - goto error_brelse; + goto error_undo; ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); if (ret) - goto error_brelse; + goto error_undo; device->in_fs_metadata = 0; @@ -1416,6 +1416,13 @@ out: mutex_unlock(&root->fs_info->volume_mutex); mutex_unlock(&uuid_mutex); return ret; +error_undo: + if (device->writeable) { + list_add(&device->dev_alloc_list, + &root->fs_info->fs_devices->alloc_list); + root->fs_info->fs_devices->rw_devices++; + } + goto error_brelse; } /* @@ -1633,7 +1640,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) device->dev_root = root->fs_info->dev_root; device->bdev = bdev; device->in_fs_metadata = 1; - device->mode = 0; + device->mode = FMODE_EXCL; set_blocksize(device->bdev, 4096); if (seeding_dev) { diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index f0aef78..ebafa65 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -60,7 +60,6 @@ int ceph_init_dentry(struct dentry *dentry) } di->dentry = dentry; di->lease_session = NULL; - di->parent_inode = igrab(dentry->d_parent->d_inode); dentry->d_fsdata = di; dentry->d_time = jiffies; ceph_dentry_lru_add(dentry); @@ -410,7 +409,7 @@ more: spin_lock(&inode->i_lock); if (ci->i_release_count == fi->dir_release_count) { dout(" marking %p complete\n", inode); - ci->i_ceph_flags |= CEPH_I_COMPLETE; + /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */ ci->i_max_offset = filp->f_pos; } spin_unlock(&inode->i_lock); @@ -497,6 +496,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, /* .snap dir? */ if (err == -ENOENT && + ceph_snap(parent) == CEPH_NOSNAP && strcmp(dentry->d_name.name, fsc->mount_options->snapdir_name) == 0) { struct inode *inode = ceph_get_snapdir(parent); @@ -993,7 +993,7 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *dir; - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; dir = dentry->d_parent->d_inode; @@ -1030,28 +1030,8 @@ out_touch: static void ceph_dentry_release(struct dentry *dentry) { struct ceph_dentry_info *di = ceph_dentry(dentry); - struct inode *parent_inode = NULL; - u64 snapid = CEPH_NOSNAP; - if (!IS_ROOT(dentry)) { - parent_inode = di->parent_inode; - if (parent_inode) - snapid = ceph_snap(parent_inode); - } - dout("dentry_release %p parent %p\n", dentry, parent_inode); - if (parent_inode && snapid != CEPH_SNAPDIR) { - struct ceph_inode_info *ci = ceph_inode(parent_inode); - - spin_lock(&parent_inode->i_lock); - if (ci->i_shared_gen == di->lease_shared_gen || - snapid <= CEPH_MAXSNAP) { - dout(" clearing %p complete (d_release)\n", - parent_inode); - ci->i_ceph_flags &= ~CEPH_I_COMPLETE; - ci->i_release_count++; - } - spin_unlock(&parent_inode->i_lock); - } + dout("dentry_release %p\n", dentry); if (di) { ceph_dentry_lru_del(dentry); if (di->lease_session) @@ -1059,8 +1039,6 @@ static void ceph_dentry_release(struct dentry *dentry) kmem_cache_free(ceph_dentry_cachep, di); dentry->d_fsdata = NULL; } - if (parent_inode) - iput(parent_inode); } static int ceph_snapdir_d_revalidate(struct dentry *dentry, diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 5625463..193bfa5 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -707,7 +707,7 @@ static int fill_inode(struct inode *inode, (issued & CEPH_CAP_FILE_EXCL) == 0 && (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { dout(" marking %p complete (empty)\n", inode); - ci->i_ceph_flags |= CEPH_I_COMPLETE; + /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */ ci->i_max_offset = 2; } break; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 88fcaa2..20b907d 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -207,7 +207,6 @@ struct ceph_dentry_info { struct dentry *dentry; u64 time; u64 offset; - struct inode *parent_inode; }; struct ceph_inode_xattrs_info { diff --git a/fs/compat.c b/fs/compat.c index f6fd0a0..c6d31a3 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -262,35 +262,19 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs * */ asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf) { - struct path path; - int error; - - error = user_path(pathname, &path); - if (!error) { - struct kstatfs tmp; - error = vfs_statfs(&path, &tmp); - if (!error) - error = put_compat_statfs(buf, &tmp); - path_put(&path); - } + struct kstatfs tmp; + int error = user_statfs(pathname, &tmp); + if (!error) + error = put_compat_statfs(buf, &tmp); return error; } asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf) { - struct file * file; struct kstatfs tmp; - int error; - - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = vfs_statfs(&file->f_path, &tmp); + int error = fd_statfs(fd, &tmp); if (!error) error = put_compat_statfs(buf, &tmp); - fput(file); -out: return error; } @@ -329,41 +313,29 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf) { - struct path path; + struct kstatfs tmp; int error; if (sz != sizeof(*buf)) return -EINVAL; - error = user_path(pathname, &path); - if (!error) { - struct kstatfs tmp; - error = vfs_statfs(&path, &tmp); - if (!error) - error = put_compat_statfs64(buf, &tmp); - path_put(&path); - } + error = user_statfs(pathname, &tmp); + if (!error) + error = put_compat_statfs64(buf, &tmp); return error; } asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf) { - struct file * file; struct kstatfs tmp; int error; if (sz != sizeof(*buf)) return -EINVAL; - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = vfs_statfs(&file->f_path, &tmp); + error = fd_statfs(fd, &tmp); if (!error) error = put_compat_statfs64(buf, &tmp); - fput(file); -out: return error; } @@ -1228,7 +1200,9 @@ compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec, file = fget_light(fd, &fput_needed); if (!file) return -EBADF; - ret = compat_readv(file, vec, vlen, &pos); + ret = -ESPIPE; + if (file->f_mode & FMODE_PREAD) + ret = compat_readv(file, vec, vlen, &pos); fput_light(file, fput_needed); return ret; } @@ -1285,7 +1259,9 @@ compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec, file = fget_light(fd, &fput_needed); if (!file) return -EBADF; - ret = compat_writev(file, vec, vlen, &pos); + ret = -ESPIPE; + if (file->f_mode & FMODE_PWRITE) + ret = compat_writev(file, vec, vlen, &pos); fput_light(file, fput_needed); return ret; } @@ -2308,3 +2284,16 @@ asmlinkage long compat_sys_timerfd_gettime(int ufd, } #endif /* CONFIG_TIMERFD */ + +#ifdef CONFIG_FHANDLE +/* + * Exactly like fs/open.c:sys_open_by_handle_at(), except that it + * doesn't set the O_LARGEFILE flag. + */ +asmlinkage long +compat_sys_open_by_handle_at(int mountdirfd, + struct file_handle __user *handle, int flags) +{ + return do_handle_open(mountdirfd, handle, flags); +} +#endif diff --git a/fs/dcache.c b/fs/dcache.c index 2a6bd9a..a39fe47 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -296,8 +296,12 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) __releases(parent->d_lock) __releases(dentry->d_inode->i_lock) { - dentry->d_parent = NULL; list_del(&dentry->d_u.d_child); + /* + * Inform try_to_ascend() that we are no longer attached to the + * dentry tree + */ + dentry->d_flags |= DCACHE_DISCONNECTED; if (parent) spin_unlock(&parent->d_lock); dentry_iput(dentry); @@ -1012,6 +1016,35 @@ void shrink_dcache_for_umount(struct super_block *sb) } /* + * This tries to ascend one level of parenthood, but + * we can race with renaming, so we need to re-check + * the parenthood after dropping the lock and check + * that the sequence number still matches. + */ +static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq) +{ + struct dentry *new = old->d_parent; + + rcu_read_lock(); + spin_unlock(&old->d_lock); + spin_lock(&new->d_lock); + + /* + * might go back up the wrong parent if we have had a rename + * or deletion + */ + if (new != old->d_parent || + (old->d_flags & DCACHE_DISCONNECTED) || + (!locked && read_seqretry(&rename_lock, seq))) { + spin_unlock(&new->d_lock); + new = NULL; + } + rcu_read_unlock(); + return new; +} + + +/* * Search for at least 1 mount point in the dentry's subdirs. * We descend to the next level whenever the d_subdirs * list is non-empty and continue searching. @@ -1066,24 +1099,10 @@ resume: * All done at this level ... ascend and resume the search. */ if (this_parent != parent) { - struct dentry *tmp; - struct dentry *child; - - tmp = this_parent->d_parent; - rcu_read_lock(); - spin_unlock(&this_parent->d_lock); - child = this_parent; - this_parent = tmp; - spin_lock(&this_parent->d_lock); - /* might go back up the wrong parent if we have had a rename - * or deletion */ - if (this_parent != child->d_parent || - (!locked && read_seqretry(&rename_lock, seq))) { - spin_unlock(&this_parent->d_lock); - rcu_read_unlock(); + struct dentry *child = this_parent; + this_parent = try_to_ascend(this_parent, locked, seq); + if (!this_parent) goto rename_retry; - } - rcu_read_unlock(); next = child->d_u.d_child.next; goto resume; } @@ -1181,24 +1200,10 @@ resume: * All done at this level ... ascend and resume the search. */ if (this_parent != parent) { - struct dentry *tmp; - struct dentry *child; - - tmp = this_parent->d_parent; - rcu_read_lock(); - spin_unlock(&this_parent->d_lock); - child = this_parent; - this_parent = tmp; - spin_lock(&this_parent->d_lock); - /* might go back up the wrong parent if we have had a rename - * or deletion */ - if (this_parent != child->d_parent || - (!locked && read_seqretry(&rename_lock, seq))) { - spin_unlock(&this_parent->d_lock); - rcu_read_unlock(); + struct dentry *child = this_parent; + this_parent = try_to_ascend(this_parent, locked, seq); + if (!this_parent) goto rename_retry; - } - rcu_read_unlock(); next = child->d_u.d_child.next; goto resume; } @@ -1523,6 +1528,28 @@ struct dentry * d_alloc_root(struct inode * root_inode) } EXPORT_SYMBOL(d_alloc_root); +static struct dentry * __d_find_any_alias(struct inode *inode) +{ + struct dentry *alias; + + if (list_empty(&inode->i_dentry)) + return NULL; + alias = list_first_entry(&inode->i_dentry, struct dentry, d_alias); + __dget(alias); + return alias; +} + +static struct dentry * d_find_any_alias(struct inode *inode) +{ + struct dentry *de; + + spin_lock(&inode->i_lock); + de = __d_find_any_alias(inode); + spin_unlock(&inode->i_lock); + return de; +} + + /** * d_obtain_alias - find or allocate a dentry for a given inode * @inode: inode to allocate the dentry for @@ -1552,7 +1579,7 @@ struct dentry *d_obtain_alias(struct inode *inode) if (IS_ERR(inode)) return ERR_CAST(inode); - res = d_find_alias(inode); + res = d_find_any_alias(inode); if (res) goto out_iput; @@ -1565,7 +1592,7 @@ struct dentry *d_obtain_alias(struct inode *inode) spin_lock(&inode->i_lock); - res = __d_find_alias(inode, 0); + res = __d_find_any_alias(inode); if (res) { spin_unlock(&inode->i_lock); dput(tmp); @@ -2920,28 +2947,14 @@ resume: spin_unlock(&dentry->d_lock); } if (this_parent != root) { - struct dentry *tmp; - struct dentry *child; - - tmp = this_parent->d_parent; + struct dentry *child = this_parent; if (!(this_parent->d_flags & DCACHE_GENOCIDE)) { this_parent->d_flags |= DCACHE_GENOCIDE; this_parent->d_count--; } - rcu_read_lock(); - spin_unlock(&this_parent->d_lock); - child = this_parent; - this_parent = tmp; - spin_lock(&this_parent->d_lock); - /* might go back up the wrong parent if we have had a rename - * or deletion */ - if (this_parent != child->d_parent || - (!locked && read_seqretry(&rename_lock, seq))) { - spin_unlock(&this_parent->d_lock); - rcu_read_unlock(); + this_parent = try_to_ascend(this_parent, locked, seq); + if (!this_parent) goto rename_retry; - } - rcu_read_unlock(); next = child->d_u.d_child.next; goto resume; } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 267d0ad..4a09af9 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -63,6 +63,13 @@ * cleanup path and it is also acquired by eventpoll_release_file() * if a file has been pushed inside an epoll set and it is then * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL). + * It is also acquired when inserting an epoll fd onto another epoll + * fd. We do this so that we walk the epoll tree and ensure that this + * insertion does not create a cycle of epoll file descriptors, which + * could lead to deadlock. We need a global mutex to prevent two + * simultaneous inserts (A into B and B into A) from racing and + * constructing a cycle without either insert observing that it is + * going to. * It is possible to drop the "ep->mtx" and to use the global * mutex "epmutex" (together with "ep->lock") to have it working, * but having "ep->mtx" will make the interface more scalable. @@ -224,6 +231,9 @@ static long max_user_watches __read_mostly; */ static DEFINE_MUTEX(epmutex); +/* Used to check for epoll file descriptor inclusion loops */ +static struct nested_calls poll_loop_ncalls; + /* Used for safe wake up implementation */ static struct nested_calls poll_safewake_ncalls; @@ -1198,6 +1208,62 @@ retry: return res; } +/** + * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested() + * API, to verify that adding an epoll file inside another + * epoll structure, does not violate the constraints, in + * terms of closed loops, or too deep chains (which can + * result in excessive stack usage). + * + * @priv: Pointer to the epoll file to be currently checked. + * @cookie: Original cookie for this call. This is the top-of-the-chain epoll + * data structure pointer. + * @call_nests: Current dept of the @ep_call_nested() call stack. + * + * Returns: Returns zero if adding the epoll @file inside current epoll + * structure @ep does not violate the constraints, or -1 otherwise. + */ +static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) +{ + int error = 0; + struct file *file = priv; + struct eventpoll *ep = file->private_data; + struct rb_node *rbp; + struct epitem *epi; + + mutex_lock(&ep->mtx); + for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + epi = rb_entry(rbp, struct epitem, rbn); + if (unlikely(is_file_epoll(epi->ffd.file))) { + error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + ep_loop_check_proc, epi->ffd.file, + epi->ffd.file->private_data, current); + if (error != 0) + break; + } + } + mutex_unlock(&ep->mtx); + + return error; +} + +/** + * ep_loop_check - Performs a check to verify that adding an epoll file (@file) + * another epoll file (represented by @ep) does not create + * closed loops or too deep chains. + * + * @ep: Pointer to the epoll private data structure. + * @file: Pointer to the epoll file to be checked. + * + * Returns: Returns zero if adding the epoll @file inside current epoll + * structure @ep does not violate the constraints, or -1 otherwise. + */ +static int ep_loop_check(struct eventpoll *ep, struct file *file) +{ + return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + ep_loop_check_proc, file, ep, current); +} + /* * Open an eventpoll file descriptor. */ @@ -1246,6 +1312,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event __user *, event) { int error; + int did_lock_epmutex = 0; struct file *file, *tfile; struct eventpoll *ep; struct epitem *epi; @@ -1287,6 +1354,25 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, */ ep = file->private_data; + /* + * When we insert an epoll file descriptor, inside another epoll file + * descriptor, there is the change of creating closed loops, which are + * better be handled here, than in more critical paths. + * + * We hold epmutex across the loop check and the insert in this case, in + * order to prevent two separate inserts from racing and each doing the + * insert "at the same time" such that ep_loop_check passes on both + * before either one does the insert, thereby creating a cycle. + */ + if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) { + mutex_lock(&epmutex); + did_lock_epmutex = 1; + error = -ELOOP; + if (ep_loop_check(ep, tfile) != 0) + goto error_tgt_fput; + } + + mutex_lock(&ep->mtx); /* @@ -1322,6 +1408,9 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, mutex_unlock(&ep->mtx); error_tgt_fput: + if (unlikely(did_lock_epmutex)) + mutex_unlock(&epmutex); + fput(tfile); error_fput: fput(file); @@ -1441,6 +1530,12 @@ static int __init eventpoll_init(void) EP_ITEM_COST; BUG_ON(max_user_watches < 0); + /* + * Initialize the structure used to perform epoll file descriptor + * inclusion loops checks. + */ + ep_nested_calls_init(&poll_loop_ncalls); + /* Initialize the structure used to perform safe poll wait head wake ups */ ep_nested_calls_init(&poll_safewake_ncalls); @@ -115,13 +115,16 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) struct file *file; char *tmp = getname(library); int error = PTR_ERR(tmp); + static const struct open_flags uselib_flags = { + .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, + .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN, + .intent = LOOKUP_OPEN + }; if (IS_ERR(tmp)) goto out; - file = do_filp_open(AT_FDCWD, tmp, - O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0, - MAY_READ | MAY_EXEC | MAY_OPEN); + file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW); putname(tmp); error = PTR_ERR(file); if (IS_ERR(file)) @@ -721,10 +724,13 @@ struct file *open_exec(const char *name) { struct file *file; int err; + static const struct open_flags open_exec_flags = { + .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, + .acc_mode = MAY_EXEC | MAY_OPEN, + .intent = LOOKUP_OPEN + }; - file = do_filp_open(AT_FDCWD, name, - O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0, - MAY_EXEC | MAY_OPEN); + file = do_filp_open(AT_FDCWD, name, &open_exec_flags, LOOKUP_FOLLOW); if (IS_ERR(file)) goto out; diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index 264e95d..4d70db1 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -272,7 +272,6 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry, new_de = exofs_find_entry(new_dir, new_dentry, &new_page); if (!new_de) goto out_dir; - inode_inc_link_count(old_inode); err = exofs_set_link(new_dir, new_de, new_page, old_inode); new_inode->i_ctime = CURRENT_TIME; if (dir_de) @@ -286,12 +285,9 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_dir->i_nlink >= EXOFS_LINK_MAX) goto out_dir; } - inode_inc_link_count(old_inode); err = exofs_add_link(new_dentry, old_inode); - if (err) { - inode_dec_link_count(old_inode); + if (err) goto out_dir; - } if (dir_de) inode_inc_link_count(new_dir); } @@ -299,7 +295,7 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry, old_inode->i_ctime = CURRENT_TIME; exofs_delete_entry(old_de, old_page); - inode_dec_link_count(old_inode); + mark_inode_dirty(old_inode); if (dir_de) { err = exofs_set_link(old_inode, dir_de, dir_page, new_dir); diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 4b68257..b05acb7 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -320,9 +320,14 @@ static int export_encode_fh(struct dentry *dentry, struct fid *fid, struct inode * inode = dentry->d_inode; int len = *max_len; int type = FILEID_INO32_GEN; - - if (len < 2 || (connectable && len < 4)) + + if (connectable && (len < 4)) { + *max_len = 4; + return 255; + } else if (len < 2) { + *max_len = 2; return 255; + } len = 2; fid->i32.ino = inode->i_ino; @@ -369,6 +374,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, /* * Try to get any dentry for the given file handle from the filesystem. */ + if (!nop || !nop->fh_to_dentry) + return ERR_PTR(-ESTALE); result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type); if (!result) result = ERR_PTR(-ESTALE); diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 2e1d834..adb9185 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -344,7 +344,6 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, new_de = ext2_find_entry (new_dir, &new_dentry->d_name, &new_page); if (!new_de) goto out_dir; - inode_inc_link_count(old_inode); ext2_set_link(new_dir, new_de, new_page, old_inode, 1); new_inode->i_ctime = CURRENT_TIME_SEC; if (dir_de) @@ -356,12 +355,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, if (new_dir->i_nlink >= EXT2_LINK_MAX) goto out_dir; } - inode_inc_link_count(old_inode); err = ext2_add_link(new_dentry, old_inode); - if (err) { - inode_dec_link_count(old_inode); + if (err) goto out_dir; - } if (dir_de) inode_inc_link_count(new_dir); } @@ -369,12 +365,11 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, /* * Like most other Unix systems, set the ctime for inodes on a * rename. - * inode_dec_link_count() will mark the inode dirty. */ old_inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(old_inode); ext2_delete_entry (old_de, old_page); - inode_dec_link_count(old_inode); if (dir_de) { if (old_dir != new_dir) diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index b27ba71..561f692 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -2253,13 +2253,6 @@ static int ext3_link (struct dentry * old_dentry, dquot_initialize(dir); - /* - * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing - * otherwise has the potential to corrupt the orphan inode list. - */ - if (inode->i_nlink == 0) - return -ENOENT; - retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 85c8cc8..9cc19a1d 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1936,6 +1936,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sb->s_qcop = &ext3_qctl_operations; sb->dq_op = &ext3_quota_operations; #endif + memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); mutex_init(&sbi->s_resize_lock); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 5485390..e781b7e 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2304,13 +2304,6 @@ static int ext4_link(struct dentry *old_dentry, dquot_initialize(dir); - /* - * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing - * otherwise has the potential to corrupt the orphan inode list. - */ - if (inode->i_nlink == 0) - return -ENOENT; - retry: handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f6a318f..5977b35 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3415,6 +3415,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_qcop = &ext4_qctl_operations; sb->dq_op = &ext4_quota_operations; #endif + memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); + INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); mutex_init(&sbi->s_resize_lock); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 86753fe..0e277ec 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -757,8 +757,10 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable) struct inode *inode = de->d_inode; u32 ipos_h, ipos_m, ipos_l; - if (len < 5) + if (len < 5) { + *lenp = 5; return 255; /* no room */ + } ipos_h = MSDOS_I(inode)->i_pos >> 8; ipos_m = (MSDOS_I(inode)->i_pos & 0xf0) << 24; diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index f88f752..adae3fb 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -43,7 +43,7 @@ static int vfat_revalidate_shortname(struct dentry *dentry) static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd) { - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; /* This is not negative dentry. Always valid. */ @@ -54,7 +54,7 @@ static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd) static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd) { - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; /* @@ -131,7 +131,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) SYSCALL_DEFINE1(dup, unsigned int, fildes) { int ret = -EBADF; - struct file *file = fget(fildes); + struct file *file = fget_raw(fildes); if (file) { ret = get_unused_fd(); @@ -426,15 +426,35 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, return err; } +static int check_fcntl_cmd(unsigned cmd) +{ + switch (cmd) { + case F_DUPFD: + case F_DUPFD_CLOEXEC: + case F_GETFD: + case F_SETFD: + case F_GETFL: + return 1; + } + return 0; +} + SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { struct file *filp; long err = -EBADF; - filp = fget(fd); + filp = fget_raw(fd); if (!filp) goto out; + if (unlikely(filp->f_mode & FMODE_PATH)) { + if (!check_fcntl_cmd(cmd)) { + fput(filp); + goto out; + } + } + err = security_file_fcntl(filp, cmd, arg); if (err) { fput(filp); @@ -456,10 +476,17 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, long err; err = -EBADF; - filp = fget(fd); + filp = fget_raw(fd); if (!filp) goto out; + if (unlikely(filp->f_mode & FMODE_PATH)) { + if (!check_fcntl_cmd(cmd)) { + fput(filp); + goto out; + } + } + err = security_file_fcntl(filp, cmd, arg); if (err) { fput(filp); @@ -808,14 +835,14 @@ static int __init fcntl_init(void) * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY * is defined as O_NONBLOCK on some platforms and not on others. */ - BUILD_BUG_ON(18 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( + BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | O_APPEND | /* O_NONBLOCK | */ __O_SYNC | O_DSYNC | FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | O_NOATIME | O_CLOEXEC | - __FMODE_EXEC + __FMODE_EXEC | O_PATH )); fasync_cache = kmem_cache_create("fasync_cache", diff --git a/fs/fhandle.c b/fs/fhandle.c new file mode 100644 index 0000000..bf93ad2 --- /dev/null +++ b/fs/fhandle.c @@ -0,0 +1,265 @@ +#include <linux/syscalls.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/exportfs.h> +#include <linux/fs_struct.h> +#include <linux/fsnotify.h> +#include <asm/uaccess.h> +#include "internal.h" + +static long do_sys_name_to_handle(struct path *path, + struct file_handle __user *ufh, + int __user *mnt_id) +{ + long retval; + struct file_handle f_handle; + int handle_dwords, handle_bytes; + struct file_handle *handle = NULL; + + /* + * We need t make sure wether the file system + * support decoding of the file handle + */ + if (!path->mnt->mnt_sb->s_export_op || + !path->mnt->mnt_sb->s_export_op->fh_to_dentry) + return -EOPNOTSUPP; + + if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) + return -EFAULT; + + if (f_handle.handle_bytes > MAX_HANDLE_SZ) + return -EINVAL; + + handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes, + GFP_KERNEL); + if (!handle) + return -ENOMEM; + + /* convert handle size to multiple of sizeof(u32) */ + handle_dwords = f_handle.handle_bytes >> 2; + + /* we ask for a non connected handle */ + retval = exportfs_encode_fh(path->dentry, + (struct fid *)handle->f_handle, + &handle_dwords, 0); + handle->handle_type = retval; + /* convert handle size to bytes */ + handle_bytes = handle_dwords * sizeof(u32); + handle->handle_bytes = handle_bytes; + if ((handle->handle_bytes > f_handle.handle_bytes) || + (retval == 255) || (retval == -ENOSPC)) { + /* As per old exportfs_encode_fh documentation + * we could return ENOSPC to indicate overflow + * But file system returned 255 always. So handle + * both the values + */ + /* + * set the handle size to zero so we copy only + * non variable part of the file_handle + */ + handle_bytes = 0; + retval = -EOVERFLOW; + } else + retval = 0; + /* copy the mount id */ + if (copy_to_user(mnt_id, &path->mnt->mnt_id, sizeof(*mnt_id)) || + copy_to_user(ufh, handle, + sizeof(struct file_handle) + handle_bytes)) + retval = -EFAULT; + kfree(handle); + return retval; +} + +/** + * sys_name_to_handle_at: convert name to handle + * @dfd: directory relative to which name is interpreted if not absolute + * @name: name that should be converted to handle. + * @handle: resulting file handle + * @mnt_id: mount id of the file system containing the file + * @flag: flag value to indicate whether to follow symlink or not + * + * @handle->handle_size indicate the space available to store the + * variable part of the file handle in bytes. If there is not + * enough space, the field is updated to return the minimum + * value required. + */ +SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, + struct file_handle __user *, handle, int __user *, mnt_id, + int, flag) +{ + struct path path; + int lookup_flags; + int err; + + if ((flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) + return -EINVAL; + + lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0; + if (flag & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + err = user_path_at(dfd, name, lookup_flags, &path); + if (!err) { + err = do_sys_name_to_handle(&path, handle, mnt_id); + path_put(&path); + } + return err; +} + +static struct vfsmount *get_vfsmount_from_fd(int fd) +{ + struct path path; + + if (fd == AT_FDCWD) { + struct fs_struct *fs = current->fs; + spin_lock(&fs->lock); + path = fs->pwd; + mntget(path.mnt); + spin_unlock(&fs->lock); + } else { + int fput_needed; + struct file *file = fget_light(fd, &fput_needed); + if (!file) + return ERR_PTR(-EBADF); + path = file->f_path; + mntget(path.mnt); + fput_light(file, fput_needed); + } + return path.mnt; +} + +static int vfs_dentry_acceptable(void *context, struct dentry *dentry) +{ + return 1; +} + +static int do_handle_to_path(int mountdirfd, struct file_handle *handle, + struct path *path) +{ + int retval = 0; + int handle_dwords; + + path->mnt = get_vfsmount_from_fd(mountdirfd); + if (IS_ERR(path->mnt)) { + retval = PTR_ERR(path->mnt); + goto out_err; + } + /* change the handle size to multiple of sizeof(u32) */ + handle_dwords = handle->handle_bytes >> 2; + path->dentry = exportfs_decode_fh(path->mnt, + (struct fid *)handle->f_handle, + handle_dwords, handle->handle_type, + vfs_dentry_acceptable, NULL); + if (IS_ERR(path->dentry)) { + retval = PTR_ERR(path->dentry); + goto out_mnt; + } + return 0; +out_mnt: + mntput(path->mnt); +out_err: + return retval; +} + +static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, + struct path *path) +{ + int retval = 0; + struct file_handle f_handle; + struct file_handle *handle = NULL; + + /* + * With handle we don't look at the execute bit on the + * the directory. Ideally we would like CAP_DAC_SEARCH. + * But we don't have that + */ + if (!capable(CAP_DAC_READ_SEARCH)) { + retval = -EPERM; + goto out_err; + } + if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) { + retval = -EFAULT; + goto out_err; + } + if ((f_handle.handle_bytes > MAX_HANDLE_SZ) || + (f_handle.handle_bytes == 0)) { + retval = -EINVAL; + goto out_err; + } + handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes, + GFP_KERNEL); + if (!handle) { + retval = -ENOMEM; + goto out_err; + } + /* copy the full handle */ + if (copy_from_user(handle, ufh, + sizeof(struct file_handle) + + f_handle.handle_bytes)) { + retval = -EFAULT; + goto out_handle; + } + + retval = do_handle_to_path(mountdirfd, handle, path); + +out_handle: + kfree(handle); +out_err: + return retval; +} + +long do_handle_open(int mountdirfd, + struct file_handle __user *ufh, int open_flag) +{ + long retval = 0; + struct path path; + struct file *file; + int fd; + + retval = handle_to_path(mountdirfd, ufh, &path); + if (retval) + return retval; + + fd = get_unused_fd_flags(open_flag); + if (fd < 0) { + path_put(&path); + return fd; + } + file = file_open_root(path.dentry, path.mnt, "", open_flag); + if (IS_ERR(file)) { + put_unused_fd(fd); + retval = PTR_ERR(file); + } else { + retval = fd; + fsnotify_open(file); + fd_install(fd, file); + } + path_put(&path); + return retval; +} + +/** + * sys_open_by_handle_at: Open the file handle + * @mountdirfd: directory file descriptor + * @handle: file handle to be opened + * @flag: open flags. + * + * @mountdirfd indicate the directory file descriptor + * of the mount point. file handle is decoded relative + * to the vfsmount pointed by the @mountdirfd. @flags + * value is same as the open(2) flags. + */ +SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, + struct file_handle __user *, handle, + int, flags) +{ + long ret; + + if (force_o_largefile()) + flags |= O_LARGEFILE; + + ret = do_handle_open(mountdirfd, handle, flags); + return ret; +} diff --git a/fs/file_table.c b/fs/file_table.c index eb36b6b..74a9544 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -276,11 +276,10 @@ struct file *fget(unsigned int fd) rcu_read_lock(); file = fcheck_files(files, fd); if (file) { - if (!atomic_long_inc_not_zero(&file->f_count)) { - /* File object ref couldn't be taken */ - rcu_read_unlock(); - return NULL; - } + /* File object ref couldn't be taken */ + if (file->f_mode & FMODE_PATH || + !atomic_long_inc_not_zero(&file->f_count)) + file = NULL; } rcu_read_unlock(); @@ -289,6 +288,25 @@ struct file *fget(unsigned int fd) EXPORT_SYMBOL(fget); +struct file *fget_raw(unsigned int fd) +{ + struct file *file; + struct files_struct *files = current->files; + + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) { + /* File object ref couldn't be taken */ + if (!atomic_long_inc_not_zero(&file->f_count)) + file = NULL; + } + rcu_read_unlock(); + + return file; +} + +EXPORT_SYMBOL(fget_raw); + /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. * @@ -313,6 +331,33 @@ struct file *fget_light(unsigned int fd, int *fput_needed) *fput_needed = 0; if (atomic_read(&files->count) == 1) { file = fcheck_files(files, fd); + if (file && (file->f_mode & FMODE_PATH)) + file = NULL; + } else { + rcu_read_lock(); + file = fcheck_files(files, fd); + if (file) { + if (!(file->f_mode & FMODE_PATH) && + atomic_long_inc_not_zero(&file->f_count)) + *fput_needed = 1; + else + /* Didn't get the reference, someone's freed */ + file = NULL; + } + rcu_read_unlock(); + } + + return file; +} + +struct file *fget_raw_light(unsigned int fd, int *fput_needed) +{ + struct file *file; + struct files_struct *files = current->files; + + *fput_needed = 0; + if (atomic_read(&files->count) == 1) { + file = fcheck_files(files, fd); } else { rcu_read_lock(); file = fcheck_files(files, fd); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index bfed844..8bd0ef9 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -158,7 +158,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) { struct inode *inode; - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; inode = entry->d_inode; @@ -1283,8 +1283,11 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, if (err) return err; - if ((attr->ia_valid & ATTR_OPEN) && fc->atomic_o_trunc) - return 0; + if (attr->ia_valid & ATTR_OPEN) { + if (fc->atomic_o_trunc) + return 0; + file = NULL; + } if (attr->ia_valid & ATTR_SIZE) is_truncate = true; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 95da1bc..9e0832d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -86,18 +86,52 @@ struct fuse_file *fuse_file_get(struct fuse_file *ff) return ff; } +static void fuse_release_async(struct work_struct *work) +{ + struct fuse_req *req; + struct fuse_conn *fc; + struct path path; + + req = container_of(work, struct fuse_req, misc.release.work); + path = req->misc.release.path; + fc = get_fuse_conn(path.dentry->d_inode); + + fuse_put_request(fc, req); + path_put(&path); +} + static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) { - path_put(&req->misc.release.path); + if (fc->destroy_req) { + /* + * If this is a fuseblk mount, then it's possible that + * releasing the path will result in releasing the + * super block and sending the DESTROY request. If + * the server is single threaded, this would hang. + * For this reason do the path_put() in a separate + * thread. + */ + atomic_inc(&req->count); + INIT_WORK(&req->misc.release.work, fuse_release_async); + schedule_work(&req->misc.release.work); + } else { + path_put(&req->misc.release.path); + } } -static void fuse_file_put(struct fuse_file *ff) +static void fuse_file_put(struct fuse_file *ff, bool sync) { if (atomic_dec_and_test(&ff->count)) { struct fuse_req *req = ff->reserved_req; - req->end = fuse_release_end; - fuse_request_send_background(ff->fc, req); + if (sync) { + fuse_request_send(ff->fc, req); + path_put(&req->misc.release.path); + fuse_put_request(ff->fc, req); + } else { + req->end = fuse_release_end; + fuse_request_send_background(ff->fc, req); + } kfree(ff); } } @@ -219,8 +253,12 @@ void fuse_release_common(struct file *file, int opcode) * Normally this will send the RELEASE request, however if * some asynchronous READ or WRITE requests are outstanding, * the sending will be delayed. + * + * Make the release synchronous if this is a fuseblk mount, + * synchronous RELEASE is allowed (and desirable) in this case + * because the server can be trusted not to screw up. */ - fuse_file_put(ff); + fuse_file_put(ff, ff->fc->destroy_req != NULL); } static int fuse_open(struct inode *inode, struct file *file) @@ -558,7 +596,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) page_cache_release(page); } if (req->ff) - fuse_file_put(req->ff); + fuse_file_put(req->ff, false); } static void fuse_send_readpages(struct fuse_req *req, struct file *file) @@ -1137,7 +1175,7 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf, static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) { __free_page(req->pages[0]); - fuse_file_put(req->ff); + fuse_file_put(req->ff, false); } static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index ae5744a..d428694 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -21,6 +21,7 @@ #include <linux/rwsem.h> #include <linux/rbtree.h> #include <linux/poll.h> +#include <linux/workqueue.h> /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 @@ -262,7 +263,10 @@ struct fuse_req { /** Data for asynchronous requests */ union { struct { - struct fuse_release_in in; + union { + struct fuse_release_in in; + struct work_struct work; + }; struct path path; } release; struct fuse_init_in init_in; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 9e3f68c..051b1a0 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -637,8 +637,10 @@ static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, u64 nodeid; u32 generation; - if (*max_len < len) + if (*max_len < len) { + *max_len = len; return 255; + } nodeid = get_fuse_inode(inode)->nodeid; generation = inode->i_generation; diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 4a45633..0da8da2 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -44,7 +44,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd) int error; int had_lock = 0; - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; parent = dget_parent(dentry); diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index 9023db8..b5a5e60 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -36,9 +36,13 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len, struct super_block *sb = inode->i_sb; struct gfs2_inode *ip = GFS2_I(inode); - if (*len < GFS2_SMALL_FH_SIZE || - (connectable && *len < GFS2_LARGE_FH_SIZE)) + if (connectable && (*len < GFS2_LARGE_FH_SIZE)) { + *len = GFS2_LARGE_FH_SIZE; return 255; + } else if (*len < GFS2_SMALL_FH_SIZE) { + *len = GFS2_SMALL_FH_SIZE; + return 255; + } fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32); fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF); diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 85ba027..72c31a3 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -59,14 +59,7 @@ static void gfs2_init_gl_aspace_once(void *foo) struct address_space *mapping = (struct address_space *)(gl + 1); gfs2_init_glock_once(gl); - memset(mapping, 0, sizeof(*mapping)); - INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); - spin_lock_init(&mapping->tree_lock); - spin_lock_init(&mapping->i_mmap_lock); - INIT_LIST_HEAD(&mapping->private_list); - spin_lock_init(&mapping->private_lock); - INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); - INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); + address_space_init_once(mapping); } /** diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index afa66aa..b4d70b1 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -238,46 +238,22 @@ static int hfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) } /* - * hfs_unlink() + * hfs_remove() * - * This is the unlink() entry in the inode_operations structure for - * regular HFS directories. The purpose is to delete an existing - * file, given the inode for the parent directory and the name - * (and its length) of the existing file. - */ -static int hfs_unlink(struct inode *dir, struct dentry *dentry) -{ - struct inode *inode; - int res; - - inode = dentry->d_inode; - res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name); - if (res) - return res; - - drop_nlink(inode); - hfs_delete_inode(inode); - inode->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty(inode); - - return res; -} - -/* - * hfs_rmdir() + * This serves as both unlink() and rmdir() in the inode_operations + * structure for regular HFS directories. The purpose is to delete + * an existing child, given the inode for the parent directory and + * the name (and its length) of the existing directory. * - * This is the rmdir() entry in the inode_operations structure for - * regular HFS directories. The purpose is to delete an existing - * directory, given the inode for the parent directory and the name - * (and its length) of the existing directory. + * HFS does not have hardlinks, so both rmdir and unlink set the + * link count to 0. The only difference is the emptiness check. */ -static int hfs_rmdir(struct inode *dir, struct dentry *dentry) +static int hfs_remove(struct inode *dir, struct dentry *dentry) { - struct inode *inode; + struct inode *inode = dentry->d_inode; int res; - inode = dentry->d_inode; - if (inode->i_size != 2) + if (S_ISDIR(inode->i_mode) && inode->i_size != 2) return -ENOTEMPTY; res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name); if (res) @@ -307,7 +283,7 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* Unlink destination if it already exists */ if (new_dentry->d_inode) { - res = hfs_unlink(new_dir, new_dentry); + res = hfs_remove(new_dir, new_dentry); if (res) return res; } @@ -332,9 +308,9 @@ const struct file_operations hfs_dir_operations = { const struct inode_operations hfs_dir_inode_operations = { .create = hfs_create, .lookup = hfs_lookup, - .unlink = hfs_unlink, + .unlink = hfs_remove, .mkdir = hfs_mkdir, - .rmdir = hfs_rmdir, + .rmdir = hfs_remove, .rename = hfs_rename, .setattr = hfs_inode_setattr, }; @@ -295,6 +295,20 @@ static void destroy_inode(struct inode *inode) call_rcu(&inode->i_rcu, i_callback); } +void address_space_init_once(struct address_space *mapping) +{ + memset(mapping, 0, sizeof(*mapping)); + INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); + spin_lock_init(&mapping->tree_lock); + spin_lock_init(&mapping->i_mmap_lock); + INIT_LIST_HEAD(&mapping->private_list); + spin_lock_init(&mapping->private_lock); + INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); + INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); + mutex_init(&mapping->unmap_mutex); +} +EXPORT_SYMBOL(address_space_init_once); + /* * These are initializations that only need to be done * once, because the fields are idempotent across use @@ -308,13 +322,7 @@ void inode_init_once(struct inode *inode) INIT_LIST_HEAD(&inode->i_devices); INIT_LIST_HEAD(&inode->i_wb_list); INIT_LIST_HEAD(&inode->i_lru); - INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); - spin_lock_init(&inode->i_data.tree_lock); - spin_lock_init(&inode->i_data.i_mmap_lock); - INIT_LIST_HEAD(&inode->i_data.private_list); - spin_lock_init(&inode->i_data.private_lock); - INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap); - INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear); + address_space_init_once(&inode->i_data); i_size_ordered_init(inode); #ifdef CONFIG_FSNOTIFY INIT_HLIST_HEAD(&inode->i_fsnotify_marks); @@ -540,11 +548,14 @@ void evict_inodes(struct super_block *sb) /** * invalidate_inodes - attempt to free all inodes on a superblock * @sb: superblock to operate on + * @kill_dirty: flag to guide handling of dirty inodes * * Attempts to free all inodes for a given superblock. If there were any * busy inodes return a non-zero value, else zero. + * If @kill_dirty is set, discard dirty inodes too, otherwise treat + * them as busy. */ -int invalidate_inodes(struct super_block *sb) +int invalidate_inodes(struct super_block *sb, bool kill_dirty) { int busy = 0; struct inode *inode, *next; @@ -556,6 +567,10 @@ int invalidate_inodes(struct super_block *sb) list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) continue; + if (inode->i_state & I_DIRTY && !kill_dirty) { + busy = 1; + continue; + } if (atomic_read(&inode->i_count)) { busy = 1; continue; diff --git a/fs/internal.h b/fs/internal.h index 0663568..f3d15de 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -106,10 +106,23 @@ extern void put_super(struct super_block *sb); struct nameidata; extern struct file *nameidata_to_filp(struct nameidata *); extern void release_open_intent(struct nameidata *); +struct open_flags { + int open_flag; + int mode; + int acc_mode; + int intent; +}; +extern struct file *do_filp_open(int dfd, const char *pathname, + const struct open_flags *op, int lookup_flags); +extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, + const char *, const struct open_flags *, int lookup_flags); + +extern long do_handle_open(int mountdirfd, + struct file_handle __user *ufh, int open_flag); /* * inode.c */ extern int get_nr_dirty_inodes(void); extern void evict_inodes(struct super_block *); -extern int invalidate_inodes(struct super_block *); +extern int invalidate_inodes(struct super_block *, bool); diff --git a/fs/isofs/export.c b/fs/isofs/export.c index ed752cb..dd4687f 100644 --- a/fs/isofs/export.c +++ b/fs/isofs/export.c @@ -124,9 +124,13 @@ isofs_export_encode_fh(struct dentry *dentry, * offset of the inode and the upper 16 bits of fh32[1] to * hold the offset of the parent. */ - - if (len < 3 || (connectable && len < 5)) + if (connectable && (len < 5)) { + *max_len = 5; + return 255; + } else if (len < 3) { + *max_len = 3; return 255; + } len = 3; fh32[0] = ei->i_iget5_block; diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 81ead85..3f04a18 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -809,9 +809,6 @@ static int jfs_link(struct dentry *old_dentry, if (ip->i_nlink == JFS_LINK_MAX) return -EMLINK; - if (ip->i_nlink == 0) - return -ENOENT; - dquot_initialize(dir); tid = txBegin(ip->i_sb, 0); @@ -1600,7 +1597,7 @@ out: static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd) { - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; /* * This is not negative dentry. Always valid. diff --git a/fs/minix/namei.c b/fs/minix/namei.c index ce7337d..6e6777f 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -213,7 +213,6 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry, new_de = minix_find_entry(new_dentry, &new_page); if (!new_de) goto out_dir; - inode_inc_link_count(old_inode); minix_set_link(new_de, new_page, old_inode); new_inode->i_ctime = CURRENT_TIME_SEC; if (dir_de) @@ -225,18 +224,15 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry, if (new_dir->i_nlink >= info->s_link_max) goto out_dir; } - inode_inc_link_count(old_inode); err = minix_add_link(new_dentry, old_inode); - if (err) { - inode_dec_link_count(old_inode); + if (err) goto out_dir; - } if (dir_de) inode_inc_link_count(new_dir); } minix_delete_entry(old_de, old_page); - inode_dec_link_count(old_inode); + mark_inode_dirty(old_inode); if (dir_de) { minix_set_link(dir_de, dir_page, new_dir); @@ -136,7 +136,7 @@ static int do_getname(const char __user *filename, char *page) return retval; } -char * getname(const char __user * filename) +static char *getname_flags(const char __user * filename, int flags) { char *tmp, *result; @@ -147,14 +147,21 @@ char * getname(const char __user * filename) result = tmp; if (retval < 0) { - __putname(tmp); - result = ERR_PTR(retval); + if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { + __putname(tmp); + result = ERR_PTR(retval); + } } } audit_getname(result); return result; } +char *getname(const char __user * filename) +{ + return getname_flags(filename, 0); +} + #ifdef CONFIG_AUDITSYSCALL void putname(const char *name) { @@ -401,9 +408,11 @@ static int nameidata_drop_rcu(struct nameidata *nd) { struct fs_struct *fs = current->fs; struct dentry *dentry = nd->path.dentry; + int want_root = 0; BUG_ON(!(nd->flags & LOOKUP_RCU)); - if (nd->root.mnt) { + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + want_root = 1; spin_lock(&fs->lock); if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry) @@ -414,7 +423,7 @@ static int nameidata_drop_rcu(struct nameidata *nd) goto err; BUG_ON(nd->inode != dentry->d_inode); spin_unlock(&dentry->d_lock); - if (nd->root.mnt) { + if (want_root) { path_get(&nd->root); spin_unlock(&fs->lock); } @@ -427,7 +436,7 @@ static int nameidata_drop_rcu(struct nameidata *nd) err: spin_unlock(&dentry->d_lock); err_root: - if (nd->root.mnt) + if (want_root) spin_unlock(&fs->lock); return -ECHILD; } @@ -454,9 +463,11 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry { struct fs_struct *fs = current->fs; struct dentry *parent = nd->path.dentry; + int want_root = 0; BUG_ON(!(nd->flags & LOOKUP_RCU)); - if (nd->root.mnt) { + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + want_root = 1; spin_lock(&fs->lock); if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry) @@ -476,7 +487,7 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry parent->d_count++; spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); - if (nd->root.mnt) { + if (want_root) { path_get(&nd->root); spin_unlock(&fs->lock); } @@ -490,7 +501,7 @@ err: spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); err_root: - if (nd->root.mnt) + if (want_root) spin_unlock(&fs->lock); return -ECHILD; } @@ -498,8 +509,16 @@ err_root: /* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry) { - if (nd->flags & LOOKUP_RCU) - return nameidata_dentry_drop_rcu(nd, dentry); + if (nd->flags & LOOKUP_RCU) { + if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) { + nd->flags &= ~LOOKUP_RCU; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + return -ECHILD; + } + } return 0; } @@ -518,7 +537,8 @@ static int nameidata_drop_rcu_last(struct nameidata *nd) BUG_ON(!(nd->flags & LOOKUP_RCU)); nd->flags &= ~LOOKUP_RCU; - nd->root.mnt = NULL; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; spin_lock(&dentry->d_lock); if (!__d_rcu_to_refcount(dentry, nd->seq)) goto err_unlock; @@ -539,14 +559,6 @@ err_unlock: return -ECHILD; } -/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ -static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd) -{ - if (likely(nd->flags & LOOKUP_RCU)) - return nameidata_drop_rcu_last(nd); - return 0; -} - /** * release_open_intent - free up open intent resources * @nd: pointer to nameidata @@ -590,42 +602,8 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd) return dentry; } -static inline struct dentry * -do_revalidate_rcu(struct dentry *dentry, struct nameidata *nd) -{ - int status = d_revalidate(dentry, nd); - if (likely(status > 0)) - return dentry; - if (status == -ECHILD) { - if (nameidata_dentry_drop_rcu(nd, dentry)) - return ERR_PTR(-ECHILD); - return do_revalidate(dentry, nd); - } - if (status < 0) - return ERR_PTR(status); - /* Don't d_invalidate in rcu-walk mode */ - if (nameidata_dentry_drop_rcu(nd, dentry)) - return ERR_PTR(-ECHILD); - if (!d_invalidate(dentry)) { - dput(dentry); - dentry = NULL; - } - return dentry; -} - -static inline int need_reval_dot(struct dentry *dentry) -{ - if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE))) - return 0; - - if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))) - return 0; - - return 1; -} - /* - * force_reval_path - force revalidation of a dentry + * handle_reval_path - force revalidation of a dentry * * In some situations the path walking code will trust dentries without * revalidating them. This causes problems for filesystems that depend on @@ -639,27 +617,28 @@ static inline int need_reval_dot(struct dentry *dentry) * invalidate the dentry. It's up to the caller to handle putting references * to the path if necessary. */ -static int -force_reval_path(struct path *path, struct nameidata *nd) +static inline int handle_reval_path(struct nameidata *nd) { + struct dentry *dentry = nd->path.dentry; int status; - struct dentry *dentry = path->dentry; - /* - * only check on filesystems where it's possible for the dentry to - * become stale. - */ - if (!need_reval_dot(dentry)) + if (likely(!(nd->flags & LOOKUP_JUMPED))) + return 0; + + if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE))) + return 0; + + if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))) return 0; + /* Note: we do not d_invalidate() */ status = d_revalidate(dentry, nd); if (status > 0) return 0; - if (!status) { - d_invalidate(dentry); + if (!status) status = -ESTALE; - } + return status; } @@ -728,6 +707,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l path_put(&nd->path); nd->path = nd->root; path_get(&nd->root); + nd->flags |= LOOKUP_JUMPED; } nd->inode = nd->path.dentry->d_inode; @@ -757,20 +737,44 @@ static inline void path_to_nameidata(const struct path *path, nd->path.dentry = path->dentry; } +static inline void put_link(struct nameidata *nd, struct path *link, void *cookie) +{ + struct inode *inode = link->dentry->d_inode; + if (!IS_ERR(cookie) && inode->i_op->put_link) + inode->i_op->put_link(link->dentry, nd, cookie); + path_put(link); +} + static __always_inline int -__do_follow_link(const struct path *link, struct nameidata *nd, void **p) +follow_link(struct path *link, struct nameidata *nd, void **p) { int error; struct dentry *dentry = link->dentry; BUG_ON(nd->flags & LOOKUP_RCU); + if (unlikely(current->total_link_count >= 40)) { + *p = ERR_PTR(-ELOOP); /* no ->put_link(), please */ + path_put_conditional(link, nd); + path_put(&nd->path); + return -ELOOP; + } + cond_resched(); + current->total_link_count++; + touch_atime(link->mnt, dentry); nd_set_link(nd, NULL); if (link->mnt == nd->path.mnt) mntget(link->mnt); + error = security_inode_follow_link(link->dentry, nd); + if (error) { + *p = ERR_PTR(error); /* no ->put_link(), please */ + path_put(&nd->path); + return error; + } + nd->last_type = LAST_BIND; *p = dentry->d_inode->i_op->follow_link(dentry, nd); error = PTR_ERR(*p); @@ -780,56 +784,18 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p) if (s) error = __vfs_follow_link(nd, s); else if (nd->last_type == LAST_BIND) { - error = force_reval_path(&nd->path, nd); - if (error) + nd->flags |= LOOKUP_JUMPED; + nd->inode = nd->path.dentry->d_inode; + if (nd->inode->i_op->follow_link) { + /* stepped on a _really_ weird one */ path_put(&nd->path); + error = -ELOOP; + } } } return error; } -/* - * This limits recursive symlink follows to 8, while - * limiting consecutive symlinks to 40. - * - * Without that kind of total limit, nasty chains of consecutive - * symlinks can cause almost arbitrarily long lookups. - */ -static inline int do_follow_link(struct inode *inode, struct path *path, struct nameidata *nd) -{ - void *cookie; - int err = -ELOOP; - - /* We drop rcu-walk here */ - if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry)) - return -ECHILD; - BUG_ON(inode != path->dentry->d_inode); - - if (current->link_count >= MAX_NESTED_LINKS) - goto loop; - if (current->total_link_count >= 40) - goto loop; - BUG_ON(nd->depth >= MAX_NESTED_LINKS); - cond_resched(); - err = security_inode_follow_link(path->dentry, nd); - if (err) - goto loop; - current->link_count++; - current->total_link_count++; - nd->depth++; - err = __do_follow_link(path, nd, &cookie); - if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link) - path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie); - path_put(path); - current->link_count--; - nd->depth--; - return err; -loop: - path_put_conditional(path, nd); - path_put(&nd->path); - return err; -} - static int follow_up_rcu(struct path *path) { struct vfsmount *parent; @@ -1068,7 +1034,7 @@ static int follow_dotdot_rcu(struct nameidata *nd) seq = read_seqcount_begin(&parent->d_seq); if (read_seqcount_retry(&old->d_seq, nd->seq)) - return -ECHILD; + goto failed; inode = parent->d_inode; nd->path.dentry = parent; nd->seq = seq; @@ -1081,8 +1047,15 @@ static int follow_dotdot_rcu(struct nameidata *nd) } __follow_mount_rcu(nd, &nd->path, &inode, true); nd->inode = inode; - return 0; + +failed: + nd->flags &= ~LOOKUP_RCU; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); + return -ECHILD; } /* @@ -1216,68 +1189,85 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, { struct vfsmount *mnt = nd->path.mnt; struct dentry *dentry, *parent = nd->path.dentry; - struct inode *dir; + int need_reval = 1; + int status = 1; int err; /* - * See if the low-level filesystem might want - * to use its own hash.. - */ - if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { - err = parent->d_op->d_hash(parent, nd->inode, name); - if (err < 0) - return err; - } - - /* * Rename seqlock is not required here because in the off chance * of a false negative due to a concurrent rename, we're going to * do the non-racy lookup, below. */ if (nd->flags & LOOKUP_RCU) { unsigned seq; - *inode = nd->inode; dentry = __d_lookup_rcu(parent, name, &seq, inode); - if (!dentry) { - if (nameidata_drop_rcu(nd)) - return -ECHILD; - goto need_lookup; - } + if (!dentry) + goto unlazy; + /* Memory barrier in read_seqcount_begin of child is enough */ if (__read_seqcount_retry(&parent->d_seq, nd->seq)) return -ECHILD; - nd->seq = seq; + if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { - dentry = do_revalidate_rcu(dentry, nd); - if (!dentry) - goto need_lookup; - if (IS_ERR(dentry)) - goto fail; - if (!(nd->flags & LOOKUP_RCU)) - goto done; + status = d_revalidate(dentry, nd); + if (unlikely(status <= 0)) { + if (status != -ECHILD) + need_reval = 0; + goto unlazy; + } } path->mnt = mnt; path->dentry = dentry; if (likely(__follow_mount_rcu(nd, path, inode, false))) return 0; - if (nameidata_drop_rcu(nd)) - return -ECHILD; - /* fallthru */ +unlazy: + if (dentry) { + if (nameidata_dentry_drop_rcu(nd, dentry)) + return -ECHILD; + } else { + if (nameidata_drop_rcu(nd)) + return -ECHILD; + } + } else { + dentry = __d_lookup(parent, name); } - dentry = __d_lookup(parent, name); - if (!dentry) - goto need_lookup; -found: - if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { - dentry = do_revalidate(dentry, nd); - if (!dentry) - goto need_lookup; - if (IS_ERR(dentry)) - goto fail; + +retry: + if (unlikely(!dentry)) { + struct inode *dir = parent->d_inode; + BUG_ON(nd->inode != dir); + + mutex_lock(&dir->i_mutex); + dentry = d_lookup(parent, name); + if (likely(!dentry)) { + dentry = d_alloc_and_lookup(parent, name, nd); + if (IS_ERR(dentry)) { + mutex_unlock(&dir->i_mutex); + return PTR_ERR(dentry); + } + /* known good */ + need_reval = 0; + status = 1; + } + mutex_unlock(&dir->i_mutex); } -done: + if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval) + status = d_revalidate(dentry, nd); + if (unlikely(status <= 0)) { + if (status < 0) { + dput(dentry); + return status; + } + if (!d_invalidate(dentry)) { + dput(dentry); + dentry = NULL; + need_reval = 1; + goto retry; + } + } + path->mnt = mnt; path->dentry = dentry; err = follow_managed(path, nd->flags); @@ -1287,39 +1277,113 @@ done: } *inode = path->dentry->d_inode; return 0; +} -need_lookup: - dir = parent->d_inode; - BUG_ON(nd->inode != dir); +static inline int may_lookup(struct nameidata *nd) +{ + if (nd->flags & LOOKUP_RCU) { + int err = exec_permission(nd->inode, IPERM_FLAG_RCU); + if (err != -ECHILD) + return err; + if (nameidata_drop_rcu(nd)) + return -ECHILD; + } + return exec_permission(nd->inode, 0); +} - mutex_lock(&dir->i_mutex); - /* - * First re-do the cached lookup just in case it was created - * while we waited for the directory semaphore, or the first - * lookup failed due to an unrelated rename. - * - * This could use version numbering or similar to avoid unnecessary - * cache lookups, but then we'd have to do the first lookup in the - * non-racy way. However in the common case here, everything should - * be hot in cache, so would it be a big win? - */ - dentry = d_lookup(parent, name); - if (likely(!dentry)) { - dentry = d_alloc_and_lookup(parent, name, nd); - mutex_unlock(&dir->i_mutex); - if (IS_ERR(dentry)) - goto fail; - goto done; +static inline int handle_dots(struct nameidata *nd, int type) +{ + if (type == LAST_DOTDOT) { + if (nd->flags & LOOKUP_RCU) { + if (follow_dotdot_rcu(nd)) + return -ECHILD; + } else + follow_dotdot(nd); + } + return 0; +} + +static void terminate_walk(struct nameidata *nd) +{ + if (!(nd->flags & LOOKUP_RCU)) { + path_put(&nd->path); + } else { + nd->flags &= ~LOOKUP_RCU; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + br_read_unlock(vfsmount_lock); } +} + +static inline int walk_component(struct nameidata *nd, struct path *path, + struct qstr *name, int type, int follow) +{ + struct inode *inode; + int err; /* - * Uhhuh! Nasty case: the cache was re-populated while - * we waited on the semaphore. Need to revalidate. + * "." and ".." are special - ".." especially so because it has + * to be able to know about the current root directory and + * parent relationships. */ - mutex_unlock(&dir->i_mutex); - goto found; + if (unlikely(type != LAST_NORM)) + return handle_dots(nd, type); + err = do_lookup(nd, name, path, &inode); + if (unlikely(err)) { + terminate_walk(nd); + return err; + } + if (!inode) { + path_to_nameidata(path, nd); + terminate_walk(nd); + return -ENOENT; + } + if (unlikely(inode->i_op->follow_link) && follow) { + if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry)) + return -ECHILD; + BUG_ON(inode != path->dentry->d_inode); + return 1; + } + path_to_nameidata(path, nd); + nd->inode = inode; + return 0; +} -fail: - return PTR_ERR(dentry); +/* + * This limits recursive symlink follows to 8, while + * limiting consecutive symlinks to 40. + * + * Without that kind of total limit, nasty chains of consecutive + * symlinks can cause almost arbitrarily long lookups. + */ +static inline int nested_symlink(struct path *path, struct nameidata *nd) +{ + int res; + + BUG_ON(nd->depth >= MAX_NESTED_LINKS); + if (unlikely(current->link_count >= MAX_NESTED_LINKS)) { + path_put_conditional(path, nd); + path_put(&nd->path); + return -ELOOP; + } + + nd->depth++; + current->link_count++; + + do { + struct path link = *path; + void *cookie; + + res = follow_link(&link, nd, &cookie); + if (!res) + res = walk_component(nd, path, &nd->last, + nd->last_type, LOOKUP_FOLLOW); + put_link(nd, &link, cookie); + } while (res > 0); + + current->link_count--; + nd->depth--; + return res; } /* @@ -1339,30 +1403,18 @@ static int link_path_walk(const char *name, struct nameidata *nd) while (*name=='/') name++; if (!*name) - goto return_reval; - - if (nd->depth) - lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE); + return 0; /* At this point we know we have a real path component. */ for(;;) { - struct inode *inode; unsigned long hash; struct qstr this; unsigned int c; + int type; nd->flags |= LOOKUP_CONTINUE; - if (nd->flags & LOOKUP_RCU) { - err = exec_permission(nd->inode, IPERM_FLAG_RCU); - if (err == -ECHILD) { - if (nameidata_drop_rcu(nd)) - return -ECHILD; - goto exec_again; - } - } else { -exec_again: - err = exec_permission(nd->inode, 0); - } + + err = may_lookup(nd); if (err) break; @@ -1378,52 +1430,43 @@ exec_again: this.len = name - (const char *) this.name; this.hash = end_name_hash(hash); + type = LAST_NORM; + if (this.name[0] == '.') switch (this.len) { + case 2: + if (this.name[1] == '.') { + type = LAST_DOTDOT; + nd->flags |= LOOKUP_JUMPED; + } + break; + case 1: + type = LAST_DOT; + } + if (likely(type == LAST_NORM)) { + struct dentry *parent = nd->path.dentry; + nd->flags &= ~LOOKUP_JUMPED; + if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { + err = parent->d_op->d_hash(parent, nd->inode, + &this); + if (err < 0) + break; + } + } + /* remove trailing slashes? */ if (!c) goto last_component; while (*++name == '/'); if (!*name) - goto last_with_slashes; + goto last_component; - /* - * "." and ".." are special - ".." especially so because it has - * to be able to know about the current root directory and - * parent relationships. - */ - if (this.name[0] == '.') switch (this.len) { - default: - break; - case 2: - if (this.name[1] != '.') - break; - if (nd->flags & LOOKUP_RCU) { - if (follow_dotdot_rcu(nd)) - return -ECHILD; - } else - follow_dotdot(nd); - /* fallthrough */ - case 1: - continue; - } - /* This does the actual lookups.. */ - err = do_lookup(nd, &this, &next, &inode); - if (err) - break; - err = -ENOENT; - if (!inode) - goto out_dput; + err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW); + if (err < 0) + return err; - if (inode->i_op->follow_link) { - err = do_follow_link(inode, &next, nd); + if (err) { + err = nested_symlink(&next, nd); if (err) - goto return_err; - nd->inode = nd->path.dentry->d_inode; - err = -ENOENT; - if (!nd->inode) - break; - } else { - path_to_nameidata(&next, nd); - nd->inode = inode; + return err; } err = -ENOTDIR; if (!nd->inode->i_op->lookup) @@ -1431,209 +1474,109 @@ exec_again: continue; /* here ends the main loop */ -last_with_slashes: - lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; last_component: /* Clear LOOKUP_CONTINUE iff it was previously unset */ nd->flags &= lookup_flags | ~LOOKUP_CONTINUE; - if (lookup_flags & LOOKUP_PARENT) - goto lookup_parent; - if (this.name[0] == '.') switch (this.len) { - default: - break; - case 2: - if (this.name[1] != '.') - break; - if (nd->flags & LOOKUP_RCU) { - if (follow_dotdot_rcu(nd)) - return -ECHILD; - } else - follow_dotdot(nd); - /* fallthrough */ - case 1: - goto return_reval; - } - err = do_lookup(nd, &this, &next, &inode); - if (err) - break; - if (inode && unlikely(inode->i_op->follow_link) && - (lookup_flags & LOOKUP_FOLLOW)) { - err = do_follow_link(inode, &next, nd); - if (err) - goto return_err; - nd->inode = nd->path.dentry->d_inode; - } else { - path_to_nameidata(&next, nd); - nd->inode = inode; - } - err = -ENOENT; - if (!nd->inode) - break; - if (lookup_flags & LOOKUP_DIRECTORY) { - err = -ENOTDIR; - if (!nd->inode->i_op->lookup) - break; - } - goto return_base; -lookup_parent: nd->last = this; - nd->last_type = LAST_NORM; - if (this.name[0] != '.') - goto return_base; - if (this.len == 1) - nd->last_type = LAST_DOT; - else if (this.len == 2 && this.name[1] == '.') - nd->last_type = LAST_DOTDOT; - else - goto return_base; -return_reval: - /* - * We bypassed the ordinary revalidation routines. - * We may need to check the cached dentry for staleness. - */ - if (need_reval_dot(nd->path.dentry)) { - if (nameidata_drop_rcu_last_maybe(nd)) - return -ECHILD; - /* Note: we do not d_invalidate() */ - err = d_revalidate(nd->path.dentry, nd); - if (!err) - err = -ESTALE; - if (err < 0) - break; - return 0; - } -return_base: - if (nameidata_drop_rcu_last_maybe(nd)) - return -ECHILD; + nd->last_type = type; return 0; -out_dput: - if (!(nd->flags & LOOKUP_RCU)) - path_put_conditional(&next, nd); - break; } - if (!(nd->flags & LOOKUP_RCU)) - path_put(&nd->path); -return_err: + terminate_walk(nd); return err; } -static inline int path_walk_rcu(const char *name, struct nameidata *nd) -{ - current->total_link_count = 0; - - return link_path_walk(name, nd); -} - -static inline int path_walk_simple(const char *name, struct nameidata *nd) -{ - current->total_link_count = 0; - - return link_path_walk(name, nd); -} - -static int path_walk(const char *name, struct nameidata *nd) -{ - struct path save = nd->path; - int result; - - current->total_link_count = 0; - - /* make sure the stuff we saved doesn't go away */ - path_get(&save); - - result = link_path_walk(name, nd); - if (result == -ESTALE) { - /* nd->path had been dropped */ - current->total_link_count = 0; - nd->path = save; - path_get(&nd->path); - nd->flags |= LOOKUP_REVAL; - result = link_path_walk(name, nd); - } - - path_put(&save); - - return result; -} - -static void path_finish_rcu(struct nameidata *nd) -{ - if (nd->flags & LOOKUP_RCU) { - /* RCU dangling. Cancel it. */ - nd->flags &= ~LOOKUP_RCU; - nd->root.mnt = NULL; - rcu_read_unlock(); - br_read_unlock(vfsmount_lock); - } - if (nd->file) - fput(nd->file); -} - -static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd) +static int path_init(int dfd, const char *name, unsigned int flags, + struct nameidata *nd, struct file **fp) { int retval = 0; int fput_needed; struct file *file; nd->last_type = LAST_ROOT; /* if there are only slashes... */ - nd->flags = flags | LOOKUP_RCU; + nd->flags = flags | LOOKUP_JUMPED; nd->depth = 0; + if (flags & LOOKUP_ROOT) { + struct inode *inode = nd->root.dentry->d_inode; + if (*name) { + if (!inode->i_op->lookup) + return -ENOTDIR; + retval = inode_permission(inode, MAY_EXEC); + if (retval) + return retval; + } + nd->path = nd->root; + nd->inode = inode; + if (flags & LOOKUP_RCU) { + br_read_lock(vfsmount_lock); + rcu_read_lock(); + nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + } else { + path_get(&nd->path); + } + return 0; + } + nd->root.mnt = NULL; - nd->file = NULL; if (*name=='/') { - struct fs_struct *fs = current->fs; - unsigned seq; - - br_read_lock(vfsmount_lock); - rcu_read_lock(); - - do { - seq = read_seqcount_begin(&fs->seq); - nd->root = fs->root; - nd->path = nd->root; - nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); - + if (flags & LOOKUP_RCU) { + br_read_lock(vfsmount_lock); + rcu_read_lock(); + set_root_rcu(nd); + } else { + set_root(nd); + path_get(&nd->root); + } + nd->path = nd->root; } else if (dfd == AT_FDCWD) { - struct fs_struct *fs = current->fs; - unsigned seq; - - br_read_lock(vfsmount_lock); - rcu_read_lock(); + if (flags & LOOKUP_RCU) { + struct fs_struct *fs = current->fs; + unsigned seq; - do { - seq = read_seqcount_begin(&fs->seq); - nd->path = fs->pwd; - nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); + br_read_lock(vfsmount_lock); + rcu_read_lock(); + do { + seq = read_seqcount_begin(&fs->seq); + nd->path = fs->pwd; + nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + } while (read_seqcount_retry(&fs->seq, seq)); + } else { + get_fs_pwd(current->fs, &nd->path); + } } else { struct dentry *dentry; - file = fget_light(dfd, &fput_needed); + file = fget_raw_light(dfd, &fput_needed); retval = -EBADF; if (!file) goto out_fail; dentry = file->f_path.dentry; - retval = -ENOTDIR; - if (!S_ISDIR(dentry->d_inode->i_mode)) - goto fput_fail; + if (*name) { + retval = -ENOTDIR; + if (!S_ISDIR(dentry->d_inode->i_mode)) + goto fput_fail; - retval = file_permission(file, MAY_EXEC); - if (retval) - goto fput_fail; + retval = file_permission(file, MAY_EXEC); + if (retval) + goto fput_fail; + } nd->path = file->f_path; - if (fput_needed) - nd->file = file; - - nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); - br_read_lock(vfsmount_lock); - rcu_read_lock(); + if (flags & LOOKUP_RCU) { + if (fput_needed) + *fp = file; + nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + br_read_lock(vfsmount_lock); + rcu_read_lock(); + } else { + path_get(&file->f_path); + fput_light(file, fput_needed); + } } + nd->inode = nd->path.dentry->d_inode; return 0; @@ -1643,60 +1586,23 @@ out_fail: return retval; } -static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd) +static inline int lookup_last(struct nameidata *nd, struct path *path) { - int retval = 0; - int fput_needed; - struct file *file; + if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) + nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; - nd->last_type = LAST_ROOT; /* if there are only slashes... */ - nd->flags = flags; - nd->depth = 0; - nd->root.mnt = NULL; - - if (*name=='/') { - set_root(nd); - nd->path = nd->root; - path_get(&nd->root); - } else if (dfd == AT_FDCWD) { - get_fs_pwd(current->fs, &nd->path); - } else { - struct dentry *dentry; - - file = fget_light(dfd, &fput_needed); - retval = -EBADF; - if (!file) - goto out_fail; - - dentry = file->f_path.dentry; - - retval = -ENOTDIR; - if (!S_ISDIR(dentry->d_inode->i_mode)) - goto fput_fail; - - retval = file_permission(file, MAY_EXEC); - if (retval) - goto fput_fail; - - nd->path = file->f_path; - path_get(&file->f_path); - - fput_light(file, fput_needed); - } - nd->inode = nd->path.dentry->d_inode; - return 0; - -fput_fail: - fput_light(file, fput_needed); -out_fail: - return retval; + nd->flags &= ~LOOKUP_PARENT; + return walk_component(nd, path, &nd->last, nd->last_type, + nd->flags & LOOKUP_FOLLOW); } /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ -static int do_path_lookup(int dfd, const char *name, +static int path_lookupat(int dfd, const char *name, unsigned int flags, struct nameidata *nd) { - int retval; + struct file *base = NULL; + struct path path; + int err; /* * Path walking is largely split up into 2 different synchronisation @@ -1712,44 +1618,75 @@ static int do_path_lookup(int dfd, const char *name, * be handled by restarting a traditional ref-walk (which will always * be able to complete). */ - retval = path_init_rcu(dfd, name, flags, nd); - if (unlikely(retval)) - return retval; - retval = path_walk_rcu(name, nd); - path_finish_rcu(nd); - if (nd->root.mnt) { - path_put(&nd->root); - nd->root.mnt = NULL; + err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base); + + if (unlikely(err)) + return err; + + current->total_link_count = 0; + err = link_path_walk(name, nd); + + if (!err && !(flags & LOOKUP_PARENT)) { + err = lookup_last(nd, &path); + while (err > 0) { + void *cookie; + struct path link = path; + nd->flags |= LOOKUP_PARENT; + err = follow_link(&link, nd, &cookie); + if (!err) + err = lookup_last(nd, &path); + put_link(nd, &link, cookie); + } } - if (unlikely(retval == -ECHILD || retval == -ESTALE)) { - /* slower, locked walk */ - if (retval == -ESTALE) - flags |= LOOKUP_REVAL; - retval = path_init(dfd, name, flags, nd); - if (unlikely(retval)) - return retval; - retval = path_walk(name, nd); - if (nd->root.mnt) { - path_put(&nd->root); - nd->root.mnt = NULL; + if (nd->flags & LOOKUP_RCU) { + /* went all way through without dropping RCU */ + BUG_ON(err); + if (nameidata_drop_rcu_last(nd)) + err = -ECHILD; + } + + if (!err) + err = handle_reval_path(nd); + + if (!err && nd->flags & LOOKUP_DIRECTORY) { + if (!nd->inode->i_op->lookup) { + path_put(&nd->path); + return -ENOTDIR; } } + if (base) + fput(base); + + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + path_put(&nd->root); + nd->root.mnt = NULL; + } + return err; +} + +static int do_path_lookup(int dfd, const char *name, + unsigned int flags, struct nameidata *nd) +{ + int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd); + if (unlikely(retval == -ECHILD)) + retval = path_lookupat(dfd, name, flags, nd); + if (unlikely(retval == -ESTALE)) + retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd); + if (likely(!retval)) { if (unlikely(!audit_dummy_context())) { if (nd->path.dentry && nd->inode) audit_inode(name, nd->path.dentry); } } - return retval; } -int path_lookup(const char *name, unsigned int flags, - struct nameidata *nd) +int kern_path_parent(const char *name, struct nameidata *nd) { - return do_path_lookup(AT_FDCWD, name, flags, nd); + return do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, nd); } int kern_path(const char *name, unsigned int flags, struct path *path) @@ -1773,29 +1710,10 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, const char *name, unsigned int flags, struct nameidata *nd) { - int retval; - - /* same as do_path_lookup */ - nd->last_type = LAST_ROOT; - nd->flags = flags; - nd->depth = 0; - - nd->path.dentry = dentry; - nd->path.mnt = mnt; - path_get(&nd->path); - nd->root = nd->path; - path_get(&nd->root); - nd->inode = nd->path.dentry->d_inode; - - retval = path_walk(name, nd); - if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry && - nd->inode)) - audit_inode(name, nd->path.dentry); - - path_put(&nd->root); - nd->root.mnt = NULL; - - return retval; + nd->root.dentry = dentry; + nd->root.mnt = mnt; + /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */ + return do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, nd); } static struct dentry *__lookup_hash(struct qstr *name, @@ -1810,17 +1728,6 @@ static struct dentry *__lookup_hash(struct qstr *name, return ERR_PTR(err); /* - * See if the low-level filesystem might want - * to use its own hash.. - */ - if (base->d_flags & DCACHE_OP_HASH) { - err = base->d_op->d_hash(base, inode, name); - dentry = ERR_PTR(err); - if (err < 0) - goto out; - } - - /* * Don't bother with __d_lookup: callers are for creat as * well as unlink, so a lot of the time it would cost * a double lookup. @@ -1832,7 +1739,7 @@ static struct dentry *__lookup_hash(struct qstr *name, if (!dentry) dentry = d_alloc_and_lookup(base, name, nd); -out: + return dentry; } @@ -1846,28 +1753,6 @@ static struct dentry *lookup_hash(struct nameidata *nd) return __lookup_hash(&nd->last, nd->path.dentry, nd); } -static int __lookup_one_len(const char *name, struct qstr *this, - struct dentry *base, int len) -{ - unsigned long hash; - unsigned int c; - - this->name = name; - this->len = len; - if (!len) - return -EACCES; - - hash = init_name_hash(); - while (len--) { - c = *(const unsigned char *)name++; - if (c == '/' || c == '\0') - return -EACCES; - hash = partial_name_hash(c, hash); - } - this->hash = end_name_hash(hash); - return 0; -} - /** * lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup @@ -1881,14 +1766,34 @@ static int __lookup_one_len(const char *name, struct qstr *this, */ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) { - int err; struct qstr this; + unsigned long hash; + unsigned int c; WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); - err = __lookup_one_len(name, &this, base, len); - if (err) - return ERR_PTR(err); + this.name = name; + this.len = len; + if (!len) + return ERR_PTR(-EACCES); + + hash = init_name_hash(); + while (len--) { + c = *(const unsigned char *)name++; + if (c == '/' || c == '\0') + return ERR_PTR(-EACCES); + hash = partial_name_hash(c, hash); + } + this.hash = end_name_hash(hash); + /* + * See if the low-level filesystem might want + * to use its own hash.. + */ + if (base->d_flags & DCACHE_OP_HASH) { + int err = base->d_op->d_hash(base, base->d_inode, &this); + if (err < 0) + return ERR_PTR(err); + } return __lookup_hash(&this, base, NULL); } @@ -1897,7 +1802,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags, struct path *path) { struct nameidata nd; - char *tmp = getname(name); + char *tmp = getname_flags(name, flags); int err = PTR_ERR(tmp); if (!IS_ERR(tmp)) { @@ -2077,12 +1982,16 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode, return error; } -int may_open(struct path *path, int acc_mode, int flag) +static int may_open(struct path *path, int acc_mode, int flag) { struct dentry *dentry = path->dentry; struct inode *inode = dentry->d_inode; int error; + /* O_PATH? */ + if (!acc_mode) + return 0; + if (!inode) return -ENOENT; @@ -2151,34 +2060,6 @@ static int handle_truncate(struct file *filp) } /* - * Be careful about ever adding any more callers of this - * function. Its flags must be in the namei format, not - * what get passed to sys_open(). - */ -static int __open_namei_create(struct nameidata *nd, struct path *path, - int open_flag, int mode) -{ - int error; - struct dentry *dir = nd->path.dentry; - - if (!IS_POSIXACL(dir->d_inode)) - mode &= ~current_umask(); - error = security_path_mknod(&nd->path, path->dentry, mode, 0); - if (error) - goto out_unlock; - error = vfs_create(dir->d_inode, path->dentry, mode, nd); -out_unlock: - mutex_unlock(&dir->d_inode->i_mutex); - dput(nd->path.dentry); - nd->path.dentry = path->dentry; - - if (error) - return error; - /* Don't check for write permission, don't truncate */ - return may_open(&nd->path, 0, open_flag & ~O_TRUNC); -} - -/* * Note that while the flag value (low two bits) for sys_open means: * 00 - read-only * 01 - write-only @@ -2202,126 +2083,115 @@ static inline int open_to_namei_flags(int flag) return flag; } -static int open_will_truncate(int flag, struct inode *inode) -{ - /* - * We'll never write to the fs underlying - * a device file. - */ - if (special_file(inode->i_mode)) - return 0; - return (flag & O_TRUNC); -} - -static struct file *finish_open(struct nameidata *nd, - int open_flag, int acc_mode) -{ - struct file *filp; - int will_truncate; - int error; - - will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode); - if (will_truncate) { - error = mnt_want_write(nd->path.mnt); - if (error) - goto exit; - } - error = may_open(&nd->path, acc_mode, open_flag); - if (error) { - if (will_truncate) - mnt_drop_write(nd->path.mnt); - goto exit; - } - filp = nameidata_to_filp(nd); - if (!IS_ERR(filp)) { - error = ima_file_check(filp, acc_mode); - if (error) { - fput(filp); - filp = ERR_PTR(error); - } - } - if (!IS_ERR(filp)) { - if (will_truncate) { - error = handle_truncate(filp); - if (error) { - fput(filp); - filp = ERR_PTR(error); - } - } - } - /* - * It is now safe to drop the mnt write - * because the filp has had a write taken - * on its behalf. - */ - if (will_truncate) - mnt_drop_write(nd->path.mnt); - path_put(&nd->path); - return filp; - -exit: - path_put(&nd->path); - return ERR_PTR(error); -} - /* - * Handle O_CREAT case for do_filp_open + * Handle the last step of open() */ static struct file *do_last(struct nameidata *nd, struct path *path, - int open_flag, int acc_mode, - int mode, const char *pathname) + const struct open_flags *op, const char *pathname) { struct dentry *dir = nd->path.dentry; + struct dentry *dentry; + int open_flag = op->open_flag; + int will_truncate = open_flag & O_TRUNC; + int want_write = 0; + int acc_mode = op->acc_mode; struct file *filp; - int error = -EISDIR; + int error; + + nd->flags &= ~LOOKUP_PARENT; + nd->flags |= op->intent; switch (nd->last_type) { case LAST_DOTDOT: - follow_dotdot(nd); - dir = nd->path.dentry; case LAST_DOT: - if (need_reval_dot(dir)) { - int status = d_revalidate(nd->path.dentry, nd); - if (!status) - status = -ESTALE; - if (status < 0) { - error = status; - goto exit; - } - } + error = handle_dots(nd, nd->last_type); + if (error) + return ERR_PTR(error); /* fallthrough */ case LAST_ROOT: - goto exit; + if (nd->flags & LOOKUP_RCU) { + if (nameidata_drop_rcu_last(nd)) + return ERR_PTR(-ECHILD); + } + error = handle_reval_path(nd); + if (error) + goto exit; + audit_inode(pathname, nd->path.dentry); + if (open_flag & O_CREAT) { + error = -EISDIR; + goto exit; + } + goto ok; case LAST_BIND: + /* can't be RCU mode here */ + error = handle_reval_path(nd); + if (error) + goto exit; audit_inode(pathname, dir); goto ok; } + if (!(open_flag & O_CREAT)) { + int symlink_ok = 0; + if (nd->last.name[nd->last.len]) + nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; + if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW)) + symlink_ok = 1; + /* we _can_ be in RCU mode here */ + error = walk_component(nd, path, &nd->last, LAST_NORM, + !symlink_ok); + if (error < 0) + return ERR_PTR(error); + if (error) /* symlink */ + return NULL; + /* sayonara */ + if (nd->flags & LOOKUP_RCU) { + if (nameidata_drop_rcu_last(nd)) + return ERR_PTR(-ECHILD); + } + + error = -ENOTDIR; + if (nd->flags & LOOKUP_DIRECTORY) { + if (!nd->inode->i_op->lookup) + goto exit; + } + audit_inode(pathname, nd->path.dentry); + goto ok; + } + + /* create side of things */ + + if (nd->flags & LOOKUP_RCU) { + if (nameidata_drop_rcu_last(nd)) + return ERR_PTR(-ECHILD); + } + + audit_inode(pathname, dir); + error = -EISDIR; /* trailing slashes? */ if (nd->last.name[nd->last.len]) goto exit; mutex_lock(&dir->d_inode->i_mutex); - path->dentry = lookup_hash(nd); - path->mnt = nd->path.mnt; - - error = PTR_ERR(path->dentry); - if (IS_ERR(path->dentry)) { + dentry = lookup_hash(nd); + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) { mutex_unlock(&dir->d_inode->i_mutex); goto exit; } - if (IS_ERR(nd->intent.open.file)) { - error = PTR_ERR(nd->intent.open.file); - goto exit_mutex_unlock; - } + path->dentry = dentry; + path->mnt = nd->path.mnt; /* Negative dentry, just create the file */ - if (!path->dentry->d_inode) { + if (!dentry->d_inode) { + int mode = op->mode; + if (!IS_POSIXACL(dir->d_inode)) + mode &= ~current_umask(); /* * This write is needed to ensure that a - * ro->rw transition does not occur between + * rw->ro transition does not occur between * the time when the file is created and when * a permanent write count is taken through * the 'struct file' in nameidata_to_filp(). @@ -2329,22 +2199,21 @@ static struct file *do_last(struct nameidata *nd, struct path *path, error = mnt_want_write(nd->path.mnt); if (error) goto exit_mutex_unlock; - error = __open_namei_create(nd, path, open_flag, mode); - if (error) { - mnt_drop_write(nd->path.mnt); - goto exit; - } - filp = nameidata_to_filp(nd); - mnt_drop_write(nd->path.mnt); - path_put(&nd->path); - if (!IS_ERR(filp)) { - error = ima_file_check(filp, acc_mode); - if (error) { - fput(filp); - filp = ERR_PTR(error); - } - } - return filp; + want_write = 1; + /* Don't check for write permission, don't truncate */ + open_flag &= ~O_TRUNC; + will_truncate = 0; + acc_mode = MAY_OPEN; + error = security_path_mknod(&nd->path, dentry, mode, 0); + if (error) + goto exit_mutex_unlock; + error = vfs_create(dir->d_inode, dentry, mode, nd); + if (error) + goto exit_mutex_unlock; + mutex_unlock(&dir->d_inode->i_mutex); + dput(nd->path.dentry); + nd->path.dentry = dentry; + goto common; } /* @@ -2374,7 +2243,40 @@ static struct file *do_last(struct nameidata *nd, struct path *path, if (S_ISDIR(nd->inode->i_mode)) goto exit; ok: - filp = finish_open(nd, open_flag, acc_mode); + if (!S_ISREG(nd->inode->i_mode)) + will_truncate = 0; + + if (will_truncate) { + error = mnt_want_write(nd->path.mnt); + if (error) + goto exit; + want_write = 1; + } +common: + error = may_open(&nd->path, acc_mode, open_flag); + if (error) + goto exit; + filp = nameidata_to_filp(nd); + if (!IS_ERR(filp)) { + error = ima_file_check(filp, op->acc_mode); + if (error) { + fput(filp); + filp = ERR_PTR(error); + } + } + if (!IS_ERR(filp)) { + if (will_truncate) { + error = handle_truncate(filp); + if (error) { + fput(filp); + filp = ERR_PTR(error); + } + } + } +out: + if (want_write) + mnt_drop_write(nd->path.mnt); + path_put(&nd->path); return filp; exit_mutex_unlock: @@ -2382,197 +2284,103 @@ exit_mutex_unlock: exit_dput: path_put_conditional(path, nd); exit: - path_put(&nd->path); - return ERR_PTR(error); + filp = ERR_PTR(error); + goto out; } -/* - * Note that the low bits of the passed in "open_flag" - * are not the same as in the local variable "flag". See - * open_to_namei_flags() for more details. - */ -struct file *do_filp_open(int dfd, const char *pathname, - int open_flag, int mode, int acc_mode) +static struct file *path_openat(int dfd, const char *pathname, + struct nameidata *nd, const struct open_flags *op, int flags) { + struct file *base = NULL; struct file *filp; - struct nameidata nd; - int error; struct path path; - int count = 0; - int flag = open_to_namei_flags(open_flag); - int flags; - - if (!(open_flag & O_CREAT)) - mode = 0; - - /* Must never be set by userspace */ - open_flag &= ~FMODE_NONOTIFY; - - /* - * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only - * check for O_DSYNC if the need any syncing at all we enforce it's - * always set instead of having to deal with possibly weird behaviour - * for malicious applications setting only __O_SYNC. - */ - if (open_flag & __O_SYNC) - open_flag |= O_DSYNC; - - if (!acc_mode) - acc_mode = MAY_OPEN | ACC_MODE(open_flag); - - /* O_TRUNC implies we need access checks for write permissions */ - if (open_flag & O_TRUNC) - acc_mode |= MAY_WRITE; - - /* Allow the LSM permission hook to distinguish append - access from general write access. */ - if (open_flag & O_APPEND) - acc_mode |= MAY_APPEND; - - flags = LOOKUP_OPEN; - if (open_flag & O_CREAT) { - flags |= LOOKUP_CREATE; - if (open_flag & O_EXCL) - flags |= LOOKUP_EXCL; - } - if (open_flag & O_DIRECTORY) - flags |= LOOKUP_DIRECTORY; - if (!(open_flag & O_NOFOLLOW)) - flags |= LOOKUP_FOLLOW; + int error; filp = get_empty_filp(); if (!filp) return ERR_PTR(-ENFILE); - filp->f_flags = open_flag; - nd.intent.open.file = filp; - nd.intent.open.flags = flag; - nd.intent.open.create_mode = mode; - - if (open_flag & O_CREAT) - goto creat; + filp->f_flags = op->open_flag; + nd->intent.open.file = filp; + nd->intent.open.flags = open_to_namei_flags(op->open_flag); + nd->intent.open.create_mode = op->mode; - /* !O_CREAT, simple open */ - error = do_path_lookup(dfd, pathname, flags, &nd); + error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base); if (unlikely(error)) goto out_filp; - error = -ELOOP; - if (!(nd.flags & LOOKUP_FOLLOW)) { - if (nd.inode->i_op->follow_link) - goto out_path; - } - error = -ENOTDIR; - if (nd.flags & LOOKUP_DIRECTORY) { - if (!nd.inode->i_op->lookup) - goto out_path; - } - audit_inode(pathname, nd.path.dentry); - filp = finish_open(&nd, open_flag, acc_mode); - release_open_intent(&nd); - return filp; -creat: - /* OK, have to create the file. Find the parent. */ - error = path_init_rcu(dfd, pathname, - LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd); - if (error) - goto out_filp; - error = path_walk_rcu(pathname, &nd); - path_finish_rcu(&nd); - if (unlikely(error == -ECHILD || error == -ESTALE)) { - /* slower, locked walk */ - if (error == -ESTALE) { -reval: - flags |= LOOKUP_REVAL; - } - error = path_init(dfd, pathname, - LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd); - if (error) - goto out_filp; - - error = path_walk_simple(pathname, &nd); - } + current->total_link_count = 0; + error = link_path_walk(pathname, nd); if (unlikely(error)) goto out_filp; - if (unlikely(!audit_dummy_context())) - audit_inode(pathname, nd.path.dentry); - /* - * We have the parent and last component. - */ - nd.flags = flags; - filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); + filp = do_last(nd, &path, op, pathname); while (unlikely(!filp)) { /* trailing symlink */ struct path link = path; - struct inode *linki = link.dentry->d_inode; void *cookie; - error = -ELOOP; - if (!(nd.flags & LOOKUP_FOLLOW)) - goto exit_dput; - if (count++ == 32) - goto exit_dput; - /* - * This is subtle. Instead of calling do_follow_link() we do - * the thing by hands. The reason is that this way we have zero - * link_count and path_walk() (called from ->follow_link) - * honoring LOOKUP_PARENT. After that we have the parent and - * last component, i.e. we are in the same situation as after - * the first path_walk(). Well, almost - if the last component - * is normal we get its copy stored in nd->last.name and we will - * have to putname() it when we are done. Procfs-like symlinks - * just set LAST_BIND. - */ - nd.flags |= LOOKUP_PARENT; - error = security_inode_follow_link(link.dentry, &nd); - if (error) - goto exit_dput; - error = __do_follow_link(&link, &nd, &cookie); - if (unlikely(error)) { - if (!IS_ERR(cookie) && linki->i_op->put_link) - linki->i_op->put_link(link.dentry, &nd, cookie); - /* nd.path had been dropped */ - nd.path = link; - goto out_path; + if (!(nd->flags & LOOKUP_FOLLOW)) { + path_put_conditional(&path, nd); + path_put(&nd->path); + filp = ERR_PTR(-ELOOP); + break; } - nd.flags &= ~LOOKUP_PARENT; - filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); - if (linki->i_op->put_link) - linki->i_op->put_link(link.dentry, &nd, cookie); - path_put(&link); + nd->flags |= LOOKUP_PARENT; + nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); + error = follow_link(&link, nd, &cookie); + if (unlikely(error)) + filp = ERR_PTR(error); + else + filp = do_last(nd, &path, op, pathname); + put_link(nd, &link, cookie); } out: - if (nd.root.mnt) - path_put(&nd.root); - if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL)) - goto reval; - release_open_intent(&nd); + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) + path_put(&nd->root); + if (base) + fput(base); + release_open_intent(nd); return filp; -exit_dput: - path_put_conditional(&path, &nd); -out_path: - path_put(&nd.path); out_filp: filp = ERR_PTR(error); goto out; } -/** - * filp_open - open file and return file pointer - * - * @filename: path to open - * @flags: open flags as per the open(2) second argument - * @mode: mode for the new file if O_CREAT is set, else ignored - * - * This is the helper to open a file from kernelspace if you really - * have to. But in generally you should not do this, so please move - * along, nothing to see here.. - */ -struct file *filp_open(const char *filename, int flags, int mode) +struct file *do_filp_open(int dfd, const char *pathname, + const struct open_flags *op, int flags) +{ + struct nameidata nd; + struct file *filp; + + filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU); + if (unlikely(filp == ERR_PTR(-ECHILD))) + filp = path_openat(dfd, pathname, &nd, op, flags); + if (unlikely(filp == ERR_PTR(-ESTALE))) + filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL); + return filp; +} + +struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, + const char *name, const struct open_flags *op, int flags) { - return do_filp_open(AT_FDCWD, filename, flags, mode, 0); + struct nameidata nd; + struct file *file; + + nd.root.mnt = mnt; + nd.root.dentry = dentry; + + flags |= LOOKUP_ROOT; + + if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN) + return ERR_PTR(-ELOOP); + + file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU); + if (unlikely(file == ERR_PTR(-ECHILD))) + file = path_openat(-1, name, &nd, op, flags); + if (unlikely(file == ERR_PTR(-ESTALE))) + file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL); + return file; } -EXPORT_SYMBOL(filp_open); /** * lookup_create - lookup a dentry, creating it if it doesn't exist @@ -3111,7 +2919,11 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de return error; mutex_lock(&inode->i_mutex); - error = dir->i_op->link(old_dentry, dir, new_dentry); + /* Make sure we don't allow creating hardlink to an unlinked file */ + if (inode->i_nlink == 0) + error = -ENOENT; + else + error = dir->i_op->link(old_dentry, dir, new_dentry); mutex_unlock(&inode->i_mutex); if (!error) fsnotify_link(dir, inode, new_dentry); @@ -3133,15 +2945,27 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, struct dentry *new_dentry; struct nameidata nd; struct path old_path; + int how = 0; int error; char *to; - if ((flags & ~AT_SYMLINK_FOLLOW) != 0) + if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; + /* + * To use null names we require CAP_DAC_READ_SEARCH + * This ensures that not everyone will be able to create + * handlink using the passed filedescriptor. + */ + if (flags & AT_EMPTY_PATH) { + if (!capable(CAP_DAC_READ_SEARCH)) + return -ENOENT; + how = LOOKUP_EMPTY; + } + + if (flags & AT_SYMLINK_FOLLOW) + how |= LOOKUP_FOLLOW; - error = user_path_at(olddfd, oldname, - flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0, - &old_path); + error = user_path_at(olddfd, oldname, how, &old_path); if (error) return error; @@ -3578,7 +3402,7 @@ EXPORT_SYMBOL(page_readlink); EXPORT_SYMBOL(__page_symlink); EXPORT_SYMBOL(page_symlink); EXPORT_SYMBOL(page_symlink_inode_operations); -EXPORT_SYMBOL(path_lookup); +EXPORT_SYMBOL(kern_path_parent); EXPORT_SYMBOL(kern_path); EXPORT_SYMBOL(vfs_path_lookup); EXPORT_SYMBOL(inode_permission); diff --git a/fs/namespace.c b/fs/namespace.c index 7b0b953..dffe6f4 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1002,6 +1002,18 @@ const struct seq_operations mounts_op = { .show = show_vfsmnt }; +static int uuid_is_nil(u8 *uuid) +{ + int i; + u8 *cp = (u8 *)uuid; + + for (i = 0; i < 16; i++) { + if (*cp++) + return 0; + } + return 1; +} + static int show_mountinfo(struct seq_file *m, void *v) { struct proc_mounts *p = m->private; @@ -1040,6 +1052,10 @@ static int show_mountinfo(struct seq_file *m, void *v) if (IS_MNT_UNBINDABLE(mnt)) seq_puts(m, " unbindable"); + if (!uuid_is_nil(mnt->mnt_sb->s_uuid)) + /* print the uuid */ + seq_printf(m, " uuid:%pU", mnt->mnt_sb->s_uuid); + /* Filesystem specific data */ seq_puts(m, " - "); show_type(m, sb); @@ -1244,7 +1260,7 @@ static int do_umount(struct vfsmount *mnt, int flags) */ br_write_lock(vfsmount_lock); if (mnt_get_count(mnt) != 2) { - br_write_lock(vfsmount_lock); + br_write_unlock(vfsmount_lock); return -EBUSY; } br_write_unlock(vfsmount_lock); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 1cc600e..2f8e618 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -37,6 +37,7 @@ #include <linux/inet.h> #include <linux/nfs_xdr.h> #include <linux/slab.h> +#include <linux/compat.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -89,7 +90,11 @@ int nfs_wait_bit_killable(void *word) */ u64 nfs_compat_user_ino64(u64 fileid) { - int ino; +#ifdef CONFIG_COMPAT + compat_ulong_t ino; +#else + unsigned long ino; +#endif if (enable_ino64) return fileid; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 7a74740..1be36cf 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -298,6 +298,11 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp); #if defined(CONFIG_NFS_V4_1) struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp); struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp); +extern void nfs4_schedule_session_recovery(struct nfs4_session *); +#else +static inline void nfs4_schedule_session_recovery(struct nfs4_session *session) +{ +} #endif /* CONFIG_NFS_V4_1 */ extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); @@ -307,10 +312,9 @@ extern void nfs4_put_open_state(struct nfs4_state *); extern void nfs4_close_state(struct path *, struct nfs4_state *, fmode_t); extern void nfs4_close_sync(struct path *, struct nfs4_state *, fmode_t); extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); -extern void nfs4_schedule_state_recovery(struct nfs_client *); +extern void nfs4_schedule_lease_recovery(struct nfs_client *); extern void nfs4_schedule_state_manager(struct nfs_client *); -extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state); -extern int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state); +extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *); extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); extern void nfs41_handle_recall_slot(struct nfs_client *clp); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c index f5c9b12..b73c343 100644 --- a/fs/nfs/nfs4filelayoutdev.c +++ b/fs/nfs/nfs4filelayoutdev.c @@ -219,6 +219,10 @@ decode_and_add_ds(__be32 **pp, struct inode *inode) goto out_err; } buf = kmalloc(rlen + 1, GFP_KERNEL); + if (!buf) { + dprintk("%s: Not enough memory\n", __func__); + goto out_err; + } buf[rlen] = '\0'; memcpy(buf, r_addr, rlen); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 78936a8..0a07e35 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -256,12 +256,13 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, case -NFS4ERR_OPENMODE: if (state == NULL) break; - nfs4_state_mark_reclaim_nograce(clp, state); - goto do_state_recovery; + nfs4_schedule_stateid_recovery(server, state); + goto wait_on_recovery; case -NFS4ERR_STALE_STATEID: case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_EXPIRED: - goto do_state_recovery; + nfs4_schedule_lease_recovery(clp); + goto wait_on_recovery; #if defined(CONFIG_NFS_V4_1) case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: @@ -272,7 +273,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, case -NFS4ERR_SEQ_MISORDERED: dprintk("%s ERROR: %d Reset session\n", __func__, errorcode); - nfs4_schedule_state_recovery(clp); + nfs4_schedule_session_recovery(clp->cl_session); exception->retry = 1; break; #endif /* defined(CONFIG_NFS_V4_1) */ @@ -295,8 +296,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, } /* We failed to handle the error */ return nfs4_map_errors(ret); -do_state_recovery: - nfs4_schedule_state_recovery(clp); +wait_on_recovery: ret = nfs4_wait_clnt_recover(clp); if (ret == 0) exception->retry = 1; @@ -435,8 +435,8 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res * clp = res->sr_session->clp; do_renew_lease(clp, timestamp); /* Check sequence flags */ - if (atomic_read(&clp->cl_count) > 1) - nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); + if (res->sr_status_flags != 0) + nfs4_schedule_lease_recovery(clp); break; case -NFS4ERR_DELAY: /* The server detected a resend of the RPC call and @@ -1255,14 +1255,13 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state case -NFS4ERR_BAD_HIGH_SLOT: case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: case -NFS4ERR_DEADSESSION: - nfs4_schedule_state_recovery( - server->nfs_client); + nfs4_schedule_session_recovery(server->nfs_client->cl_session); goto out; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: /* Don't recall a delegation if it was lost */ - nfs4_schedule_state_recovery(server->nfs_client); + nfs4_schedule_lease_recovery(server->nfs_client); goto out; case -ERESTARTSYS: /* @@ -1271,7 +1270,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state */ case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: - nfs4_state_mark_reclaim_nograce(server->nfs_client, state); + nfs4_schedule_stateid_recovery(server, state); case -EKEYEXPIRED: /* * User RPCSEC_GSS context has expired. @@ -1587,7 +1586,7 @@ static int nfs4_recover_expired_lease(struct nfs_server *server) if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) && !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state)) break; - nfs4_schedule_state_recovery(clp); + nfs4_schedule_state_manager(clp); ret = -EIO; } return ret; @@ -3178,7 +3177,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata) if (task->tk_status < 0) { /* Unless we're shutting down, schedule state recovery! */ if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0) - nfs4_schedule_state_recovery(clp); + nfs4_schedule_lease_recovery(clp); return; } do_renew_lease(clp, timestamp); @@ -3252,6 +3251,35 @@ static void buf_to_pages(const void *buf, size_t buflen, } } +static int buf_to_pages_noslab(const void *buf, size_t buflen, + struct page **pages, unsigned int *pgbase) +{ + struct page *newpage, **spages; + int rc = 0; + size_t len; + spages = pages; + + do { + len = min_t(size_t, PAGE_CACHE_SIZE, buflen); + newpage = alloc_page(GFP_KERNEL); + + if (newpage == NULL) + goto unwind; + memcpy(page_address(newpage), buf, len); + buf += len; + buflen -= len; + *pages++ = newpage; + rc++; + } while (buflen != 0); + + return rc; + +unwind: + for(; rc > 0; rc--) + __free_page(spages[rc-1]); + return -ENOMEM; +} + struct nfs4_cached_acl { int cached; size_t len; @@ -3420,13 +3448,23 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl .rpc_argp = &arg, .rpc_resp = &res, }; - int ret; + int ret, i; if (!nfs4_server_supports_acls(server)) return -EOPNOTSUPP; + i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase); + if (i < 0) + return i; nfs_inode_return_delegation(inode); - buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase); ret = nfs4_call_sync(server, &msg, &arg, &res, 1); + + /* + * Free each page after tx, so the only ref left is + * held by the network stack + */ + for (; i > 0; i--) + put_page(pages[i-1]); + /* * Acl update can result in inode attribute update. * so mark the attribute cache invalid. @@ -3464,12 +3502,13 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, case -NFS4ERR_OPENMODE: if (state == NULL) break; - nfs4_state_mark_reclaim_nograce(clp, state); - goto do_state_recovery; + nfs4_schedule_stateid_recovery(server, state); + goto wait_on_recovery; case -NFS4ERR_STALE_STATEID: case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_EXPIRED: - goto do_state_recovery; + nfs4_schedule_lease_recovery(clp); + goto wait_on_recovery; #if defined(CONFIG_NFS_V4_1) case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: @@ -3480,7 +3519,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, case -NFS4ERR_SEQ_MISORDERED: dprintk("%s ERROR %d, Reset session\n", __func__, task->tk_status); - nfs4_schedule_state_recovery(clp); + nfs4_schedule_session_recovery(clp->cl_session); task->tk_status = 0; return -EAGAIN; #endif /* CONFIG_NFS_V4_1 */ @@ -3497,9 +3536,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, } task->tk_status = nfs4_map_errors(task->tk_status); return 0; -do_state_recovery: +wait_on_recovery: rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); - nfs4_schedule_state_recovery(clp); if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); task->tk_status = 0; @@ -4110,7 +4148,7 @@ static void nfs4_lock_release(void *calldata) task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp, data->arg.lock_seqid); if (!IS_ERR(task)) - rpc_put_task(task); + rpc_put_task_async(task); dprintk("%s: cancelling lock!\n", __func__); } else nfs_free_seqid(data->arg.lock_seqid); @@ -4134,23 +4172,18 @@ static const struct rpc_call_ops nfs4_recover_lock_ops = { static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error) { - struct nfs_client *clp = server->nfs_client; - struct nfs4_state *state = lsp->ls_state; - switch (error) { case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_EXPIRED: + lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; if (new_lock_owner != 0 || (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) - nfs4_state_mark_reclaim_nograce(clp, state); - lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; + nfs4_schedule_stateid_recovery(server, lsp->ls_state); break; case -NFS4ERR_STALE_STATEID: - if (new_lock_owner != 0 || - (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) - nfs4_state_mark_reclaim_reboot(clp, state); lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; + case -NFS4ERR_EXPIRED: + nfs4_schedule_lease_recovery(server->nfs_client); }; } @@ -4366,12 +4399,14 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) case -NFS4ERR_EXPIRED: case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: + nfs4_schedule_lease_recovery(server->nfs_client); + goto out; case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: case -NFS4ERR_BAD_HIGH_SLOT: case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: case -NFS4ERR_DEADSESSION: - nfs4_schedule_state_recovery(server->nfs_client); + nfs4_schedule_session_recovery(server->nfs_client->cl_session); goto out; case -ERESTARTSYS: /* @@ -4381,7 +4416,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl) case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_BAD_STATEID: case -NFS4ERR_OPENMODE: - nfs4_state_mark_reclaim_nograce(server->nfs_client, state); + nfs4_schedule_stateid_recovery(server, state); err = 0; goto out; case -EKEYEXPIRED: @@ -4988,10 +5023,20 @@ int nfs4_proc_create_session(struct nfs_client *clp) int status; unsigned *ptr; struct nfs4_session *session = clp->cl_session; + long timeout = 0; + int err; dprintk("--> %s clp=%p session=%p\n", __func__, clp, session); - status = _nfs4_proc_create_session(clp); + do { + status = _nfs4_proc_create_session(clp); + if (status == -NFS4ERR_DELAY) { + err = nfs4_delay(clp->cl_rpcclient, &timeout); + if (err) + status = err; + } + } while (status == -NFS4ERR_DELAY); + if (status) goto out; @@ -5100,7 +5145,7 @@ static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client rpc_delay(task, NFS4_POLL_RETRY_MAX); return -EAGAIN; default: - nfs4_schedule_state_recovery(clp); + nfs4_schedule_lease_recovery(clp); } return 0; } @@ -5187,7 +5232,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr if (IS_ERR(task)) ret = PTR_ERR(task); else - rpc_put_task(task); + rpc_put_task_async(task); dprintk("<-- %s status=%d\n", __func__, ret); return ret; } @@ -5203,8 +5248,13 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) goto out; } ret = rpc_wait_for_completion_task(task); - if (!ret) + if (!ret) { + struct nfs4_sequence_res *res = task->tk_msg.rpc_resp; + + if (task->tk_status == 0) + nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); ret = task->tk_status; + } rpc_put_task(task); out: dprintk("<-- %s status=%d\n", __func__, ret); @@ -5241,7 +5291,7 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf rpc_delay(task, NFS4_POLL_RETRY_MAX); return -EAGAIN; default: - nfs4_schedule_state_recovery(clp); + nfs4_schedule_lease_recovery(clp); } return 0; } @@ -5309,6 +5359,9 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp) status = PTR_ERR(task); goto out; } + status = nfs4_wait_for_completion_rpc_task(task); + if (status == 0) + status = task->tk_status; rpc_put_task(task); return 0; out: diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index e6742b5..0592288 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1007,9 +1007,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) } /* - * Schedule a state recovery attempt + * Schedule a lease recovery attempt */ -void nfs4_schedule_state_recovery(struct nfs_client *clp) +void nfs4_schedule_lease_recovery(struct nfs_client *clp) { if (!clp) return; @@ -1018,7 +1018,7 @@ void nfs4_schedule_state_recovery(struct nfs_client *clp) nfs4_schedule_state_manager(clp); } -int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) +static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) { set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); @@ -1032,7 +1032,7 @@ int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *st return 1; } -int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state) +static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state) { set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); @@ -1041,6 +1041,14 @@ int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *s return 1; } +void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state) +{ + struct nfs_client *clp = server->nfs_client; + + nfs4_state_mark_reclaim_nograce(clp, state); + nfs4_schedule_state_manager(clp); +} + static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops) { struct inode *inode = state->inode; @@ -1436,10 +1444,15 @@ static int nfs4_reclaim_lease(struct nfs_client *clp) } #ifdef CONFIG_NFS_V4_1 +void nfs4_schedule_session_recovery(struct nfs4_session *session) +{ + nfs4_schedule_lease_recovery(session->clp); +} + void nfs41_handle_recall_slot(struct nfs_client *clp) { set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state); - nfs4_schedule_state_recovery(clp); + nfs4_schedule_state_manager(clp); } static void nfs4_reset_all_state(struct nfs_client *clp) @@ -1447,7 +1460,7 @@ static void nfs4_reset_all_state(struct nfs_client *clp) if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { clp->cl_boot_time = CURRENT_TIME; nfs4_state_start_reclaim_nograce(clp); - nfs4_schedule_state_recovery(clp); + nfs4_schedule_state_manager(clp); } } @@ -1455,7 +1468,7 @@ static void nfs41_handle_server_reboot(struct nfs_client *clp) { if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { nfs4_state_start_reclaim_reboot(clp); - nfs4_schedule_state_recovery(clp); + nfs4_schedule_state_manager(clp); } } @@ -1475,7 +1488,7 @@ static void nfs41_handle_cb_path_down(struct nfs_client *clp) { nfs_expire_all_delegations(clp); if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0) - nfs4_schedule_state_recovery(clp); + nfs4_schedule_state_manager(clp); } void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 4e2c168..94d50e8 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1660,7 +1660,7 @@ static void encode_create_session(struct xdr_stream *xdr, p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12); *p++ = cpu_to_be32(OP_CREATE_SESSION); - p = xdr_encode_hyper(p, clp->cl_ex_clid); + p = xdr_encode_hyper(p, clp->cl_clientid); *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */ *p++ = cpu_to_be32(args->flags); /*flags */ @@ -4694,7 +4694,7 @@ static int decode_exchange_id(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 8); if (unlikely(!p)) goto out_overflow; - xdr_decode_hyper(p, &clp->cl_ex_clid); + xdr_decode_hyper(p, &clp->cl_clientid); p = xdr_inline_decode(xdr, 12); if (unlikely(!p)) goto out_overflow; diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 903908a..c541093 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -86,11 +86,14 @@ /* Default path we try to mount. "%s" gets replaced by our IP address */ #define NFS_ROOT "/tftpboot/%s" +/* Default NFSROOT mount options. */ +#define NFS_DEF_OPTIONS "udp" + /* Parameters passed from the kernel command line */ static char nfs_root_parms[256] __initdata = ""; /* Text-based mount options passed to super.c */ -static char nfs_root_options[256] __initdata = ""; +static char nfs_root_options[256] __initdata = NFS_DEF_OPTIONS; /* Address of NFS server */ static __be32 servaddr __initdata = htonl(INADDR_NONE); @@ -160,8 +163,14 @@ static int __init root_nfs_copy(char *dest, const char *src, } static int __init root_nfs_cat(char *dest, const char *src, - const size_t destlen) + const size_t destlen) { + size_t len = strlen(dest); + + if (len && dest[len - 1] != ',') + if (strlcat(dest, ",", destlen) > destlen) + return -1; + if (strlcat(dest, src, destlen) > destlen) return -1; return 0; @@ -194,16 +203,6 @@ static int __init root_nfs_parse_options(char *incoming, char *exppath, if (root_nfs_cat(nfs_root_options, incoming, sizeof(nfs_root_options))) return -1; - - /* - * Possibly prepare for more options to be appended - */ - if (nfs_root_options[0] != '\0' && - nfs_root_options[strlen(nfs_root_options)] != ',') - if (root_nfs_cat(nfs_root_options, ",", - sizeof(nfs_root_options))) - return -1; - return 0; } @@ -217,7 +216,7 @@ static int __init root_nfs_parse_options(char *incoming, char *exppath, */ static int __init root_nfs_data(char *cmdline) { - char addr_option[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1]; + char mand_options[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1]; int len, retval = -1; char *tmp = NULL; const size_t tmplen = sizeof(nfs_export_path); @@ -244,9 +243,9 @@ static int __init root_nfs_data(char *cmdline) * Append mandatory options for nfsroot so they override * what has come before */ - snprintf(addr_option, sizeof(addr_option), "nolock,addr=%pI4", + snprintf(mand_options, sizeof(mand_options), "nolock,addr=%pI4", &servaddr); - if (root_nfs_cat(nfs_root_options, addr_option, + if (root_nfs_cat(nfs_root_options, mand_options, sizeof(nfs_root_options))) goto out_optionstoolong; diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index e313a51..6481d53 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -180,7 +180,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n task_setup_data.rpc_client = NFS_CLIENT(dir); task = rpc_run_task(&task_setup_data); if (!IS_ERR(task)) - rpc_put_task(task); + rpc_put_task_async(task); return 1; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index c8278f4..42b92d7 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1292,6 +1292,8 @@ static int nfs_commit_rpcsetup(struct list_head *head, task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); + if (how & FLUSH_SYNC) + rpc_wait_for_completion_task(task); rpc_put_task(task); return 0; } diff --git a/fs/nfsctl.c b/fs/nfsctl.c index bf9cbd2..124e8fc 100644 --- a/fs/nfsctl.c +++ b/fs/nfsctl.c @@ -22,30 +22,17 @@ static struct file *do_open(char *name, int flags) { - struct nameidata nd; struct vfsmount *mnt; - int error; + struct file *file; mnt = do_kern_mount("nfsd", 0, "nfsd", NULL); if (IS_ERR(mnt)) return (struct file *)mnt; - error = vfs_path_lookup(mnt->mnt_root, mnt, name, 0, &nd); - mntput(mnt); /* drop do_kern_mount reference */ - if (error) - return ERR_PTR(error); - - if (flags == O_RDWR) - error = may_open(&nd.path, MAY_READ|MAY_WRITE, flags); - else - error = may_open(&nd.path, MAY_WRITE, flags); + file = file_open_root(mnt->mnt_root, mnt, name, flags); - if (!error) - return dentry_open(nd.path.dentry, nd.path.mnt, flags, - current_cred()); - - path_put(&nd.path); - return ERR_PTR(error); + mntput(mnt); /* drop do_kern_mount reference */ + return file; } static struct { diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index cde36cb..02eb4ed 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -432,7 +432,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr, * If the server returns different values for sessionID, slotID or * sequence number, the server is looney tunes. */ - p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4); + p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4 + 4 + 4); if (unlikely(p == NULL)) goto out_overflow; memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 54b60bf..7b566ec 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2445,15 +2445,16 @@ nfs4_check_delegmode(struct nfs4_delegation *dp, int flags) static struct nfs4_delegation * find_delegation_file(struct nfs4_file *fp, stateid_t *stid) { - struct nfs4_delegation *dp = NULL; + struct nfs4_delegation *dp; spin_lock(&recall_lock); - list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) { - if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) - break; - } + list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) + if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) { + spin_unlock(&recall_lock); + return dp; + } spin_unlock(&recall_lock); - return dp; + return NULL; } int share_access_to_flags(u32 share_access) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 1275b86..615f0a9 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1142,7 +1142,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, u32 dummy; char *machine_name; - int i; + int i, j; int nr_secflavs; READ_BUF(16); @@ -1215,7 +1215,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, READ_BUF(4); READ32(dummy); READ_BUF(dummy * 4); - for (i = 0; i < dummy; ++i) + for (j = 0; j < dummy; ++j) READ32(dummy); break; case RPC_AUTH_GSS: diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 388e9e8..85f7baa 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -35,11 +35,6 @@ #include "btnode.h" -void nilfs_btnode_cache_init_once(struct address_space *btnc) -{ - nilfs_mapping_init_once(btnc); -} - static const struct address_space_operations def_btnode_aops = { .sync_page = block_sync_page, }; diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h index 7903749..1b8ebd8 100644 --- a/fs/nilfs2/btnode.h +++ b/fs/nilfs2/btnode.h @@ -37,7 +37,6 @@ struct nilfs_btnode_chkey_ctxt { struct buffer_head *newbh; }; -void nilfs_btnode_cache_init_once(struct address_space *); void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *); void nilfs_btnode_cache_clear(struct address_space *); struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc, diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 6a0e2a1..a0babd2 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -454,9 +454,9 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode, struct backing_dev_info *bdi = inode->i_sb->s_bdi; INIT_LIST_HEAD(&shadow->frozen_buffers); - nilfs_mapping_init_once(&shadow->frozen_data); + address_space_init_once(&shadow->frozen_data); nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops); - nilfs_mapping_init_once(&shadow->frozen_btnodes); + address_space_init_once(&shadow->frozen_btnodes); nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops); mi->mi_shadow = shadow; return 0; diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 9803427..161791d 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -397,7 +397,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page); if (!new_de) goto out_dir; - inc_nlink(old_inode); nilfs_set_link(new_dir, new_de, new_page, old_inode); nilfs_mark_inode_dirty(new_dir); new_inode->i_ctime = CURRENT_TIME; @@ -411,13 +410,9 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_dir->i_nlink >= NILFS_LINK_MAX) goto out_dir; } - inc_nlink(old_inode); err = nilfs_add_link(new_dentry, old_inode); - if (err) { - drop_nlink(old_inode); - nilfs_mark_inode_dirty(old_inode); + if (err) goto out_dir; - } if (dir_de) { inc_nlink(new_dir); nilfs_mark_inode_dirty(new_dir); @@ -431,7 +426,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_inode->i_ctime = CURRENT_TIME; nilfs_delete_entry(old_de, old_page); - drop_nlink(old_inode); if (dir_de) { nilfs_set_link(old_inode, dir_de, dir_page, new_dir); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 0c43241..a585b35 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -492,19 +492,6 @@ unsigned nilfs_page_count_clean_buffers(struct page *page, return nc; } -void nilfs_mapping_init_once(struct address_space *mapping) -{ - memset(mapping, 0, sizeof(*mapping)); - INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); - spin_lock_init(&mapping->tree_lock); - INIT_LIST_HEAD(&mapping->private_list); - spin_lock_init(&mapping->private_lock); - - spin_lock_init(&mapping->i_mmap_lock); - INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); - INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); -} - void nilfs_mapping_init(struct address_space *mapping, struct backing_dev_info *bdi, const struct address_space_operations *aops) diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index 622df27..2a00953 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -61,7 +61,6 @@ void nilfs_free_private_page(struct page *); int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); void nilfs_copy_back_pages(struct address_space *, struct address_space *); void nilfs_clear_dirty_pages(struct address_space *); -void nilfs_mapping_init_once(struct address_space *mapping); void nilfs_mapping_init(struct address_space *mapping, struct backing_dev_info *bdi, const struct address_space_operations *aops); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 55ebae5..2de9f63 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -430,7 +430,8 @@ static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci, nilfs_segctor_map_segsum_entry( sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo)); - if (inode->i_sb && !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags)) + if (NILFS_I(inode)->i_root && + !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags)) set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); /* skip finfo */ } diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 58fd707..1673b3d 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1279,7 +1279,7 @@ static void nilfs_inode_init_once(void *obj) #ifdef CONFIG_NILFS_XATTR init_rwsem(&ii->xattr_sem); #endif - nilfs_btnode_cache_init_once(&ii->i_btnode_cache); + address_space_init_once(&ii->i_btnode_cache); ii->i_bmap = &ii->i_bmap_data; inode_init_once(&ii->vfs_inode); } diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 6d80ecc..7eb9040 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -56,7 +56,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, int ret = 0; /* if all else fails, just return false */ struct ocfs2_super *osb; - if (nd->flags & LOOKUP_RCU) + if (nd && nd->flags & LOOKUP_RCU) return -ECHILD; inode = dentry->d_inode; diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 5dbc306..254652a 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -197,8 +197,12 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len, dentry->d_name.len, dentry->d_name.name, fh, len, connectable); - if (len < 3 || (connectable && len < 6)) { - mlog(ML_ERROR, "fh buffer is too small for encoding\n"); + if (connectable && (len < 6)) { + *max_len = 6; + type = 255; + goto bail; + } else if (len < 3) { + *max_len = 3; type = 255; goto bail; } diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 43e56b9..6180da1 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -405,9 +405,9 @@ static inline int ocfs2_remove_extent_credits(struct super_block *sb) ocfs2_quota_trans_credits(sb); } -/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + - * bitmap block for the new bit) dx_root update for free list */ -#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2 + 1) +/* data block for new dir/symlink, allocation of directory block, dx_root + * update for free list */ +#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + OCFS2_SUBALLOC_ALLOC + 1) static inline int ocfs2_add_dir_index_credits(struct super_block *sb) { diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index b5f9160..29623da 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3228,7 +3228,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb, u32 num_clusters, unsigned int e_flags) { int ret, delete, index, credits = 0; - u32 new_bit, new_len; + u32 new_bit, new_len, orig_num_clusters; unsigned int set_len; struct ocfs2_super *osb = OCFS2_SB(sb); handle_t *handle; @@ -3261,6 +3261,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb, goto out; } + orig_num_clusters = num_clusters; + while (num_clusters) { ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh, p_cluster, num_clusters, @@ -3348,7 +3350,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb, * in write-back mode. */ if (context->get_clusters == ocfs2_di_get_clusters) { - ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters); + ret = ocfs2_cow_sync_writeback(sb, context, cpos, + orig_num_clusters); if (ret) mlog_errno(ret); } @@ -4376,7 +4379,7 @@ static int ocfs2_user_path_parent(const char __user *path, if (IS_ERR(s)) return PTR_ERR(s); - error = path_lookup(s, LOOKUP_PARENT, nd); + error = kern_path_parent(s, nd); if (error) putname(s); else diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 38f986d..36c423f 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1316,7 +1316,7 @@ static int ocfs2_parse_options(struct super_block *sb, struct mount_options *mopt, int is_remount) { - int status; + int status, user_stack = 0; char *p; u32 tmp; @@ -1459,6 +1459,15 @@ static int ocfs2_parse_options(struct super_block *sb, memcpy(mopt->cluster_stack, args[0].from, OCFS2_STACK_LABEL_LEN); mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; + /* + * Open code the memcmp here as we don't have + * an osb to pass to + * ocfs2_userspace_stack(). + */ + if (memcmp(mopt->cluster_stack, + OCFS2_CLASSIC_CLUSTER_STACK, + OCFS2_STACK_LABEL_LEN)) + user_stack = 1; break; case Opt_inode64: mopt->mount_opt |= OCFS2_MOUNT_INODE64; @@ -1514,13 +1523,16 @@ static int ocfs2_parse_options(struct super_block *sb, } } - /* Ensure only one heartbeat mode */ - tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | - OCFS2_MOUNT_HB_NONE); - if (hweight32(tmp) != 1) { - mlog(ML_ERROR, "Invalid heartbeat mount options\n"); - status = 0; - goto bail; + if (user_stack == 0) { + /* Ensure only one heartbeat mode */ + tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | + OCFS2_MOUNT_HB_GLOBAL | + OCFS2_MOUNT_HB_NONE); + if (hweight32(tmp) != 1) { + mlog(ML_ERROR, "Invalid heartbeat mount options\n"); + status = 0; + goto bail; + } } status = 1; @@ -233,6 +233,14 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!(file->f_mode & FMODE_WRITE)) return -EBADF; + + /* It's not possible punch hole on append only file */ + if (mode & FALLOC_FL_PUNCH_HOLE && IS_APPEND(inode)) + return -EPERM; + + if (IS_IMMUTABLE(inode)) + return -EPERM; + /* * Revalidate the write permissions, in case security policy has * changed since the files were opened. @@ -565,13 +573,15 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, { struct path path; int error = -EINVAL; - int follow; + int lookup_flags; - if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) + if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) goto out; - follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; - error = user_path_at(dfd, filename, follow, &path); + lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; + if (flag & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + error = user_path_at(dfd, filename, lookup_flags, &path); if (error) goto out; error = mnt_want_write(path.mnt); @@ -661,11 +671,16 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, int (*open)(struct inode *, struct file *), const struct cred *cred) { + static const struct file_operations empty_fops = {}; struct inode *inode; int error; f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; + + if (unlikely(f->f_flags & O_PATH)) + f->f_mode = FMODE_PATH; + inode = dentry->d_inode; if (f->f_mode & FMODE_WRITE) { error = __get_file_write_access(inode, mnt); @@ -679,9 +694,15 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, f->f_path.dentry = dentry; f->f_path.mnt = mnt; f->f_pos = 0; - f->f_op = fops_get(inode->i_fop); file_sb_list_add(f, inode->i_sb); + if (unlikely(f->f_mode & FMODE_PATH)) { + f->f_op = &empty_fops; + return f; + } + + f->f_op = fops_get(inode->i_fop); + error = security_dentry_open(f, cred); if (error) goto cleanup_all; @@ -882,15 +903,110 @@ void fd_install(unsigned int fd, struct file *file) EXPORT_SYMBOL(fd_install); +static inline int build_open_flags(int flags, int mode, struct open_flags *op) +{ + int lookup_flags = 0; + int acc_mode; + + if (!(flags & O_CREAT)) + mode = 0; + op->mode = mode; + + /* Must never be set by userspace */ + flags &= ~FMODE_NONOTIFY; + + /* + * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only + * check for O_DSYNC if the need any syncing at all we enforce it's + * always set instead of having to deal with possibly weird behaviour + * for malicious applications setting only __O_SYNC. + */ + if (flags & __O_SYNC) + flags |= O_DSYNC; + + /* + * If we have O_PATH in the open flag. Then we + * cannot have anything other than the below set of flags + */ + if (flags & O_PATH) { + flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH; + acc_mode = 0; + } else { + acc_mode = MAY_OPEN | ACC_MODE(flags); + } + + op->open_flag = flags; + + /* O_TRUNC implies we need access checks for write permissions */ + if (flags & O_TRUNC) + acc_mode |= MAY_WRITE; + + /* Allow the LSM permission hook to distinguish append + access from general write access. */ + if (flags & O_APPEND) + acc_mode |= MAY_APPEND; + + op->acc_mode = acc_mode; + + op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN; + + if (flags & O_CREAT) { + op->intent |= LOOKUP_CREATE; + if (flags & O_EXCL) + op->intent |= LOOKUP_EXCL; + } + + if (flags & O_DIRECTORY) + lookup_flags |= LOOKUP_DIRECTORY; + if (!(flags & O_NOFOLLOW)) + lookup_flags |= LOOKUP_FOLLOW; + return lookup_flags; +} + +/** + * filp_open - open file and return file pointer + * + * @filename: path to open + * @flags: open flags as per the open(2) second argument + * @mode: mode for the new file if O_CREAT is set, else ignored + * + * This is the helper to open a file from kernelspace if you really + * have to. But in generally you should not do this, so please move + * along, nothing to see here.. + */ +struct file *filp_open(const char *filename, int flags, int mode) +{ + struct open_flags op; + int lookup = build_open_flags(flags, mode, &op); + return do_filp_open(AT_FDCWD, filename, &op, lookup); +} +EXPORT_SYMBOL(filp_open); + +struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, + const char *filename, int flags) +{ + struct open_flags op; + int lookup = build_open_flags(flags, 0, &op); + if (flags & O_CREAT) + return ERR_PTR(-EINVAL); + if (!filename && (flags & O_DIRECTORY)) + if (!dentry->d_inode->i_op->lookup) + return ERR_PTR(-ENOTDIR); + return do_file_open_root(dentry, mnt, filename, &op, lookup); +} +EXPORT_SYMBOL(file_open_root); + long do_sys_open(int dfd, const char __user *filename, int flags, int mode) { + struct open_flags op; + int lookup = build_open_flags(flags, mode, &op); char *tmp = getname(filename); int fd = PTR_ERR(tmp); if (!IS_ERR(tmp)) { fd = get_unused_fd_flags(flags); if (fd >= 0) { - struct file *f = do_filp_open(dfd, tmp, flags, mode, 0); + struct file *f = do_filp_open(dfd, tmp, &op, lookup); if (IS_ERR(f)) { put_unused_fd(fd); fd = PTR_ERR(f); @@ -960,8 +1076,10 @@ int filp_close(struct file *filp, fl_owner_t id) if (filp->f_op && filp->f_op->flush) retval = filp->f_op->flush(filp, id); - dnotify_flush(filp, id); - locks_remove_posix(filp, id); + if (likely(!(filp->f_mode & FMODE_PATH))) { + dnotify_flush(filp, id); + locks_remove_posix(filp, id); + } fput(filp); return retval; } diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c index 789c625..b10e354 100644 --- a/fs/partitions/ldm.c +++ b/fs/partitions/ldm.c @@ -251,6 +251,11 @@ static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm) } vm->vblk_size = get_unaligned_be32(data + 0x08); + if (vm->vblk_size == 0) { + ldm_error ("Illegal VBLK size"); + return false; + } + vm->vblk_offset = get_unaligned_be32(data + 0x0C); vm->last_vblk_seq = get_unaligned_be32(data + 0x04); diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c index 48cec7c..be03a0b 100644 --- a/fs/partitions/osf.c +++ b/fs/partitions/osf.c @@ -10,10 +10,13 @@ #include "check.h" #include "osf.h" +#define MAX_OSF_PARTITIONS 8 + int osf_partition(struct parsed_partitions *state) { int i; int slot = 1; + unsigned int npartitions; Sector sect; unsigned char *data; struct disklabel { @@ -45,7 +48,7 @@ int osf_partition(struct parsed_partitions *state) u8 p_fstype; u8 p_frag; __le16 p_cpg; - } d_partitions[8]; + } d_partitions[MAX_OSF_PARTITIONS]; } * label; struct d_partition * partition; @@ -63,7 +66,12 @@ int osf_partition(struct parsed_partitions *state) put_dev_sector(sect); return 0; } - for (i = 0 ; i < le16_to_cpu(label->d_npartitions); i++, partition++) { + npartitions = le16_to_cpu(label->d_npartitions); + if (npartitions > MAX_OSF_PARTITIONS) { + put_dev_sector(sect); + return 0; + } + for (i = 0 ; i < npartitions; i++, partition++) { if (slot == state->limit) break; if (le32_to_cpu(partition->p_size)) diff --git a/fs/proc/base.c b/fs/proc/base.c index 9d096e8..d49c4b5 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2620,35 +2620,6 @@ static const struct pid_entry proc_base_stuff[] = { &proc_self_inode_operations, NULL, {}), }; -/* - * Exceptional case: normally we are not allowed to unhash a busy - * directory. In this case, however, we can do it - no aliasing problems - * due to the way we treat inodes. - */ -static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd) -{ - struct inode *inode; - struct task_struct *task; - - if (nd->flags & LOOKUP_RCU) - return -ECHILD; - - inode = dentry->d_inode; - task = get_proc_task(inode); - if (task) { - put_task_struct(task); - return 1; - } - d_drop(dentry); - return 0; -} - -static const struct dentry_operations proc_base_dentry_operations = -{ - .d_revalidate = proc_base_revalidate, - .d_delete = pid_delete_dentry, -}; - static struct dentry *proc_base_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { @@ -2685,7 +2656,6 @@ static struct dentry *proc_base_instantiate(struct inode *dir, if (p->fop) inode->i_fop = p->fop; ei->op = p->op; - d_set_d_op(dentry, &proc_base_dentry_operations); d_add(dentry, inode); error = NULL; out: diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 176ce4c..d6a7ca1 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -27,6 +27,7 @@ static void proc_evict_inode(struct inode *inode) { struct proc_dir_entry *de; + struct ctl_table_header *head; truncate_inode_pages(&inode->i_data, 0); end_writeback(inode); @@ -38,8 +39,11 @@ static void proc_evict_inode(struct inode *inode) de = PROC_I(inode)->pde; if (de) pde_put(de); - if (PROC_I(inode)->sysctl) - sysctl_head_put(PROC_I(inode)->sysctl); + head = PROC_I(inode)->sysctl; + if (head) { + rcu_assign_pointer(PROC_I(inode)->sysctl, NULL); + sysctl_head_put(head); + } } struct vfsmount *proc_mnt; diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index d9396a4..927cbd1 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -233,7 +233,7 @@ void __init proc_device_tree_init(void) return; root = of_find_node_by_path("/"); if (root == NULL) { - printk(KERN_ERR "/proc/device-tree: can't find root\n"); + pr_debug("/proc/device-tree: can't find root\n"); return; } proc_device_tree_add_node(root, proc_device_tree); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 09a1f92..8eb2522 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -408,15 +408,18 @@ static int proc_sys_compare(const struct dentry *parent, const struct dentry *dentry, const struct inode *inode, unsigned int len, const char *str, const struct qstr *name) { + struct ctl_table_header *head; /* Although proc doesn't have negative dentries, rcu-walk means * that inode here can be NULL */ + /* AV: can it, indeed? */ if (!inode) - return 0; + return 1; if (name->len != len) return 1; if (memcmp(name->name, str, len)) return 1; - return !sysctl_is_seen(PROC_I(inode)->sysctl); + head = rcu_dereference(PROC_I(inode)->sysctl); + return !head || !sysctl_is_seen(head); } static const struct dentry_operations proc_sys_dentry_operations = { diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 0bae036..1bba24b 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1593,8 +1593,13 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp, struct inode *inode = dentry->d_inode; int maxlen = *lenp; - if (maxlen < 3) + if (need_parent && (maxlen < 5)) { + *lenp = 5; return 255; + } else if (maxlen < 3) { + *lenp = 3; + return 255; + } data[0] = inode->i_ino; data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index ba5f51e..4b2eb56 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -771,7 +771,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE, dentry, inode, &security); if (retval) { - dir->i_nlink--; + DEC_DIR_INODE_NLINK(dir) goto out_failed; } @@ -1122,10 +1122,6 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, reiserfs_write_unlock(dir->i_sb); return -EMLINK; } - if (inode->i_nlink == 0) { - reiserfs_write_unlock(dir->i_sb); - return -ENOENT; - } /* inc before scheduling so reiserfs_unlink knows we are here */ inc_nlink(inode); diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 3cfb2e9..5c11ca8 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -978,8 +978,6 @@ int reiserfs_permission(struct inode *inode, int mask, unsigned int flags) static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd) { - if (nd->flags & LOOKUP_RCU) - return -ECHILD; return -EPERM; } @@ -75,13 +75,16 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, int error = -EINVAL; int lookup_flags = 0; - if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0) + if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | + AT_EMPTY_PATH)) != 0) goto out; if (!(flag & AT_SYMLINK_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; if (flag & AT_NO_AUTOMOUNT) lookup_flags |= LOOKUP_NO_AUTOMOUNT; + if (flag & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; error = user_path_at(dfd, filename, lookup_flags, &path); if (error) @@ -297,7 +300,7 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname, if (bufsiz <= 0) return -EINVAL; - error = user_path_at(dfd, pathname, 0, &path); + error = user_path_at(dfd, pathname, LOOKUP_EMPTY, &path); if (!error) { struct inode *inode = path.dentry->d_inode; diff --git a/fs/statfs.c b/fs/statfs.c index 30ea8c8..8244924 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -73,149 +73,135 @@ int vfs_statfs(struct path *path, struct kstatfs *buf) } EXPORT_SYMBOL(vfs_statfs); -static int do_statfs_native(struct path *path, struct statfs *buf) +int user_statfs(const char __user *pathname, struct kstatfs *st) { - struct kstatfs st; - int retval; + struct path path; + int error = user_path(pathname, &path); + if (!error) { + error = vfs_statfs(&path, st); + path_put(&path); + } + return error; +} - retval = vfs_statfs(path, &st); - if (retval) - return retval; +int fd_statfs(int fd, struct kstatfs *st) +{ + struct file *file = fget(fd); + int error = -EBADF; + if (file) { + error = vfs_statfs(&file->f_path, st); + fput(file); + } + return error; +} - if (sizeof(*buf) == sizeof(st)) - memcpy(buf, &st, sizeof(st)); +static int do_statfs_native(struct kstatfs *st, struct statfs __user *p) +{ + struct statfs buf; + + if (sizeof(buf) == sizeof(*st)) + memcpy(&buf, st, sizeof(*st)); else { - if (sizeof buf->f_blocks == 4) { - if ((st.f_blocks | st.f_bfree | st.f_bavail | - st.f_bsize | st.f_frsize) & + if (sizeof buf.f_blocks == 4) { + if ((st->f_blocks | st->f_bfree | st->f_bavail | + st->f_bsize | st->f_frsize) & 0xffffffff00000000ULL) return -EOVERFLOW; /* * f_files and f_ffree may be -1; it's okay to stuff * that into 32 bits */ - if (st.f_files != -1 && - (st.f_files & 0xffffffff00000000ULL)) + if (st->f_files != -1 && + (st->f_files & 0xffffffff00000000ULL)) return -EOVERFLOW; - if (st.f_ffree != -1 && - (st.f_ffree & 0xffffffff00000000ULL)) + if (st->f_ffree != -1 && + (st->f_ffree & 0xffffffff00000000ULL)) return -EOVERFLOW; } - buf->f_type = st.f_type; - buf->f_bsize = st.f_bsize; - buf->f_blocks = st.f_blocks; - buf->f_bfree = st.f_bfree; - buf->f_bavail = st.f_bavail; - buf->f_files = st.f_files; - buf->f_ffree = st.f_ffree; - buf->f_fsid = st.f_fsid; - buf->f_namelen = st.f_namelen; - buf->f_frsize = st.f_frsize; - buf->f_flags = st.f_flags; - memset(buf->f_spare, 0, sizeof(buf->f_spare)); + buf.f_type = st->f_type; + buf.f_bsize = st->f_bsize; + buf.f_blocks = st->f_blocks; + buf.f_bfree = st->f_bfree; + buf.f_bavail = st->f_bavail; + buf.f_files = st->f_files; + buf.f_ffree = st->f_ffree; + buf.f_fsid = st->f_fsid; + buf.f_namelen = st->f_namelen; + buf.f_frsize = st->f_frsize; + buf.f_flags = st->f_flags; + memset(buf.f_spare, 0, sizeof(buf.f_spare)); } + if (copy_to_user(p, &buf, sizeof(buf))) + return -EFAULT; return 0; } -static int do_statfs64(struct path *path, struct statfs64 *buf) +static int do_statfs64(struct kstatfs *st, struct statfs64 __user *p) { - struct kstatfs st; - int retval; - - retval = vfs_statfs(path, &st); - if (retval) - return retval; - - if (sizeof(*buf) == sizeof(st)) - memcpy(buf, &st, sizeof(st)); + struct statfs64 buf; + if (sizeof(buf) == sizeof(*st)) + memcpy(&buf, st, sizeof(*st)); else { - buf->f_type = st.f_type; - buf->f_bsize = st.f_bsize; - buf->f_blocks = st.f_blocks; - buf->f_bfree = st.f_bfree; - buf->f_bavail = st.f_bavail; - buf->f_files = st.f_files; - buf->f_ffree = st.f_ffree; - buf->f_fsid = st.f_fsid; - buf->f_namelen = st.f_namelen; - buf->f_frsize = st.f_frsize; - buf->f_flags = st.f_flags; - memset(buf->f_spare, 0, sizeof(buf->f_spare)); + buf.f_type = st->f_type; + buf.f_bsize = st->f_bsize; + buf.f_blocks = st->f_blocks; + buf.f_bfree = st->f_bfree; + buf.f_bavail = st->f_bavail; + buf.f_files = st->f_files; + buf.f_ffree = st->f_ffree; + buf.f_fsid = st->f_fsid; + buf.f_namelen = st->f_namelen; + buf.f_frsize = st->f_frsize; + buf.f_flags = st->f_flags; + memset(buf.f_spare, 0, sizeof(buf.f_spare)); } + if (copy_to_user(p, &buf, sizeof(buf))) + return -EFAULT; return 0; } SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf) { - struct path path; - int error; - - error = user_path(pathname, &path); - if (!error) { - struct statfs tmp; - error = do_statfs_native(&path, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - path_put(&path); - } + struct kstatfs st; + int error = user_statfs(pathname, &st); + if (!error) + error = do_statfs_native(&st, buf); return error; } SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf) { - struct path path; - long error; - + struct kstatfs st; + int error; if (sz != sizeof(*buf)) return -EINVAL; - error = user_path(pathname, &path); - if (!error) { - struct statfs64 tmp; - error = do_statfs64(&path, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - path_put(&path); - } + error = user_statfs(pathname, &st); + if (!error) + error = do_statfs64(&st, buf); return error; } SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf) { - struct file *file; - struct statfs tmp; - int error; - - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = do_statfs_native(&file->f_path, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - fput(file); -out: + struct kstatfs st; + int error = fd_statfs(fd, &st); + if (!error) + error = do_statfs_native(&st, buf); return error; } SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf) { - struct file *file; - struct statfs64 tmp; + struct kstatfs st; int error; if (sz != sizeof(*buf)) return -EINVAL; - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - error = do_statfs64(&file->f_path, &tmp); - if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) - error = -EFAULT; - fput(file); -out: + error = fd_statfs(fd, &st); + if (!error) + error = do_statfs64(&st, buf); return error; } diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index b427b12..e474fbc 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -245,7 +245,6 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry, new_de = sysv_find_entry(new_dentry, &new_page); if (!new_de) goto out_dir; - inode_inc_link_count(old_inode); sysv_set_link(new_de, new_page, old_inode); new_inode->i_ctime = CURRENT_TIME_SEC; if (dir_de) @@ -257,18 +256,15 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry, if (new_dir->i_nlink >= SYSV_SB(new_dir->i_sb)->s_link_max) goto out_dir; } - inode_inc_link_count(old_inode); err = sysv_add_link(new_dentry, old_inode); - if (err) { - inode_dec_link_count(old_inode); + if (err) goto out_dir; - } if (dir_de) inode_inc_link_count(new_dir); } sysv_delete_entry(old_de, old_page); - inode_dec_link_count(old_inode); + mark_inode_dirty(old_inode); if (dir_de) { sysv_set_link(dir_de, dir_page, new_dir); diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 14f64b6..7217d67 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -522,24 +522,6 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, ubifs_assert(mutex_is_locked(&dir->i_mutex)); ubifs_assert(mutex_is_locked(&inode->i_mutex)); - /* - * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing - * otherwise has the potential to corrupt the orphan inode list. - * - * Indeed, consider a scenario when 'vfs_link(dirA/fileA)' and - * 'vfs_unlink(dirA/fileA, dirB/fileB)' race. 'vfs_link()' does not - * lock 'dirA->i_mutex', so this is possible. Both of the functions - * lock 'fileA->i_mutex' though. Suppose 'vfs_unlink()' wins, and takes - * 'fileA->i_mutex' mutex first. Suppose 'fileA->i_nlink' is 1. In this - * case 'ubifs_unlink()' will drop the last reference, and put 'inodeA' - * to the list of orphans. After this, 'vfs_link()' will link - * 'dirB/fileB' to 'inodeA'. This is a problem because, for example, - * the subsequent 'vfs_unlink(dirB/fileB)' will add the same inode - * to the list of orphans. - */ - if (inode->i_nlink == 0) - return -ENOENT; - err = dbg_check_synced_i_size(inode); if (err) return err; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 2be0f9e..f1dce84 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -32,6 +32,8 @@ #include <linux/crc-itu-t.h> #include <linux/exportfs.h> +enum { UDF_MAX_LINKS = 0xffff }; + static inline int udf_match(int len1, const unsigned char *name1, int len2, const unsigned char *name2) { @@ -650,7 +652,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode) struct udf_inode_info *iinfo; err = -EMLINK; - if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1) + if (dir->i_nlink >= UDF_MAX_LINKS) goto out; err = -EIO; @@ -1034,9 +1036,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, struct fileIdentDesc cfi, *fi; int err; - if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) { + if (inode->i_nlink >= UDF_MAX_LINKS) return -EMLINK; - } fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); if (!fi) { @@ -1131,9 +1132,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, goto end_rename; retval = -EMLINK; - if (!new_inode && - new_dir->i_nlink >= - (256 << sizeof(new_dir->i_nlink)) - 1) + if (!new_inode && new_dir->i_nlink >= UDF_MAX_LINKS) goto end_rename; } if (!nfi) { @@ -1287,8 +1286,13 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp, struct fid *fid = (struct fid *)fh; int type = FILEID_UDF_WITHOUT_PARENT; - if (len < 3 || (connectable && len < 5)) + if (connectable && (len < 5)) { + *lenp = 5; return 255; + } else if (len < 3) { + *lenp = 3; + return 255; + } *lenp = 3; fid->udf.block = location.logicalBlockNum; diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 12f39b9..d6f6815 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -306,7 +306,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page); if (!new_de) goto out_dir; - inode_inc_link_count(old_inode); ufs_set_link(new_dir, new_de, new_page, old_inode); new_inode->i_ctime = CURRENT_TIME_SEC; if (dir_de) @@ -318,12 +317,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, if (new_dir->i_nlink >= UFS_LINK_MAX) goto out_dir; } - inode_inc_link_count(old_inode); err = ufs_add_link(new_dentry, old_inode); - if (err) { - inode_dec_link_count(old_inode); + if (err) goto out_dir; - } if (dir_de) inode_inc_link_count(new_dir); } @@ -331,12 +327,11 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, /* * Like most other Unix systems, set the ctime for inodes on a * rename. - * inode_dec_link_count() will mark the inode dirty. */ old_inode->i_ctime = CURRENT_TIME_SEC; ufs_delete_entry(old_dir, old_de, old_page); - inode_dec_link_count(old_inode); + mark_inode_dirty(old_inode); if (dir_de) { ufs_set_link(old_inode, dir_de, dir_page, new_dir); diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c index 05201ae..d61611c 100644 --- a/fs/xfs/linux-2.6/xfs_discard.c +++ b/fs/xfs/linux-2.6/xfs_discard.c @@ -152,6 +152,8 @@ xfs_ioc_trim( if (!capable(CAP_SYS_ADMIN)) return -XFS_ERROR(EPERM); + if (!blk_queue_discard(q)) + return -XFS_ERROR(EOPNOTSUPP); if (copy_from_user(&range, urange, sizeof(range))) return -XFS_ERROR(EFAULT); diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c index fc0114d..f4f878f 100644 --- a/fs/xfs/linux-2.6/xfs_export.c +++ b/fs/xfs/linux-2.6/xfs_export.c @@ -89,8 +89,10 @@ xfs_fs_encode_fh( * seven combinations work. The real answer is "don't use v2". */ len = xfs_fileid_length(fileid_type); - if (*max_len < len) + if (*max_len < len) { + *max_len = len; return 255; + } *max_len = len; switch (fileid_type) { diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index f5e2a19..0ca0e3c 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -695,14 +695,19 @@ xfs_ioc_fsgeometry_v1( xfs_mount_t *mp, void __user *arg) { - xfs_fsop_geom_v1_t fsgeo; + xfs_fsop_geom_t fsgeo; int error; - error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3); + error = xfs_fs_geometry(mp, &fsgeo, 3); if (error) return -error; - if (copy_to_user(arg, &fsgeo, sizeof(fsgeo))) + /* + * Caller should have passed an argument of type + * xfs_fsop_geom_v1_t. This is a proper subset of the + * xfs_fsop_geom_t that xfs_fs_geometry() fills in. + */ + if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t))) return -XFS_ERROR(EFAULT); return 0; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index cec89dd..85668ef 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -53,6 +53,9 @@ xfs_fs_geometry( xfs_fsop_geom_t *geo, int new_version) { + + memset(geo, 0, sizeof(*geo)); + geo->blocksize = mp->m_sb.sb_blocksize; geo->rtextsize = mp->m_sb.sb_rextsize; geo->agblocks = mp->m_sb.sb_agblocks; |