diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/afs/flock.c | 1 | ||||
-rw-r--r-- | fs/aio.c | 24 | ||||
-rw-r--r-- | fs/binfmt_elf.c | 9 | ||||
-rw-r--r-- | fs/bio-integrity.c | 170 | ||||
-rw-r--r-- | fs/bio.c | 11 | ||||
-rw-r--r-- | fs/btrfs/async-thread.c | 2 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 3 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 566 | ||||
-rw-r--r-- | fs/btrfs/file.c | 5 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 25 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 6 | ||||
-rw-r--r-- | fs/btrfs/relocation.c | 5 | ||||
-rw-r--r-- | fs/btrfs/transaction.c | 4 | ||||
-rw-r--r-- | fs/compat.c | 4 | ||||
-rw-r--r-- | fs/eventfd.c | 122 | ||||
-rw-r--r-- | fs/exec.c | 4 | ||||
-rw-r--r-- | fs/ext2/namei.c | 12 | ||||
-rw-r--r-- | fs/fuse/dev.c | 83 | ||||
-rw-r--r-- | fs/fuse/dir.c | 57 | ||||
-rw-r--r-- | fs/fuse/file.c | 2 | ||||
-rw-r--r-- | fs/fuse/fuse_i.h | 27 | ||||
-rw-r--r-- | fs/fuse/inode.c | 68 | ||||
-rw-r--r-- | fs/hostfs/hostfs_kern.c | 1 | ||||
-rw-r--r-- | fs/jffs2/scan.c | 4 | ||||
-rw-r--r-- | fs/nfsd/vfs.c | 3 | ||||
-rw-r--r-- | fs/notify/inotify/inotify_user.c | 3 | ||||
-rw-r--r-- | fs/sync.c | 5 |
27 files changed, 933 insertions, 293 deletions
diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 210acaf..3ff8bdd 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -432,7 +432,6 @@ vfs_rejected_lock: list_del_init(&fl->fl_u.afs.link); if (list_empty(&vnode->granted_locks)) afs_defer_unlock(vnode, key); - spin_unlock(&vnode->lock); goto abort_attempt; } @@ -485,6 +485,8 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) { assert_spin_locked(&ctx->ctx_lock); + if (req->ki_eventfd != NULL) + eventfd_ctx_put(req->ki_eventfd); if (req->ki_dtor) req->ki_dtor(req); if (req->ki_iovec != &req->ki_inline_vec) @@ -509,8 +511,6 @@ static void aio_fput_routine(struct work_struct *data) /* Complete the fput(s) */ if (req->ki_filp != NULL) __fput(req->ki_filp); - if (req->ki_eventfd != NULL) - __fput(req->ki_eventfd); /* Link the iocb into the context's free list */ spin_lock_irq(&ctx->ctx_lock); @@ -528,8 +528,6 @@ static void aio_fput_routine(struct work_struct *data) */ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) { - int schedule_putreq = 0; - dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n", req, atomic_long_read(&req->ki_filp->f_count)); @@ -549,24 +547,16 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) * we would not be holding the last reference to the file*, so * this function will be executed w/out any aio kthread wakeup. */ - if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) - schedule_putreq++; - else - req->ki_filp = NULL; - if (req->ki_eventfd != NULL) { - if (unlikely(atomic_long_dec_and_test(&req->ki_eventfd->f_count))) - schedule_putreq++; - else - req->ki_eventfd = NULL; - } - if (unlikely(schedule_putreq)) { + if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) { get_ioctx(ctx); spin_lock(&fput_lock); list_add(&req->ki_list, &fput_head); spin_unlock(&fput_lock); queue_work(aio_wq, &fput_work); - } else + } else { + req->ki_filp = NULL; really_put_req(ctx, req); + } return 1; } @@ -1622,7 +1612,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, * an eventfd() fd, and will be signaled for each completed * event using the eventfd_signal() function. */ - req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd); + req->ki_eventfd = eventfd_ctx_fdget((int) iocb->aio_resfd); if (IS_ERR(req->ki_eventfd)) { ret = PTR_ERR(req->ki_eventfd); req->ki_eventfd = NULL; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 9fa212b..b7c1603 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1522,11 +1522,11 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, info->thread = NULL; psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); - fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); - if (psinfo == NULL) return 0; + fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); + /* * Figure out how many notes we're going to need for each thread. */ @@ -1929,7 +1929,10 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un elf = kmalloc(sizeof(*elf), GFP_KERNEL); if (!elf) goto out; - + /* + * The number of segs are recored into ELF header as 16bit value. + * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here. + */ segs = current->mm->map_count; #ifdef ELF_CORE_EXTRA_PHDRS segs += ELF_CORE_EXTRA_PHDRS; diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 31c46a2..49a34e7 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -1,7 +1,7 @@ /* * bio-integrity.c - bio data integrity extensions * - * Copyright (C) 2007, 2008 Oracle Corporation + * Copyright (C) 2007, 2008, 2009 Oracle Corporation * Written by: Martin K. Petersen <martin.petersen@oracle.com> * * This program is free software; you can redistribute it and/or @@ -25,63 +25,121 @@ #include <linux/bio.h> #include <linux/workqueue.h> -static struct kmem_cache *bio_integrity_slab __read_mostly; -static mempool_t *bio_integrity_pool; -static struct bio_set *integrity_bio_set; +struct integrity_slab { + struct kmem_cache *slab; + unsigned short nr_vecs; + char name[8]; +}; + +#define IS(x) { .nr_vecs = x, .name = "bip-"__stringify(x) } +struct integrity_slab bip_slab[BIOVEC_NR_POOLS] __read_mostly = { + IS(1), IS(4), IS(16), IS(64), IS(128), IS(BIO_MAX_PAGES), +}; +#undef IS + static struct workqueue_struct *kintegrityd_wq; +static inline unsigned int vecs_to_idx(unsigned int nr) +{ + switch (nr) { + case 1: + return 0; + case 2 ... 4: + return 1; + case 5 ... 16: + return 2; + case 17 ... 64: + return 3; + case 65 ... 128: + return 4; + case 129 ... BIO_MAX_PAGES: + return 5; + default: + BUG(); + } +} + +static inline int use_bip_pool(unsigned int idx) +{ + if (idx == BIOVEC_NR_POOLS) + return 1; + + return 0; +} + /** - * bio_integrity_alloc - Allocate integrity payload and attach it to bio + * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio * @bio: bio to attach integrity metadata to * @gfp_mask: Memory allocation mask * @nr_vecs: Number of integrity metadata scatter-gather elements + * @bs: bio_set to allocate from * * Description: This function prepares a bio for attaching integrity * metadata. nr_vecs specifies the maximum number of pages containing * integrity metadata that can be attached. */ -struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, - gfp_t gfp_mask, - unsigned int nr_vecs) +struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, + gfp_t gfp_mask, + unsigned int nr_vecs, + struct bio_set *bs) { struct bio_integrity_payload *bip; - struct bio_vec *iv; - unsigned long idx; + unsigned int idx = vecs_to_idx(nr_vecs); BUG_ON(bio == NULL); + bip = NULL; - bip = mempool_alloc(bio_integrity_pool, gfp_mask); - if (unlikely(bip == NULL)) { - printk(KERN_ERR "%s: could not alloc bip\n", __func__); - return NULL; - } + /* Lower order allocations come straight from slab */ + if (!use_bip_pool(idx)) + bip = kmem_cache_alloc(bip_slab[idx].slab, gfp_mask); - memset(bip, 0, sizeof(*bip)); + /* Use mempool if lower order alloc failed or max vecs were requested */ + if (bip == NULL) { + bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); - iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, integrity_bio_set); - if (unlikely(iv == NULL)) { - printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__); - mempool_free(bip, bio_integrity_pool); - return NULL; + if (unlikely(bip == NULL)) { + printk(KERN_ERR "%s: could not alloc bip\n", __func__); + return NULL; + } } - bip->bip_pool = idx; - bip->bip_vec = iv; + memset(bip, 0, sizeof(*bip)); + + bip->bip_slab = idx; bip->bip_bio = bio; bio->bi_integrity = bip; return bip; } +EXPORT_SYMBOL(bio_integrity_alloc_bioset); + +/** + * bio_integrity_alloc - Allocate integrity payload and attach it to bio + * @bio: bio to attach integrity metadata to + * @gfp_mask: Memory allocation mask + * @nr_vecs: Number of integrity metadata scatter-gather elements + * + * Description: This function prepares a bio for attaching integrity + * metadata. nr_vecs specifies the maximum number of pages containing + * integrity metadata that can be attached. + */ +struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, + gfp_t gfp_mask, + unsigned int nr_vecs) +{ + return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set); +} EXPORT_SYMBOL(bio_integrity_alloc); /** * bio_integrity_free - Free bio integrity payload * @bio: bio containing bip to be freed + * @bs: bio_set this bio was allocated from * * Description: Used to free the integrity portion of a bio. Usually * called from bio_free(). */ -void bio_integrity_free(struct bio *bio) +void bio_integrity_free(struct bio *bio, struct bio_set *bs) { struct bio_integrity_payload *bip = bio->bi_integrity; @@ -92,8 +150,10 @@ void bio_integrity_free(struct bio *bio) && bip->bip_buf != NULL) kfree(bip->bip_buf); - bvec_free_bs(integrity_bio_set, bip->bip_vec, bip->bip_pool); - mempool_free(bip, bio_integrity_pool); + if (use_bip_pool(bip->bip_slab)) + mempool_free(bip, bs->bio_integrity_pool); + else + kmem_cache_free(bip_slab[bip->bip_slab].slab, bip); bio->bi_integrity = NULL; } @@ -114,7 +174,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, struct bio_integrity_payload *bip = bio->bi_integrity; struct bio_vec *iv; - if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_pool)) { + if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_slab)) { printk(KERN_ERR "%s: bip_vec full\n", __func__); return 0; } @@ -647,8 +707,8 @@ void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors) bp->iv1 = bip->bip_vec[0]; bp->iv2 = bip->bip_vec[0]; - bp->bip1.bip_vec = &bp->iv1; - bp->bip2.bip_vec = &bp->iv2; + bp->bip1.bip_vec[0] = bp->iv1; + bp->bip2.bip_vec[0] = bp->iv2; bp->iv1.bv_len = sectors * bi->tuple_size; bp->iv2.bv_offset += sectors * bi->tuple_size; @@ -667,17 +727,19 @@ EXPORT_SYMBOL(bio_integrity_split); * @bio: New bio * @bio_src: Original bio * @gfp_mask: Memory allocation mask + * @bs: bio_set to allocate bip from * * Description: Called to allocate a bip when cloning a bio */ -int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask) +int bio_integrity_clone(struct bio *bio, struct bio *bio_src, + gfp_t gfp_mask, struct bio_set *bs) { struct bio_integrity_payload *bip_src = bio_src->bi_integrity; struct bio_integrity_payload *bip; BUG_ON(bip_src == NULL); - bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt); + bip = bio_integrity_alloc_bioset(bio, gfp_mask, bip_src->bip_vcnt, bs); if (bip == NULL) return -EIO; @@ -693,25 +755,43 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask) } EXPORT_SYMBOL(bio_integrity_clone); -static int __init bio_integrity_init(void) +int bioset_integrity_create(struct bio_set *bs, int pool_size) { - kintegrityd_wq = create_workqueue("kintegrityd"); + unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES); + + bs->bio_integrity_pool = + mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab); + if (!bs->bio_integrity_pool) + return -1; + + return 0; +} +EXPORT_SYMBOL(bioset_integrity_create); + +void bioset_integrity_free(struct bio_set *bs) +{ + if (bs->bio_integrity_pool) + mempool_destroy(bs->bio_integrity_pool); +} +EXPORT_SYMBOL(bioset_integrity_free); + +void __init bio_integrity_init(void) +{ + unsigned int i; + + kintegrityd_wq = create_workqueue("kintegrityd"); if (!kintegrityd_wq) panic("Failed to create kintegrityd\n"); - bio_integrity_slab = KMEM_CACHE(bio_integrity_payload, - SLAB_HWCACHE_ALIGN|SLAB_PANIC); + for (i = 0 ; i < BIOVEC_NR_POOLS ; i++) { + unsigned int size; - bio_integrity_pool = mempool_create_slab_pool(BIO_POOL_SIZE, - bio_integrity_slab); - if (!bio_integrity_pool) - panic("bio_integrity: can't allocate bip pool\n"); + size = sizeof(struct bio_integrity_payload) + + bip_slab[i].nr_vecs * sizeof(struct bio_vec); - integrity_bio_set = bioset_create(BIO_POOL_SIZE, 0); - if (!integrity_bio_set) - panic("bio_integrity: can't allocate bio_set\n"); - - return 0; + bip_slab[i].slab = + kmem_cache_create(bip_slab[i].name, size, 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + } } -subsys_initcall(bio_integrity_init); @@ -238,7 +238,7 @@ void bio_free(struct bio *bio, struct bio_set *bs) bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio)); if (bio_integrity(bio)) - bio_integrity_free(bio); + bio_integrity_free(bio, bs); /* * If we have front padding, adjust the bio pointer before freeing @@ -341,7 +341,7 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) static void bio_kmalloc_destructor(struct bio *bio) { if (bio_integrity(bio)) - bio_integrity_free(bio); + bio_integrity_free(bio, fs_bio_set); kfree(bio); } @@ -472,7 +472,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) if (bio_integrity(bio)) { int ret; - ret = bio_integrity_clone(b, bio, gfp_mask); + ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set); if (ret < 0) { bio_put(b); @@ -1539,6 +1539,7 @@ void bioset_free(struct bio_set *bs) if (bs->bio_pool) mempool_destroy(bs->bio_pool); + bioset_integrity_free(bs); biovec_free_pools(bs); bio_put_slab(bs); @@ -1579,6 +1580,9 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad) if (!bs->bio_pool) goto bad; + if (bioset_integrity_create(bs, pool_size)) + goto bad; + if (!biovec_create_pools(bs, pool_size)) return bs; @@ -1616,6 +1620,7 @@ static int __init init_bio(void) if (!bio_slabs) panic("bio: can't allocate bios\n"); + bio_integrity_init(); biovec_init_slabs(); fs_bio_set = bioset_create(BIO_POOL_SIZE, 0); diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 7f88628..6e4f6c5 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -299,8 +299,8 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) "btrfs-%s-%d", workers->name, workers->num_workers + i); if (IS_ERR(worker->task)) { - kfree(worker); ret = PTR_ERR(worker->task); + kfree(worker); goto fail; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 2779c2f..98a8738 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2074,8 +2074,7 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); -int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root - *root); +int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index edc7d20..a5aca39 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -990,15 +990,13 @@ static inline int extent_ref_type(u64 parent, u64 owner) return type; } -static int find_next_key(struct btrfs_path *path, struct btrfs_key *key) +static int find_next_key(struct btrfs_path *path, int level, + struct btrfs_key *key) { - int level; - BUG_ON(!path->keep_locks); - for (level = 0; level < BTRFS_MAX_LEVEL; level++) { + for (; level < BTRFS_MAX_LEVEL; level++) { if (!path->nodes[level]) break; - btrfs_assert_tree_locked(path->nodes[level]); if (path->slots[level] + 1 >= btrfs_header_nritems(path->nodes[level])) continue; @@ -1158,7 +1156,8 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, * For simplicity, we just do not add new inline back * ref if there is any kind of item for this block */ - if (find_next_key(path, &key) == 0 && key.objectid == bytenr && + if (find_next_key(path, 0, &key) == 0 && + key.objectid == bytenr && key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { err = -EAGAIN; goto out; @@ -2697,7 +2696,7 @@ again: printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" ", %llu bytes_used, %llu bytes_reserved, " - "%llu bytes_pinned, %llu bytes_readonly, %llu may use" + "%llu bytes_pinned, %llu bytes_readonly, %llu may use " "%llu total\n", (unsigned long long)bytes, (unsigned long long)data_sinfo->bytes_delalloc, (unsigned long long)data_sinfo->bytes_used, @@ -4128,6 +4127,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, return buf; } +#if 0 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *leaf) { @@ -4171,8 +4171,6 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, return 0; } -#if 0 - static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_leaf_ref *ref) @@ -4553,262 +4551,471 @@ out: } #endif +struct walk_control { + u64 refs[BTRFS_MAX_LEVEL]; + u64 flags[BTRFS_MAX_LEVEL]; + struct btrfs_key update_progress; + int stage; + int level; + int shared_level; + int update_ref; + int keep_locks; +}; + +#define DROP_REFERENCE 1 +#define UPDATE_BACKREF 2 + /* - * helper function for drop_subtree, this function is similar to - * walk_down_tree. The main difference is that it checks reference - * counts while tree blocks are locked. + * hepler to process tree block while walking down the tree. + * + * when wc->stage == DROP_REFERENCE, this function checks + * reference count of the block. if the block is shared and + * we need update back refs for the subtree rooted at the + * block, this function changes wc->stage to UPDATE_BACKREF + * + * when wc->stage == UPDATE_BACKREF, this function updates + * back refs for pointers in the block. + * + * NOTE: return value 1 means we should stop walking down. */ -static noinline int walk_down_tree(struct btrfs_trans_handle *trans, +static noinline int walk_down_proc(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_path *path, int *level) + struct btrfs_path *path, + struct walk_control *wc) { - struct extent_buffer *next; - struct extent_buffer *cur; - struct extent_buffer *parent; - u64 bytenr; - u64 ptr_gen; - u64 refs; - u64 flags; - u32 blocksize; + int level = wc->level; + struct extent_buffer *eb = path->nodes[level]; + struct btrfs_key key; + u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; int ret; - cur = path->nodes[*level]; - ret = btrfs_lookup_extent_info(trans, root, cur->start, cur->len, - &refs, &flags); - BUG_ON(ret); - if (refs > 1) - goto out; + if (wc->stage == UPDATE_BACKREF && + btrfs_header_owner(eb) != root->root_key.objectid) + return 1; - BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); + /* + * when reference count of tree block is 1, it won't increase + * again. once full backref flag is set, we never clear it. + */ + if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || + (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) { + BUG_ON(!path->locks[level]); + ret = btrfs_lookup_extent_info(trans, root, + eb->start, eb->len, + &wc->refs[level], + &wc->flags[level]); + BUG_ON(ret); + BUG_ON(wc->refs[level] == 0); + } - while (*level >= 0) { - cur = path->nodes[*level]; - if (*level == 0) { - ret = btrfs_drop_leaf_ref(trans, root, cur); - BUG_ON(ret); - clean_tree_block(trans, root, cur); - break; - } - if (path->slots[*level] >= btrfs_header_nritems(cur)) { - clean_tree_block(trans, root, cur); - break; + if (wc->stage == DROP_REFERENCE && + wc->update_ref && wc->refs[level] > 1) { + BUG_ON(eb == root->node); + BUG_ON(path->slots[level] > 0); + if (level == 0) + btrfs_item_key_to_cpu(eb, &key, path->slots[level]); + else + btrfs_node_key_to_cpu(eb, &key, path->slots[level]); + if (btrfs_header_owner(eb) == root->root_key.objectid && + btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) { + wc->stage = UPDATE_BACKREF; + wc->shared_level = level; } + } - bytenr = btrfs_node_blockptr(cur, path->slots[*level]); - blocksize = btrfs_level_size(root, *level - 1); - ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); + if (wc->stage == DROP_REFERENCE) { + if (wc->refs[level] > 1) + return 1; - next = read_tree_block(root, bytenr, blocksize, ptr_gen); - btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); + if (path->locks[level] && !wc->keep_locks) { + btrfs_tree_unlock(eb); + path->locks[level] = 0; + } + return 0; + } - ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize, - &refs, &flags); + /* wc->stage == UPDATE_BACKREF */ + if (!(wc->flags[level] & flag)) { + BUG_ON(!path->locks[level]); + ret = btrfs_inc_ref(trans, root, eb, 1); BUG_ON(ret); - if (refs > 1) { - parent = path->nodes[*level]; - ret = btrfs_free_extent(trans, root, bytenr, - blocksize, parent->start, - btrfs_header_owner(parent), - *level - 1, 0); + ret = btrfs_dec_ref(trans, root, eb, 0); + BUG_ON(ret); + ret = btrfs_set_disk_extent_flags(trans, root, eb->start, + eb->len, flag, 0); + BUG_ON(ret); + wc->flags[level] |= flag; + } + + /* + * the block is shared by multiple trees, so it's not good to + * keep the tree lock + */ + if (path->locks[level] && level > 0) { + btrfs_tree_unlock(eb); + path->locks[level] = 0; + } + return 0; +} + +/* + * hepler to process tree block while walking up the tree. + * + * when wc->stage == DROP_REFERENCE, this function drops + * reference count on the block. + * + * when wc->stage == UPDATE_BACKREF, this function changes + * wc->stage back to DROP_REFERENCE if we changed wc->stage + * to UPDATE_BACKREF previously while processing the block. + * + * NOTE: return value 1 means we should stop walking up. + */ +static noinline int walk_up_proc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc) +{ + int ret = 0; + int level = wc->level; + struct extent_buffer *eb = path->nodes[level]; + u64 parent = 0; + + if (wc->stage == UPDATE_BACKREF) { + BUG_ON(wc->shared_level < level); + if (level < wc->shared_level) + goto out; + + BUG_ON(wc->refs[level] <= 1); + ret = find_next_key(path, level + 1, &wc->update_progress); + if (ret > 0) + wc->update_ref = 0; + + wc->stage = DROP_REFERENCE; + wc->shared_level = -1; + path->slots[level] = 0; + + /* + * check reference count again if the block isn't locked. + * we should start walking down the tree again if reference + * count is one. + */ + if (!path->locks[level]) { + BUG_ON(level == 0); + btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); + path->locks[level] = 1; + + ret = btrfs_lookup_extent_info(trans, root, + eb->start, eb->len, + &wc->refs[level], + &wc->flags[level]); BUG_ON(ret); - path->slots[*level]++; - btrfs_tree_unlock(next); - free_extent_buffer(next); - continue; + BUG_ON(wc->refs[level] == 0); + if (wc->refs[level] == 1) { + btrfs_tree_unlock(eb); + path->locks[level] = 0; + return 1; + } + } else { + BUG_ON(level != 0); } + } - BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)); + /* wc->stage == DROP_REFERENCE */ + BUG_ON(wc->refs[level] > 1 && !path->locks[level]); - *level = btrfs_header_level(next); - path->nodes[*level] = next; - path->slots[*level] = 0; - path->locks[*level] = 1; - cond_resched(); + if (wc->refs[level] == 1) { + if (level == 0) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + ret = btrfs_dec_ref(trans, root, eb, 1); + else + ret = btrfs_dec_ref(trans, root, eb, 0); + BUG_ON(ret); + } + /* make block locked assertion in clean_tree_block happy */ + if (!path->locks[level] && + btrfs_header_generation(eb) == trans->transid) { + btrfs_tree_lock(eb); + btrfs_set_lock_blocking(eb); + path->locks[level] = 1; + } + clean_tree_block(trans, root, eb); + } + + if (eb == root->node) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + parent = eb->start; + else + BUG_ON(root->root_key.objectid != + btrfs_header_owner(eb)); + } else { + if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + parent = path->nodes[level + 1]->start; + else + BUG_ON(root->root_key.objectid != + btrfs_header_owner(path->nodes[level + 1])); } -out: - if (path->nodes[*level] == root->node) - parent = path->nodes[*level]; - else - parent = path->nodes[*level + 1]; - bytenr = path->nodes[*level]->start; - blocksize = path->nodes[*level]->len; - ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent->start, - btrfs_header_owner(parent), *level, 0); + ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent, + root->root_key.objectid, level, 0); BUG_ON(ret); +out: + wc->refs[level] = 0; + wc->flags[level] = 0; + return ret; +} + +static noinline int walk_down_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct walk_control *wc) +{ + struct extent_buffer *next; + struct extent_buffer *cur; + u64 bytenr; + u64 ptr_gen; + u32 blocksize; + int level = wc->level; + int ret; + + while (level >= 0) { + cur = path->nodes[level]; + BUG_ON(path->slots[level] >= btrfs_header_nritems(cur)); - if (path->locks[*level]) { - btrfs_tree_unlock(path->nodes[*level]); - path->locks[*level] = 0; + ret = walk_down_proc(trans, root, path, wc); + if (ret > 0) + break; + + if (level == 0) + break; + + bytenr = btrfs_node_blockptr(cur, path->slots[level]); + blocksize = btrfs_level_size(root, level - 1); + ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]); + + next = read_tree_block(root, bytenr, blocksize, ptr_gen); + btrfs_tree_lock(next); + btrfs_set_lock_blocking(next); + + level--; + BUG_ON(level != btrfs_header_level(next)); + path->nodes[level] = next; + path->slots[level] = 0; + path->locks[level] = 1; + wc->level = level; } - free_extent_buffer(path->nodes[*level]); - path->nodes[*level] = NULL; - *level += 1; - cond_resched(); return 0; } -/* - * helper for dropping snapshots. This walks back up the tree in the path - * to find the first node higher up where we haven't yet gone through - * all the slots - */ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - int *level, int max_level) + struct walk_control *wc, int max_level) { - struct btrfs_root_item *root_item = &root->root_item; - int i; - int slot; + int level = wc->level; int ret; - for (i = *level; i < max_level && path->nodes[i]; i++) { - slot = path->slots[i]; - if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { - /* - * there is more work to do in this level. - * Update the drop_progress marker to reflect - * the work we've done so far, and then bump - * the slot number - */ - path->slots[i]++; - WARN_ON(*level == 0); - if (max_level == BTRFS_MAX_LEVEL) { - btrfs_node_key(path->nodes[i], - &root_item->drop_progress, - path->slots[i]); - root_item->drop_level = i; - } - *level = i; + path->slots[level] = btrfs_header_nritems(path->nodes[level]); + while (level < max_level && path->nodes[level]) { + wc->level = level; + if (path->slots[level] + 1 < + btrfs_header_nritems(path->nodes[level])) { + path->slots[level]++; return 0; } else { - struct extent_buffer *parent; - - /* - * this whole node is done, free our reference - * on it and go up one level - */ - if (path->nodes[*level] == root->node) - parent = path->nodes[*level]; - else - parent = path->nodes[*level + 1]; + ret = walk_up_proc(trans, root, path, wc); + if (ret > 0) + return 0; - clean_tree_block(trans, root, path->nodes[i]); - ret = btrfs_free_extent(trans, root, - path->nodes[i]->start, - path->nodes[i]->len, - parent->start, - btrfs_header_owner(parent), - *level, 0); - BUG_ON(ret); - if (path->locks[*level]) { - btrfs_tree_unlock(path->nodes[i]); - path->locks[i] = 0; + if (path->locks[level]) { + btrfs_tree_unlock(path->nodes[level]); + path->locks[level] = 0; } - free_extent_buffer(path->nodes[i]); - path->nodes[i] = NULL; - *level = i + 1; + free_extent_buffer(path->nodes[level]); + path->nodes[level] = NULL; + level++; } } return 1; } /* - * drop the reference count on the tree rooted at 'snap'. This traverses - * the tree freeing any blocks that have a ref count of zero after being - * decremented. + * drop a subvolume tree. + * + * this function traverses the tree freeing any blocks that only + * referenced by the tree. + * + * when a shared tree block is found. this function decreases its + * reference count by one. if update_ref is true, this function + * also make sure backrefs for the shared block and all lower level + * blocks are properly updated. */ -int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root - *root) +int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) { - int ret = 0; - int wret; - int level; struct btrfs_path *path; - int update_count; + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = root->fs_info->tree_root; struct btrfs_root_item *root_item = &root->root_item; + struct walk_control *wc; + struct btrfs_key key; + int err = 0; + int ret; + int level; path = btrfs_alloc_path(); BUG_ON(!path); - level = btrfs_header_level(root->node); + wc = kzalloc(sizeof(*wc), GFP_NOFS); + BUG_ON(!wc); + + trans = btrfs_start_transaction(tree_root, 1); + if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { + level = btrfs_header_level(root->node); path->nodes[level] = btrfs_lock_root_node(root); btrfs_set_lock_blocking(path->nodes[level]); path->slots[level] = 0; path->locks[level] = 1; + memset(&wc->update_progress, 0, + sizeof(wc->update_progress)); } else { - struct btrfs_key key; - struct btrfs_disk_key found_key; - struct extent_buffer *node; - btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); + memcpy(&wc->update_progress, &key, + sizeof(wc->update_progress)); + level = root_item->drop_level; + BUG_ON(level == 0); path->lowest_level = level; - wret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (wret < 0) { - ret = wret; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + path->lowest_level = 0; + if (ret < 0) { + err = ret; goto out; } - node = path->nodes[level]; - btrfs_node_key(node, &found_key, path->slots[level]); - WARN_ON(memcmp(&found_key, &root_item->drop_progress, - sizeof(found_key))); + btrfs_node_key_to_cpu(path->nodes[level], &key, + path->slots[level]); + WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key))); + /* * unlock our path, this is safe because only this * function is allowed to delete this snapshot */ btrfs_unlock_up_safe(path, 0); + + level = btrfs_header_level(root->node); + while (1) { + btrfs_tree_lock(path->nodes[level]); + btrfs_set_lock_blocking(path->nodes[level]); + + ret = btrfs_lookup_extent_info(trans, root, + path->nodes[level]->start, + path->nodes[level]->len, + &wc->refs[level], + &wc->flags[level]); + BUG_ON(ret); + BUG_ON(wc->refs[level] == 0); + + if (level == root_item->drop_level) + break; + + btrfs_tree_unlock(path->nodes[level]); + WARN_ON(wc->refs[level] != 1); + level--; + } } + + wc->level = level; + wc->shared_level = -1; + wc->stage = DROP_REFERENCE; + wc->update_ref = update_ref; + wc->keep_locks = 0; + while (1) { - unsigned long update; - wret = walk_down_tree(trans, root, path, &level); - if (wret > 0) + ret = walk_down_tree(trans, root, path, wc); + if (ret < 0) { + err = ret; break; - if (wret < 0) - ret = wret; + } - wret = walk_up_tree(trans, root, path, &level, - BTRFS_MAX_LEVEL); - if (wret > 0) + ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); + if (ret < 0) { + err = ret; break; - if (wret < 0) - ret = wret; - if (trans->transaction->in_commit || - trans->transaction->delayed_refs.flushing) { - ret = -EAGAIN; + } + + if (ret > 0) { + BUG_ON(wc->stage != DROP_REFERENCE); break; } - for (update_count = 0; update_count < 16; update_count++) { + + if (wc->stage == DROP_REFERENCE) { + level = wc->level; + btrfs_node_key(path->nodes[level], + &root_item->drop_progress, + path->slots[level]); + root_item->drop_level = level; + } + + BUG_ON(wc->level == 0); + if (trans->transaction->in_commit || + trans->transaction->delayed_refs.flushing) { + ret = btrfs_update_root(trans, tree_root, + &root->root_key, + root_item); + BUG_ON(ret); + + btrfs_end_transaction(trans, tree_root); + trans = btrfs_start_transaction(tree_root, 1); + } else { + unsigned long update; update = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; if (update) - btrfs_run_delayed_refs(trans, root, update); - else - break; + btrfs_run_delayed_refs(trans, tree_root, + update); } } + btrfs_release_path(root, path); + BUG_ON(err); + + ret = btrfs_del_root(trans, tree_root, &root->root_key); + BUG_ON(ret); + + free_extent_buffer(root->node); + free_extent_buffer(root->commit_root); + kfree(root); out: + btrfs_end_transaction(trans, tree_root); + kfree(wc); btrfs_free_path(path); - return ret; + return err; } +/* + * drop subtree rooted at tree block 'node'. + * + * NOTE: this function will unlock and release tree block 'node' + */ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, struct extent_buffer *parent) { struct btrfs_path *path; + struct walk_control *wc; int level; int parent_level; int ret = 0; int wret; + BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + path = btrfs_alloc_path(); BUG_ON(!path); + wc = kzalloc(sizeof(*wc), GFP_NOFS); + BUG_ON(!wc); + btrfs_assert_tree_locked(parent); parent_level = btrfs_header_level(parent); extent_buffer_get(parent); @@ -4817,24 +5024,33 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, btrfs_assert_tree_locked(node); level = btrfs_header_level(node); - extent_buffer_get(node); path->nodes[level] = node; path->slots[level] = 0; + path->locks[level] = 1; + + wc->refs[parent_level] = 1; + wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; + wc->level = level; + wc->shared_level = -1; + wc->stage = DROP_REFERENCE; + wc->update_ref = 0; + wc->keep_locks = 1; while (1) { - wret = walk_down_tree(trans, root, path, &level); - if (wret < 0) + wret = walk_down_tree(trans, root, path, wc); + if (wret < 0) { ret = wret; - if (wret != 0) break; + } - wret = walk_up_tree(trans, root, path, &level, parent_level); + wret = walk_up_tree(trans, root, path, wc, parent_level); if (wret < 0) ret = wret; if (wret != 0) break; } + kfree(wc); btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 126477e..7c3cd24 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -151,7 +151,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, } if (end_pos > isize) { i_size_write(inode, end_pos); - btrfs_update_inode(trans, root, inode); + /* we've only changed i_size in ram, and we haven't updated + * the disk i_size. There is no need to log the inode + * at this time. + */ } err = btrfs_end_transaction(trans, root); out_unlock: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dbe1aab..7ffa3d3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3580,12 +3580,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, owner = 1; BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, alloc_hint, owner); - if ((mode & S_IFREG)) { - if (btrfs_test_opt(root, NODATASUM)) - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; - if (btrfs_test_opt(root, NODATACOW)) - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; - } key[0].objectid = objectid; btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); @@ -3640,6 +3634,13 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_inherit_iflags(inode, dir); + if ((mode & S_IFREG)) { + if (btrfs_test_opt(root, NODATASUM)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; + if (btrfs_test_opt(root, NODATACOW)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; + } + insert_inode_hash(inode); inode_tree_add(inode); return inode; @@ -5082,6 +5083,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, u64 mask = BTRFS_I(inode)->root->sectorsize - 1; struct extent_map *em; struct btrfs_trans_handle *trans; + struct btrfs_root *root; int ret; alloc_start = offset & ~mask; @@ -5100,6 +5102,13 @@ static long btrfs_fallocate(struct inode *inode, int mode, goto out; } + root = BTRFS_I(inode)->root; + + ret = btrfs_check_data_free_space(root, inode, + alloc_end - alloc_start); + if (ret) + goto out; + locked_end = alloc_end - 1; while (1) { struct btrfs_ordered_extent *ordered; @@ -5107,7 +5116,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1); if (!trans) { ret = -EIO; - goto out; + goto out_free; } /* the extent lock is ordered inside the running @@ -5168,6 +5177,8 @@ static long btrfs_fallocate(struct inode *inode, int mode, GFP_NOFS); btrfs_end_transaction(trans, BTRFS_I(inode)->root); +out_free: + btrfs_free_reserved_data_space(root, inode, alloc_end - alloc_start); out: mutex_unlock(&inode->i_mutex); return ret; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index eff18f5..9f4db84 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1028,7 +1028,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, struct btrfs_file_extent_item); comp = btrfs_file_extent_compression(leaf, extent); type = btrfs_file_extent_type(leaf, extent); - if (type == BTRFS_FILE_EXTENT_REG) { + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { disko = btrfs_file_extent_disk_bytenr(leaf, extent); diskl = btrfs_file_extent_disk_num_bytes(leaf, @@ -1051,7 +1052,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, new_key.objectid = inode->i_ino; new_key.offset = key.offset + destoff - off; - if (type == BTRFS_FILE_EXTENT_REG) { + if (type == BTRFS_FILE_EXTENT_REG || + type == BTRFS_FILE_EXTENT_PREALLOC) { ret = btrfs_insert_empty_item(trans, root, path, &new_key, size); if (ret) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b23dc20..0083979 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1788,7 +1788,7 @@ static void merge_func(struct btrfs_work *work) btrfs_end_transaction(trans, root); } - btrfs_drop_dead_root(reloc_root); + btrfs_drop_snapshot(reloc_root, 0); if (atomic_dec_and_test(async->num_pending)) complete(async->done); @@ -2075,9 +2075,6 @@ static int do_relocation(struct btrfs_trans_handle *trans, ret = btrfs_drop_subtree(trans, root, eb, upper->eb); BUG_ON(ret); - - btrfs_tree_unlock(eb); - free_extent_buffer(eb); } if (!lowest) { btrfs_tree_unlock(upper->eb); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 4e83457..2dbf1c1 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -593,6 +593,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) return 0; } +#if 0 /* * when dropping snapshots, we generate a ton of delayed refs, and it makes * sense not to join the transaction while it is trying to flush the current @@ -681,6 +682,7 @@ int btrfs_drop_dead_root(struct btrfs_root *root) btrfs_btree_balance_dirty(tree_root, nr); return ret; } +#endif /* * new snapshots need to be created at a very specific time in the @@ -1081,7 +1083,7 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root) while (!list_empty(&list)) { root = list_entry(list.next, struct btrfs_root, root_list); list_del_init(&root->root_list); - btrfs_drop_dead_root(root); + btrfs_drop_snapshot(root, 0); } return 0; } diff --git a/fs/compat.c b/fs/compat.c index cdd51a3..fbadb94 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1486,8 +1486,8 @@ int compat_do_execve(char * filename, if (!bprm) goto out_files; - retval = mutex_lock_interruptible(¤t->cred_guard_mutex); - if (retval < 0) + retval = -ERESTARTNOINTR; + if (mutex_lock_interruptible(¤t->cred_guard_mutex)) goto out_free; current->in_execve = 1; diff --git a/fs/eventfd.c b/fs/eventfd.c index 3f0e197..31d12de 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -14,35 +14,44 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/anon_inodes.h> -#include <linux/eventfd.h> #include <linux/syscalls.h> #include <linux/module.h> +#include <linux/kref.h> +#include <linux/eventfd.h> struct eventfd_ctx { + struct kref kref; wait_queue_head_t wqh; /* * Every time that a write(2) is performed on an eventfd, the * value of the __u64 being written is added to "count" and a * wakeup is performed on "wqh". A read(2) will return the "count" * value to userspace, and will reset "count" to zero. The kernel - * size eventfd_signal() also, adds to the "count" counter and + * side eventfd_signal() also, adds to the "count" counter and * issue a wakeup. */ __u64 count; unsigned int flags; }; -/* - * Adds "n" to the eventfd counter "count". Returns "n" in case of - * success, or a value lower then "n" in case of coutner overflow. - * This function is supposed to be called by the kernel in paths - * that do not allow sleeping. In this function we allow the counter - * to reach the ULLONG_MAX value, and we signal this as overflow - * condition by returining a POLLERR to poll(2). +/** + * eventfd_signal - Adds @n to the eventfd counter. + * @ctx: [in] Pointer to the eventfd context. + * @n: [in] Value of the counter to be added to the eventfd internal counter. + * The value cannot be negative. + * + * This function is supposed to be called by the kernel in paths that do not + * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX + * value, and we signal this as overflow condition by returining a POLLERR + * to poll(2). + * + * Returns @n in case of success, a non-negative number lower than @n in case + * of overflow, or the following error codes: + * + * -EINVAL : The value of @n is negative. */ -int eventfd_signal(struct file *file, int n) +int eventfd_signal(struct eventfd_ctx *ctx, int n) { - struct eventfd_ctx *ctx = file->private_data; unsigned long flags; if (n < 0) @@ -59,9 +68,45 @@ int eventfd_signal(struct file *file, int n) } EXPORT_SYMBOL_GPL(eventfd_signal); +static void eventfd_free(struct kref *kref) +{ + struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref); + + kfree(ctx); +} + +/** + * eventfd_ctx_get - Acquires a reference to the internal eventfd context. + * @ctx: [in] Pointer to the eventfd context. + * + * Returns: In case of success, returns a pointer to the eventfd context. + */ +struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx) +{ + kref_get(&ctx->kref); + return ctx; +} +EXPORT_SYMBOL_GPL(eventfd_ctx_get); + +/** + * eventfd_ctx_put - Releases a reference to the internal eventfd context. + * @ctx: [in] Pointer to eventfd context. + * + * The eventfd context reference must have been previously acquired either + * with eventfd_ctx_get() or eventfd_ctx_fdget()). + */ +void eventfd_ctx_put(struct eventfd_ctx *ctx) +{ + kref_put(&ctx->kref, eventfd_free); +} +EXPORT_SYMBOL_GPL(eventfd_ctx_put); + static int eventfd_release(struct inode *inode, struct file *file) { - kfree(file->private_data); + struct eventfd_ctx *ctx = file->private_data; + + wake_up_poll(&ctx->wqh, POLLHUP); + eventfd_ctx_put(ctx); return 0; } @@ -185,6 +230,16 @@ static const struct file_operations eventfd_fops = { .write = eventfd_write, }; +/** + * eventfd_fget - Acquire a reference of an eventfd file descriptor. + * @fd: [in] Eventfd file descriptor. + * + * Returns a pointer to the eventfd file structure in case of success, or the + * following error pointer: + * + * -EBADF : Invalid @fd file descriptor. + * -EINVAL : The @fd file descriptor is not an eventfd file. + */ struct file *eventfd_fget(int fd) { struct file *file; @@ -201,6 +256,48 @@ struct file *eventfd_fget(int fd) } EXPORT_SYMBOL_GPL(eventfd_fget); +/** + * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context. + * @fd: [in] Eventfd file descriptor. + * + * Returns a pointer to the internal eventfd context, otherwise the error + * pointers returned by the following functions: + * + * eventfd_fget + */ +struct eventfd_ctx *eventfd_ctx_fdget(int fd) +{ + struct file *file; + struct eventfd_ctx *ctx; + + file = eventfd_fget(fd); + if (IS_ERR(file)) + return (struct eventfd_ctx *) file; + ctx = eventfd_ctx_get(file->private_data); + fput(file); + + return ctx; +} +EXPORT_SYMBOL_GPL(eventfd_ctx_fdget); + +/** + * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context. + * @file: [in] Eventfd file pointer. + * + * Returns a pointer to the internal eventfd context, otherwise the error + * pointer: + * + * -EINVAL : The @fd file descriptor is not an eventfd file. + */ +struct eventfd_ctx *eventfd_ctx_fileget(struct file *file) +{ + if (file->f_op != &eventfd_fops) + return ERR_PTR(-EINVAL); + + return eventfd_ctx_get(file->private_data); +} +EXPORT_SYMBOL_GPL(eventfd_ctx_fileget); + SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) { int fd; @@ -217,6 +314,7 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) if (!ctx) return -ENOMEM; + kref_init(&ctx->kref); init_waitqueue_head(&ctx->wqh); ctx->count = count; ctx->flags = flags; @@ -1277,8 +1277,8 @@ int do_execve(char * filename, if (!bprm) goto out_files; - retval = mutex_lock_interruptible(¤t->cred_guard_mutex); - if (retval < 0) + retval = -ERESTARTNOINTR; + if (mutex_lock_interruptible(¤t->cred_guard_mutex)) goto out_free; current->in_execve = 1; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 6524eca..e1dedb0 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -66,8 +66,16 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, str inode = NULL; if (ino) { inode = ext2_iget(dir->i_sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); + if (unlikely(IS_ERR(inode))) { + if (PTR_ERR(inode) == -ESTALE) { + ext2_error(dir->i_sb, __func__, + "deleted inode referenced: %lu", + ino); + return ERR_PTR(-EIO); + } else { + return ERR_CAST(inode); + } + } } return d_splice_alias(inode, dentry); } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 8fed2ed..f58ecbc 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -849,6 +849,81 @@ err: return err; } +static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_inval_inode_out outarg; + int err = -EINVAL; + + if (size != sizeof(outarg)) + goto err; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto err; + fuse_copy_finish(cs); + + down_read(&fc->killsb); + err = -ENOENT; + if (!fc->sb) + goto err_unlock; + + err = fuse_reverse_inval_inode(fc->sb, outarg.ino, + outarg.off, outarg.len); + +err_unlock: + up_read(&fc->killsb); + return err; + +err: + fuse_copy_finish(cs); + return err; +} + +static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_inval_entry_out outarg; + int err = -EINVAL; + char buf[FUSE_NAME_MAX+1]; + struct qstr name; + + if (size < sizeof(outarg)) + goto err; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto err; + + err = -ENAMETOOLONG; + if (outarg.namelen > FUSE_NAME_MAX) + goto err; + + name.name = buf; + name.len = outarg.namelen; + err = fuse_copy_one(cs, buf, outarg.namelen + 1); + if (err) + goto err; + fuse_copy_finish(cs); + buf[outarg.namelen] = 0; + name.hash = full_name_hash(name.name, name.len); + + down_read(&fc->killsb); + err = -ENOENT; + if (!fc->sb) + goto err_unlock; + + err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name); + +err_unlock: + up_read(&fc->killsb); + return err; + +err: + fuse_copy_finish(cs); + return err; +} + static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { @@ -856,6 +931,12 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, case FUSE_NOTIFY_POLL: return fuse_notify_poll(fc, size, cs); + case FUSE_NOTIFY_INVAL_INODE: + return fuse_notify_inval_inode(fc, size, cs); + + case FUSE_NOTIFY_INVAL_ENTRY: + return fuse_notify_inval_entry(fc, size, cs); + default: fuse_copy_finish(cs); return -EINVAL; @@ -910,7 +991,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { int err; - unsigned nbytes = iov_length(iov, nr_segs); + size_t nbytes = iov_length(iov, nr_segs); struct fuse_req *req; struct fuse_out_header oh; struct fuse_copy_state cs; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index b3089a0..e703654 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -375,7 +375,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, struct fuse_conn *fc = get_fuse_conn(dir); struct fuse_req *req; struct fuse_req *forget_req; - struct fuse_open_in inarg; + struct fuse_create_in inarg; struct fuse_open_out outopen; struct fuse_entry_out outentry; struct fuse_file *ff; @@ -399,15 +399,20 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, if (!ff) goto out_put_request; + if (!fc->dont_mask) + mode &= ~current_umask(); + flags &= ~O_NOCTTY; memset(&inarg, 0, sizeof(inarg)); memset(&outentry, 0, sizeof(outentry)); inarg.flags = flags; inarg.mode = mode; + inarg.umask = current_umask(); req->in.h.opcode = FUSE_CREATE; req->in.h.nodeid = get_node_id(dir); req->in.numargs = 2; - req->in.args[0].size = sizeof(inarg); + req->in.args[0].size = fc->minor < 12 ? sizeof(struct fuse_open_in) : + sizeof(inarg); req->in.args[0].value = &inarg; req->in.args[1].size = entry->d_name.len + 1; req->in.args[1].value = entry->d_name.name; @@ -546,12 +551,17 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode, if (IS_ERR(req)) return PTR_ERR(req); + if (!fc->dont_mask) + mode &= ~current_umask(); + memset(&inarg, 0, sizeof(inarg)); inarg.mode = mode; inarg.rdev = new_encode_dev(rdev); + inarg.umask = current_umask(); req->in.h.opcode = FUSE_MKNOD; req->in.numargs = 2; - req->in.args[0].size = sizeof(inarg); + req->in.args[0].size = fc->minor < 12 ? FUSE_COMPAT_MKNOD_IN_SIZE : + sizeof(inarg); req->in.args[0].value = &inarg; req->in.args[1].size = entry->d_name.len + 1; req->in.args[1].value = entry->d_name.name; @@ -578,8 +588,12 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode) if (IS_ERR(req)) return PTR_ERR(req); + if (!fc->dont_mask) + mode &= ~current_umask(); + memset(&inarg, 0, sizeof(inarg)); inarg.mode = mode; + inarg.umask = current_umask(); req->in.h.opcode = FUSE_MKDIR; req->in.numargs = 2; req->in.args[0].size = sizeof(inarg); @@ -845,6 +859,43 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat, return err; } +int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, + struct qstr *name) +{ + int err = -ENOTDIR; + struct inode *parent; + struct dentry *dir; + struct dentry *entry; + + parent = ilookup5(sb, parent_nodeid, fuse_inode_eq, &parent_nodeid); + if (!parent) + return -ENOENT; + + mutex_lock(&parent->i_mutex); + if (!S_ISDIR(parent->i_mode)) + goto unlock; + + err = -ENOENT; + dir = d_find_alias(parent); + if (!dir) + goto unlock; + + entry = d_lookup(dir, name); + dput(dir); + if (!entry) + goto unlock; + + fuse_invalidate_attr(parent); + fuse_invalidate_entry(entry); + dput(entry); + err = 0; + + unlock: + mutex_unlock(&parent->i_mutex); + iput(parent); + return err; +} + /* * Calling into a user-controlled filesystem gives the filesystem * daemon ptrace-like capabilities over the requester process. This diff --git a/fs/fuse/file.c b/fs/fuse/file.c index fce6ce6..cbc4640 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1922,7 +1922,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait) req = fuse_get_req(fc); if (IS_ERR(req)) - return PTR_ERR(req); + return POLLERR; req->in.h.opcode = FUSE_POLL; req->in.h.nodeid = ff->nodeid; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index aaf2f9f..52b641f 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -446,6 +446,9 @@ struct fuse_conn { /** Do multi-page cached writes */ unsigned big_writes:1; + /** Don't apply umask to creation modes */ + unsigned dont_mask:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -481,6 +484,12 @@ struct fuse_conn { /** Called on final put */ void (*release)(struct fuse_conn *); + + /** Super block for this connection. */ + struct super_block *sb; + + /** Read/write semaphore to hold when accessing sb. */ + struct rw_semaphore killsb; }; static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) @@ -509,6 +518,11 @@ extern const struct file_operations fuse_dev_operations; extern const struct dentry_operations fuse_dentry_operations; /** + * Inode to nodeid comparison. + */ +int fuse_inode_eq(struct inode *inode, void *_nodeidp); + +/** * Get a filled in inode */ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, @@ -708,6 +722,19 @@ void fuse_release_nowrite(struct inode *inode); u64 fuse_get_attr_version(struct fuse_conn *fc); +/** + * File-system tells the kernel to invalidate cache for the given node id. + */ +int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, + loff_t offset, loff_t len); + +/** + * File-system tells the kernel to invalidate parent attributes and + * the dentry matching parent/name. + */ +int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, + struct qstr *name); + int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, bool isdir); ssize_t fuse_direct_io(struct file *file, const char __user *buf, diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d8673cc..f91ccc4 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -206,7 +206,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) BUG(); } -static int fuse_inode_eq(struct inode *inode, void *_nodeidp) +int fuse_inode_eq(struct inode *inode, void *_nodeidp) { u64 nodeid = *(u64 *) _nodeidp; if (get_node_id(inode) == nodeid) @@ -257,6 +257,31 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, return inode; } +int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, + loff_t offset, loff_t len) +{ + struct inode *inode; + pgoff_t pg_start; + pgoff_t pg_end; + + inode = ilookup5(sb, nodeid, fuse_inode_eq, &nodeid); + if (!inode) + return -ENOENT; + + fuse_invalidate_attr(inode); + if (offset >= 0) { + pg_start = offset >> PAGE_CACHE_SHIFT; + if (len <= 0) + pg_end = -1; + else + pg_end = (offset + len - 1) >> PAGE_CACHE_SHIFT; + invalidate_inode_pages2_range(inode->i_mapping, + pg_start, pg_end); + } + iput(inode); + return 0; +} + static void fuse_umount_begin(struct super_block *sb) { fuse_abort_conn(get_fuse_conn_super(sb)); @@ -480,6 +505,7 @@ void fuse_conn_init(struct fuse_conn *fc) memset(fc, 0, sizeof(*fc)); spin_lock_init(&fc->lock); mutex_init(&fc->inst_mutex); + init_rwsem(&fc->killsb); atomic_set(&fc->count, 1); init_waitqueue_head(&fc->waitq); init_waitqueue_head(&fc->blocked_waitq); @@ -725,6 +751,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) } if (arg->flags & FUSE_BIG_WRITES) fc->big_writes = 1; + if (arg->flags & FUSE_DONT_MASK) + fc->dont_mask = 1; } else { ra_pages = fc->max_read / PAGE_CACHE_SIZE; fc->no_lock = 1; @@ -748,7 +776,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) arg->minor = FUSE_KERNEL_MINOR_VERSION; arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE; arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | - FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES; + FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -860,10 +888,16 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fuse_conn_init(fc); fc->dev = sb->s_dev; + fc->sb = sb; err = fuse_bdi_init(fc, sb); if (err) goto err_put_conn; + /* Handle umasking inside the fuse code */ + if (sb->s_flags & MS_POSIXACL) + fc->dont_mask = 1; + sb->s_flags |= MS_POSIXACL; + fc->release = fuse_free_conn; fc->flags = d.flags; fc->user_id = d.user_id; @@ -941,12 +975,25 @@ static int fuse_get_sb(struct file_system_type *fs_type, return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt); } +static void fuse_kill_sb_anon(struct super_block *sb) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + + if (fc) { + down_write(&fc->killsb); + fc->sb = NULL; + up_write(&fc->killsb); + } + + kill_anon_super(sb); +} + static struct file_system_type fuse_fs_type = { .owner = THIS_MODULE, .name = "fuse", .fs_flags = FS_HAS_SUBTYPE, .get_sb = fuse_get_sb, - .kill_sb = kill_anon_super, + .kill_sb = fuse_kill_sb_anon, }; #ifdef CONFIG_BLOCK @@ -958,11 +1005,24 @@ static int fuse_get_sb_blk(struct file_system_type *fs_type, mnt); } +static void fuse_kill_sb_blk(struct super_block *sb) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + + if (fc) { + down_write(&fc->killsb); + fc->sb = NULL; + up_write(&fc->killsb); + } + + kill_block_super(sb); +} + static struct file_system_type fuseblk_fs_type = { .owner = THIS_MODULE, .name = "fuseblk", .get_sb = fuse_get_sb_blk, - .kill_sb = kill_block_super, + .kill_sb = fuse_kill_sb_blk, .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE, }; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index fe02ad4..032604e 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -972,6 +972,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) sb->s_blocksize_bits = 10; sb->s_magic = HOSTFS_SUPER_MAGIC; sb->s_op = &hostfs_sbops; + sb->s_maxbytes = MAX_LFS_FILESIZE; /* NULL is printed as <NULL> by sprintf: avoid that. */ if (req_root == NULL) diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index 7515e73..696686c 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -130,9 +130,9 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) if (jffs2_sum_active()) { s = kzalloc(sizeof(struct jffs2_summary), GFP_KERNEL); if (!s) { - kfree(flashbuf); JFFS2_WARNING("Can't allocate memory for summary\n"); - return -ENOMEM; + ret = -ENOMEM; + goto out; } } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 4145083..23341c1 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -678,7 +678,6 @@ __be32 nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access, struct file **filp) { - const struct cred *cred = current_cred(); struct dentry *dentry; struct inode *inode; int flags = O_RDONLY|O_LARGEFILE; @@ -733,7 +732,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, vfs_dq_init(inode); } *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt), - flags, cred); + flags, current_cred()); if (IS_ERR(*filp)) host_err = PTR_ERR(*filp); else diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index ff231ad..ff27a29 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -296,12 +296,15 @@ static int inotify_fasync(int fd, struct file *file, int on) static int inotify_release(struct inode *ignored, struct file *file) { struct fsnotify_group *group = file->private_data; + struct user_struct *user = group->inotify_data.user; fsnotify_clear_marks_by_group(group); /* free this group, matching get was inotify_init->fsnotify_obtain_group */ fsnotify_put_group(group); + atomic_dec(&user->inotify_devs); + return 0; } @@ -112,8 +112,13 @@ restart: mutex_unlock(&mutex); } +/* + * sync everything. Start out by waking pdflush, because that writes back + * all queues in parallel. + */ SYSCALL_DEFINE0(sync) { + wakeup_pdflush(0); sync_filesystems(0); sync_filesystems(1); if (unlikely(laptop_mode)) |