diff options
31 files changed, 5066 insertions, 2740 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 462859a..7ec1409 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -377,6 +377,7 @@ again: if (!list_empty(&worker->pending) || !list_empty(&worker->prio_pending)) { spin_unlock_irq(&worker->lock); + set_current_state(TASK_RUNNING); goto again; } diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 7a4dee1..6ad63f1 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -137,8 +137,8 @@ struct btrfs_inode { * of extent items we've reserved metadata for. */ spinlock_t accounting_lock; + atomic_t outstanding_extents; int reserved_extents; - int outstanding_extents; /* * ordered_data_close is set by truncate when a file that used @@ -151,6 +151,7 @@ struct btrfs_inode { * of these. */ unsigned ordered_data_close:1; + unsigned orphan_meta_reserved:1; unsigned dummy_inode:1; /* diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 6795a71..0d1d966 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -280,7 +280,8 @@ int btrfs_block_can_be_shared(struct btrfs_root *root, static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, - struct extent_buffer *cow) + struct extent_buffer *cow, + int *last_ref) { u64 refs; u64 owner; @@ -366,6 +367,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, BUG_ON(ret); } clean_tree_block(trans, root, buf); + *last_ref = 1; } return 0; } @@ -392,6 +394,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; struct extent_buffer *cow; int level; + int last_ref = 0; int unlock_orig = 0; u64 parent_start; @@ -442,7 +445,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, (unsigned long)btrfs_header_fsid(cow), BTRFS_FSID_SIZE); - update_ref_for_cow(trans, root, buf, cow); + update_ref_for_cow(trans, root, buf, cow, &last_ref); + + if (root->ref_cows) + btrfs_reloc_cow_block(trans, root, buf, cow); if (buf == root->node) { WARN_ON(parent && parent != buf); @@ -457,8 +463,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, extent_buffer_get(cow); spin_unlock(&root->node_lock); - btrfs_free_tree_block(trans, root, buf->start, buf->len, - parent_start, root->root_key.objectid, level); + btrfs_free_tree_block(trans, root, buf, parent_start, + last_ref); free_extent_buffer(buf); add_root_to_dirty_list(root); } else { @@ -473,8 +479,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); - btrfs_free_tree_block(trans, root, buf->start, buf->len, - parent_start, root->root_key.objectid, level); + btrfs_free_tree_block(trans, root, buf, parent_start, + last_ref); } if (unlock_orig) btrfs_tree_unlock(buf); @@ -949,6 +955,22 @@ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, return bin_search(eb, key, level, slot); } +static void root_add_used(struct btrfs_root *root, u32 size) +{ + spin_lock(&root->accounting_lock); + btrfs_set_root_used(&root->root_item, + btrfs_root_used(&root->root_item) + size); + spin_unlock(&root->accounting_lock); +} + +static void root_sub_used(struct btrfs_root *root, u32 size) +{ + spin_lock(&root->accounting_lock); + btrfs_set_root_used(&root->root_item, + btrfs_root_used(&root->root_item) - size); + spin_unlock(&root->accounting_lock); +} + /* given a node and slot number, this reads the blocks it points to. The * extent buffer is returned with a reference taken (but unlocked). * NULL is returned on error. @@ -1019,7 +1041,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_lock(child); btrfs_set_lock_blocking(child); ret = btrfs_cow_block(trans, root, child, mid, 0, &child); - BUG_ON(ret); + if (ret) { + btrfs_tree_unlock(child); + free_extent_buffer(child); + goto enospc; + } spin_lock(&root->node_lock); root->node = child; @@ -1034,11 +1060,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_unlock(mid); /* once for the path */ free_extent_buffer(mid); - ret = btrfs_free_tree_block(trans, root, mid->start, mid->len, - 0, root->root_key.objectid, level); + + root_sub_used(root, mid->len); + btrfs_free_tree_block(trans, root, mid, 0, 1); /* once for the root ptr */ free_extent_buffer(mid); - return ret; + return 0; } if (btrfs_header_nritems(mid) > BTRFS_NODEPTRS_PER_BLOCK(root) / 4) @@ -1088,23 +1115,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret < 0 && wret != -ENOSPC) ret = wret; if (btrfs_header_nritems(right) == 0) { - u64 bytenr = right->start; - u32 blocksize = right->len; - clean_tree_block(trans, root, right); btrfs_tree_unlock(right); - free_extent_buffer(right); - right = NULL; wret = del_ptr(trans, root, path, level + 1, pslot + 1); if (wret) ret = wret; - wret = btrfs_free_tree_block(trans, root, - bytenr, blocksize, 0, - root->root_key.objectid, - level); - if (wret) - ret = wret; + root_sub_used(root, right->len); + btrfs_free_tree_block(trans, root, right, 0, 1); + free_extent_buffer(right); + right = NULL; } else { struct btrfs_disk_key right_key; btrfs_node_key(right, &right_key, 0); @@ -1136,21 +1156,15 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BUG_ON(wret == 1); } if (btrfs_header_nritems(mid) == 0) { - /* we've managed to empty the middle node, drop it */ - u64 bytenr = mid->start; - u32 blocksize = mid->len; - clean_tree_block(trans, root, mid); btrfs_tree_unlock(mid); - free_extent_buffer(mid); - mid = NULL; wret = del_ptr(trans, root, path, level + 1, pslot); if (wret) ret = wret; - wret = btrfs_free_tree_block(trans, root, bytenr, blocksize, - 0, root->root_key.objectid, level); - if (wret) - ret = wret; + root_sub_used(root, mid->len); + btrfs_free_tree_block(trans, root, mid, 0, 1); + free_extent_buffer(mid); + mid = NULL; } else { /* update the parent key to reflect our changes */ struct btrfs_disk_key mid_key; @@ -1590,7 +1604,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, btrfs_release_path(NULL, p); ret = -EAGAIN; - tmp = read_tree_block(root, blocknr, blocksize, gen); + tmp = read_tree_block(root, blocknr, blocksize, 0); if (tmp) { /* * If the read above didn't mark this buffer up to date, @@ -1740,7 +1754,6 @@ again: p->nodes[level + 1], p->slots[level + 1], &b); if (err) { - free_extent_buffer(b); ret = err; goto done; } @@ -2076,6 +2089,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, if (IS_ERR(c)) return PTR_ERR(c); + root_add_used(root, root->nodesize); + memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_nritems(c, 1); btrfs_set_header_level(c, level); @@ -2134,6 +2149,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root int nritems; BUG_ON(!path->nodes[level]); + btrfs_assert_tree_locked(path->nodes[level]); lower = path->nodes[level]; nritems = btrfs_header_nritems(lower); BUG_ON(slot > nritems); @@ -2202,6 +2218,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, if (IS_ERR(split)) return PTR_ERR(split); + root_add_used(root, root->nodesize); + memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_level(split, btrfs_header_level(c)); btrfs_set_header_bytenr(split, split->start); @@ -2415,6 +2433,9 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (left_nritems) btrfs_mark_buffer_dirty(left); + else + clean_tree_block(trans, root, left); + btrfs_mark_buffer_dirty(right); btrfs_item_key(right, &disk_key, 0); @@ -2660,6 +2681,8 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(left); if (right_nritems) btrfs_mark_buffer_dirty(right); + else + clean_tree_block(trans, root, right); btrfs_item_key(right, &disk_key, 0); wret = fixup_low_keys(trans, root, path, &disk_key, 1); @@ -2669,8 +2692,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { path->slots[0] += old_left_nritems; - if (btrfs_header_nritems(path->nodes[0]) == 0) - clean_tree_block(trans, root, path->nodes[0]); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = left; @@ -2932,10 +2953,10 @@ again: right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, root->root_key.objectid, &disk_key, 0, l->start, 0); - if (IS_ERR(right)) { - BUG_ON(1); + if (IS_ERR(right)) return PTR_ERR(right); - } + + root_add_used(root, root->leafsize); memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_bytenr(right, right->start); @@ -3054,7 +3075,8 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, btrfs_set_path_blocking(path); ret = split_leaf(trans, root, &key, path, ins_len, 1); - BUG_ON(ret); + if (ret) + goto err; path->keep_locks = 0; btrfs_unlock_up_safe(path, 1); @@ -3796,9 +3818,10 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, */ btrfs_unlock_up_safe(path, 0); - ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len, - 0, root->root_key.objectid, 0); - return ret; + root_sub_used(root, leaf->len); + + btrfs_free_tree_block(trans, root, leaf, 0, 1); + return 0; } /* * delete the item at the leaf level in path. If that empties @@ -3865,6 +3888,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (leaf == root->node) { btrfs_set_header_level(leaf, 0); } else { + btrfs_set_path_blocking(path); + clean_tree_block(trans, root, leaf); ret = btrfs_del_leaf(trans, root, path, leaf); BUG_ON(ret); } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 746a724..e9bf864 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -34,6 +34,7 @@ struct btrfs_trans_handle; struct btrfs_transaction; +struct btrfs_pending_snapshot; extern struct kmem_cache *btrfs_trans_handle_cachep; extern struct kmem_cache *btrfs_transaction_cachep; extern struct kmem_cache *btrfs_bit_radix_cachep; @@ -663,6 +664,7 @@ struct btrfs_csum_item { #define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) #define BTRFS_BLOCK_GROUP_DUP (1 << 5) #define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) +#define BTRFS_NR_RAID_TYPES 5 struct btrfs_block_group_item { __le64 used; @@ -674,42 +676,46 @@ struct btrfs_space_info { u64 flags; u64 total_bytes; /* total bytes in the space */ - u64 bytes_used; /* total bytes used on disk */ + u64 bytes_used; /* total bytes used, + this does't take mirrors into account */ u64 bytes_pinned; /* total bytes pinned, will be freed when the transaction finishes */ u64 bytes_reserved; /* total bytes the allocator has reserved for current allocations */ u64 bytes_readonly; /* total bytes that are read only */ - u64 bytes_super; /* total bytes reserved for the super blocks */ - u64 bytes_root; /* the number of bytes needed to commit a - transaction */ + u64 bytes_may_use; /* number of bytes that may be used for delalloc/allocations */ - u64 bytes_delalloc; /* number of bytes currently reserved for - delayed allocation */ + u64 disk_used; /* total bytes used on disk */ int full; /* indicates that we cannot allocate any more chunks for this space */ int force_alloc; /* set if we need to force a chunk alloc for this space */ - int force_delalloc; /* make people start doing filemap_flush until - we're under a threshold */ struct list_head list; - /* for controlling how we free up space for allocations */ - wait_queue_head_t allocate_wait; - wait_queue_head_t flush_wait; - int allocating_chunk; - int flushing; - /* for block groups in our same type */ - struct list_head block_groups; + struct list_head block_groups[BTRFS_NR_RAID_TYPES]; spinlock_t lock; struct rw_semaphore groups_sem; atomic_t caching_threads; }; +struct btrfs_block_rsv { + u64 size; + u64 reserved; + u64 freed[2]; + struct btrfs_space_info *space_info; + struct list_head list; + spinlock_t lock; + atomic_t usage; + unsigned int priority:8; + unsigned int durable:1; + unsigned int refill_used:1; + unsigned int full:1; +}; + /* * free clusters are used to claim free space in relatively large chunks, * allowing us to do less seeky writes. They are used for all metadata @@ -760,6 +766,7 @@ struct btrfs_block_group_cache { spinlock_t lock; u64 pinned; u64 reserved; + u64 reserved_pinned; u64 bytes_super; u64 flags; u64 sectorsize; @@ -825,6 +832,22 @@ struct btrfs_fs_info { /* logical->physical extent mapping */ struct btrfs_mapping_tree mapping_tree; + /* block reservation for extent, checksum and root tree */ + struct btrfs_block_rsv global_block_rsv; + /* block reservation for delay allocation */ + struct btrfs_block_rsv delalloc_block_rsv; + /* block reservation for metadata operations */ + struct btrfs_block_rsv trans_block_rsv; + /* block reservation for chunk tree */ + struct btrfs_block_rsv chunk_block_rsv; + + struct btrfs_block_rsv empty_block_rsv; + + /* list of block reservations that cross multiple transactions */ + struct list_head durable_block_rsv_list; + + struct mutex durable_block_rsv_mutex; + u64 generation; u64 last_trans_committed; @@ -927,7 +950,6 @@ struct btrfs_fs_info { struct btrfs_workers endio_meta_write_workers; struct btrfs_workers endio_write_workers; struct btrfs_workers submit_workers; - struct btrfs_workers enospc_workers; /* * fixup workers take dirty pages that didn't properly go through * the cow mechanism and make them safe to write. It happens @@ -943,6 +965,7 @@ struct btrfs_fs_info { int do_barriers; int closing; int log_root_recovering; + int enospc_unlink; u64 total_pinned; @@ -1012,6 +1035,9 @@ struct btrfs_root { struct completion kobj_unregister; struct mutex objectid_mutex; + spinlock_t accounting_lock; + struct btrfs_block_rsv *block_rsv; + struct mutex log_mutex; wait_queue_head_t log_writer_wait; wait_queue_head_t log_commit_wait[2]; @@ -1043,7 +1069,6 @@ struct btrfs_root { int ref_cows; int track_dirty; int in_radix; - int clean_orphans; u64 defrag_trans_start; struct btrfs_key defrag_progress; @@ -1057,8 +1082,11 @@ struct btrfs_root { struct list_head root_list; - spinlock_t list_lock; + spinlock_t orphan_lock; struct list_head orphan_list; + struct btrfs_block_rsv *orphan_block_rsv; + int orphan_item_inserted; + int orphan_cleanup_state; spinlock_t inode_lock; /* red-black tree that keeps track of in-memory inodes */ @@ -1965,6 +1993,9 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *refs, u64 *flags); int btrfs_pin_extent(struct btrfs_root *root, u64 bytenr, u64 num, int reserved); int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, @@ -1984,10 +2015,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size); -int btrfs_free_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u32 blocksize, - u64 parent, u64 root_objectid, int level); +void btrfs_free_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + u64 parent, int last_ref); struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u32 blocksize, @@ -2041,27 +2072,49 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 size); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start); -int btrfs_prepare_block_group_relocation(struct btrfs_root *root, - struct btrfs_block_group_cache *group); - u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); - -int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items); -int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items); -int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, - struct inode *inode, int num_items); -int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, - struct inode *inode, int num_items); -int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, - u64 bytes); -void btrfs_free_reserved_data_space(struct btrfs_root *root, - struct inode *inode, u64 bytes); -void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, - u64 bytes); -void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, - u64 bytes); +int btrfs_check_data_free_space(struct inode *inode, u64 bytes); +void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); +int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + int num_items, int *retries); +void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, + struct inode *inode); +void btrfs_orphan_release_metadata(struct inode *inode); +int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending); +int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); +void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); +int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); +void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes); +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv); +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); +void btrfs_free_block_rsv(struct btrfs_root *root, + struct btrfs_block_rsv *rsv); +void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *rsv); +int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, int *retries); +int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved, int min_factor); +int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, + struct btrfs_block_rsv *dst_rsv, + u64 num_bytes); +void btrfs_block_rsv_release(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes); +int btrfs_set_block_group_ro(struct btrfs_root *root, + struct btrfs_block_group_cache *cache); +int btrfs_set_block_group_rw(struct btrfs_root *root, + struct btrfs_block_group_cache *cache); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -2152,7 +2205,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); -int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref); +int btrfs_drop_snapshot(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int update_ref); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, @@ -2245,6 +2299,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, u64 inode_objectid, u64 ref_objectid, u64 *index); +struct btrfs_inode_ref * +btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, int mod); int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); @@ -2257,6 +2317,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, struct bio *bio, u32 *dst); +int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u64 logical_offset, u32 *dst); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, @@ -2311,6 +2373,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); +int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, struct extent_state **cached_state); int btrfs_writepages(struct address_space *mapping, @@ -2349,10 +2412,20 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans, int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); void btrfs_orphan_cleanup(struct btrfs_root *root); +void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending, + u64 *bytes_to_reserve); +void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending); +void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int btrfs_cont_expand(struct inode *inode, loff_t size); int btrfs_invalidate_inodes(struct btrfs_root *root); void btrfs_add_delayed_iput(struct inode *inode); void btrfs_run_delayed_iputs(struct btrfs_root *root); +int btrfs_prealloc_file_range(struct inode *inode, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint); extern const struct dentry_operations btrfs_dentry_operations; /* ioctl.c */ @@ -2409,4 +2482,12 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_recover_relocation(struct btrfs_root *root); int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); +void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *cow); +void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending, + u64 *bytes_to_reserve); +void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending); #endif diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 902ce50..e807b14 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -319,107 +319,6 @@ out: } /* - * helper function to lookup reference count and flags of extent. - * - * the head node for delayed ref is used to store the sum of all the - * reference count modifications queued up in the rbtree. the head - * node may also store the extent flags to set. This way you can check - * to see what the reference count and extent flags would be if all of - * the delayed refs are not processed. - */ -int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *refs, u64 *flags) -{ - struct btrfs_delayed_ref_node *ref; - struct btrfs_delayed_ref_head *head; - struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_path *path; - struct btrfs_extent_item *ei; - struct extent_buffer *leaf; - struct btrfs_key key; - u32 item_size; - u64 num_refs; - u64 extent_flags; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = bytenr; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = num_bytes; - delayed_refs = &trans->transaction->delayed_refs; -again: - ret = btrfs_search_slot(trans, root->fs_info->extent_root, - &key, path, 0, 0); - if (ret < 0) - goto out; - - if (ret == 0) { - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - if (item_size >= sizeof(*ei)) { - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item); - num_refs = btrfs_extent_refs(leaf, ei); - extent_flags = btrfs_extent_flags(leaf, ei); - } else { -#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - struct btrfs_extent_item_v0 *ei0; - BUG_ON(item_size != sizeof(*ei0)); - ei0 = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item_v0); - num_refs = btrfs_extent_refs_v0(leaf, ei0); - /* FIXME: this isn't correct for data */ - extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; -#else - BUG(); -#endif - } - BUG_ON(num_refs == 0); - } else { - num_refs = 0; - extent_flags = 0; - ret = 0; - } - - spin_lock(&delayed_refs->lock); - ref = find_ref_head(&delayed_refs->root, bytenr, NULL); - if (ref) { - head = btrfs_delayed_node_to_head(ref); - if (!mutex_trylock(&head->mutex)) { - atomic_inc(&ref->refs); - spin_unlock(&delayed_refs->lock); - - btrfs_release_path(root->fs_info->extent_root, path); - - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); - btrfs_put_delayed_ref(ref); - goto again; - } - if (head->extent_op && head->extent_op->update_flags) - extent_flags |= head->extent_op->flags_to_set; - else - BUG_ON(num_refs == 0); - - num_refs += ref->ref_mod; - mutex_unlock(&head->mutex); - } - WARN_ON(num_refs == 0); - if (refs) - *refs = num_refs; - if (flags) - *flags = extent_flags; -out: - spin_unlock(&delayed_refs->lock); - btrfs_free_path(path); - return ret; -} - -/* * helper function to update an extent delayed ref in the * rbtree. existing and update must both have the same * bytenr and parent diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index f6fc67d..50e3cf9 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -167,9 +167,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head * btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr); -int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, - u64 num_bytes, u64 *refs, u64 *flags); int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 orig_parent, u64 parent, u64 orig_ref_root, u64 ref_root, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index feca041..f3b287c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -74,6 +74,11 @@ struct async_submit_bio { int rw; int mirror_num; unsigned long bio_flags; + /* + * bio_offset is optional, can be used if the pages in the bio + * can't tell us where in the file the bio should go + */ + u64 bio_offset; struct btrfs_work work; }; @@ -534,7 +539,8 @@ static void run_one_async_start(struct btrfs_work *work) async = container_of(work, struct async_submit_bio, work); fs_info = BTRFS_I(async->inode)->root->fs_info; async->submit_bio_start(async->inode, async->rw, async->bio, - async->mirror_num, async->bio_flags); + async->mirror_num, async->bio_flags, + async->bio_offset); } static void run_one_async_done(struct btrfs_work *work) @@ -556,7 +562,8 @@ static void run_one_async_done(struct btrfs_work *work) wake_up(&fs_info->async_submit_wait); async->submit_bio_done(async->inode, async->rw, async->bio, - async->mirror_num, async->bio_flags); + async->mirror_num, async->bio_flags, + async->bio_offset); } static void run_one_async_free(struct btrfs_work *work) @@ -570,6 +577,7 @@ static void run_one_async_free(struct btrfs_work *work) int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, int rw, struct bio *bio, int mirror_num, unsigned long bio_flags, + u64 bio_offset, extent_submit_bio_hook_t *submit_bio_start, extent_submit_bio_hook_t *submit_bio_done) { @@ -592,6 +600,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->work.flags = 0; async->bio_flags = bio_flags; + async->bio_offset = bio_offset; atomic_inc(&fs_info->nr_async_submits); @@ -627,7 +636,8 @@ static int btree_csum_one_bio(struct bio *bio) static int __btree_submit_bio_start(struct inode *inode, int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags) + unsigned long bio_flags, + u64 bio_offset) { /* * when we're called for a write, we're already in the async @@ -638,7 +648,8 @@ static int __btree_submit_bio_start(struct inode *inode, int rw, } static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags) + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { /* * when we're called for a write, we're already in the async @@ -648,7 +659,8 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, } static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags) + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { int ret; @@ -671,6 +683,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, */ return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, inode, rw, bio, mirror_num, 0, + bio_offset, __btree_submit_bio_start, __btree_submit_bio_done); } @@ -894,7 +907,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->ref_cows = 0; root->track_dirty = 0; root->in_radix = 0; - root->clean_orphans = 0; + root->orphan_item_inserted = 0; + root->orphan_cleanup_state = 0; root->fs_info = fs_info; root->objectid = objectid; @@ -903,13 +917,16 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->name = NULL; root->in_sysfs = 0; root->inode_tree = RB_ROOT; + root->block_rsv = NULL; + root->orphan_block_rsv = NULL; INIT_LIST_HEAD(&root->dirty_list); INIT_LIST_HEAD(&root->orphan_list); INIT_LIST_HEAD(&root->root_list); spin_lock_init(&root->node_lock); - spin_lock_init(&root->list_lock); + spin_lock_init(&root->orphan_lock); spin_lock_init(&root->inode_lock); + spin_lock_init(&root->accounting_lock); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); init_waitqueue_head(&root->log_writer_wait); @@ -968,42 +985,6 @@ static int find_and_setup_root(struct btrfs_root *tree_root, return 0; } -int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - struct extent_buffer *eb; - struct btrfs_root *log_root_tree = fs_info->log_root_tree; - u64 start = 0; - u64 end = 0; - int ret; - - if (!log_root_tree) - return 0; - - while (1) { - ret = find_first_extent_bit(&log_root_tree->dirty_log_pages, - 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); - if (ret) - break; - - clear_extent_bits(&log_root_tree->dirty_log_pages, start, end, - EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); - } - eb = fs_info->log_root_tree->node; - - WARN_ON(btrfs_header_level(eb) != 0); - WARN_ON(btrfs_header_nritems(eb) != 0); - - ret = btrfs_free_reserved_extent(fs_info->tree_root, - eb->start, eb->len); - BUG_ON(ret); - - free_extent_buffer(eb); - kfree(fs_info->log_root_tree); - fs_info->log_root_tree = NULL; - return 0; -} - static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { @@ -1191,19 +1172,23 @@ again: if (root) return root; - ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid); - if (ret == 0) - ret = -ENOENT; - if (ret < 0) - return ERR_PTR(ret); - root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); if (IS_ERR(root)) return root; - WARN_ON(btrfs_root_refs(&root->root_item) == 0); set_anon_super(&root->anon_super, NULL); + if (btrfs_root_refs(&root->root_item) == 0) { + ret = -ENOENT; + goto fail; + } + + ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid); + if (ret < 0) + goto fail; + if (ret == 0) + root->orphan_item_inserted = 1; + ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); if (ret) goto fail; @@ -1212,10 +1197,9 @@ again: ret = radix_tree_insert(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, root); - if (ret == 0) { + if (ret == 0) root->in_radix = 1; - root->clean_orphans = 1; - } + spin_unlock(&fs_info->fs_roots_radix_lock); radix_tree_preload_end(); if (ret) { @@ -1461,10 +1445,6 @@ static int cleaner_kthread(void *arg) struct btrfs_root *root = arg; do { - smp_mb(); - if (root->fs_info->closing) - break; - vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); if (!(root->fs_info->sb->s_flags & MS_RDONLY) && @@ -1477,11 +1457,9 @@ static int cleaner_kthread(void *arg) if (freezing(current)) { refrigerator(); } else { - smp_mb(); - if (root->fs_info->closing) - break; set_current_state(TASK_INTERRUPTIBLE); - schedule(); + if (!kthread_should_stop()) + schedule(); __set_current_state(TASK_RUNNING); } } while (!kthread_should_stop()); @@ -1493,36 +1471,40 @@ static int transaction_kthread(void *arg) struct btrfs_root *root = arg; struct btrfs_trans_handle *trans; struct btrfs_transaction *cur; + u64 transid; unsigned long now; unsigned long delay; int ret; do { - smp_mb(); - if (root->fs_info->closing) - break; - delay = HZ * 30; vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); mutex_lock(&root->fs_info->transaction_kthread_mutex); - mutex_lock(&root->fs_info->trans_mutex); + spin_lock(&root->fs_info->new_trans_lock); cur = root->fs_info->running_transaction; if (!cur) { - mutex_unlock(&root->fs_info->trans_mutex); + spin_unlock(&root->fs_info->new_trans_lock); goto sleep; } now = get_seconds(); - if (now < cur->start_time || now - cur->start_time < 30) { - mutex_unlock(&root->fs_info->trans_mutex); + if (!cur->blocked && + (now < cur->start_time || now - cur->start_time < 30)) { + spin_unlock(&root->fs_info->new_trans_lock); delay = HZ * 5; goto sleep; } - mutex_unlock(&root->fs_info->trans_mutex); - trans = btrfs_start_transaction(root, 1); - ret = btrfs_commit_transaction(trans, root); + transid = cur->transid; + spin_unlock(&root->fs_info->new_trans_lock); + trans = btrfs_join_transaction(root, 1); + if (transid == trans->transid) { + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + } else { + btrfs_end_transaction(trans, root); + } sleep: wake_up_process(root->fs_info->cleaner_kthread); mutex_unlock(&root->fs_info->transaction_kthread_mutex); @@ -1530,10 +1512,10 @@ sleep: if (freezing(current)) { refrigerator(); } else { - if (root->fs_info->closing) - break; set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(delay); + if (!kthread_should_stop() && + !btrfs_transaction_blocked(root->fs_info)) + schedule_timeout(delay); __set_current_state(TASK_RUNNING); } } while (!kthread_should_stop()); @@ -1620,6 +1602,13 @@ struct btrfs_root *open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); btrfs_mapping_init(&fs_info->mapping_tree); + btrfs_init_block_rsv(&fs_info->global_block_rsv); + btrfs_init_block_rsv(&fs_info->delalloc_block_rsv); + btrfs_init_block_rsv(&fs_info->trans_block_rsv); + btrfs_init_block_rsv(&fs_info->chunk_block_rsv); + btrfs_init_block_rsv(&fs_info->empty_block_rsv); + INIT_LIST_HEAD(&fs_info->durable_block_rsv_list); + mutex_init(&fs_info->durable_block_rsv_mutex); atomic_set(&fs_info->nr_async_submits, 0); atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->async_submit_draining, 0); @@ -1759,9 +1748,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, min_t(u64, fs_devices->num_devices, fs_info->thread_pool_size), &fs_info->generic_worker); - btrfs_init_workers(&fs_info->enospc_workers, "enospc", - fs_info->thread_pool_size, - &fs_info->generic_worker); /* a higher idle thresh on the submit workers makes it much more * likely that bios will be send down in a sane order to the @@ -1809,7 +1795,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_start_workers(&fs_info->endio_meta_workers, 1); btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); btrfs_start_workers(&fs_info->endio_write_workers, 1); - btrfs_start_workers(&fs_info->enospc_workers, 1); fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, @@ -1912,17 +1897,18 @@ struct btrfs_root *open_ctree(struct super_block *sb, csum_root->track_dirty = 1; + fs_info->generation = generation; + fs_info->last_trans_committed = generation; + fs_info->data_alloc_profile = (u64)-1; + fs_info->metadata_alloc_profile = (u64)-1; + fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; + ret = btrfs_read_block_groups(extent_root); if (ret) { printk(KERN_ERR "Failed to read block groups: %d\n", ret); goto fail_block_groups; } - fs_info->generation = generation; - fs_info->last_trans_committed = generation; - fs_info->data_alloc_profile = (u64)-1; - fs_info->metadata_alloc_profile = (u64)-1; - fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, "btrfs-cleaner"); if (IS_ERR(fs_info->cleaner_kthread)) @@ -1977,6 +1963,9 @@ struct btrfs_root *open_ctree(struct super_block *sb, BUG_ON(ret); if (!(sb->s_flags & MS_RDONLY)) { + ret = btrfs_cleanup_fs_roots(fs_info); + BUG_ON(ret); + ret = btrfs_recover_relocation(tree_root); if (ret < 0) { printk(KERN_WARNING @@ -2040,7 +2029,6 @@ fail_sb_buffer: btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->submit_workers); - btrfs_stop_workers(&fs_info->enospc_workers); fail_iput: invalidate_inode_pages2(fs_info->btree_inode->i_mapping); iput(fs_info->btree_inode); @@ -2405,11 +2393,11 @@ int btrfs_commit_super(struct btrfs_root *root) down_write(&root->fs_info->cleanup_work_sem); up_write(&root->fs_info->cleanup_work_sem); - trans = btrfs_start_transaction(root, 1); + trans = btrfs_join_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); BUG_ON(ret); /* run commit again to drop the original snapshot */ - trans = btrfs_start_transaction(root, 1); + trans = btrfs_join_transaction(root, 1); btrfs_commit_transaction(trans, root); ret = btrfs_write_and_wait_transaction(NULL, root); BUG_ON(ret); @@ -2426,15 +2414,15 @@ int close_ctree(struct btrfs_root *root) fs_info->closing = 1; smp_mb(); - kthread_stop(root->fs_info->transaction_kthread); - kthread_stop(root->fs_info->cleaner_kthread); - if (!(fs_info->sb->s_flags & MS_RDONLY)) { ret = btrfs_commit_super(root); if (ret) printk(KERN_ERR "btrfs: commit super ret %d\n", ret); } + kthread_stop(root->fs_info->transaction_kthread); + kthread_stop(root->fs_info->cleaner_kthread); + fs_info->closing = 2; smp_mb(); @@ -2473,7 +2461,6 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->endio_meta_write_workers); btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->submit_workers); - btrfs_stop_workers(&fs_info->enospc_workers); btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index c958ecb..88e825a 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -87,7 +87,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, int metadata); int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags, + unsigned long bio_flags, u64 bio_offset, extent_submit_bio_hook_t *submit_bio_start, extent_submit_bio_hook_t *submit_bio_done); @@ -95,8 +95,6 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone); unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); int btrfs_write_tree_block(struct extent_buffer *buf); int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); -int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_add_log_tree(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c6a4f45..b9080d7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -35,10 +35,9 @@ static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int alloc, - int mark_free); -static int update_reserved_extents(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve); + u64 bytenr, u64 num_bytes, int alloc); +static int update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve, int sinfo); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -61,12 +60,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags, int force); -static int pin_down_bytes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 num_bytes, - int is_data, int reserved, - struct extent_buffer **must_clean); static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); static void dump_space_info(struct btrfs_space_info *info, u64 bytes, @@ -91,8 +84,12 @@ void btrfs_get_block_group(struct btrfs_block_group_cache *cache) void btrfs_put_block_group(struct btrfs_block_group_cache *cache) { - if (atomic_dec_and_test(&cache->count)) + if (atomic_dec_and_test(&cache->count)) { + WARN_ON(cache->pinned > 0); + WARN_ON(cache->reserved > 0); + WARN_ON(cache->reserved_pinned > 0); kfree(cache); + } } /* @@ -319,7 +316,7 @@ static int caching_kthread(void *data) exclude_super_stripes(extent_root, block_group); spin_lock(&block_group->space_info->lock); - block_group->space_info->bytes_super += block_group->bytes_super; + block_group->space_info->bytes_readonly += block_group->bytes_super; spin_unlock(&block_group->space_info->lock); last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); @@ -507,6 +504,9 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, struct list_head *head = &info->space_info; struct btrfs_space_info *found; + flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM | + BTRFS_BLOCK_GROUP_METADATA; + rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { if (found->flags == flags) { @@ -610,6 +610,113 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) } /* + * helper function to lookup reference count and flags of extent. + * + * the head node for delayed ref is used to store the sum of all the + * reference count modifications queued up in the rbtree. the head + * node may also store the extent flags to set. This way you can check + * to see what the reference count and extent flags would be if all of + * the delayed refs are not processed. + */ +int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u64 *refs, u64 *flags) +{ + struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_path *path; + struct btrfs_extent_item *ei; + struct extent_buffer *leaf; + struct btrfs_key key; + u32 item_size; + u64 num_refs; + u64 extent_flags; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = num_bytes; + if (!trans) { + path->skip_locking = 1; + path->search_commit_root = 1; + } +again: + ret = btrfs_search_slot(trans, root->fs_info->extent_root, + &key, path, 0, 0); + if (ret < 0) + goto out_free; + + if (ret == 0) { + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if (item_size >= sizeof(*ei)) { + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item); + num_refs = btrfs_extent_refs(leaf, ei); + extent_flags = btrfs_extent_flags(leaf, ei); + } else { +#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 + struct btrfs_extent_item_v0 *ei0; + BUG_ON(item_size != sizeof(*ei0)); + ei0 = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_item_v0); + num_refs = btrfs_extent_refs_v0(leaf, ei0); + /* FIXME: this isn't correct for data */ + extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; +#else + BUG(); +#endif + } + BUG_ON(num_refs == 0); + } else { + num_refs = 0; + extent_flags = 0; + ret = 0; + } + + if (!trans) + goto out; + + delayed_refs = &trans->transaction->delayed_refs; + spin_lock(&delayed_refs->lock); + head = btrfs_find_delayed_ref_head(trans, bytenr); + if (head) { + if (!mutex_trylock(&head->mutex)) { + atomic_inc(&head->node.refs); + spin_unlock(&delayed_refs->lock); + + btrfs_release_path(root->fs_info->extent_root, path); + + mutex_lock(&head->mutex); + mutex_unlock(&head->mutex); + btrfs_put_delayed_ref(&head->node); + goto again; + } + if (head->extent_op && head->extent_op->update_flags) + extent_flags |= head->extent_op->flags_to_set; + else + BUG_ON(num_refs == 0); + + num_refs += head->node.ref_mod; + mutex_unlock(&head->mutex); + } + spin_unlock(&delayed_refs->lock); +out: + WARN_ON(num_refs == 0); + if (refs) + *refs = num_refs; + if (flags) + *flags = extent_flags; +out_free: + btrfs_free_path(path); + return ret; +} + +/* * Back reference rules. Back refs have three main goals: * * 1) differentiate between all holders of references to an extent so that @@ -1871,7 +1978,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, return ret; } - /* helper function to actually process a single delayed ref entry */ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -1891,32 +1997,14 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, BUG_ON(extent_op); head = btrfs_delayed_node_to_head(node); if (insert_reserved) { - int mark_free = 0; - struct extent_buffer *must_clean = NULL; - - ret = pin_down_bytes(trans, root, NULL, - node->bytenr, node->num_bytes, - head->is_data, 1, &must_clean); - if (ret > 0) - mark_free = 1; - - if (must_clean) { - clean_tree_block(NULL, root, must_clean); - btrfs_tree_unlock(must_clean); - free_extent_buffer(must_clean); - } + btrfs_pin_extent(root, node->bytenr, + node->num_bytes, 1); if (head->is_data) { ret = btrfs_del_csums(trans, root, node->bytenr, node->num_bytes); BUG_ON(ret); } - if (mark_free) { - ret = btrfs_free_reserved_extent(root, - node->bytenr, - node->num_bytes); - BUG_ON(ret); - } } mutex_unlock(&head->mutex); return 0; @@ -2347,6 +2435,8 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, ret = 0; out: btrfs_free_path(path); + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + WARN_ON(ret > 0); return ret; } @@ -2660,12 +2750,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, struct btrfs_space_info **space_info) { struct btrfs_space_info *found; + int i; + int factor; + + if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; found = __find_space_info(info, flags); if (found) { spin_lock(&found->lock); found->total_bytes += total_bytes; found->bytes_used += bytes_used; + found->disk_used += bytes_used * factor; found->full = 0; spin_unlock(&found->lock); *space_info = found; @@ -2675,18 +2774,20 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, if (!found) return -ENOMEM; - INIT_LIST_HEAD(&found->block_groups); + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + INIT_LIST_HEAD(&found->block_groups[i]); init_rwsem(&found->groups_sem); - init_waitqueue_head(&found->flush_wait); - init_waitqueue_head(&found->allocate_wait); spin_lock_init(&found->lock); - found->flags = flags; + found->flags = flags & (BTRFS_BLOCK_GROUP_DATA | + BTRFS_BLOCK_GROUP_SYSTEM | + BTRFS_BLOCK_GROUP_METADATA); found->total_bytes = total_bytes; found->bytes_used = bytes_used; + found->disk_used = bytes_used * factor; found->bytes_pinned = 0; found->bytes_reserved = 0; found->bytes_readonly = 0; - found->bytes_delalloc = 0; + found->bytes_may_use = 0; found->full = 0; found->force_alloc = 0; *space_info = found; @@ -2711,19 +2812,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) } } -static void set_block_group_readonly(struct btrfs_block_group_cache *cache) -{ - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - if (!cache->ro) { - cache->space_info->bytes_readonly += cache->key.offset - - btrfs_block_group_used(&cache->item); - cache->ro = 1; - } - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); -} - u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) { u64 num_devices = root->fs_info->fs_devices->rw_devices; @@ -2752,491 +2840,50 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) return flags; } -static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data) -{ - struct btrfs_fs_info *info = root->fs_info; - u64 alloc_profile; - - if (data) { - alloc_profile = info->avail_data_alloc_bits & - info->data_alloc_profile; - data = BTRFS_BLOCK_GROUP_DATA | alloc_profile; - } else if (root == root->fs_info->chunk_root) { - alloc_profile = info->avail_system_alloc_bits & - info->system_alloc_profile; - data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile; - } else { - alloc_profile = info->avail_metadata_alloc_bits & - info->metadata_alloc_profile; - data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile; - } - - return btrfs_reduce_alloc_profile(root, data); -} - -void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) -{ - u64 alloc_target; - - alloc_target = btrfs_get_alloc_profile(root, 1); - BTRFS_I(inode)->space_info = __find_space_info(root->fs_info, - alloc_target); -} - -static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items) -{ - u64 num_bytes; - int level; - - level = BTRFS_MAX_LEVEL - 2; - /* - * NOTE: these calculations are absolutely the worst possible case. - * This assumes that _every_ item we insert will require a new leaf, and - * that the tree has grown to its maximum level size. - */ - - /* - * for every item we insert we could insert both an extent item and a - * extent ref item. Then for ever item we insert, we will need to cow - * both the original leaf, plus the leaf to the left and right of it. - * - * Unless we are talking about the extent root, then we just want the - * number of items * 2, since we just need the extent item plus its ref. - */ - if (root == root->fs_info->extent_root) - num_bytes = num_items * 2; - else - num_bytes = (num_items + (2 * num_items)) * 3; - - /* - * num_bytes is total number of leaves we could need times the leaf - * size, and then for every leaf we could end up cow'ing 2 nodes per - * level, down to the leaf level. - */ - num_bytes = (num_bytes * root->leafsize) + - (num_bytes * (level * 2)) * root->nodesize; - - return num_bytes; -} - -/* - * Unreserve metadata space for delalloc. If we have less reserved credits than - * we have extents, this function does nothing. - */ -int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, - struct inode *inode, int num_items) -{ - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_space_info *meta_sinfo; - u64 num_bytes; - u64 alloc_target; - bool bug = false; - - /* get the space info for where the metadata will live */ - alloc_target = btrfs_get_alloc_profile(root, 0); - meta_sinfo = __find_space_info(info, alloc_target); - - num_bytes = calculate_bytes_needed(root->fs_info->extent_root, - num_items); - - spin_lock(&meta_sinfo->lock); - spin_lock(&BTRFS_I(inode)->accounting_lock); - if (BTRFS_I(inode)->reserved_extents <= - BTRFS_I(inode)->outstanding_extents) { - spin_unlock(&BTRFS_I(inode)->accounting_lock); - spin_unlock(&meta_sinfo->lock); - return 0; - } - spin_unlock(&BTRFS_I(inode)->accounting_lock); - - BTRFS_I(inode)->reserved_extents -= num_items; - BUG_ON(BTRFS_I(inode)->reserved_extents < 0); - - if (meta_sinfo->bytes_delalloc < num_bytes) { - bug = true; - meta_sinfo->bytes_delalloc = 0; - } else { - meta_sinfo->bytes_delalloc -= num_bytes; - } - spin_unlock(&meta_sinfo->lock); - - BUG_ON(bug); - - return 0; -} - -static void check_force_delalloc(struct btrfs_space_info *meta_sinfo) +static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) { - u64 thresh; - - thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + - meta_sinfo->bytes_super + meta_sinfo->bytes_root + - meta_sinfo->bytes_may_use; - - thresh = meta_sinfo->total_bytes - thresh; - thresh *= 80; - do_div(thresh, 100); - if (thresh <= meta_sinfo->bytes_delalloc) - meta_sinfo->force_delalloc = 1; - else - meta_sinfo->force_delalloc = 0; + if (flags & BTRFS_BLOCK_GROUP_DATA) + flags |= root->fs_info->avail_data_alloc_bits & + root->fs_info->data_alloc_profile; + else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + flags |= root->fs_info->avail_system_alloc_bits & + root->fs_info->system_alloc_profile; + else if (flags & BTRFS_BLOCK_GROUP_METADATA) + flags |= root->fs_info->avail_metadata_alloc_bits & + root->fs_info->metadata_alloc_profile; + return btrfs_reduce_alloc_profile(root, flags); } -struct async_flush { - struct btrfs_root *root; - struct btrfs_space_info *info; - struct btrfs_work work; -}; - -static noinline void flush_delalloc_async(struct btrfs_work *work) +static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) { - struct async_flush *async; - struct btrfs_root *root; - struct btrfs_space_info *info; - - async = container_of(work, struct async_flush, work); - root = async->root; - info = async->info; - - btrfs_start_delalloc_inodes(root, 0); - wake_up(&info->flush_wait); - btrfs_wait_ordered_extents(root, 0, 0); - - spin_lock(&info->lock); - info->flushing = 0; - spin_unlock(&info->lock); - wake_up(&info->flush_wait); - - kfree(async); -} - -static void wait_on_flush(struct btrfs_space_info *info) -{ - DEFINE_WAIT(wait); - u64 used; - - while (1) { - prepare_to_wait(&info->flush_wait, &wait, - TASK_UNINTERRUPTIBLE); - spin_lock(&info->lock); - if (!info->flushing) { - spin_unlock(&info->lock); - break; - } - - used = info->bytes_used + info->bytes_reserved + - info->bytes_pinned + info->bytes_readonly + - info->bytes_super + info->bytes_root + - info->bytes_may_use + info->bytes_delalloc; - if (used < info->total_bytes) { - spin_unlock(&info->lock); - break; - } - spin_unlock(&info->lock); - schedule(); - } - finish_wait(&info->flush_wait, &wait); -} - -static void flush_delalloc(struct btrfs_root *root, - struct btrfs_space_info *info) -{ - struct async_flush *async; - bool wait = false; - - spin_lock(&info->lock); + u64 flags; - if (!info->flushing) - info->flushing = 1; + if (data) + flags = BTRFS_BLOCK_GROUP_DATA; + else if (root == root->fs_info->chunk_root) + flags = BTRFS_BLOCK_GROUP_SYSTEM; else - wait = true; - - spin_unlock(&info->lock); - - if (wait) { - wait_on_flush(info); - return; - } - - async = kzalloc(sizeof(*async), GFP_NOFS); - if (!async) - goto flush; - - async->root = root; - async->info = info; - async->work.func = flush_delalloc_async; + flags = BTRFS_BLOCK_GROUP_METADATA; - btrfs_queue_worker(&root->fs_info->enospc_workers, - &async->work); - wait_on_flush(info); - return; - -flush: - btrfs_start_delalloc_inodes(root, 0); - btrfs_wait_ordered_extents(root, 0, 0); - - spin_lock(&info->lock); - info->flushing = 0; - spin_unlock(&info->lock); - wake_up(&info->flush_wait); + return get_alloc_profile(root, flags); } -static int maybe_allocate_chunk(struct btrfs_root *root, - struct btrfs_space_info *info) -{ - struct btrfs_super_block *disk_super = &root->fs_info->super_copy; - struct btrfs_trans_handle *trans; - bool wait = false; - int ret = 0; - u64 min_metadata; - u64 free_space; - - free_space = btrfs_super_total_bytes(disk_super); - /* - * we allow the metadata to grow to a max of either 10gb or 5% of the - * space in the volume. - */ - min_metadata = min((u64)10 * 1024 * 1024 * 1024, - div64_u64(free_space * 5, 100)); - if (info->total_bytes >= min_metadata) { - spin_unlock(&info->lock); - return 0; - } - - if (info->full) { - spin_unlock(&info->lock); - return 0; - } - - if (!info->allocating_chunk) { - info->force_alloc = 1; - info->allocating_chunk = 1; - } else { - wait = true; - } - - spin_unlock(&info->lock); - - if (wait) { - wait_event(info->allocate_wait, - !info->allocating_chunk); - return 1; - } - - trans = btrfs_start_transaction(root, 1); - if (!trans) { - ret = -ENOMEM; - goto out; - } - - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - 4096 + 2 * 1024 * 1024, - info->flags, 0); - btrfs_end_transaction(trans, root); - if (ret) - goto out; -out: - spin_lock(&info->lock); - info->allocating_chunk = 0; - spin_unlock(&info->lock); - wake_up(&info->allocate_wait); - - if (ret) - return 0; - return 1; -} - -/* - * Reserve metadata space for delalloc. - */ -int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, - struct inode *inode, int num_items) -{ - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_space_info *meta_sinfo; - u64 num_bytes; - u64 used; - u64 alloc_target; - int flushed = 0; - int force_delalloc; - - /* get the space info for where the metadata will live */ - alloc_target = btrfs_get_alloc_profile(root, 0); - meta_sinfo = __find_space_info(info, alloc_target); - - num_bytes = calculate_bytes_needed(root->fs_info->extent_root, - num_items); -again: - spin_lock(&meta_sinfo->lock); - - force_delalloc = meta_sinfo->force_delalloc; - - if (unlikely(!meta_sinfo->bytes_root)) - meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); - - if (!flushed) - meta_sinfo->bytes_delalloc += num_bytes; - - used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + - meta_sinfo->bytes_super + meta_sinfo->bytes_root + - meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; - - if (used > meta_sinfo->total_bytes) { - flushed++; - - if (flushed == 1) { - if (maybe_allocate_chunk(root, meta_sinfo)) - goto again; - flushed++; - } else { - spin_unlock(&meta_sinfo->lock); - } - - if (flushed == 2) { - filemap_flush(inode->i_mapping); - goto again; - } else if (flushed == 3) { - flush_delalloc(root, meta_sinfo); - goto again; - } - spin_lock(&meta_sinfo->lock); - meta_sinfo->bytes_delalloc -= num_bytes; - spin_unlock(&meta_sinfo->lock); - printk(KERN_ERR "enospc, has %d, reserved %d\n", - BTRFS_I(inode)->outstanding_extents, - BTRFS_I(inode)->reserved_extents); - dump_space_info(meta_sinfo, 0, 0); - return -ENOSPC; - } - - BTRFS_I(inode)->reserved_extents += num_items; - check_force_delalloc(meta_sinfo); - spin_unlock(&meta_sinfo->lock); - - if (!flushed && force_delalloc) - filemap_flush(inode->i_mapping); - - return 0; -} - -/* - * unreserve num_items number of items worth of metadata space. This needs to - * be paired with btrfs_reserve_metadata_space. - * - * NOTE: if you have the option, run this _AFTER_ you do a - * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref - * oprations which will result in more used metadata, so we want to make sure we - * can do that without issue. - */ -int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items) -{ - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_space_info *meta_sinfo; - u64 num_bytes; - u64 alloc_target; - bool bug = false; - - /* get the space info for where the metadata will live */ - alloc_target = btrfs_get_alloc_profile(root, 0); - meta_sinfo = __find_space_info(info, alloc_target); - - num_bytes = calculate_bytes_needed(root, num_items); - - spin_lock(&meta_sinfo->lock); - if (meta_sinfo->bytes_may_use < num_bytes) { - bug = true; - meta_sinfo->bytes_may_use = 0; - } else { - meta_sinfo->bytes_may_use -= num_bytes; - } - spin_unlock(&meta_sinfo->lock); - - BUG_ON(bug); - - return 0; -} - -/* - * Reserve some metadata space for use. We'll calculate the worste case number - * of bytes that would be needed to modify num_items number of items. If we - * have space, fantastic, if not, you get -ENOSPC. Please call - * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of - * items you reserved, since whatever metadata you needed should have already - * been allocated. - * - * This will commit the transaction to make more space if we don't have enough - * metadata space. THe only time we don't do this is if we're reserving space - * inside of a transaction, then we will just return -ENOSPC and it is the - * callers responsibility to handle it properly. - */ -int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items) +void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) { - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_space_info *meta_sinfo; - u64 num_bytes; - u64 used; - u64 alloc_target; - int retries = 0; - - /* get the space info for where the metadata will live */ - alloc_target = btrfs_get_alloc_profile(root, 0); - meta_sinfo = __find_space_info(info, alloc_target); - - num_bytes = calculate_bytes_needed(root, num_items); -again: - spin_lock(&meta_sinfo->lock); - - if (unlikely(!meta_sinfo->bytes_root)) - meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); - - if (!retries) - meta_sinfo->bytes_may_use += num_bytes; - - used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + - meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + - meta_sinfo->bytes_super + meta_sinfo->bytes_root + - meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; - - if (used > meta_sinfo->total_bytes) { - retries++; - if (retries == 1) { - if (maybe_allocate_chunk(root, meta_sinfo)) - goto again; - retries++; - } else { - spin_unlock(&meta_sinfo->lock); - } - - if (retries == 2) { - flush_delalloc(root, meta_sinfo); - goto again; - } - spin_lock(&meta_sinfo->lock); - meta_sinfo->bytes_may_use -= num_bytes; - spin_unlock(&meta_sinfo->lock); - - dump_space_info(meta_sinfo, 0, 0); - return -ENOSPC; - } - - check_force_delalloc(meta_sinfo); - spin_unlock(&meta_sinfo->lock); - - return 0; + BTRFS_I(inode)->space_info = __find_space_info(root->fs_info, + BTRFS_BLOCK_GROUP_DATA); } /* * This will check the space that the inode allocates from to make sure we have * enough space for bytes. */ -int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, - u64 bytes) +int btrfs_check_data_free_space(struct inode *inode, u64 bytes) { struct btrfs_space_info *data_sinfo; + struct btrfs_root *root = BTRFS_I(inode)->root; u64 used; - int ret = 0, committed = 0, flushed = 0; + int ret = 0, committed = 0; /* make sure bytes are sectorsize aligned */ bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); @@ -3248,21 +2895,13 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, again: /* make sure we have enough space to handle the data first */ spin_lock(&data_sinfo->lock); - used = data_sinfo->bytes_used + data_sinfo->bytes_delalloc + - data_sinfo->bytes_reserved + data_sinfo->bytes_pinned + - data_sinfo->bytes_readonly + data_sinfo->bytes_may_use + - data_sinfo->bytes_super; + used = data_sinfo->bytes_used + data_sinfo->bytes_reserved + + data_sinfo->bytes_pinned + data_sinfo->bytes_readonly + + data_sinfo->bytes_may_use; if (used + bytes > data_sinfo->total_bytes) { struct btrfs_trans_handle *trans; - if (!flushed) { - spin_unlock(&data_sinfo->lock); - flush_delalloc(root, data_sinfo); - flushed = 1; - goto again; - } - /* * if we don't have enough free bytes in this space then we need * to alloc a new chunk. @@ -3274,15 +2913,15 @@ again: spin_unlock(&data_sinfo->lock); alloc: alloc_target = btrfs_get_alloc_profile(root, 1); - trans = btrfs_start_transaction(root, 1); - if (!trans) - return -ENOMEM; + trans = btrfs_join_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); ret = do_chunk_alloc(trans, root->fs_info->extent_root, bytes + 2 * 1024 * 1024, alloc_target, 0); btrfs_end_transaction(trans, root); - if (ret) + if (ret < 0) return ret; if (!data_sinfo) { @@ -3297,25 +2936,26 @@ alloc: if (!committed && !root->fs_info->open_ioctl_trans) { committed = 1; trans = btrfs_join_transaction(root, 1); - if (!trans) - return -ENOMEM; + if (IS_ERR(trans)) + return PTR_ERR(trans); ret = btrfs_commit_transaction(trans, root); if (ret) return ret; goto again; } - printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" - ", %llu bytes_used, %llu bytes_reserved, " - "%llu bytes_pinned, %llu bytes_readonly, %llu may use " - "%llu total\n", (unsigned long long)bytes, - (unsigned long long)data_sinfo->bytes_delalloc, +#if 0 /* I hope we never need this code again, just in case */ + printk(KERN_ERR "no space left, need %llu, %llu bytes_used, " + "%llu bytes_reserved, " "%llu bytes_pinned, " + "%llu bytes_readonly, %llu may use %llu total\n", + (unsigned long long)bytes, (unsigned long long)data_sinfo->bytes_used, (unsigned long long)data_sinfo->bytes_reserved, (unsigned long long)data_sinfo->bytes_pinned, (unsigned long long)data_sinfo->bytes_readonly, (unsigned long long)data_sinfo->bytes_may_use, (unsigned long long)data_sinfo->total_bytes); +#endif return -ENOSPC; } data_sinfo->bytes_may_use += bytes; @@ -3326,12 +2966,13 @@ alloc: } /* - * if there was an error for whatever reason after calling - * btrfs_check_data_free_space, call this so we can cleanup the counters. + * called when we are clearing an delalloc extent from the + * inode's io_tree or there was an error for whatever reason + * after calling btrfs_check_data_free_space */ -void btrfs_free_reserved_data_space(struct btrfs_root *root, - struct inode *inode, u64 bytes) +void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) { + struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_space_info *data_sinfo; /* make sure bytes are sectorsize aligned */ @@ -3344,48 +2985,6 @@ void btrfs_free_reserved_data_space(struct btrfs_root *root, spin_unlock(&data_sinfo->lock); } -/* called when we are adding a delalloc extent to the inode's io_tree */ -void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, - u64 bytes) -{ - struct btrfs_space_info *data_sinfo; - - /* get the space info for where this inode will be storing its data */ - data_sinfo = BTRFS_I(inode)->space_info; - - /* make sure we have enough space to handle the data first */ - spin_lock(&data_sinfo->lock); - data_sinfo->bytes_delalloc += bytes; - - /* - * we are adding a delalloc extent without calling - * btrfs_check_data_free_space first. This happens on a weird - * writepage condition, but shouldn't hurt our accounting - */ - if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) { - data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes; - BTRFS_I(inode)->reserved_bytes = 0; - } else { - data_sinfo->bytes_may_use -= bytes; - BTRFS_I(inode)->reserved_bytes -= bytes; - } - - spin_unlock(&data_sinfo->lock); -} - -/* called when we are clearing an delalloc extent from the inode's io_tree */ -void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, - u64 bytes) -{ - struct btrfs_space_info *info; - - info = BTRFS_I(inode)->space_info; - - spin_lock(&info->lock); - info->bytes_delalloc -= bytes; - spin_unlock(&info->lock); -} - static void force_metadata_allocation(struct btrfs_fs_info *info) { struct list_head *head = &info->space_info; @@ -3399,13 +2998,28 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) rcu_read_unlock(); } +static int should_alloc_chunk(struct btrfs_space_info *sinfo, + u64 alloc_bytes) +{ + u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; + + if (sinfo->bytes_used + sinfo->bytes_reserved + + alloc_bytes + 256 * 1024 * 1024 < num_bytes) + return 0; + + if (sinfo->bytes_used + sinfo->bytes_reserved + + alloc_bytes < div_factor(num_bytes, 8)) + return 0; + + return 1; +} + static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags, int force) { struct btrfs_space_info *space_info; struct btrfs_fs_info *fs_info = extent_root->fs_info; - u64 thresh; int ret = 0; mutex_lock(&fs_info->chunk_mutex); @@ -3428,11 +3042,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, goto out; } - thresh = space_info->total_bytes - space_info->bytes_readonly; - thresh = div_factor(thresh, 8); - if (!force && - (space_info->bytes_used + space_info->bytes_pinned + - space_info->bytes_reserved + alloc_bytes) < thresh) { + if (!force && !should_alloc_chunk(space_info, alloc_bytes)) { spin_unlock(&space_info->lock); goto out; } @@ -3454,6 +3064,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, spin_lock(&space_info->lock); if (ret) space_info->full = 1; + else + ret = 1; space_info->force_alloc = 0; spin_unlock(&space_info->lock); out: @@ -3461,13 +3073,713 @@ out: return ret; } +static int maybe_allocate_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_space_info *sinfo, u64 num_bytes) +{ + int ret; + int end_trans = 0; + + if (sinfo->full) + return 0; + + spin_lock(&sinfo->lock); + ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024); + spin_unlock(&sinfo->lock); + if (!ret) + return 0; + + if (!trans) { + trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); + end_trans = 1; + } + + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + num_bytes + 2 * 1024 * 1024, + get_alloc_profile(root, sinfo->flags), 0); + + if (end_trans) + btrfs_end_transaction(trans, root); + + return ret == 1 ? 1 : 0; +} + +/* + * shrink metadata reservation for delalloc + */ +static int shrink_delalloc(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 to_reclaim) +{ + struct btrfs_block_rsv *block_rsv; + u64 reserved; + u64 max_reclaim; + u64 reclaimed = 0; + int pause = 1; + int ret; + + block_rsv = &root->fs_info->delalloc_block_rsv; + spin_lock(&block_rsv->lock); + reserved = block_rsv->reserved; + spin_unlock(&block_rsv->lock); + + if (reserved == 0) + return 0; + + max_reclaim = min(reserved, to_reclaim); + + while (1) { + ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0); + if (!ret) { + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(pause); + pause <<= 1; + if (pause > HZ / 10) + pause = HZ / 10; + } else { + pause = 1; + } + + spin_lock(&block_rsv->lock); + if (reserved > block_rsv->reserved) + reclaimed = reserved - block_rsv->reserved; + reserved = block_rsv->reserved; + spin_unlock(&block_rsv->lock); + + if (reserved == 0 || reclaimed >= max_reclaim) + break; + + if (trans && trans->transaction->blocked) + return -EAGAIN; + } + return reclaimed >= to_reclaim; +} + +static int should_retry_reserve(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, int *retries) +{ + struct btrfs_space_info *space_info = block_rsv->space_info; + int ret; + + if ((*retries) > 2) + return -ENOSPC; + + ret = maybe_allocate_chunk(trans, root, space_info, num_bytes); + if (ret) + return 1; + + if (trans && trans->transaction->in_commit) + return -ENOSPC; + + ret = shrink_delalloc(trans, root, num_bytes); + if (ret) + return ret; + + spin_lock(&space_info->lock); + if (space_info->bytes_pinned < num_bytes) + ret = 1; + spin_unlock(&space_info->lock); + if (ret) + return -ENOSPC; + + (*retries)++; + + if (trans) + return -EAGAIN; + + trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + + return 1; +} + +static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + struct btrfs_space_info *space_info = block_rsv->space_info; + u64 unused; + int ret = -ENOSPC; + + spin_lock(&space_info->lock); + unused = space_info->bytes_used + space_info->bytes_reserved + + space_info->bytes_pinned + space_info->bytes_readonly; + + if (unused < space_info->total_bytes) + unused = space_info->total_bytes - unused; + else + unused = 0; + + if (unused >= num_bytes) { + if (block_rsv->priority >= 10) { + space_info->bytes_reserved += num_bytes; + ret = 0; + } else { + if ((unused + block_rsv->reserved) * + block_rsv->priority >= + (num_bytes + block_rsv->reserved) * 10) { + space_info->bytes_reserved += num_bytes; + ret = 0; + } + } + } + spin_unlock(&space_info->lock); + + return ret; +} + +static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_rsv *block_rsv; + if (root->ref_cows) + block_rsv = trans->block_rsv; + else + block_rsv = root->block_rsv; + + if (!block_rsv) + block_rsv = &root->fs_info->empty_block_rsv; + + return block_rsv; +} + +static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + int ret = -ENOSPC; + spin_lock(&block_rsv->lock); + if (block_rsv->reserved >= num_bytes) { + block_rsv->reserved -= num_bytes; + if (block_rsv->reserved < block_rsv->size) + block_rsv->full = 0; + ret = 0; + } + spin_unlock(&block_rsv->lock); + return ret; +} + +static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, + u64 num_bytes, int update_size) +{ + spin_lock(&block_rsv->lock); + block_rsv->reserved += num_bytes; + if (update_size) + block_rsv->size += num_bytes; + else if (block_rsv->reserved >= block_rsv->size) + block_rsv->full = 1; + spin_unlock(&block_rsv->lock); +} + +void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, + struct btrfs_block_rsv *dest, u64 num_bytes) +{ + struct btrfs_space_info *space_info = block_rsv->space_info; + + spin_lock(&block_rsv->lock); + if (num_bytes == (u64)-1) + num_bytes = block_rsv->size; + block_rsv->size -= num_bytes; + if (block_rsv->reserved >= block_rsv->size) { + num_bytes = block_rsv->reserved - block_rsv->size; + block_rsv->reserved = block_rsv->size; + block_rsv->full = 1; + } else { + num_bytes = 0; + } + spin_unlock(&block_rsv->lock); + + if (num_bytes > 0) { + if (dest) { + block_rsv_add_bytes(dest, num_bytes, 0); + } else { + spin_lock(&space_info->lock); + space_info->bytes_reserved -= num_bytes; + spin_unlock(&space_info->lock); + } + } +} + +static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src, + struct btrfs_block_rsv *dst, u64 num_bytes) +{ + int ret; + + ret = block_rsv_use_bytes(src, num_bytes); + if (ret) + return ret; + + block_rsv_add_bytes(dst, num_bytes, 1); + return 0; +} + +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv) +{ + memset(rsv, 0, sizeof(*rsv)); + spin_lock_init(&rsv->lock); + atomic_set(&rsv->usage, 1); + rsv->priority = 6; + INIT_LIST_HEAD(&rsv->list); +} + +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) +{ + struct btrfs_block_rsv *block_rsv; + struct btrfs_fs_info *fs_info = root->fs_info; + u64 alloc_target; + + block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); + if (!block_rsv) + return NULL; + + btrfs_init_block_rsv(block_rsv); + + alloc_target = btrfs_get_alloc_profile(root, 0); + block_rsv->space_info = __find_space_info(fs_info, + BTRFS_BLOCK_GROUP_METADATA); + + return block_rsv; +} + +void btrfs_free_block_rsv(struct btrfs_root *root, + struct btrfs_block_rsv *rsv) +{ + if (rsv && atomic_dec_and_test(&rsv->usage)) { + btrfs_block_rsv_release(root, rsv, (u64)-1); + if (!rsv->durable) + kfree(rsv); + } +} + +/* + * make the block_rsv struct be able to capture freed space. + * the captured space will re-add to the the block_rsv struct + * after transaction commit + */ +void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv) +{ + block_rsv->durable = 1; + mutex_lock(&fs_info->durable_block_rsv_mutex); + list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list); + mutex_unlock(&fs_info->durable_block_rsv_mutex); +} + +int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes, int *retries) +{ + int ret; + + if (num_bytes == 0) + return 0; +again: + ret = reserve_metadata_bytes(block_rsv, num_bytes); + if (!ret) { + block_rsv_add_bytes(block_rsv, num_bytes, 1); + return 0; + } + + ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries); + if (ret > 0) + goto again; + + return ret; +} + +int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 min_reserved, int min_factor) +{ + u64 num_bytes = 0; + int commit_trans = 0; + int ret = -ENOSPC; + + if (!block_rsv) + return 0; + + spin_lock(&block_rsv->lock); + if (min_factor > 0) + num_bytes = div_factor(block_rsv->size, min_factor); + if (min_reserved > num_bytes) + num_bytes = min_reserved; + + if (block_rsv->reserved >= num_bytes) { + ret = 0; + } else { + num_bytes -= block_rsv->reserved; + if (block_rsv->durable && + block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes) + commit_trans = 1; + } + spin_unlock(&block_rsv->lock); + if (!ret) + return 0; + + if (block_rsv->refill_used) { + ret = reserve_metadata_bytes(block_rsv, num_bytes); + if (!ret) { + block_rsv_add_bytes(block_rsv, num_bytes, 0); + return 0; + } + } + + if (commit_trans) { + if (trans) + return -EAGAIN; + + trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); + ret = btrfs_commit_transaction(trans, root); + return 0; + } + + WARN_ON(1); + printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n", + block_rsv->size, block_rsv->reserved, + block_rsv->freed[0], block_rsv->freed[1]); + + return -ENOSPC; +} + +int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, + struct btrfs_block_rsv *dst_rsv, + u64 num_bytes) +{ + return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); +} + +void btrfs_block_rsv_release(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, + u64 num_bytes) +{ + struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; + if (global_rsv->full || global_rsv == block_rsv || + block_rsv->space_info != global_rsv->space_info) + global_rsv = NULL; + block_rsv_release_bytes(block_rsv, global_rsv, num_bytes); +} + +/* + * helper to calculate size of global block reservation. + * the desired value is sum of space used by extent tree, + * checksum tree and root tree + */ +static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *sinfo; + u64 num_bytes; + u64 meta_used; + u64 data_used; + int csum_size = btrfs_super_csum_size(&fs_info->super_copy); +#if 0 + /* + * per tree used space accounting can be inaccuracy, so we + * can't rely on it. + */ + spin_lock(&fs_info->extent_root->accounting_lock); + num_bytes = btrfs_root_used(&fs_info->extent_root->root_item); + spin_unlock(&fs_info->extent_root->accounting_lock); + + spin_lock(&fs_info->csum_root->accounting_lock); + num_bytes += btrfs_root_used(&fs_info->csum_root->root_item); + spin_unlock(&fs_info->csum_root->accounting_lock); + + spin_lock(&fs_info->tree_root->accounting_lock); + num_bytes += btrfs_root_used(&fs_info->tree_root->root_item); + spin_unlock(&fs_info->tree_root->accounting_lock); +#endif + sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); + spin_lock(&sinfo->lock); + data_used = sinfo->bytes_used; + spin_unlock(&sinfo->lock); + + sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + spin_lock(&sinfo->lock); + meta_used = sinfo->bytes_used; + spin_unlock(&sinfo->lock); + + num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * + csum_size * 2; + num_bytes += div64_u64(data_used + meta_used, 50); + + if (num_bytes * 3 > meta_used) + num_bytes = div64_u64(meta_used, 3); + + return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); +} + +static void update_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; + struct btrfs_space_info *sinfo = block_rsv->space_info; + u64 num_bytes; + + num_bytes = calc_global_metadata_size(fs_info); + + spin_lock(&block_rsv->lock); + spin_lock(&sinfo->lock); + + block_rsv->size = num_bytes; + + num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + + sinfo->bytes_reserved + sinfo->bytes_readonly; + + if (sinfo->total_bytes > num_bytes) { + num_bytes = sinfo->total_bytes - num_bytes; + block_rsv->reserved += num_bytes; + sinfo->bytes_reserved += num_bytes; + } + + if (block_rsv->reserved >= block_rsv->size) { + num_bytes = block_rsv->reserved - block_rsv->size; + sinfo->bytes_reserved -= num_bytes; + block_rsv->reserved = block_rsv->size; + block_rsv->full = 1; + } +#if 0 + printk(KERN_INFO"global block rsv size %llu reserved %llu\n", + block_rsv->size, block_rsv->reserved); +#endif + spin_unlock(&sinfo->lock); + spin_unlock(&block_rsv->lock); +} + +static void init_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + struct btrfs_space_info *space_info; + + space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); + fs_info->chunk_block_rsv.space_info = space_info; + fs_info->chunk_block_rsv.priority = 10; + + space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + fs_info->global_block_rsv.space_info = space_info; + fs_info->global_block_rsv.priority = 10; + fs_info->global_block_rsv.refill_used = 1; + fs_info->delalloc_block_rsv.space_info = space_info; + fs_info->trans_block_rsv.space_info = space_info; + fs_info->empty_block_rsv.space_info = space_info; + fs_info->empty_block_rsv.priority = 10; + + fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; + fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; + fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; + fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; + fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; + + btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv); + + btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv); + + update_global_block_rsv(fs_info); +} + +static void release_global_block_rsv(struct btrfs_fs_info *fs_info) +{ + block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1); + WARN_ON(fs_info->delalloc_block_rsv.size > 0); + WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); + WARN_ON(fs_info->trans_block_rsv.size > 0); + WARN_ON(fs_info->trans_block_rsv.reserved > 0); + WARN_ON(fs_info->chunk_block_rsv.size > 0); + WARN_ON(fs_info->chunk_block_rsv.reserved > 0); +} + +static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items) +{ + return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * + 3 * num_items; +} + +int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + int num_items, int *retries) +{ + u64 num_bytes; + int ret; + + if (num_items == 0 || root->fs_info->chunk_root == root) + return 0; + + num_bytes = calc_trans_metadata_size(root, num_items); + ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv, + num_bytes, retries); + if (!ret) { + trans->bytes_reserved += num_bytes; + trans->block_rsv = &root->fs_info->trans_block_rsv; + } + return ret; +} + +void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (!trans->bytes_reserved) + return; + + BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv); + btrfs_block_rsv_release(root, trans->block_rsv, + trans->bytes_reserved); + trans->bytes_reserved = 0; +} + +int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); + struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; + + /* + * one for deleting orphan item, one for updating inode and + * two for calling btrfs_truncate_inode_items. + * + * btrfs_truncate_inode_items is a delete operation, it frees + * more space than it uses in most cases. So two units of + * metadata space should be enough for calling it many times. + * If all of the metadata space is used, we can commit + * transaction and use space it freed. + */ + u64 num_bytes = calc_trans_metadata_size(root, 4); + return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); +} + +void btrfs_orphan_release_metadata(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 num_bytes = calc_trans_metadata_size(root, 4); + btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); +} + +int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending) +{ + struct btrfs_root *root = pending->root; + struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); + struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; + /* + * two for root back/forward refs, two for directory entries + * and one for root of the snapshot. + */ + u64 num_bytes = calc_trans_metadata_size(root, 5); + dst_rsv->space_info = src_rsv->space_info; + return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); +} + +static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) +{ + return num_bytes >>= 3; +} + +int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; + u64 to_reserve; + int nr_extents; + int retries = 0; + int ret; + + if (btrfs_transaction_in_commit(root->fs_info)) + schedule_timeout(1); + + num_bytes = ALIGN(num_bytes, root->sectorsize); +again: + spin_lock(&BTRFS_I(inode)->accounting_lock); + nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; + if (nr_extents > BTRFS_I(inode)->reserved_extents) { + nr_extents -= BTRFS_I(inode)->reserved_extents; + to_reserve = calc_trans_metadata_size(root, nr_extents); + } else { + nr_extents = 0; + to_reserve = 0; + } + + to_reserve += calc_csum_metadata_size(inode, num_bytes); + ret = reserve_metadata_bytes(block_rsv, to_reserve); + if (ret) { + spin_unlock(&BTRFS_I(inode)->accounting_lock); + ret = should_retry_reserve(NULL, root, block_rsv, to_reserve, + &retries); + if (ret > 0) + goto again; + return ret; + } + + BTRFS_I(inode)->reserved_extents += nr_extents; + atomic_inc(&BTRFS_I(inode)->outstanding_extents); + spin_unlock(&BTRFS_I(inode)->accounting_lock); + + block_rsv_add_bytes(block_rsv, to_reserve, 1); + + if (block_rsv->size > 512 * 1024 * 1024) + shrink_delalloc(NULL, root, to_reserve); + + return 0; +} + +void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 to_free; + int nr_extents; + + num_bytes = ALIGN(num_bytes, root->sectorsize); + atomic_dec(&BTRFS_I(inode)->outstanding_extents); + + spin_lock(&BTRFS_I(inode)->accounting_lock); + nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents); + if (nr_extents < BTRFS_I(inode)->reserved_extents) { + nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents; + BTRFS_I(inode)->reserved_extents -= nr_extents; + } else { + nr_extents = 0; + } + spin_unlock(&BTRFS_I(inode)->accounting_lock); + + to_free = calc_csum_metadata_size(inode, num_bytes); + if (nr_extents > 0) + to_free += calc_trans_metadata_size(root, nr_extents); + + btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, + to_free); +} + +int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) +{ + int ret; + + ret = btrfs_check_data_free_space(inode, num_bytes); + if (ret) + return ret; + + ret = btrfs_delalloc_reserve_metadata(inode, num_bytes); + if (ret) { + btrfs_free_reserved_data_space(inode, num_bytes); + return ret; + } + + return 0; +} + +void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) +{ + btrfs_delalloc_release_metadata(inode, num_bytes); + btrfs_free_reserved_data_space(inode, num_bytes); +} + static int update_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int alloc, - int mark_free) + u64 bytenr, u64 num_bytes, int alloc) { struct btrfs_block_group_cache *cache; struct btrfs_fs_info *info = root->fs_info; + int factor; u64 total = num_bytes; u64 old_val; u64 byte_in_group; @@ -3486,6 +3798,12 @@ static int update_block_group(struct btrfs_trans_handle *trans, cache = btrfs_lookup_block_group(info, bytenr); if (!cache) return -1; + if (cache->flags & (BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + factor = 2; + else + factor = 1; byte_in_group = bytenr - cache->key.objectid; WARN_ON(byte_in_group > cache->key.offset); @@ -3498,31 +3816,24 @@ static int update_block_group(struct btrfs_trans_handle *trans, old_val += num_bytes; btrfs_set_block_group_used(&cache->item, old_val); cache->reserved -= num_bytes; - cache->space_info->bytes_used += num_bytes; cache->space_info->bytes_reserved -= num_bytes; - if (cache->ro) - cache->space_info->bytes_readonly -= num_bytes; + cache->space_info->bytes_used += num_bytes; + cache->space_info->disk_used += num_bytes * factor; spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); } else { old_val -= num_bytes; - cache->space_info->bytes_used -= num_bytes; - if (cache->ro) - cache->space_info->bytes_readonly += num_bytes; btrfs_set_block_group_used(&cache->item, old_val); + cache->pinned += num_bytes; + cache->space_info->bytes_pinned += num_bytes; + cache->space_info->bytes_used -= num_bytes; + cache->space_info->disk_used -= num_bytes * factor; spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - if (mark_free) { - int ret; - ret = btrfs_discard_extent(root, bytenr, - num_bytes); - WARN_ON(ret); - - ret = btrfs_add_free_space(cache, bytenr, - num_bytes); - WARN_ON(ret); - } + set_extent_dirty(info->pinned_extents, + bytenr, bytenr + num_bytes - 1, + GFP_NOFS | __GFP_NOFAIL); } btrfs_put_block_group(cache); total -= num_bytes; @@ -3546,18 +3857,10 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) return bytenr; } -/* - * this function must be called within transaction - */ -int btrfs_pin_extent(struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int reserved) +static int pin_down_extent(struct btrfs_root *root, + struct btrfs_block_group_cache *cache, + u64 bytenr, u64 num_bytes, int reserved) { - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_block_group_cache *cache; - - cache = btrfs_lookup_block_group(fs_info, bytenr); - BUG_ON(!cache); - spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned += num_bytes; @@ -3569,28 +3872,68 @@ int btrfs_pin_extent(struct btrfs_root *root, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - btrfs_put_block_group(cache); + set_extent_dirty(root->fs_info->pinned_extents, bytenr, + bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); + return 0; +} + +/* + * this function must be called within transaction + */ +int btrfs_pin_extent(struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int reserved) +{ + struct btrfs_block_group_cache *cache; + + cache = btrfs_lookup_block_group(root->fs_info, bytenr); + BUG_ON(!cache); + + pin_down_extent(root, cache, bytenr, num_bytes, reserved); - set_extent_dirty(fs_info->pinned_extents, - bytenr, bytenr + num_bytes - 1, GFP_NOFS); + btrfs_put_block_group(cache); return 0; } -static int update_reserved_extents(struct btrfs_block_group_cache *cache, - u64 num_bytes, int reserve) +/* + * update size of reserved extents. this function may return -EAGAIN + * if 'reserve' is true or 'sinfo' is false. + */ +static int update_reserved_bytes(struct btrfs_block_group_cache *cache, + u64 num_bytes, int reserve, int sinfo) { - spin_lock(&cache->space_info->lock); - spin_lock(&cache->lock); - if (reserve) { - cache->reserved += num_bytes; - cache->space_info->bytes_reserved += num_bytes; + int ret = 0; + if (sinfo) { + struct btrfs_space_info *space_info = cache->space_info; + spin_lock(&space_info->lock); + spin_lock(&cache->lock); + if (reserve) { + if (cache->ro) { + ret = -EAGAIN; + } else { + cache->reserved += num_bytes; + space_info->bytes_reserved += num_bytes; + } + } else { + if (cache->ro) + space_info->bytes_readonly += num_bytes; + cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; + } + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); } else { - cache->reserved -= num_bytes; - cache->space_info->bytes_reserved -= num_bytes; + spin_lock(&cache->lock); + if (cache->ro) { + ret = -EAGAIN; + } else { + if (reserve) + cache->reserved += num_bytes; + else + cache->reserved -= num_bytes; + } + spin_unlock(&cache->lock); } - spin_unlock(&cache->lock); - spin_unlock(&cache->space_info->lock); - return 0; + return ret; } int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, @@ -3621,6 +3964,8 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, fs_info->pinned_extents = &fs_info->freed_extents[0]; up_write(&fs_info->extent_commit_sem); + + update_global_block_rsv(fs_info); return 0; } @@ -3647,14 +3992,21 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) btrfs_add_free_space(cache, start, len); } + start += len; + spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); cache->pinned -= len; cache->space_info->bytes_pinned -= len; + if (cache->ro) { + cache->space_info->bytes_readonly += len; + } else if (cache->reserved_pinned > 0) { + len = min(len, cache->reserved_pinned); + cache->reserved_pinned -= len; + cache->space_info->bytes_reserved += len; + } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - - start += len; } if (cache) @@ -3667,8 +4019,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *unpin; + struct btrfs_block_rsv *block_rsv; + struct btrfs_block_rsv *next_rsv; u64 start; u64 end; + int idx; int ret; if (fs_info->pinned_extents == &fs_info->freed_extents[0]) @@ -3689,59 +4044,30 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, cond_resched(); } - return ret; -} + mutex_lock(&fs_info->durable_block_rsv_mutex); + list_for_each_entry_safe(block_rsv, next_rsv, + &fs_info->durable_block_rsv_list, list) { -static int pin_down_bytes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - u64 bytenr, u64 num_bytes, - int is_data, int reserved, - struct extent_buffer **must_clean) -{ - int err = 0; - struct extent_buffer *buf; - - if (is_data) - goto pinit; - - /* - * discard is sloooow, and so triggering discards on - * individual btree blocks isn't a good plan. Just - * pin everything in discard mode. - */ - if (btrfs_test_opt(root, DISCARD)) - goto pinit; - - buf = btrfs_find_tree_block(root, bytenr, num_bytes); - if (!buf) - goto pinit; + idx = trans->transid & 0x1; + if (block_rsv->freed[idx] > 0) { + block_rsv_add_bytes(block_rsv, + block_rsv->freed[idx], 0); + block_rsv->freed[idx] = 0; + } + if (atomic_read(&block_rsv->usage) == 0) { + btrfs_block_rsv_release(root, block_rsv, (u64)-1); - /* we can reuse a block if it hasn't been written - * and it is from this transaction. We can't - * reuse anything from the tree log root because - * it has tiny sub-transactions. - */ - if (btrfs_buffer_uptodate(buf, 0) && - btrfs_try_tree_lock(buf)) { - u64 header_owner = btrfs_header_owner(buf); - u64 header_transid = btrfs_header_generation(buf); - if (header_owner != BTRFS_TREE_LOG_OBJECTID && - header_transid == trans->transid && - !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - *must_clean = buf; - return 1; + if (block_rsv->freed[0] == 0 && + block_rsv->freed[1] == 0) { + list_del_init(&block_rsv->list); + kfree(block_rsv); + } + } else { + btrfs_block_rsv_release(root, block_rsv, 0); } - btrfs_tree_unlock(buf); } - free_extent_buffer(buf); -pinit: - if (path) - btrfs_set_path_blocking(path); - /* unlocks the pinned mutex */ - btrfs_pin_extent(root, bytenr, num_bytes, reserved); + mutex_unlock(&fs_info->durable_block_rsv_mutex); - BUG_ON(err < 0); return 0; } @@ -3902,9 +4228,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, BUG_ON(ret); } } else { - int mark_free = 0; - struct extent_buffer *must_clean = NULL; - if (found_extent) { BUG_ON(is_data && refs_to_drop != extent_data_ref_count(root, path, iref)); @@ -3917,31 +4240,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } - ret = pin_down_bytes(trans, root, path, bytenr, - num_bytes, is_data, 0, &must_clean); - if (ret > 0) - mark_free = 1; - BUG_ON(ret < 0); - /* - * it is going to be very rare for someone to be waiting - * on the block we're freeing. del_items might need to - * schedule, so rather than get fancy, just force it - * to blocking here - */ - if (must_clean) - btrfs_set_lock_blocking(must_clean); - ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); BUG_ON(ret); btrfs_release_path(extent_root, path); - if (must_clean) { - clean_tree_block(NULL, root, must_clean); - btrfs_tree_unlock(must_clean); - free_extent_buffer(must_clean); - } - if (is_data) { ret = btrfs_del_csums(trans, root, bytenr, num_bytes); BUG_ON(ret); @@ -3951,8 +4254,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); } - ret = update_block_group(trans, root, bytenr, num_bytes, 0, - mark_free); + ret = update_block_group(trans, root, bytenr, num_bytes, 0); BUG_ON(ret); } btrfs_free_path(path); @@ -3960,7 +4262,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } /* - * when we free an extent, it is possible (and likely) that we free the last + * when we free an block, it is possible (and likely) that we free the last * delayed ref for that extent as well. This searches the delayed ref tree for * a given extent, and if there are no other delayed refs to be processed, it * removes it from the tree. @@ -3972,7 +4274,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_node *ref; struct rb_node *node; - int ret; + int ret = 0; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); @@ -4024,17 +4326,99 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, list_del_init(&head->cluster); spin_unlock(&delayed_refs->lock); - ret = run_one_delayed_ref(trans, root->fs_info->tree_root, - &head->node, head->extent_op, - head->must_insert_reserved); - BUG_ON(ret); + BUG_ON(head->extent_op); + if (head->must_insert_reserved) + ret = 1; + + mutex_unlock(&head->mutex); btrfs_put_delayed_ref(&head->node); - return 0; + return ret; out: spin_unlock(&delayed_refs->lock); return 0; } +void btrfs_free_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + u64 parent, int last_ref) +{ + struct btrfs_block_rsv *block_rsv; + struct btrfs_block_group_cache *cache = NULL; + int ret; + + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len, + parent, root->root_key.objectid, + btrfs_header_level(buf), + BTRFS_DROP_DELAYED_REF, NULL); + BUG_ON(ret); + } + + if (!last_ref) + return; + + block_rsv = get_block_rsv(trans, root); + cache = btrfs_lookup_block_group(root->fs_info, buf->start); + BUG_ON(block_rsv->space_info != cache->space_info); + + if (btrfs_header_generation(buf) == trans->transid) { + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { + ret = check_ref_cleanup(trans, root, buf->start); + if (!ret) + goto pin; + } + + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { + pin_down_extent(root, cache, buf->start, buf->len, 1); + goto pin; + } + + WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); + + btrfs_add_free_space(cache, buf->start, buf->len); + ret = update_reserved_bytes(cache, buf->len, 0, 0); + if (ret == -EAGAIN) { + /* block group became read-only */ + update_reserved_bytes(cache, buf->len, 0, 1); + goto out; + } + + ret = 1; + spin_lock(&block_rsv->lock); + if (block_rsv->reserved < block_rsv->size) { + block_rsv->reserved += buf->len; + ret = 0; + } + spin_unlock(&block_rsv->lock); + + if (ret) { + spin_lock(&cache->space_info->lock); + cache->space_info->bytes_reserved -= buf->len; + spin_unlock(&cache->space_info->lock); + } + goto out; + } +pin: + if (block_rsv->durable && !cache->ro) { + ret = 0; + spin_lock(&cache->lock); + if (!cache->ro) { + cache->reserved_pinned += buf->len; + ret = 1; + } + spin_unlock(&cache->lock); + + if (ret) { + spin_lock(&block_rsv->lock); + block_rsv->freed[trans->transid & 0x1] += buf->len; + spin_unlock(&block_rsv->lock); + } + } +out: + btrfs_put_block_group(cache); +} + int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, @@ -4056,8 +4440,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, parent, root_objectid, (int)owner, BTRFS_DROP_DELAYED_REF, NULL); BUG_ON(ret); - ret = check_ref_cleanup(trans, root, bytenr); - BUG_ON(ret); } else { ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, parent, root_objectid, owner, @@ -4067,21 +4449,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, return ret; } -int btrfs_free_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 bytenr, u32 blocksize, - u64 parent, u64 root_objectid, int level) -{ - u64 used; - spin_lock(&root->node_lock); - used = btrfs_root_used(&root->root_item) - blocksize; - btrfs_set_root_used(&root->root_item, used); - spin_unlock(&root->node_lock); - - return btrfs_free_extent(trans, root, bytenr, blocksize, - parent, root_objectid, level, 0); -} - static u64 stripe_align(struct btrfs_root *root, u64 val) { u64 mask = ((u64)root->stripesize - 1); @@ -4134,6 +4501,22 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) return 0; } +static int get_block_group_index(struct btrfs_block_group_cache *cache) +{ + int index; + if (cache->flags & BTRFS_BLOCK_GROUP_RAID10) + index = 0; + else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1) + index = 1; + else if (cache->flags & BTRFS_BLOCK_GROUP_DUP) + index = 2; + else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0) + index = 3; + else + index = 4; + return index; +} + enum btrfs_loop_type { LOOP_FIND_IDEAL = 0, LOOP_CACHING_NOWAIT = 1, @@ -4155,7 +4538,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, u64 num_bytes, u64 empty_size, u64 search_start, u64 search_end, u64 hint_byte, struct btrfs_key *ins, - u64 exclude_start, u64 exclude_nr, int data) { int ret = 0; @@ -4168,6 +4550,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_space_info *space_info; int last_ptr_loop = 0; int loop = 0; + int index = 0; bool found_uncached_bg = false; bool failed_cluster_refill = false; bool failed_alloc = false; @@ -4237,6 +4620,7 @@ ideal_cache: btrfs_put_block_group(block_group); up_read(&space_info->groups_sem); } else { + index = get_block_group_index(block_group); goto have_block_group; } } else if (block_group) { @@ -4245,7 +4629,8 @@ ideal_cache: } search: down_read(&space_info->groups_sem); - list_for_each_entry(block_group, &space_info->block_groups, list) { + list_for_each_entry(block_group, &space_info->block_groups[index], + list) { u64 offset; int cached; @@ -4436,23 +4821,22 @@ checks: goto loop; } - if (exclude_nr > 0 && - (search_start + num_bytes > exclude_start && - search_start < exclude_start + exclude_nr)) { - search_start = exclude_start + exclude_nr; + ins->objectid = search_start; + ins->offset = num_bytes; + + if (offset < search_start) + btrfs_add_free_space(block_group, offset, + search_start - offset); + BUG_ON(offset > search_start); + ret = update_reserved_bytes(block_group, num_bytes, 1, + (data & BTRFS_BLOCK_GROUP_DATA)); + if (ret == -EAGAIN) { btrfs_add_free_space(block_group, offset, num_bytes); - /* - * if search_start is still in this block group - * then we just re-search this block group - */ - if (search_start >= block_group->key.objectid && - search_start < (block_group->key.objectid + - block_group->key.offset)) - goto have_block_group; goto loop; } + /* we are all good, lets return */ ins->objectid = search_start; ins->offset = num_bytes; @@ -4460,18 +4844,18 @@ checks: btrfs_add_free_space(block_group, offset, search_start - offset); BUG_ON(offset > search_start); - - update_reserved_extents(block_group, num_bytes, 1); - - /* we are all good, lets return */ break; loop: failed_cluster_refill = false; failed_alloc = false; + BUG_ON(index != get_block_group_index(block_group)); btrfs_put_block_group(block_group); } up_read(&space_info->groups_sem); + if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) + goto search; + /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for * for them to make caching progress. Also * determine the best possible bg to cache @@ -4485,6 +4869,7 @@ loop: if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && (found_uncached_bg || empty_size || empty_cluster || allowed_chunk_alloc)) { + index = 0; if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { found_uncached_bg = false; loop++; @@ -4567,31 +4952,30 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int dump_block_groups) { struct btrfs_block_group_cache *cache; + int index = 0; spin_lock(&info->lock); printk(KERN_INFO "space_info has %llu free, is %sfull\n", (unsigned long long)(info->total_bytes - info->bytes_used - info->bytes_pinned - info->bytes_reserved - - info->bytes_super), + info->bytes_readonly), (info->full) ? "" : "not "); - printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," - " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu" - "\n", + printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, " + "reserved=%llu, may_use=%llu, readonly=%llu\n", (unsigned long long)info->total_bytes, + (unsigned long long)info->bytes_used, (unsigned long long)info->bytes_pinned, - (unsigned long long)info->bytes_delalloc, + (unsigned long long)info->bytes_reserved, (unsigned long long)info->bytes_may_use, - (unsigned long long)info->bytes_used, - (unsigned long long)info->bytes_root, - (unsigned long long)info->bytes_super, - (unsigned long long)info->bytes_reserved); + (unsigned long long)info->bytes_readonly); spin_unlock(&info->lock); if (!dump_block_groups) return; down_read(&info->groups_sem); - list_for_each_entry(cache, &info->block_groups, list) { +again: + list_for_each_entry(cache, &info->block_groups[index], list) { spin_lock(&cache->lock); printk(KERN_INFO "block group %llu has %llu bytes, %llu used " "%llu pinned %llu reserved\n", @@ -4603,6 +4987,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, btrfs_dump_free_space(cache, bytes); spin_unlock(&cache->lock); } + if (++index < BTRFS_NR_RAID_TYPES) + goto again; up_read(&info->groups_sem); } @@ -4628,9 +5014,8 @@ again: WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(trans, root, num_bytes, empty_size, - search_start, search_end, hint_byte, ins, - trans->alloc_exclude_start, - trans->alloc_exclude_nr, data); + search_start, search_end, hint_byte, + ins, data); if (ret == -ENOSPC && num_bytes > min_alloc_size) { num_bytes = num_bytes >> 1; @@ -4668,7 +5053,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) ret = btrfs_discard_extent(root, start, len); btrfs_add_free_space(cache, start, len); - update_reserved_extents(cache, len, 0); + update_reserved_bytes(cache, len, 0, 1); btrfs_put_block_group(cache); return ret; @@ -4731,8 +5116,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - ret = update_block_group(trans, root, ins->objectid, ins->offset, - 1, 0); + ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); if (ret) { printk(KERN_ERR "btrfs update block group failed for %llu " "%llu\n", (unsigned long long)ins->objectid, @@ -4792,8 +5176,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - ret = update_block_group(trans, root, ins->objectid, ins->offset, - 1, 0); + ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); if (ret) { printk(KERN_ERR "btrfs update block group failed for %llu " "%llu\n", (unsigned long long)ins->objectid, @@ -4869,73 +5252,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, put_caching_control(caching_ctl); } - update_reserved_extents(block_group, ins->offset, 1); + ret = update_reserved_bytes(block_group, ins->offset, 1, 1); + BUG_ON(ret); btrfs_put_block_group(block_group); ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 0, owner, offset, ins, 1); return ret; } -/* - * finds a free extent and does all the dirty work required for allocation - * returns the key for the extent through ins, and a tree buffer for - * the first block of the extent through buf. - * - * returns 0 if everything worked, non-zero otherwise. - */ -static int alloc_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 num_bytes, u64 parent, u64 root_objectid, - struct btrfs_disk_key *key, int level, - u64 empty_size, u64 hint_byte, u64 search_end, - struct btrfs_key *ins) -{ - int ret; - u64 flags = 0; - - ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes, - empty_size, hint_byte, search_end, - ins, 0); - if (ret) - return ret; - - if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { - if (parent == 0) - parent = ins->objectid; - flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; - } else - BUG_ON(parent > 0); - - if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { - struct btrfs_delayed_extent_op *extent_op; - extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); - BUG_ON(!extent_op); - if (key) - memcpy(&extent_op->key, key, sizeof(extent_op->key)); - else - memset(&extent_op->key, 0, sizeof(extent_op->key)); - extent_op->flags_to_set = flags; - extent_op->update_key = 1; - extent_op->update_flags = 1; - extent_op->is_data = 0; - - ret = btrfs_add_delayed_tree_ref(trans, ins->objectid, - ins->offset, parent, root_objectid, - level, BTRFS_ADD_DELAYED_EXTENT, - extent_op); - BUG_ON(ret); - } - - if (root_objectid == root->root_key.objectid) { - u64 used; - spin_lock(&root->node_lock); - used = btrfs_root_used(&root->root_item) + num_bytes; - btrfs_set_root_used(&root->root_item, used); - spin_unlock(&root->node_lock); - } - return ret; -} - struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u32 blocksize, @@ -4974,8 +5298,45 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, return buf; } +static struct btrfs_block_rsv * +use_block_rsv(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u32 blocksize) +{ + struct btrfs_block_rsv *block_rsv; + int ret; + + block_rsv = get_block_rsv(trans, root); + + if (block_rsv->size == 0) { + ret = reserve_metadata_bytes(block_rsv, blocksize); + if (ret) + return ERR_PTR(ret); + return block_rsv; + } + + ret = block_rsv_use_bytes(block_rsv, blocksize); + if (!ret) + return block_rsv; + + WARN_ON(1); + printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n", + block_rsv->size, block_rsv->reserved, + block_rsv->freed[0], block_rsv->freed[1]); + + return ERR_PTR(-ENOSPC); +} + +static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize) +{ + block_rsv_add_bytes(block_rsv, blocksize, 0); + block_rsv_release_bytes(block_rsv, NULL, 0); +} + /* - * helper function to allocate a block for a given tree + * finds a free extent and does all the dirty work required for allocation + * returns the key for the extent through ins, and a tree buffer for + * the first block of the extent through buf. + * * returns the tree buffer or NULL. */ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, @@ -4985,18 +5346,53 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, u64 hint, u64 empty_size) { struct btrfs_key ins; - int ret; + struct btrfs_block_rsv *block_rsv; struct extent_buffer *buf; + u64 flags = 0; + int ret; + - ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid, - key, level, empty_size, hint, (u64)-1, &ins); + block_rsv = use_block_rsv(trans, root, blocksize); + if (IS_ERR(block_rsv)) + return ERR_CAST(block_rsv); + + ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, + empty_size, hint, (u64)-1, &ins, 0); if (ret) { - BUG_ON(ret > 0); + unuse_block_rsv(block_rsv, blocksize); return ERR_PTR(ret); } buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize, level); + BUG_ON(IS_ERR(buf)); + + if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (parent == 0) + parent = ins.objectid; + flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + } else + BUG_ON(parent > 0); + + if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { + struct btrfs_delayed_extent_op *extent_op; + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + BUG_ON(!extent_op); + if (key) + memcpy(&extent_op->key, key, sizeof(extent_op->key)); + else + memset(&extent_op->key, 0, sizeof(extent_op->key)); + extent_op->flags_to_set = flags; + extent_op->update_key = 1; + extent_op->update_flags = 1; + extent_op->is_data = 0; + + ret = btrfs_add_delayed_tree_ref(trans, ins.objectid, + ins.offset, parent, root_objectid, + level, BTRFS_ADD_DELAYED_EXTENT, + extent_op); + BUG_ON(ret); + } return buf; } @@ -5321,7 +5717,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct walk_control *wc) { - int ret = 0; + int ret; int level = wc->level; struct extent_buffer *eb = path->nodes[level]; u64 parent = 0; @@ -5399,13 +5795,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, btrfs_header_owner(path->nodes[level + 1])); } - ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent, - root->root_key.objectid, level, 0); - BUG_ON(ret); + btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); out: wc->refs[level] = 0; wc->flags[level] = 0; - return ret; + return 0; } static noinline int walk_down_tree(struct btrfs_trans_handle *trans, @@ -5483,7 +5877,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans, * also make sure backrefs for the shared block and all lower level * blocks are properly updated. */ -int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) +int btrfs_drop_snapshot(struct btrfs_root *root, + struct btrfs_block_rsv *block_rsv, int update_ref) { struct btrfs_path *path; struct btrfs_trans_handle *trans; @@ -5501,7 +5896,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) wc = kzalloc(sizeof(*wc), GFP_NOFS); BUG_ON(!wc); - trans = btrfs_start_transaction(tree_root, 1); + trans = btrfs_start_transaction(tree_root, 0); + if (block_rsv) + trans->block_rsv = block_rsv; if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_header_level(root->node); @@ -5589,22 +5986,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) } BUG_ON(wc->level == 0); - if (trans->transaction->in_commit || - trans->transaction->delayed_refs.flushing) { + if (btrfs_should_end_transaction(trans, tree_root)) { ret = btrfs_update_root(trans, tree_root, &root->root_key, root_item); BUG_ON(ret); - btrfs_end_transaction(trans, tree_root); - trans = btrfs_start_transaction(tree_root, 1); - } else { - unsigned long update; - update = trans->delayed_ref_updates; - trans->delayed_ref_updates = 0; - if (update) - btrfs_run_delayed_refs(trans, tree_root, - update); + btrfs_end_transaction_throttle(trans, tree_root); + trans = btrfs_start_transaction(tree_root, 0); + if (block_rsv) + trans->block_rsv = block_rsv; } } btrfs_release_path(root, path); @@ -5632,7 +6023,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) kfree(root); } out: - btrfs_end_transaction(trans, tree_root); + btrfs_end_transaction_throttle(trans, tree_root); kfree(wc); btrfs_free_path(path); return err; @@ -7228,48 +7619,80 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) return flags; } -static int __alloc_chunk_for_shrink(struct btrfs_root *root, - struct btrfs_block_group_cache *shrink_block_group, - int force) +static int set_block_group_ro(struct btrfs_block_group_cache *cache) { - struct btrfs_trans_handle *trans; - u64 new_alloc_flags; - u64 calc; + struct btrfs_space_info *sinfo = cache->space_info; + u64 num_bytes; + int ret = -ENOSPC; - spin_lock(&shrink_block_group->lock); - if (btrfs_block_group_used(&shrink_block_group->item) + - shrink_block_group->reserved > 0) { - spin_unlock(&shrink_block_group->lock); + if (cache->ro) + return 0; - trans = btrfs_start_transaction(root, 1); - spin_lock(&shrink_block_group->lock); + spin_lock(&sinfo->lock); + spin_lock(&cache->lock); + num_bytes = cache->key.offset - cache->reserved - cache->pinned - + cache->bytes_super - btrfs_block_group_used(&cache->item); + + if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + + sinfo->bytes_may_use + sinfo->bytes_readonly + + cache->reserved_pinned + num_bytes < sinfo->total_bytes) { + sinfo->bytes_readonly += num_bytes; + sinfo->bytes_reserved += cache->reserved_pinned; + cache->reserved_pinned = 0; + cache->ro = 1; + ret = 0; + } + spin_unlock(&cache->lock); + spin_unlock(&sinfo->lock); + return ret; +} - new_alloc_flags = update_block_group_flags(root, - shrink_block_group->flags); - if (new_alloc_flags != shrink_block_group->flags) { - calc = - btrfs_block_group_used(&shrink_block_group->item); - } else { - calc = shrink_block_group->key.offset; - } - spin_unlock(&shrink_block_group->lock); +int btrfs_set_block_group_ro(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) - do_chunk_alloc(trans, root->fs_info->extent_root, - calc + 2 * 1024 * 1024, new_alloc_flags, force); +{ + struct btrfs_trans_handle *trans; + u64 alloc_flags; + int ret; - btrfs_end_transaction(trans, root); - } else - spin_unlock(&shrink_block_group->lock); - return 0; -} + BUG_ON(cache->ro); + + trans = btrfs_join_transaction(root, 1); + BUG_ON(IS_ERR(trans)); + alloc_flags = update_block_group_flags(root, cache->flags); + if (alloc_flags != cache->flags) + do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); -int btrfs_prepare_block_group_relocation(struct btrfs_root *root, - struct btrfs_block_group_cache *group) + ret = set_block_group_ro(cache); + if (!ret) + goto out; + alloc_flags = get_alloc_profile(root, cache->space_info->flags); + ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); + if (ret < 0) + goto out; + ret = set_block_group_ro(cache); +out: + btrfs_end_transaction(trans, root); + return ret; +} +int btrfs_set_block_group_rw(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) { - __alloc_chunk_for_shrink(root, group, 1); - set_block_group_readonly(group); + struct btrfs_space_info *sinfo = cache->space_info; + u64 num_bytes; + + BUG_ON(!cache->ro); + + spin_lock(&sinfo->lock); + spin_lock(&cache->lock); + num_bytes = cache->key.offset - cache->reserved - cache->pinned - + cache->bytes_super - btrfs_block_group_used(&cache->item); + sinfo->bytes_readonly -= num_bytes; + cache->ro = 0; + spin_unlock(&cache->lock); + spin_unlock(&sinfo->lock); return 0; } @@ -7436,17 +7859,33 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) */ synchronize_rcu(); + release_global_block_rsv(info); + while(!list_empty(&info->space_info)) { space_info = list_entry(info->space_info.next, struct btrfs_space_info, list); - + if (space_info->bytes_pinned > 0 || + space_info->bytes_reserved > 0) { + WARN_ON(1); + dump_space_info(space_info, 0, 0); + } list_del(&space_info->list); kfree(space_info); } return 0; } +static void __link_block_group(struct btrfs_space_info *space_info, + struct btrfs_block_group_cache *cache) +{ + int index = get_block_group_index(cache); + + down_write(&space_info->groups_sem); + list_add_tail(&cache->list, &space_info->block_groups[index]); + up_write(&space_info->groups_sem); +} + int btrfs_read_block_groups(struct btrfs_root *root) { struct btrfs_path *path; @@ -7468,10 +7907,8 @@ int btrfs_read_block_groups(struct btrfs_root *root) while (1) { ret = find_first_block_group(root, path, &key); - if (ret > 0) { - ret = 0; - goto error; - } + if (ret > 0) + break; if (ret != 0) goto error; @@ -7480,7 +7917,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) cache = kzalloc(sizeof(*cache), GFP_NOFS); if (!cache) { ret = -ENOMEM; - break; + goto error; } atomic_set(&cache->count, 1); @@ -7537,20 +7974,36 @@ int btrfs_read_block_groups(struct btrfs_root *root) BUG_ON(ret); cache->space_info = space_info; spin_lock(&cache->space_info->lock); - cache->space_info->bytes_super += cache->bytes_super; + cache->space_info->bytes_readonly += cache->bytes_super; spin_unlock(&cache->space_info->lock); - down_write(&space_info->groups_sem); - list_add_tail(&cache->list, &space_info->block_groups); - up_write(&space_info->groups_sem); + __link_block_group(space_info, cache); ret = btrfs_add_block_group_cache(root->fs_info, cache); BUG_ON(ret); set_avail_alloc_bits(root->fs_info, cache->flags); if (btrfs_chunk_readonly(root, cache->key.objectid)) - set_block_group_readonly(cache); + set_block_group_ro(cache); } + + list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { + if (!(get_alloc_profile(root, space_info->flags) & + (BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_DUP))) + continue; + /* + * avoid allocating from un-mirrored block group if there are + * mirrored block groups. + */ + list_for_each_entry(cache, &space_info->block_groups[3], list) + set_block_group_ro(cache); + list_for_each_entry(cache, &space_info->block_groups[4], list) + set_block_group_ro(cache); + } + + init_global_block_rsv(info); ret = 0; error: btrfs_free_path(path); @@ -7611,12 +8064,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, BUG_ON(ret); spin_lock(&cache->space_info->lock); - cache->space_info->bytes_super += cache->bytes_super; + cache->space_info->bytes_readonly += cache->bytes_super; spin_unlock(&cache->space_info->lock); - down_write(&cache->space_info->groups_sem); - list_add_tail(&cache->list, &cache->space_info->block_groups); - up_write(&cache->space_info->groups_sem); + __link_block_group(cache->space_info, cache); ret = btrfs_add_block_group_cache(root->fs_info, cache); BUG_ON(ret); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d2d0368..a4080c2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -135,7 +135,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) return state; } -static void free_extent_state(struct extent_state *state) +void free_extent_state(struct extent_state *state) { if (!state) return; @@ -335,21 +335,18 @@ static int merge_state(struct extent_io_tree *tree, } static int set_state_cb(struct extent_io_tree *tree, - struct extent_state *state, - unsigned long bits) + struct extent_state *state, int *bits) { if (tree->ops && tree->ops->set_bit_hook) { return tree->ops->set_bit_hook(tree->mapping->host, - state->start, state->end, - state->state, bits); + state, bits); } return 0; } static void clear_state_cb(struct extent_io_tree *tree, - struct extent_state *state, - unsigned long bits) + struct extent_state *state, int *bits) { if (tree->ops && tree->ops->clear_bit_hook) tree->ops->clear_bit_hook(tree->mapping->host, state, bits); @@ -367,9 +364,10 @@ static void clear_state_cb(struct extent_io_tree *tree, */ static int insert_state(struct extent_io_tree *tree, struct extent_state *state, u64 start, u64 end, - int bits) + int *bits) { struct rb_node *node; + int bits_to_set = *bits & ~EXTENT_CTLBITS; int ret; if (end < start) { @@ -384,9 +382,9 @@ static int insert_state(struct extent_io_tree *tree, if (ret) return ret; - if (bits & EXTENT_DIRTY) + if (bits_to_set & EXTENT_DIRTY) tree->dirty_bytes += end - start + 1; - state->state |= bits; + state->state |= bits_to_set; node = tree_insert(&tree->state, end, &state->rb_node); if (node) { struct extent_state *found; @@ -456,13 +454,13 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, * struct is freed and removed from the tree */ static int clear_state_bit(struct extent_io_tree *tree, - struct extent_state *state, int bits, int wake, - int delete) + struct extent_state *state, + int *bits, int wake) { - int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING; + int bits_to_clear = *bits & ~EXTENT_CTLBITS; int ret = state->state & bits_to_clear; - if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { + if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; WARN_ON(range > tree->dirty_bytes); tree->dirty_bytes -= range; @@ -471,9 +469,8 @@ static int clear_state_bit(struct extent_io_tree *tree, state->state &= ~bits_to_clear; if (wake) wake_up(&state->wq); - if (delete || state->state == 0) { + if (state->state == 0) { if (state->tree) { - clear_state_cb(tree, state, state->state); rb_erase(&state->rb_node, &tree->state); state->tree = NULL; free_extent_state(state); @@ -514,6 +511,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int set = 0; int clear = 0; + if (delete) + bits |= ~EXTENT_CTLBITS; + bits |= EXTENT_FIRST_DELALLOC; + if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) clear = 1; again: @@ -580,8 +581,7 @@ hit_next: if (err) goto out; if (state->end <= end) { - set |= clear_state_bit(tree, state, bits, wake, - delete); + set |= clear_state_bit(tree, state, &bits, wake); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -602,7 +602,7 @@ hit_next: if (wake) wake_up(&state->wq); - set |= clear_state_bit(tree, prealloc, bits, wake, delete); + set |= clear_state_bit(tree, prealloc, &bits, wake); prealloc = NULL; goto out; @@ -613,7 +613,7 @@ hit_next: else next_node = NULL; - set |= clear_state_bit(tree, state, bits, wake, delete); + set |= clear_state_bit(tree, state, &bits, wake); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -706,19 +706,19 @@ out: static int set_state_bits(struct extent_io_tree *tree, struct extent_state *state, - int bits) + int *bits) { int ret; + int bits_to_set = *bits & ~EXTENT_CTLBITS; ret = set_state_cb(tree, state, bits); if (ret) return ret; - - if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { + if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; tree->dirty_bytes += range; } - state->state |= bits; + state->state |= bits_to_set; return 0; } @@ -745,10 +745,9 @@ static void cache_state(struct extent_state *state, * [start, end] is inclusive This takes the tree lock. */ -static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - int bits, int exclusive_bits, u64 *failed_start, - struct extent_state **cached_state, - gfp_t mask) +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int exclusive_bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -757,6 +756,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u64 last_start; u64 last_end; + bits |= EXTENT_FIRST_DELALLOC; again: if (!prealloc && (mask & __GFP_WAIT)) { prealloc = alloc_extent_state(mask); @@ -778,7 +778,7 @@ again: */ node = tree_search(tree, start); if (!node) { - err = insert_state(tree, prealloc, start, end, bits); + err = insert_state(tree, prealloc, start, end, &bits); prealloc = NULL; BUG_ON(err == -EEXIST); goto out; @@ -802,7 +802,7 @@ hit_next: goto out; } - err = set_state_bits(tree, state, bits); + err = set_state_bits(tree, state, &bits); if (err) goto out; @@ -852,7 +852,7 @@ hit_next: if (err) goto out; if (state->end <= end) { - err = set_state_bits(tree, state, bits); + err = set_state_bits(tree, state, &bits); if (err) goto out; cache_state(state, cached_state); @@ -877,7 +877,7 @@ hit_next: else this_end = last_start - 1; err = insert_state(tree, prealloc, start, this_end, - bits); + &bits); BUG_ON(err == -EEXIST); if (err) { prealloc = NULL; @@ -903,7 +903,7 @@ hit_next: err = split_state(tree, state, prealloc, end + 1); BUG_ON(err == -EEXIST); - err = set_state_bits(tree, prealloc, bits); + err = set_state_bits(tree, prealloc, &bits); if (err) { prealloc = NULL; goto out; @@ -966,8 +966,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, { return clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, - NULL, mask); + EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); } int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, @@ -1435,9 +1434,6 @@ int extent_clear_unlock_delalloc(struct inode *inode, if (op & EXTENT_CLEAR_DELALLOC) clear_bits |= EXTENT_DELALLOC; - if (op & EXTENT_CLEAR_ACCOUNTING) - clear_bits |= EXTENT_DO_ACCOUNTING; - clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | @@ -1916,7 +1912,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, if (tree->ops && tree->ops->submit_bio_hook) tree->ops->submit_bio_hook(page->mapping->host, rw, bio, - mirror_num, bio_flags); + mirror_num, bio_flags, start); else submit_bio(rw, bio); if (bio_flagged(bio, BIO_EOPNOTSUPP)) @@ -2020,6 +2016,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, sector_t sector; struct extent_map *em; struct block_device *bdev; + struct btrfs_ordered_extent *ordered; int ret; int nr = 0; size_t page_offset = 0; @@ -2031,7 +2028,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree, set_page_extent_mapped(page); end = page_end; - lock_extent(tree, start, end, GFP_NOFS); + while (1) { + lock_extent(tree, start, end, GFP_NOFS); + ordered = btrfs_lookup_ordered_extent(inode, start); + if (!ordered) + break; + unlock_extent(tree, start, end, GFP_NOFS); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + } if (page->index == last_byte >> PAGE_CACHE_SHIFT) { char *userpage; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index bbab481..5691c7b 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -16,7 +16,9 @@ #define EXTENT_BOUNDARY (1 << 9) #define EXTENT_NODATASUM (1 << 10) #define EXTENT_DO_ACCOUNTING (1 << 11) +#define EXTENT_FIRST_DELALLOC (1 << 12) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) +#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) /* flags for bio submission */ #define EXTENT_BIO_COMPRESSED 1 @@ -47,7 +49,7 @@ struct extent_state; typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags); + unsigned long bio_flags, u64 bio_offset); struct extent_io_ops { int (*fill_delalloc)(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, @@ -69,10 +71,10 @@ struct extent_io_ops { struct extent_state *state); int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate); - int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, - unsigned long old, unsigned long bits); + int (*set_bit_hook)(struct inode *inode, struct extent_state *state, + int *bits); int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, - unsigned long bits); + int *bits); int (*merge_extent_hook)(struct inode *inode, struct extent_state *new, struct extent_state *other); @@ -176,6 +178,7 @@ u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, unsigned long bits); +void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits, int filled, struct extent_state *cached_state); int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, @@ -185,6 +188,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int bits, gfp_t mask); +int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int exclusive_bits, u64 *failed_start, + struct extent_state **cached_state, gfp_t mask); int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 54a2550..a562a25 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -149,13 +149,14 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, } -int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, - struct bio *bio, u32 *dst) +static int __btrfs_lookup_bio_sums(struct btrfs_root *root, + struct inode *inode, struct bio *bio, + u64 logical_offset, u32 *dst, int dio) { u32 sum; struct bio_vec *bvec = bio->bi_io_vec; int bio_index = 0; - u64 offset; + u64 offset = 0; u64 item_start_offset = 0; u64 item_last_offset = 0; u64 disk_bytenr; @@ -174,8 +175,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, WARN_ON(bio->bi_vcnt <= 0); disk_bytenr = (u64)bio->bi_sector << 9; + if (dio) + offset = logical_offset; while (bio_index < bio->bi_vcnt) { - offset = page_offset(bvec->bv_page) + bvec->bv_offset; + if (!dio) + offset = page_offset(bvec->bv_page) + bvec->bv_offset; ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum); if (ret == 0) goto found; @@ -238,6 +242,7 @@ found: else set_state_private(io_tree, offset, sum); disk_bytenr += bvec->bv_len; + offset += bvec->bv_len; bio_index++; bvec++; } @@ -245,6 +250,18 @@ found: return 0; } +int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u32 *dst) +{ + return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0); +} + +int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u64 offset, u32 *dst) +{ + return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1); +} + int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list) { @@ -657,6 +674,9 @@ again: goto found; } ret = PTR_ERR(item); + if (ret != -EFBIG && ret != -ENOENT) + goto fail_unlock; + if (ret == -EFBIG) { u32 item_size; /* we found one, but it isn't big enough yet */ diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 29ff749..79437c5 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -46,32 +46,42 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, int write_bytes, struct page **prepared_pages, - const char __user *buf) + struct iov_iter *i) { - long page_fault = 0; - int i; + size_t copied; + int pg = 0; int offset = pos & (PAGE_CACHE_SIZE - 1); - for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) { + while (write_bytes > 0) { size_t count = min_t(size_t, PAGE_CACHE_SIZE - offset, write_bytes); - struct page *page = prepared_pages[i]; - fault_in_pages_readable(buf, count); + struct page *page = prepared_pages[pg]; +again: + if (unlikely(iov_iter_fault_in_readable(i, count))) + return -EFAULT; /* Copy data from userspace to the current page */ - kmap(page); - page_fault = __copy_from_user(page_address(page) + offset, - buf, count); + copied = iov_iter_copy_from_user(page, i, offset, count); + /* Flush processor's dcache for this page */ flush_dcache_page(page); - kunmap(page); - buf += count; - write_bytes -= count; + iov_iter_advance(i, copied); + write_bytes -= copied; - if (page_fault) - break; + if (unlikely(copied == 0)) { + count = min_t(size_t, PAGE_CACHE_SIZE - offset, + iov_iter_single_seg_count(i)); + goto again; + } + + if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { + offset += copied; + } else { + pg++; + offset = 0; + } } - return page_fault ? -EFAULT : 0; + return 0; } /* @@ -126,8 +136,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, end_of_last_block = start_pos + num_bytes - 1; err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, NULL); - if (err) - return err; + BUG_ON(err); for (i = 0; i < num_pages; i++) { struct page *p = pages[i]; @@ -142,7 +151,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, * at this time. */ } - return err; + return 0; } /* @@ -823,45 +832,46 @@ again: return 0; } -static ssize_t btrfs_file_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t btrfs_file_aio_write(struct kiocb *iocb, + const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { - loff_t pos; + struct file *file = iocb->ki_filp; + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct page *pinned[2]; + struct page **pages = NULL; + struct iov_iter i; + loff_t *ppos = &iocb->ki_pos; loff_t start_pos; ssize_t num_written = 0; ssize_t err = 0; + size_t count; + size_t ocount; int ret = 0; - struct inode *inode = fdentry(file)->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct page **pages = NULL; int nrptrs; - struct page *pinned[2]; unsigned long first_index; unsigned long last_index; int will_write; + int buffered = 0; will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || (file->f_flags & O_DIRECT)); - nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, - PAGE_CACHE_SIZE / (sizeof(struct page *))); pinned[0] = NULL; pinned[1] = NULL; - pos = *ppos; start_pos = pos; vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); - /* do the reserve before the mutex lock in case we have to do some - * flushing. We wouldn't deadlock, but this is more polite. - */ - err = btrfs_reserve_metadata_for_delalloc(root, inode, 1); - if (err) - goto out_nolock; - mutex_lock(&inode->i_mutex); + err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); + if (err) + goto out; + count = ocount; + current->backing_dev_info = inode->i_mapping->backing_dev_info; err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) @@ -875,15 +885,53 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, goto out; file_update_time(file); + BTRFS_I(inode)->sequence++; + + if (unlikely(file->f_flags & O_DIRECT)) { + num_written = generic_file_direct_write(iocb, iov, &nr_segs, + pos, ppos, count, + ocount); + /* + * the generic O_DIRECT will update in-memory i_size after the + * DIOs are done. But our endio handlers that update the on + * disk i_size never update past the in memory i_size. So we + * need one more update here to catch any additions to the + * file + */ + if (inode->i_size != BTRFS_I(inode)->disk_i_size) { + btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + mark_inode_dirty(inode); + } + if (num_written < 0) { + ret = num_written; + num_written = 0; + goto out; + } else if (num_written == count) { + /* pick up pos changes done by the generic code */ + pos = *ppos; + goto out; + } + /* + * We are going to do buffered for the rest of the range, so we + * need to make sure to invalidate the buffered pages when we're + * done. + */ + buffered = 1; + pos += num_written; + } + + iov_iter_init(&i, iov, nr_segs, count, num_written); + nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / + (sizeof(struct page *))); pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); /* generic_write_checks can change our pos */ start_pos = pos; - BTRFS_I(inode)->sequence++; first_index = pos >> PAGE_CACHE_SHIFT; - last_index = (pos + count) >> PAGE_CACHE_SHIFT; + last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT; /* * there are lots of better ways to do this, but this code @@ -900,7 +948,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, unlock_page(pinned[0]); } } - if ((pos + count) & (PAGE_CACHE_SIZE - 1)) { + if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) { pinned[1] = grab_cache_page(inode->i_mapping, last_index); if (!PageUptodate(pinned[1])) { ret = btrfs_readpage(NULL, pinned[1]); @@ -911,10 +959,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, } } - while (count > 0) { + while (iov_iter_count(&i) > 0) { size_t offset = pos & (PAGE_CACHE_SIZE - 1); - size_t write_bytes = min(count, nrptrs * - (size_t)PAGE_CACHE_SIZE - + size_t write_bytes = min(iov_iter_count(&i), + nrptrs * (size_t)PAGE_CACHE_SIZE - offset); size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; @@ -922,7 +970,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, WARN_ON(num_pages > nrptrs); memset(pages, 0, sizeof(struct page *) * nrptrs); - ret = btrfs_check_data_free_space(root, inode, write_bytes); + ret = btrfs_delalloc_reserve_space(inode, write_bytes); if (ret) goto out; @@ -930,26 +978,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, pos, first_index, last_index, write_bytes); if (ret) { - btrfs_free_reserved_data_space(root, inode, - write_bytes); + btrfs_delalloc_release_space(inode, write_bytes); goto out; } ret = btrfs_copy_from_user(pos, num_pages, - write_bytes, pages, buf); - if (ret) { - btrfs_free_reserved_data_space(root, inode, - write_bytes); - btrfs_drop_pages(pages, num_pages); - goto out; + write_bytes, pages, &i); + if (ret == 0) { + dirty_and_release_pages(NULL, root, file, pages, + num_pages, pos, write_bytes); } - ret = dirty_and_release_pages(NULL, root, file, pages, - num_pages, pos, write_bytes); btrfs_drop_pages(pages, num_pages); if (ret) { - btrfs_free_reserved_data_space(root, inode, - write_bytes); + btrfs_delalloc_release_space(inode, write_bytes); goto out; } @@ -965,8 +1007,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, btrfs_throttle(root); } - buf += write_bytes; - count -= write_bytes; pos += write_bytes; num_written += write_bytes; @@ -976,9 +1016,7 @@ out: mutex_unlock(&inode->i_mutex); if (ret) err = ret; - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); -out_nolock: kfree(pages); if (pinned[0]) page_cache_release(pinned[0]); @@ -1008,7 +1046,7 @@ out_nolock: num_written = err; if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); if (ret == 0) { @@ -1023,7 +1061,7 @@ out_nolock: btrfs_end_transaction(trans, root); } } - if (file->f_flags & O_DIRECT) { + if (file->f_flags & O_DIRECT && buffered) { invalidate_mapping_pages(inode->i_mapping, start_pos >> PAGE_CACHE_SHIFT, (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); @@ -1104,9 +1142,9 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) if (file && file->private_data) btrfs_ioctl_trans_end(file); - trans = btrfs_start_transaction(root, 1); - if (!trans) { - ret = -ENOMEM; + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); goto out; } @@ -1161,7 +1199,7 @@ const struct file_operations btrfs_file_operations = { .read = do_sync_read, .aio_read = generic_file_aio_read, .splice_read = generic_file_splice_read, - .write = btrfs_file_write, + .aio_write = btrfs_file_aio_write, .mmap = btrfs_file_mmap, .open = generic_file_open, .release = btrfs_release_file, diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 72ce3c1..64f1150 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -49,6 +49,33 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name, return 0; } +struct btrfs_inode_ref * +btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, int mod) +{ + struct btrfs_key key; + struct btrfs_inode_ref *ref; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + int ret; + + key.objectid = inode_objectid; + key.type = BTRFS_INODE_REF_KEY; + key.offset = ref_objectid; + + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return NULL; + if (!find_name_in_backref(path, name, name_len, &ref)) + return NULL; + return ref; +} + int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *name, int name_len, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d601629..fa6ccc1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -252,6 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, inline_len, compressed_size, compressed_pages); BUG_ON(ret); + btrfs_delalloc_release_metadata(inode, end + 1 - start); btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); return 0; } @@ -414,6 +415,7 @@ again: trans = btrfs_join_transaction(root, 1); BUG_ON(!trans); btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; /* lets try to make an inline extent */ if (ret || total_in < (actual_end - start)) { @@ -439,7 +441,6 @@ again: start, end, NULL, EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_ACCOUNTING | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); btrfs_end_transaction(trans, root); @@ -697,6 +698,38 @@ retry: return 0; } +static u64 get_extent_allocation_hint(struct inode *inode, u64 start, + u64 num_bytes) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + u64 alloc_hint = 0; + + read_lock(&em_tree->lock); + em = search_extent_mapping(em_tree, start, num_bytes); + if (em) { + /* + * if block start isn't an actual block number then find the + * first block in this inode and use that as a hint. If that + * block is also bogus then just don't worry about it. + */ + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + free_extent_map(em); + em = search_extent_mapping(em_tree, 0, 0); + if (em && em->block_start < EXTENT_MAP_LAST_BYTE) + alloc_hint = em->block_start; + if (em) + free_extent_map(em); + } else { + alloc_hint = em->block_start; + free_extent_map(em); + } + } + read_unlock(&em_tree->lock); + + return alloc_hint; +} + /* * when extent_io.c finds a delayed allocation range in the file, * the call backs end up in this code. The basic idea is to @@ -734,6 +767,7 @@ static noinline int cow_file_range(struct inode *inode, trans = btrfs_join_transaction(root, 1); BUG_ON(!trans); btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; actual_end = min_t(u64, isize, end + 1); @@ -753,7 +787,6 @@ static noinline int cow_file_range(struct inode *inode, EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC | - EXTENT_CLEAR_ACCOUNTING | EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); @@ -769,29 +802,7 @@ static noinline int cow_file_range(struct inode *inode, BUG_ON(disk_num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy)); - - read_lock(&BTRFS_I(inode)->extent_tree.lock); - em = search_extent_mapping(&BTRFS_I(inode)->extent_tree, - start, num_bytes); - if (em) { - /* - * if block start isn't an actual block number then find the - * first block in this inode and use that as a hint. If that - * block is also bogus then just don't worry about it. - */ - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - free_extent_map(em); - em = search_extent_mapping(em_tree, 0, 0); - if (em && em->block_start < EXTENT_MAP_LAST_BYTE) - alloc_hint = em->block_start; - if (em) - free_extent_map(em); - } else { - alloc_hint = em->block_start; - free_extent_map(em); - } - } - read_unlock(&BTRFS_I(inode)->extent_tree.lock); + alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); while (disk_num_bytes > 0) { @@ -1174,6 +1185,13 @@ out_check: num_bytes, num_bytes, type); BUG_ON(ret); + if (root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID) { + ret = btrfs_reloc_clone_csums(inode, cur_offset, + num_bytes); + BUG_ON(ret); + } + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, cur_offset, cur_offset + num_bytes - 1, locked_page, EXTENT_CLEAR_UNLOCK_PAGE | @@ -1226,15 +1244,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, } static int btrfs_split_extent_hook(struct inode *inode, - struct extent_state *orig, u64 split) + struct extent_state *orig, u64 split) { + /* not delalloc, ignore it */ if (!(orig->state & EXTENT_DELALLOC)) return 0; - spin_lock(&BTRFS_I(inode)->accounting_lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - + atomic_inc(&BTRFS_I(inode)->outstanding_extents); return 0; } @@ -1252,10 +1268,7 @@ static int btrfs_merge_extent_hook(struct inode *inode, if (!(other->state & EXTENT_DELALLOC)) return 0; - spin_lock(&BTRFS_I(inode)->accounting_lock); - BTRFS_I(inode)->outstanding_extents--; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - + atomic_dec(&BTRFS_I(inode)->outstanding_extents); return 0; } @@ -1264,8 +1277,8 @@ static int btrfs_merge_extent_hook(struct inode *inode, * bytes in this file, and to maintain the list of inodes that * have pending delalloc work to be done. */ -static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, - unsigned long old, unsigned long bits) +static int btrfs_set_bit_hook(struct inode *inode, + struct extent_state *state, int *bits) { /* @@ -1273,17 +1286,18 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, * but in this case, we are only testeing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; + u64 len = state->end + 1 - state->start; - spin_lock(&BTRFS_I(inode)->accounting_lock); - BTRFS_I(inode)->outstanding_extents++; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - btrfs_delalloc_reserve_space(root, inode, end - start + 1); + if (*bits & EXTENT_FIRST_DELALLOC) + *bits &= ~EXTENT_FIRST_DELALLOC; + else + atomic_inc(&BTRFS_I(inode)->outstanding_extents); spin_lock(&root->fs_info->delalloc_lock); - BTRFS_I(inode)->delalloc_bytes += end - start + 1; - root->fs_info->delalloc_bytes += end - start + 1; + BTRFS_I(inode)->delalloc_bytes += len; + root->fs_info->delalloc_bytes += len; if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { list_add_tail(&BTRFS_I(inode)->delalloc_inodes, &root->fs_info->delalloc_inodes); @@ -1297,45 +1311,32 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, * extent_io.c clear_bit_hook, see set_bit_hook for why */ static int btrfs_clear_bit_hook(struct inode *inode, - struct extent_state *state, unsigned long bits) + struct extent_state *state, int *bits) { /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testeing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; + u64 len = state->end + 1 - state->start; - if (bits & EXTENT_DO_ACCOUNTING) { - spin_lock(&BTRFS_I(inode)->accounting_lock); - WARN_ON(!BTRFS_I(inode)->outstanding_extents); - BTRFS_I(inode)->outstanding_extents--; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); - } + if (*bits & EXTENT_FIRST_DELALLOC) + *bits &= ~EXTENT_FIRST_DELALLOC; + else if (!(*bits & EXTENT_DO_ACCOUNTING)) + atomic_dec(&BTRFS_I(inode)->outstanding_extents); + + if (*bits & EXTENT_DO_ACCOUNTING) + btrfs_delalloc_release_metadata(inode, len); + + if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) + btrfs_free_reserved_data_space(inode, len); spin_lock(&root->fs_info->delalloc_lock); - if (state->end - state->start + 1 > - root->fs_info->delalloc_bytes) { - printk(KERN_INFO "btrfs warning: delalloc account " - "%llu %llu\n", - (unsigned long long) - state->end - state->start + 1, - (unsigned long long) - root->fs_info->delalloc_bytes); - btrfs_delalloc_free_space(root, inode, (u64)-1); - root->fs_info->delalloc_bytes = 0; - BTRFS_I(inode)->delalloc_bytes = 0; - } else { - btrfs_delalloc_free_space(root, inode, - state->end - - state->start + 1); - root->fs_info->delalloc_bytes -= state->end - - state->start + 1; - BTRFS_I(inode)->delalloc_bytes -= state->end - - state->start + 1; - } + root->fs_info->delalloc_bytes -= len; + BTRFS_I(inode)->delalloc_bytes -= len; + if (BTRFS_I(inode)->delalloc_bytes == 0 && !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { list_del_init(&BTRFS_I(inode)->delalloc_inodes); @@ -1384,7 +1385,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, */ static int __btrfs_submit_bio_start(struct inode *inode, int rw, struct bio *bio, int mirror_num, - unsigned long bio_flags) + unsigned long bio_flags, + u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; @@ -1403,7 +1405,8 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw, * are inserted into the btree */ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags) + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; return btrfs_map_bio(root, rw, bio, mirror_num, 1); @@ -1414,7 +1417,8 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, * on write, or reading the csums from the tree before a read */ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, - int mirror_num, unsigned long bio_flags) + int mirror_num, unsigned long bio_flags, + u64 bio_offset) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; @@ -1439,7 +1443,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, /* we're doing a write, do the async checksumming */ return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, inode, rw, bio, mirror_num, - bio_flags, __btrfs_submit_bio_start, + bio_flags, bio_offset, + __btrfs_submit_bio_start, __btrfs_submit_bio_done); } @@ -1520,6 +1525,7 @@ again: goto again; } + BUG(); btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); ClearPageChecked(page); out: @@ -1650,7 +1656,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) { struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; + struct btrfs_trans_handle *trans = NULL; struct btrfs_ordered_extent *ordered_extent = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; @@ -1668,9 +1674,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (!ret) { trans = btrfs_join_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); - btrfs_end_transaction(trans, root); } goto out; } @@ -1680,6 +1687,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) 0, &cached_state, GFP_NOFS); trans = btrfs_join_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = &root->fs_info->delalloc_block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compressed = 1; @@ -1711,12 +1720,13 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) add_pending_csums(trans, inode, ordered_extent->file_offset, &ordered_extent->list); - /* this also removes the ordered extent from the tree */ btrfs_ordered_update_i_size(inode, 0, ordered_extent); ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); - btrfs_end_transaction(trans, root); out: + btrfs_delalloc_release_metadata(inode, ordered_extent->len); + if (trans) + btrfs_end_transaction(trans, root); /* once for us */ btrfs_put_ordered_extent(ordered_extent); /* once for the tree */ @@ -1838,7 +1848,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio, BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, failrec->last_mirror, - failrec->bio_flags); + failrec->bio_flags, 0); return 0; } @@ -1993,32 +2003,196 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) } /* + * calculate extra metadata reservation when snapshotting a subvolume + * contains orphan files. + */ +void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending, + u64 *bytes_to_reserve) +{ + struct btrfs_root *root; + struct btrfs_block_rsv *block_rsv; + u64 num_bytes; + int index; + + root = pending->root; + if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) + return; + + block_rsv = root->orphan_block_rsv; + + /* orphan block reservation for the snapshot */ + num_bytes = block_rsv->size; + + /* + * after the snapshot is created, COWing tree blocks may use more + * space than it frees. So we should make sure there is enough + * reserved space. + */ + index = trans->transid & 0x1; + if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { + num_bytes += block_rsv->size - + (block_rsv->reserved + block_rsv->freed[index]); + } + + *bytes_to_reserve += num_bytes; +} + +void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending) +{ + struct btrfs_root *root = pending->root; + struct btrfs_root *snap = pending->snap; + struct btrfs_block_rsv *block_rsv; + u64 num_bytes; + int index; + int ret; + + if (!root->orphan_block_rsv || list_empty(&root->orphan_list)) + return; + + /* refill source subvolume's orphan block reservation */ + block_rsv = root->orphan_block_rsv; + index = trans->transid & 0x1; + if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) { + num_bytes = block_rsv->size - + (block_rsv->reserved + block_rsv->freed[index]); + ret = btrfs_block_rsv_migrate(&pending->block_rsv, + root->orphan_block_rsv, + num_bytes); + BUG_ON(ret); + } + + /* setup orphan block reservation for the snapshot */ + block_rsv = btrfs_alloc_block_rsv(snap); + BUG_ON(!block_rsv); + + btrfs_add_durable_block_rsv(root->fs_info, block_rsv); + snap->orphan_block_rsv = block_rsv; + + num_bytes = root->orphan_block_rsv->size; + ret = btrfs_block_rsv_migrate(&pending->block_rsv, + block_rsv, num_bytes); + BUG_ON(ret); + +#if 0 + /* insert orphan item for the snapshot */ + WARN_ON(!root->orphan_item_inserted); + ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, + snap->root_key.objectid); + BUG_ON(ret); + snap->orphan_item_inserted = 1; +#endif +} + +enum btrfs_orphan_cleanup_state { + ORPHAN_CLEANUP_STARTED = 1, + ORPHAN_CLEANUP_DONE = 2, +}; + +/* + * This is called in transaction commmit time. If there are no orphan + * files in the subvolume, it removes orphan item and frees block_rsv + * structure. + */ +void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + + if (!list_empty(&root->orphan_list) || + root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) + return; + + if (root->orphan_item_inserted && + btrfs_root_refs(&root->root_item) > 0) { + ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, + root->root_key.objectid); + BUG_ON(ret); + root->orphan_item_inserted = 0; + } + + if (root->orphan_block_rsv) { + WARN_ON(root->orphan_block_rsv->size > 0); + btrfs_free_block_rsv(root, root->orphan_block_rsv); + root->orphan_block_rsv = NULL; + } +} + +/* * This creates an orphan entry for the given inode in case something goes * wrong in the middle of an unlink/truncate. + * + * NOTE: caller of this function should reserve 5 units of metadata for + * this function. */ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; - int ret = 0; + struct btrfs_block_rsv *block_rsv = NULL; + int reserve = 0; + int insert = 0; + int ret; - spin_lock(&root->list_lock); + if (!root->orphan_block_rsv) { + block_rsv = btrfs_alloc_block_rsv(root); + BUG_ON(!block_rsv); + } - /* already on the orphan list, we're good */ - if (!list_empty(&BTRFS_I(inode)->i_orphan)) { - spin_unlock(&root->list_lock); - return 0; + spin_lock(&root->orphan_lock); + if (!root->orphan_block_rsv) { + root->orphan_block_rsv = block_rsv; + } else if (block_rsv) { + btrfs_free_block_rsv(root, block_rsv); + block_rsv = NULL; + } + + if (list_empty(&BTRFS_I(inode)->i_orphan)) { + list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); +#if 0 + /* + * For proper ENOSPC handling, we should do orphan + * cleanup when mounting. But this introduces backward + * compatibility issue. + */ + if (!xchg(&root->orphan_item_inserted, 1)) + insert = 2; + else + insert = 1; +#endif + insert = 1; + } else { + WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved); } - list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); + if (!BTRFS_I(inode)->orphan_meta_reserved) { + BTRFS_I(inode)->orphan_meta_reserved = 1; + reserve = 1; + } + spin_unlock(&root->orphan_lock); - spin_unlock(&root->list_lock); + if (block_rsv) + btrfs_add_durable_block_rsv(root->fs_info, block_rsv); - /* - * insert an orphan item to track this unlinked/truncated file - */ - ret = btrfs_insert_orphan_item(trans, root, inode->i_ino); + /* grab metadata reservation from transaction handle */ + if (reserve) { + ret = btrfs_orphan_reserve_metadata(trans, inode); + BUG_ON(ret); + } - return ret; + /* insert an orphan item to track this unlinked/truncated file */ + if (insert >= 1) { + ret = btrfs_insert_orphan_item(trans, root, inode->i_ino); + BUG_ON(ret); + } + + /* insert an orphan item to track subvolume contains orphan files */ + if (insert >= 2) { + ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, + root->root_key.objectid); + BUG_ON(ret); + } + return 0; } /* @@ -2028,26 +2202,31 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; + int delete_item = 0; + int release_rsv = 0; int ret = 0; - spin_lock(&root->list_lock); - - if (list_empty(&BTRFS_I(inode)->i_orphan)) { - spin_unlock(&root->list_lock); - return 0; + spin_lock(&root->orphan_lock); + if (!list_empty(&BTRFS_I(inode)->i_orphan)) { + list_del_init(&BTRFS_I(inode)->i_orphan); + delete_item = 1; } - list_del_init(&BTRFS_I(inode)->i_orphan); - if (!trans) { - spin_unlock(&root->list_lock); - return 0; + if (BTRFS_I(inode)->orphan_meta_reserved) { + BTRFS_I(inode)->orphan_meta_reserved = 0; + release_rsv = 1; } + spin_unlock(&root->orphan_lock); - spin_unlock(&root->list_lock); + if (trans && delete_item) { + ret = btrfs_del_orphan_item(trans, root, inode->i_ino); + BUG_ON(ret); + } - ret = btrfs_del_orphan_item(trans, root, inode->i_ino); + if (release_rsv) + btrfs_orphan_release_metadata(inode); - return ret; + return 0; } /* @@ -2064,7 +2243,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) struct inode *inode; int ret = 0, nr_unlink = 0, nr_truncate = 0; - if (!xchg(&root->clean_orphans, 0)) + if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) return; path = btrfs_alloc_path(); @@ -2117,16 +2296,15 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); - if (IS_ERR(inode)) - break; + BUG_ON(IS_ERR(inode)); /* * add this inode to the orphan list so btrfs_orphan_del does * the proper thing when we hit it */ - spin_lock(&root->list_lock); + spin_lock(&root->orphan_lock); list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); - spin_unlock(&root->list_lock); + spin_unlock(&root->orphan_lock); /* * if this is a bad inode, means we actually succeeded in @@ -2135,7 +2313,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) * do a destroy_inode */ if (is_bad_inode(inode)) { - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); btrfs_orphan_del(trans, inode); btrfs_end_transaction(trans, root); iput(inode); @@ -2153,13 +2331,23 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) /* this will do delete_inode and everything for us */ iput(inode); } + btrfs_free_path(path); + + root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; + + if (root->orphan_block_rsv) + btrfs_block_rsv_release(root, root->orphan_block_rsv, + (u64)-1); + + if (root->orphan_block_rsv || root->orphan_item_inserted) { + trans = btrfs_join_transaction(root, 1); + btrfs_end_transaction(trans, root); + } if (nr_unlink) printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); if (nr_truncate) printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); - - btrfs_free_path(path); } /* @@ -2478,29 +2666,201 @@ out: return ret; } -static int btrfs_unlink(struct inode *dir, struct dentry *dentry) +/* helper to check if there is any shared block in the path */ +static int check_path_shared(struct btrfs_root *root, + struct btrfs_path *path) +{ + struct extent_buffer *eb; + int level; + int ret; + u64 refs; + + for (level = 0; level < BTRFS_MAX_LEVEL; level++) { + if (!path->nodes[level]) + break; + eb = path->nodes[level]; + if (!btrfs_block_can_be_shared(root, eb)) + continue; + ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len, + &refs, NULL); + if (refs > 1) + return 1; + } + return 0; +} + +/* + * helper to start transaction for unlink and rmdir. + * + * unlink and rmdir are special in btrfs, they do not always free space. + * so in enospc case, we should make sure they will free space before + * allowing them to use the global metadata reservation. + */ +static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir, + struct dentry *dentry) { - struct btrfs_root *root; struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_path *path; + struct btrfs_inode_ref *ref; + struct btrfs_dir_item *di; struct inode *inode = dentry->d_inode; + u64 index; + int check_link = 1; + int err = -ENOSPC; int ret; - unsigned long nr = 0; - root = BTRFS_I(dir)->root; + trans = btrfs_start_transaction(root, 10); + if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) + return trans; - /* - * 5 items for unlink inode - * 1 for orphan - */ - ret = btrfs_reserve_metadata_space(root, 6); - if (ret) - return ret; + if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) + return ERR_PTR(-ENOSPC); + + /* check if there is someone else holds reference */ + if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1) + return ERR_PTR(-ENOSPC); + + if (atomic_read(&inode->i_count) > 2) + return ERR_PTR(-ENOSPC); + + if (xchg(&root->fs_info->enospc_unlink, 1)) + return ERR_PTR(-ENOSPC); - trans = btrfs_start_transaction(root, 1); + path = btrfs_alloc_path(); + if (!path) { + root->fs_info->enospc_unlink = 0; + return ERR_PTR(-ENOMEM); + } + + trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { - btrfs_unreserve_metadata_space(root, 6); - return PTR_ERR(trans); + btrfs_free_path(path); + root->fs_info->enospc_unlink = 0; + return trans; + } + + path->skip_locking = 1; + path->search_commit_root = 1; + + ret = btrfs_lookup_inode(trans, root, path, + &BTRFS_I(dir)->location, 0); + if (ret < 0) { + err = ret; + goto out; + } + if (ret == 0) { + if (check_path_shared(root, path)) + goto out; + } else { + check_link = 0; + } + btrfs_release_path(root, path); + + ret = btrfs_lookup_inode(trans, root, path, + &BTRFS_I(inode)->location, 0); + if (ret < 0) { + err = ret; + goto out; + } + if (ret == 0) { + if (check_path_shared(root, path)) + goto out; + } else { + check_link = 0; + } + btrfs_release_path(root, path); + + if (ret == 0 && S_ISREG(inode->i_mode)) { + ret = btrfs_lookup_file_extent(trans, root, path, + inode->i_ino, (u64)-1, 0); + if (ret < 0) { + err = ret; + goto out; + } + BUG_ON(ret == 0); + if (check_path_shared(root, path)) + goto out; + btrfs_release_path(root, path); + } + + if (!check_link) { + err = 0; + goto out; + } + + di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, + dentry->d_name.name, dentry->d_name.len, 0); + if (IS_ERR(di)) { + err = PTR_ERR(di); + goto out; + } + if (di) { + if (check_path_shared(root, path)) + goto out; + } else { + err = 0; + goto out; } + btrfs_release_path(root, path); + + ref = btrfs_lookup_inode_ref(trans, root, path, + dentry->d_name.name, dentry->d_name.len, + inode->i_ino, dir->i_ino, 0); + if (IS_ERR(ref)) { + err = PTR_ERR(ref); + goto out; + } + BUG_ON(!ref); + if (check_path_shared(root, path)) + goto out; + index = btrfs_inode_ref_index(path->nodes[0], ref); + btrfs_release_path(root, path); + + di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index, + dentry->d_name.name, dentry->d_name.len, 0); + if (IS_ERR(di)) { + err = PTR_ERR(di); + goto out; + } + BUG_ON(ret == -ENOENT); + if (check_path_shared(root, path)) + goto out; + + err = 0; +out: + btrfs_free_path(path); + if (err) { + btrfs_end_transaction(trans, root); + root->fs_info->enospc_unlink = 0; + return ERR_PTR(err); + } + + trans->block_rsv = &root->fs_info->global_block_rsv; + return trans; +} + +static void __unlink_end_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (trans->block_rsv == &root->fs_info->global_block_rsv) { + BUG_ON(!root->fs_info->enospc_unlink); + root->fs_info->enospc_unlink = 0; + } + btrfs_end_transaction_throttle(trans, root); +} + +static int btrfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_trans_handle *trans; + struct inode *inode = dentry->d_inode; + int ret; + unsigned long nr = 0; + + trans = __unlink_start_trans(dir, dentry); + if (IS_ERR(trans)) + return PTR_ERR(trans); btrfs_set_trans_block_group(trans, dir); @@ -2508,14 +2868,15 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, dentry->d_name.name, dentry->d_name.len); + BUG_ON(ret); - if (inode->i_nlink == 0) + if (inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, inode); + BUG_ON(ret); + } nr = trans->blocks_used; - - btrfs_end_transaction_throttle(trans, root); - btrfs_unreserve_metadata_space(root, 6); + __unlink_end_trans(trans, root); btrfs_btree_balance_dirty(root, nr); return ret; } @@ -2587,7 +2948,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = dentry->d_inode; int err = 0; - int ret; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_trans_handle *trans; unsigned long nr = 0; @@ -2596,15 +2956,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) return -ENOTEMPTY; - ret = btrfs_reserve_metadata_space(root, 5); - if (ret) - return ret; - - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - btrfs_unreserve_metadata_space(root, 5); + trans = __unlink_start_trans(dir, dentry); + if (IS_ERR(trans)) return PTR_ERR(trans); - } btrfs_set_trans_block_group(trans, dir); @@ -2627,12 +2981,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) btrfs_i_size_write(inode, 0); out: nr = trans->blocks_used; - ret = btrfs_end_transaction_throttle(trans, root); - btrfs_unreserve_metadata_space(root, 5); + __unlink_end_trans(trans, root); btrfs_btree_balance_dirty(root, nr); - if (ret && !err) - err = ret; return err; } @@ -3029,6 +3380,7 @@ out: if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); + BUG_ON(ret); } btrfs_free_path(path); return err; @@ -3056,11 +3408,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) if ((offset & (blocksize - 1)) == 0) goto out; - ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); - if (ret) - goto out; - - ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); + ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); if (ret) goto out; @@ -3068,8 +3416,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from) again: page = grab_cache_page(mapping, index); if (!page) { - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); goto out; } @@ -3132,8 +3479,7 @@ again: out_unlock: if (ret) - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); unlock_page(page); page_cache_release(page); out: @@ -3145,7 +3491,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct extent_map *em; + struct extent_map *em = NULL; struct extent_state *cached_state = NULL; u64 mask = root->sectorsize - 1; u64 hole_start = (inode->i_size + mask) & ~mask; @@ -3183,11 +3529,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) u64 hint_byte = 0; hole_size = last_byte - cur_offset; - err = btrfs_reserve_metadata_space(root, 2); - if (err) + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); break; - - trans = btrfs_start_transaction(root, 1); + } btrfs_set_trans_block_group(trans, inode); err = btrfs_drop_extents(trans, inode, cur_offset, @@ -3205,14 +3551,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) last_byte - 1, 0); btrfs_end_transaction(trans, root); - btrfs_unreserve_metadata_space(root, 2); } free_extent_map(em); + em = NULL; cur_offset = last_byte; if (cur_offset >= block_end) break; } + free_extent_map(em); unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, GFP_NOFS); return err; @@ -3239,11 +3586,10 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) } } - ret = btrfs_reserve_metadata_space(root, 1); - if (ret) - return ret; + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); - trans = btrfs_start_transaction(root, 1); btrfs_set_trans_block_group(trans, inode); ret = btrfs_orphan_add(trans, inode); @@ -3251,7 +3597,6 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_unreserve_metadata_space(root, 1); btrfs_btree_balance_dirty(root, nr); if (attr->ia_size > inode->i_size) { @@ -3264,8 +3609,11 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) i_size_write(inode, attr->ia_size); btrfs_ordered_update_i_size(inode, inode->i_size, NULL); - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = root->orphan_block_rsv; + BUG_ON(!trans->block_rsv); ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); @@ -3345,10 +3693,21 @@ void btrfs_delete_inode(struct inode *inode) btrfs_i_size_write(inode, 0); while (1) { - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); - ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); + trans->block_rsv = root->orphan_block_rsv; + + ret = btrfs_block_rsv_check(trans, root, + root->orphan_block_rsv, 0, 5); + if (ret) { + BUG_ON(ret != -EAGAIN); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + continue; + } + ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); if (ret != -EAGAIN) break; @@ -3356,6 +3715,7 @@ void btrfs_delete_inode(struct inode *inode) btrfs_end_transaction(trans, root); trans = NULL; btrfs_btree_balance_dirty(root, nr); + } if (ret == 0) { @@ -3596,40 +3956,10 @@ again: return 0; } -static noinline void init_btrfs_i(struct inode *inode) -{ - struct btrfs_inode *bi = BTRFS_I(inode); - - bi->generation = 0; - bi->sequence = 0; - bi->last_trans = 0; - bi->last_sub_trans = 0; - bi->logged_trans = 0; - bi->delalloc_bytes = 0; - bi->reserved_bytes = 0; - bi->disk_i_size = 0; - bi->flags = 0; - bi->index_cnt = (u64)-1; - bi->last_unlink_trans = 0; - bi->ordered_data_close = 0; - bi->force_compress = 0; - extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); - extent_io_tree_init(&BTRFS_I(inode)->io_tree, - inode->i_mapping, GFP_NOFS); - extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, - inode->i_mapping, GFP_NOFS); - INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); - INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations); - RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); - btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); - mutex_init(&BTRFS_I(inode)->log_mutex); -} - static int btrfs_init_locked_inode(struct inode *inode, void *p) { struct btrfs_iget_args *args = p; inode->i_ino = args->ino; - init_btrfs_i(inode); BTRFS_I(inode)->root = args->root; btrfs_set_inode_space_info(args->root, inode); return 0; @@ -3692,8 +4022,6 @@ static struct inode *new_simple_dir(struct super_block *s, if (!inode) return ERR_PTR(-ENOMEM); - init_btrfs_i(inode); - BTRFS_I(inode)->root = root; memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); BTRFS_I(inode)->dummy_inode = 1; @@ -3950,7 +4278,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) struct btrfs_trans_handle *trans; int ret = 0; - if (root->fs_info->btree_inode == inode) + if (BTRFS_I(inode)->dummy_inode) return 0; if (wbc->sync_mode == WB_SYNC_ALL) { @@ -3971,10 +4299,38 @@ void btrfs_dirty_inode(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; + int ret; + + if (BTRFS_I(inode)->dummy_inode) + return; trans = btrfs_join_transaction(root, 1); btrfs_set_trans_block_group(trans, inode); - btrfs_update_inode(trans, root, inode); + + ret = btrfs_update_inode(trans, root, inode); + if (ret && ret == -ENOSPC) { + /* whoops, lets try again with the full transaction */ + btrfs_end_transaction(trans, root); + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + if (printk_ratelimit()) { + printk(KERN_ERR "btrfs: fail to " + "dirty inode %lu error %ld\n", + inode->i_ino, PTR_ERR(trans)); + } + return; + } + btrfs_set_trans_block_group(trans, inode); + + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + if (printk_ratelimit()) { + printk(KERN_ERR "btrfs: fail to " + "dirty inode %lu error %d\n", + inode->i_ino, ret); + } + } + } btrfs_end_transaction(trans, root); } @@ -4092,7 +4448,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, * btrfs_get_inode_index_count has an explanation for the magic * number */ - init_btrfs_i(inode); BTRFS_I(inode)->index_cnt = 2; BTRFS_I(inode)->root = root; BTRFS_I(inode)->generation = trans->transid; @@ -4247,26 +4602,21 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; + err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); + if (err) + return err; + /* * 2 for inode item and ref * 2 for dir items * 1 for xattr if selinux is on */ - err = btrfs_reserve_metadata_space(root, 5); - if (err) - return err; + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); - trans = btrfs_start_transaction(root, 1); - if (!trans) - goto fail; btrfs_set_trans_block_group(trans, dir); - err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); - if (err) { - err = -ENOSPC; - goto out_unlock; - } - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, dentry->d_parent->d_inode->i_ino, objectid, @@ -4295,13 +4645,11 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); -fail: - btrfs_unreserve_metadata_space(root, 5); + btrfs_btree_balance_dirty(root, nr); if (drop_inode) { inode_dec_link_count(inode); iput(inode); } - btrfs_btree_balance_dirty(root, nr); return err; } @@ -4311,32 +4659,26 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = NULL; - int err; int drop_inode = 0; + int err; unsigned long nr = 0; u64 objectid; u64 index = 0; + err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); + if (err) + return err; /* * 2 for inode item and ref * 2 for dir items * 1 for xattr if selinux is on */ - err = btrfs_reserve_metadata_space(root, 5); - if (err) - return err; + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); - trans = btrfs_start_transaction(root, 1); - if (!trans) - goto fail; btrfs_set_trans_block_group(trans, dir); - err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); - if (err) { - err = -ENOSPC; - goto out_unlock; - } - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, dentry->d_parent->d_inode->i_ino, @@ -4368,8 +4710,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); -fail: - btrfs_unreserve_metadata_space(root, 5); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -4396,21 +4736,21 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (root->objectid != BTRFS_I(inode)->root->objectid) return -EPERM; - /* - * 1 item for inode ref - * 2 items for dir items - */ - err = btrfs_reserve_metadata_space(root, 3); - if (err) - return err; - btrfs_inc_nlink(inode); err = btrfs_set_inode_index(dir, &index); if (err) goto fail; - trans = btrfs_start_transaction(root, 1); + /* + * 1 item for inode ref + * 2 items for dir items + */ + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto fail; + } btrfs_set_trans_block_group(trans, dir); atomic_inc(&inode->i_count); @@ -4429,7 +4769,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); fail: - btrfs_unreserve_metadata_space(root, 3); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -4449,28 +4788,20 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) u64 index = 0; unsigned long nr = 1; + err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); + if (err) + return err; + /* * 2 items for inode and ref * 2 items for dir items * 1 for xattr if selinux is on */ - err = btrfs_reserve_metadata_space(root, 5); - if (err) - return err; - - trans = btrfs_start_transaction(root, 1); - if (!trans) { - err = -ENOMEM; - goto out_unlock; - } + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); btrfs_set_trans_block_group(trans, dir); - err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); - if (err) { - err = -ENOSPC; - goto out_fail; - } - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, dentry->d_parent->d_inode->i_ino, objectid, @@ -4510,9 +4841,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) out_fail: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); - -out_unlock: - btrfs_unreserve_metadata_space(root, 5); if (drop_on_err) iput(inode); btrfs_btree_balance_dirty(root, nr); @@ -4770,6 +5098,7 @@ again: } flush_dcache_page(page); } else if (create && PageUptodate(page)) { + WARN_ON(1); if (!trans) { kunmap(page); free_extent_map(em); @@ -4866,11 +5195,651 @@ out: return em; } +static struct extent_map *btrfs_new_extent_direct(struct inode *inode, + u64 start, u64 len) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct extent_map *em; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct btrfs_key ins; + u64 alloc_hint; + int ret; + + btrfs_drop_extent_cache(inode, start, start + len - 1, 0); + + trans = btrfs_join_transaction(root, 0); + if (!trans) + return ERR_PTR(-ENOMEM); + + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + + alloc_hint = get_extent_allocation_hint(inode, start, len); + ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0, + alloc_hint, (u64)-1, &ins, 1); + if (ret) { + em = ERR_PTR(ret); + goto out; + } + + em = alloc_extent_map(GFP_NOFS); + if (!em) { + em = ERR_PTR(-ENOMEM); + goto out; + } + + em->start = start; + em->orig_start = em->start; + em->len = ins.offset; + + em->block_start = ins.objectid; + em->block_len = ins.offset; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + while (1) { + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + write_unlock(&em_tree->lock); + if (ret != -EEXIST) + break; + btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0); + } + + ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, + ins.offset, ins.offset, 0); + if (ret) { + btrfs_free_reserved_extent(root, ins.objectid, ins.offset); + em = ERR_PTR(ret); + } +out: + btrfs_end_transaction(trans, root); + return em; +} + +/* + * returns 1 when the nocow is safe, < 1 on error, 0 if the + * block must be cow'd + */ +static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans, + struct inode *inode, u64 offset, u64 len) +{ + struct btrfs_path *path; + int ret; + struct extent_buffer *leaf; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 disk_bytenr; + u64 backref_offset; + u64 extent_end; + u64 num_bytes; + int slot; + int found_type; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, + offset, 0); + if (ret < 0) + goto out; + + slot = path->slots[0]; + if (ret == 1) { + if (slot == 0) { + /* can't find the item, must cow */ + ret = 0; + goto out; + } + slot--; + } + ret = 0; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != inode->i_ino || + key.type != BTRFS_EXTENT_DATA_KEY) { + /* not our file or wrong item type, must cow */ + goto out; + } + + if (key.offset > offset) { + /* Wrong offset, must cow */ + goto out; + } + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(leaf, fi); + if (found_type != BTRFS_FILE_EXTENT_REG && + found_type != BTRFS_FILE_EXTENT_PREALLOC) { + /* not a regular extent, must cow */ + goto out; + } + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + backref_offset = btrfs_file_extent_offset(leaf, fi); + + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + if (extent_end < offset + len) { + /* extent doesn't include our full range, must cow */ + goto out; + } + + if (btrfs_extent_readonly(root, disk_bytenr)) + goto out; + + /* + * look for other files referencing this extent, if we + * find any we must cow + */ + if (btrfs_cross_ref_exist(trans, root, inode->i_ino, + key.offset - backref_offset, disk_bytenr)) + goto out; + + /* + * adjust disk_bytenr and num_bytes to cover just the bytes + * in this extent we are about to write. If there + * are any csums in that range we have to cow in order + * to keep the csums correct + */ + disk_bytenr += backref_offset; + disk_bytenr += offset - key.offset; + num_bytes = min(offset + len, extent_end) - offset; + if (csum_exist_in_range(root, disk_bytenr, num_bytes)) + goto out; + /* + * all of the above have passed, it is safe to overwrite this extent + * without cow + */ + ret = 1; +out: + btrfs_free_path(path); + return ret; +} + +static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + struct extent_map *em; + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 start = iblock << inode->i_blkbits; + u64 len = bh_result->b_size; + struct btrfs_trans_handle *trans; + + em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + if (IS_ERR(em)) + return PTR_ERR(em); + + /* + * Ok for INLINE and COMPRESSED extents we need to fallback on buffered + * io. INLINE is special, and we could probably kludge it in here, but + * it's still buffered so for safety lets just fall back to the generic + * buffered path. + * + * For COMPRESSED we _have_ to read the entire extent in so we can + * decompress it, so there will be buffering required no matter what we + * do, so go ahead and fallback to buffered. + * + * We return -ENOTBLK because thats what makes DIO go ahead and go back + * to buffered IO. Don't blame me, this is the price we pay for using + * the generic code. + */ + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || + em->block_start == EXTENT_MAP_INLINE) { + free_extent_map(em); + return -ENOTBLK; + } + + /* Just a good old fashioned hole, return */ + if (!create && (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + free_extent_map(em); + /* DIO will do one hole at a time, so just unlock a sector */ + unlock_extent(&BTRFS_I(inode)->io_tree, start, + start + root->sectorsize - 1, GFP_NOFS); + return 0; + } + + /* + * We don't allocate a new extent in the following cases + * + * 1) The inode is marked as NODATACOW. In this case we'll just use the + * existing extent. + * 2) The extent is marked as PREALLOC. We're good to go here and can + * just use the extent. + * + */ + if (!create) { + len = em->len - (start - em->start); + goto map; + } + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + em->block_start != EXTENT_MAP_HOLE)) { + int type; + int ret; + u64 block_start; + + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + type = BTRFS_ORDERED_PREALLOC; + else + type = BTRFS_ORDERED_NOCOW; + len = min(len, em->len - (start - em->start)); + block_start = em->block_start + (start - em->start); + + /* + * we're not going to log anything, but we do need + * to make sure the current transaction stays open + * while we look for nocow cross refs + */ + trans = btrfs_join_transaction(root, 0); + if (!trans) + goto must_cow; + + if (can_nocow_odirect(trans, inode, start, len) == 1) { + ret = btrfs_add_ordered_extent_dio(inode, start, + block_start, len, len, type); + btrfs_end_transaction(trans, root); + if (ret) { + free_extent_map(em); + return ret; + } + goto unlock; + } + btrfs_end_transaction(trans, root); + } +must_cow: + /* + * this will cow the extent, reset the len in case we changed + * it above + */ + len = bh_result->b_size; + free_extent_map(em); + em = btrfs_new_extent_direct(inode, start, len); + if (IS_ERR(em)) + return PTR_ERR(em); + len = min(len, em->len - (start - em->start)); +unlock: + clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1, + EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1, + 0, NULL, GFP_NOFS); +map: + bh_result->b_blocknr = (em->block_start + (start - em->start)) >> + inode->i_blkbits; + bh_result->b_size = len; + bh_result->b_bdev = em->bdev; + set_buffer_mapped(bh_result); + if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + set_buffer_new(bh_result); + + free_extent_map(em); + + return 0; +} + +struct btrfs_dio_private { + struct inode *inode; + u64 logical_offset; + u64 disk_bytenr; + u64 bytes; + u32 *csums; + void *private; +}; + +static void btrfs_endio_direct_read(struct bio *bio, int err) +{ + struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; + struct bio_vec *bvec = bio->bi_io_vec; + struct btrfs_dio_private *dip = bio->bi_private; + struct inode *inode = dip->inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + u64 start; + u32 *private = dip->csums; + + start = dip->logical_offset; + do { + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + struct page *page = bvec->bv_page; + char *kaddr; + u32 csum = ~(u32)0; + unsigned long flags; + + local_irq_save(flags); + kaddr = kmap_atomic(page, KM_IRQ0); + csum = btrfs_csum_data(root, kaddr + bvec->bv_offset, + csum, bvec->bv_len); + btrfs_csum_final(csum, (char *)&csum); + kunmap_atomic(kaddr, KM_IRQ0); + local_irq_restore(flags); + + flush_dcache_page(bvec->bv_page); + if (csum != *private) { + printk(KERN_ERR "btrfs csum failed ino %lu off" + " %llu csum %u private %u\n", + inode->i_ino, (unsigned long long)start, + csum, *private); + err = -EIO; + } + } + + start += bvec->bv_len; + private++; + bvec++; + } while (bvec <= bvec_end); + + unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, + dip->logical_offset + dip->bytes - 1, GFP_NOFS); + bio->bi_private = dip->private; + + kfree(dip->csums); + kfree(dip); + dio_end_io(bio, err); +} + +static void btrfs_endio_direct_write(struct bio *bio, int err) +{ + struct btrfs_dio_private *dip = bio->bi_private; + struct inode *inode = dip->inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct btrfs_ordered_extent *ordered = NULL; + struct extent_state *cached_state = NULL; + int ret; + + if (err) + goto out_done; + + ret = btrfs_dec_test_ordered_pending(inode, &ordered, + dip->logical_offset, dip->bytes); + if (!ret) + goto out_done; + + BUG_ON(!ordered); + + trans = btrfs_join_transaction(root, 1); + if (!trans) { + err = -ENOMEM; + goto out; + } + trans->block_rsv = &root->fs_info->delalloc_block_rsv; + + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { + ret = btrfs_ordered_update_i_size(inode, 0, ordered); + if (!ret) + ret = btrfs_update_inode(trans, root, inode); + err = ret; + goto out; + } + + lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset, + ordered->file_offset + ordered->len - 1, 0, + &cached_state, GFP_NOFS); + + if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { + ret = btrfs_mark_extent_written(trans, inode, + ordered->file_offset, + ordered->file_offset + + ordered->len); + if (ret) { + err = ret; + goto out_unlock; + } + } else { + ret = insert_reserved_file_extent(trans, inode, + ordered->file_offset, + ordered->start, + ordered->disk_len, + ordered->len, + ordered->len, + 0, 0, 0, + BTRFS_FILE_EXTENT_REG); + unpin_extent_cache(&BTRFS_I(inode)->extent_tree, + ordered->file_offset, ordered->len); + if (ret) { + err = ret; + WARN_ON(1); + goto out_unlock; + } + } + + add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); + btrfs_ordered_update_i_size(inode, 0, ordered); + btrfs_update_inode(trans, root, inode); +out_unlock: + unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, + ordered->file_offset + ordered->len - 1, + &cached_state, GFP_NOFS); +out: + btrfs_delalloc_release_metadata(inode, ordered->len); + btrfs_end_transaction(trans, root); + btrfs_put_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); +out_done: + bio->bi_private = dip->private; + + kfree(dip->csums); + kfree(dip); + dio_end_io(bio, err); +} + +static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw, + struct bio *bio, int mirror_num, + unsigned long bio_flags, u64 offset) +{ + int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; + ret = btrfs_csum_one_bio(root, inode, bio, offset, 1); + BUG_ON(ret); + return 0; +} + +static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, + loff_t file_offset) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_dio_private *dip; + struct bio_vec *bvec = bio->bi_io_vec; + u64 start; + int skip_sum; + int write = rw & (1 << BIO_RW); + int ret = 0; + + skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + + dip = kmalloc(sizeof(*dip), GFP_NOFS); + if (!dip) { + ret = -ENOMEM; + goto free_ordered; + } + dip->csums = NULL; + + if (!skip_sum) { + dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); + if (!dip->csums) { + ret = -ENOMEM; + goto free_ordered; + } + } + + dip->private = bio->bi_private; + dip->inode = inode; + dip->logical_offset = file_offset; + + start = dip->logical_offset; + dip->bytes = 0; + do { + dip->bytes += bvec->bv_len; + bvec++; + } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1)); + + dip->disk_bytenr = (u64)bio->bi_sector << 9; + bio->bi_private = dip; + + if (write) + bio->bi_end_io = btrfs_endio_direct_write; + else + bio->bi_end_io = btrfs_endio_direct_read; + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + if (ret) + goto out_err; + + if (write && !skip_sum) { + ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + inode, rw, bio, 0, 0, + dip->logical_offset, + __btrfs_submit_bio_start_direct_io, + __btrfs_submit_bio_done); + if (ret) + goto out_err; + return; + } else if (!skip_sum) + btrfs_lookup_bio_sums_dio(root, inode, bio, + dip->logical_offset, dip->csums); + + ret = btrfs_map_bio(root, rw, bio, 0, 1); + if (ret) + goto out_err; + return; +out_err: + kfree(dip->csums); + kfree(dip); +free_ordered: + /* + * If this is a write, we need to clean up the reserved space and kill + * the ordered extent. + */ + if (write) { + struct btrfs_ordered_extent *ordered; + ordered = btrfs_lookup_ordered_extent(inode, + dip->logical_offset); + if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && + !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) + btrfs_free_reserved_extent(root, ordered->start, + ordered->disk_len); + btrfs_put_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + } + bio_endio(bio, ret); +} + +static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + int seg; + size_t size; + unsigned long addr; + unsigned blocksize_mask = root->sectorsize - 1; + ssize_t retval = -EINVAL; + loff_t end = offset; + + if (offset & blocksize_mask) + goto out; + + /* Check the memory alignment. Blocks cannot straddle pages */ + for (seg = 0; seg < nr_segs; seg++) { + addr = (unsigned long)iov[seg].iov_base; + size = iov[seg].iov_len; + end += size; + if ((addr & blocksize_mask) || (size & blocksize_mask)) + goto out; + } + retval = 0; +out: + return retval; +} static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { - return -EINVAL; + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct btrfs_ordered_extent *ordered; + struct extent_state *cached_state = NULL; + u64 lockstart, lockend; + ssize_t ret; + int writing = rw & WRITE; + int write_bits = 0; + size_t count = iov_length(iov, nr_segs); + + if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, + offset, nr_segs)) { + return 0; + } + + lockstart = offset; + lockend = offset + count - 1; + + if (writing) { + ret = btrfs_delalloc_reserve_space(inode, count); + if (ret) + goto out; + } + + while (1) { + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, + 0, &cached_state, GFP_NOFS); + /* + * We're concerned with the entire range that we're going to be + * doing DIO to, so we need to make sure theres no ordered + * extents in this range. + */ + ordered = btrfs_lookup_ordered_range(inode, lockstart, + lockend - lockstart + 1); + if (!ordered) + break; + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state, GFP_NOFS); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + cond_resched(); + } + + /* + * we don't use btrfs_set_extent_delalloc because we don't want + * the dirty or uptodate bits + */ + if (writing) { + write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING; + ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + EXTENT_DELALLOC, 0, NULL, &cached_state, + GFP_NOFS); + if (ret) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, + lockend, EXTENT_LOCKED | write_bits, + 1, 0, &cached_state, GFP_NOFS); + goto out; + } + } + + free_extent_state(cached_state); + cached_state = NULL; + + ret = __blockdev_direct_IO(rw, iocb, inode, + BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, + iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, + btrfs_submit_direct, 0); + + if (ret < 0 && ret != -EIOCBQUEUED) { + clear_extent_bit(&BTRFS_I(inode)->io_tree, offset, + offset + iov_length(iov, nr_segs) - 1, + EXTENT_LOCKED | write_bits, 1, 0, + &cached_state, GFP_NOFS); + } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) { + /* + * We're falling back to buffered, unlock the section we didn't + * do IO on. + */ + clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret, + offset + iov_length(iov, nr_segs) - 1, + EXTENT_LOCKED | write_bits, 1, 0, + &cached_state, GFP_NOFS); + } +out: + free_extent_state(cached_state); + return ret; } static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, @@ -5034,7 +6003,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) u64 page_start; u64 page_end; - ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); + ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); if (ret) { if (ret == -ENOMEM) ret = VM_FAULT_OOM; @@ -5043,13 +6012,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) goto out; } - ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); - if (ret) { - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); - ret = VM_FAULT_SIGBUS; - goto out; - } - ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ again: lock_page(page); @@ -5059,7 +6021,6 @@ again: if ((page->mapping != inode->i_mapping) || (page_start >= size)) { - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); /* page got truncated out from underneath us */ goto out_unlock; } @@ -5100,7 +6061,6 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); ret = VM_FAULT_SIGBUS; - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); goto out_unlock; } ret = 0; @@ -5127,10 +6087,10 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); out_unlock: - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); if (!ret) return VM_FAULT_LOCKED; unlock_page(page); + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); out: return ret; } @@ -5155,8 +6115,10 @@ static void btrfs_truncate(struct inode *inode) btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); btrfs_ordered_update_i_size(inode, inode->i_size, NULL); - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); + BUG_ON(IS_ERR(trans)); btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = root->orphan_block_rsv; /* * setattr is responsible for setting the ordered_data_close flag, @@ -5179,6 +6141,23 @@ static void btrfs_truncate(struct inode *inode) btrfs_add_ordered_operation(trans, root, inode); while (1) { + if (!trans) { + trans = btrfs_start_transaction(root, 0); + BUG_ON(IS_ERR(trans)); + btrfs_set_trans_block_group(trans, inode); + trans->block_rsv = root->orphan_block_rsv; + } + + ret = btrfs_block_rsv_check(trans, root, + root->orphan_block_rsv, 0, 5); + if (ret) { + BUG_ON(ret != -EAGAIN); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + trans = NULL; + continue; + } + ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, BTRFS_EXTENT_DATA_KEY); @@ -5190,10 +6169,8 @@ static void btrfs_truncate(struct inode *inode) nr = trans->blocks_used; btrfs_end_transaction(trans, root); + trans = NULL; btrfs_btree_balance_dirty(root, nr); - - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); } if (ret == 0 && inode->i_nlink > 0) { @@ -5254,21 +6231,47 @@ unsigned long btrfs_force_ra(struct address_space *mapping, struct inode *btrfs_alloc_inode(struct super_block *sb) { struct btrfs_inode *ei; + struct inode *inode; ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); if (!ei) return NULL; + + ei->root = NULL; + ei->space_info = NULL; + ei->generation = 0; + ei->sequence = 0; ei->last_trans = 0; ei->last_sub_trans = 0; ei->logged_trans = 0; - ei->outstanding_extents = 0; - ei->reserved_extents = 0; - ei->root = NULL; + ei->delalloc_bytes = 0; + ei->reserved_bytes = 0; + ei->disk_i_size = 0; + ei->flags = 0; + ei->index_cnt = (u64)-1; + ei->last_unlink_trans = 0; + spin_lock_init(&ei->accounting_lock); + atomic_set(&ei->outstanding_extents, 0); + ei->reserved_extents = 0; + + ei->ordered_data_close = 0; + ei->orphan_meta_reserved = 0; + ei->dummy_inode = 0; + ei->force_compress = 0; + + inode = &ei->vfs_inode; + extent_map_tree_init(&ei->extent_tree, GFP_NOFS); + extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS); + extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS); + mutex_init(&ei->log_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->i_orphan); + INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->ordered_operations); - return &ei->vfs_inode; + RB_CLEAR_NODE(&ei->rb_node); + + return inode; } void btrfs_destroy_inode(struct inode *inode) @@ -5278,6 +6281,8 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(!list_empty(&inode->i_dentry)); WARN_ON(inode->i_data.nrpages); + WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents)); + WARN_ON(BTRFS_I(inode)->reserved_extents); /* * This can happen where we create an inode, but somebody else also @@ -5298,13 +6303,13 @@ void btrfs_destroy_inode(struct inode *inode) spin_unlock(&root->fs_info->ordered_extent_lock); } - spin_lock(&root->list_lock); + spin_lock(&root->orphan_lock); if (!list_empty(&BTRFS_I(inode)->i_orphan)) { printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", inode->i_ino); list_del_init(&BTRFS_I(inode)->i_orphan); } - spin_unlock(&root->list_lock); + spin_unlock(&root->orphan_lock); while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); @@ -5425,19 +6430,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (S_ISDIR(old_inode->i_mode) && new_inode && new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; - - /* - * We want to reserve the absolute worst case amount of items. So if - * both inodes are subvols and we need to unlink them then that would - * require 4 item modifications, but if they are both normal inodes it - * would require 5 item modifications, so we'll assume their normal - * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items - * should cover the worst case number of items we'll modify. - */ - ret = btrfs_reserve_metadata_space(root, 11); - if (ret) - return ret; - /* * we're using rename to replace one file with another. * and the replacement file is large. Start IO on it now so @@ -5450,8 +6442,18 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* close the racy window with snapshot create/destroy ioctl */ if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) down_read(&root->fs_info->subvol_sem); + /* + * We want to reserve the absolute worst case amount of items. So if + * both inodes are subvols and we need to unlink them then that would + * require 4 item modifications, but if they are both normal inodes it + * would require 5 item modifications, so we'll assume their normal + * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items + * should cover the worst case number of items we'll modify. + */ + trans = btrfs_start_transaction(root, 20); + if (IS_ERR(trans)) + return PTR_ERR(trans); - trans = btrfs_start_transaction(root, 1); btrfs_set_trans_block_group(trans, new_dir); if (dest != root) @@ -5550,7 +6552,6 @@ out_fail: if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&root->fs_info->subvol_sem); - btrfs_unreserve_metadata_space(root, 11); return ret; } @@ -5602,6 +6603,38 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) return 0; } +int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput) +{ + struct btrfs_inode *binode; + struct inode *inode = NULL; + + spin_lock(&root->fs_info->delalloc_lock); + while (!list_empty(&root->fs_info->delalloc_inodes)) { + binode = list_entry(root->fs_info->delalloc_inodes.next, + struct btrfs_inode, delalloc_inodes); + inode = igrab(&binode->vfs_inode); + if (inode) { + list_move_tail(&binode->delalloc_inodes, + &root->fs_info->delalloc_inodes); + break; + } + + list_del_init(&binode->delalloc_inodes); + cond_resched_lock(&root->fs_info->delalloc_lock); + } + spin_unlock(&root->fs_info->delalloc_lock); + + if (inode) { + write_inode_now(inode, 0); + if (delay_iput) + btrfs_add_delayed_iput(inode); + else + iput(inode); + return 1; + } + return 0; +} + static int btrfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { @@ -5625,26 +6658,20 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) return -ENAMETOOLONG; + err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid); + if (err) + return err; /* * 2 items for inode item and ref * 2 items for dir items * 1 item for xattr if selinux is on */ - err = btrfs_reserve_metadata_space(root, 5); - if (err) - return err; + trans = btrfs_start_transaction(root, 5); + if (IS_ERR(trans)) + return PTR_ERR(trans); - trans = btrfs_start_transaction(root, 1); - if (!trans) - goto out_fail; btrfs_set_trans_block_group(trans, dir); - err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); - if (err) { - err = -ENOSPC; - goto out_unlock; - } - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, dentry->d_name.len, dentry->d_parent->d_inode->i_ino, objectid, @@ -5716,8 +6743,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, out_unlock: nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); -out_fail: - btrfs_unreserve_metadata_space(root, 5); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -5726,33 +6751,28 @@ out_fail: return err; } -static int prealloc_file_range(struct inode *inode, u64 start, u64 end, - u64 alloc_hint, int mode, loff_t actual_len) +int btrfs_prealloc_file_range(struct inode *inode, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; u64 cur_offset = start; - u64 num_bytes = end - start; int ret = 0; - u64 i_size; while (num_bytes > 0) { - trans = btrfs_start_transaction(root, 1); - - ret = btrfs_reserve_extent(trans, root, num_bytes, - root->sectorsize, 0, alloc_hint, - (u64)-1, &ins, 1); - if (ret) { - WARN_ON(1); - goto stop_trans; + trans = btrfs_start_transaction(root, 3); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; } - ret = btrfs_reserve_metadata_space(root, 3); + ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, + 0, *alloc_hint, (u64)-1, &ins, 1); if (ret) { - btrfs_free_reserved_extent(root, ins.objectid, - ins.offset); - goto stop_trans; + btrfs_end_transaction(trans, root); + break; } ret = insert_reserved_file_extent(trans, inode, @@ -5766,34 +6786,27 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end, num_bytes -= ins.offset; cur_offset += ins.offset; - alloc_hint = ins.objectid + ins.offset; + *alloc_hint = ins.objectid + ins.offset; inode->i_ctime = CURRENT_TIME; BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; if (!(mode & FALLOC_FL_KEEP_SIZE) && - (actual_len > inode->i_size) && - (cur_offset > inode->i_size)) { - + (actual_len > inode->i_size) && + (cur_offset > inode->i_size)) { if (cur_offset > actual_len) - i_size = actual_len; + i_size_write(inode, actual_len); else - i_size = cur_offset; - i_size_write(inode, i_size); - btrfs_ordered_update_i_size(inode, i_size, NULL); + i_size_write(inode, cur_offset); + i_size_write(inode, cur_offset); + btrfs_ordered_update_i_size(inode, cur_offset, NULL); } ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); btrfs_end_transaction(trans, root); - btrfs_unreserve_metadata_space(root, 3); } return ret; - -stop_trans: - btrfs_end_transaction(trans, root); - return ret; - } static long btrfs_fallocate(struct inode *inode, int mode, @@ -5826,8 +6839,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, goto out; } - ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode, - alloc_end - alloc_start); + ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); if (ret) goto out; @@ -5872,16 +6884,16 @@ static long btrfs_fallocate(struct inode *inode, int mode, if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - ret = prealloc_file_range(inode, - cur_offset, last_byte, - alloc_hint, mode, offset+len); + ret = btrfs_prealloc_file_range(inode, 0, cur_offset, + last_byte - cur_offset, + 1 << inode->i_blkbits, + offset + len, + &alloc_hint); if (ret < 0) { free_extent_map(em); break; } } - if (em->block_start <= EXTENT_MAP_LAST_BYTE) - alloc_hint = em->block_start; free_extent_map(em); cur_offset = last_byte; @@ -5893,8 +6905,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state, GFP_NOFS); - btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode, - alloc_end - alloc_start); + btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); out: mutex_unlock(&inode->i_mutex); return ret; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 97a9783..4cdb98c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -239,23 +239,19 @@ static noinline int create_subvol(struct btrfs_root *root, u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; u64 index = 0; + ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root, + 0, &objectid); + if (ret) + return ret; /* * 1 - inode item * 2 - refs * 1 - root item * 2 - dir items */ - ret = btrfs_reserve_metadata_space(root, 6); - if (ret) - return ret; - - trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); - - ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root, - 0, &objectid); - if (ret) - goto fail; + trans = btrfs_start_transaction(root, 6); + if (IS_ERR(trans)) + return PTR_ERR(trans); leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, objectid, NULL, 0, 0, 0); @@ -345,13 +341,10 @@ fail: err = btrfs_commit_transaction(trans, root); if (err && !ret) ret = err; - - btrfs_unreserve_metadata_space(root, 6); return ret; } -static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, - char *name, int namelen) +static int create_snapshot(struct btrfs_root *root, struct dentry *dentry) { struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; @@ -361,40 +354,33 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, if (!root->ref_cows) return -EINVAL; - /* - * 1 - inode item - * 2 - refs - * 1 - root item - * 2 - dir items - */ - ret = btrfs_reserve_metadata_space(root, 6); - if (ret) - goto fail; - pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); - if (!pending_snapshot) { - ret = -ENOMEM; - btrfs_unreserve_metadata_space(root, 6); - goto fail; - } - pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); - if (!pending_snapshot->name) { - ret = -ENOMEM; - kfree(pending_snapshot); - btrfs_unreserve_metadata_space(root, 6); - goto fail; - } - memcpy(pending_snapshot->name, name, namelen); - pending_snapshot->name[namelen] = '\0'; + if (!pending_snapshot) + return -ENOMEM; + + btrfs_init_block_rsv(&pending_snapshot->block_rsv); pending_snapshot->dentry = dentry; - trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); pending_snapshot->root = root; + + trans = btrfs_start_transaction(root->fs_info->extent_root, 5); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto fail; + } + + ret = btrfs_snap_reserve_metadata(trans, pending_snapshot); + BUG_ON(ret); + list_add(&pending_snapshot->list, &trans->transaction->pending_snapshots); - ret = btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans, root->fs_info->extent_root); BUG_ON(ret); - btrfs_unreserve_metadata_space(root, 6); + + ret = pending_snapshot->error; + if (ret) + goto fail; + + btrfs_orphan_cleanup(pending_snapshot->snap); inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); if (IS_ERR(inode)) { @@ -405,6 +391,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, d_instantiate(dentry, inode); ret = 0; fail: + kfree(pending_snapshot); return ret; } @@ -456,8 +443,7 @@ static noinline int btrfs_mksubvol(struct path *parent, goto out_up_read; if (snap_src) { - error = create_snapshot(snap_src, dentry, - name, namelen); + error = create_snapshot(snap_src, dentry); } else { error = create_subvol(BTRFS_I(dir)->root, dentry, name, namelen); @@ -601,19 +587,9 @@ static int btrfs_defrag_file(struct file *file, if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) BTRFS_I(inode)->force_compress = 1; - ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); - if (ret) { - ret = -ENOSPC; - break; - } - - ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1); - if (ret) { - btrfs_free_reserved_data_space(root, inode, - PAGE_CACHE_SIZE); - ret = -ENOSPC; - break; - } + ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); + if (ret) + goto err_unlock; again: if (inode->i_size == 0 || i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) { @@ -622,8 +598,10 @@ again: } page = grab_cache_page(inode->i_mapping, i); - if (!page) + if (!page) { + ret = -ENOMEM; goto err_reservations; + } if (!PageUptodate(page)) { btrfs_readpage(NULL, page); @@ -631,6 +609,7 @@ again: if (!PageUptodate(page)) { unlock_page(page); page_cache_release(page); + ret = -EIO; goto err_reservations; } } @@ -644,8 +623,7 @@ again: wait_on_page_writeback(page); if (PageDirty(page)) { - btrfs_free_reserved_data_space(root, inode, - PAGE_CACHE_SIZE); + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); goto loop_unlock; } @@ -683,7 +661,6 @@ loop_unlock: page_cache_release(page); mutex_unlock(&inode->i_mutex); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); i++; } @@ -713,9 +690,9 @@ loop_unlock: return 0; err_reservations: + btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); +err_unlock: mutex_unlock(&inode->i_mutex); - btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); - btrfs_unreserve_metadata_for_delalloc(root, inode, 1); return ret; } @@ -811,7 +788,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root, device->name, (unsigned long long)new_size); if (new_size > old_size) { - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); ret = btrfs_grow_device(trans, device, new_size); btrfs_commit_transaction(trans, root); } else { @@ -1300,7 +1277,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (err) goto out_up_write; - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out; + } + trans->block_rsv = &root->fs_info->global_block_rsv; + ret = btrfs_unlink_subvol(trans, root, dir, dest->root_key.objectid, dentry->d_name.name, @@ -1314,10 +1297,12 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, dest->root_item.drop_level = 0; btrfs_set_root_refs(&dest->root_item, 0); - ret = btrfs_insert_orphan_item(trans, - root->fs_info->tree_root, - dest->root_key.objectid); - BUG_ON(ret); + if (!xchg(&dest->orphan_item_inserted, 1)) { + ret = btrfs_insert_orphan_item(trans, + root->fs_info->tree_root, + dest->root_key.objectid); + BUG_ON(ret); + } ret = btrfs_commit_transaction(trans, root); BUG_ON(ret); @@ -1358,8 +1343,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EPERM; goto out; } - btrfs_defrag_root(root, 0); - btrfs_defrag_root(root->fs_info->extent_root, 0); + ret = btrfs_defrag_root(root, 0); + if (ret) + goto out; + ret = btrfs_defrag_root(root->fs_info->extent_root, 0); break; case S_IFREG: if (!(file->f_mode & FMODE_WRITE)) { @@ -1389,9 +1376,11 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) /* the rest are all set to zero by kzalloc */ range->len = (u64)-1; } - btrfs_defrag_file(file, range); + ret = btrfs_defrag_file(file, range); kfree(range); break; + default: + ret = -EINVAL; } out: mnt_drop_write(file->f_path.mnt); @@ -1550,12 +1539,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, btrfs_wait_ordered_range(src, off, off+len); } - trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); - - /* punch hole in destination first */ - btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1); - /* clone data */ key.objectid = src->i_ino; key.type = BTRFS_EXTENT_DATA_KEY; @@ -1566,7 +1549,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, * note the key will change type as we walk through the * tree. */ - ret = btrfs_search_slot(trans, root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; @@ -1629,12 +1612,31 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, new_key.objectid = inode->i_ino; new_key.offset = key.offset + destoff - off; + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { + if (off > key.offset) { + datao += off - key.offset; + datal -= off - key.offset; + } + + if (key.offset + datal > off + len) + datal = off + len - key.offset; + + ret = btrfs_drop_extents(trans, inode, + new_key.offset, + new_key.offset + datal, + &hint_byte, 1); + BUG_ON(ret); + ret = btrfs_insert_empty_item(trans, root, path, &new_key, size); - if (ret) - goto out; + BUG_ON(ret); leaf = path->nodes[0]; slot = path->slots[0]; @@ -1645,14 +1647,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - if (off > key.offset) { - datao += off - key.offset; - datal -= off - key.offset; - } - - if (key.offset + datal > off + len) - datal = off + len - key.offset; - /* disko == 0 means it's a hole */ if (!disko) datao = 0; @@ -1683,14 +1677,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (comp && (skip || trim)) { ret = -EINVAL; + btrfs_end_transaction(trans, root); goto out; } size -= skip + trim; datal -= skip + trim; + + ret = btrfs_drop_extents(trans, inode, + new_key.offset, + new_key.offset + datal, + &hint_byte, 1); + BUG_ON(ret); + ret = btrfs_insert_empty_item(trans, root, path, &new_key, size); - if (ret) - goto out; + BUG_ON(ret); if (skip) { u32 start = @@ -1708,8 +1709,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, } btrfs_mark_buffer_dirty(leaf); - } + btrfs_release_path(root, path); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + if (new_key.offset + datal > inode->i_size) + btrfs_i_size_write(inode, + new_key.offset + datal); + BTRFS_I(inode)->flags = BTRFS_I(src)->flags; + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + btrfs_end_transaction(trans, root); + } next: btrfs_release_path(root, path); key.offset++; @@ -1717,17 +1727,7 @@ next: ret = 0; out: btrfs_release_path(root, path); - if (ret == 0) { - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - if (destoff + olen > inode->i_size) - btrfs_i_size_write(inode, destoff + olen); - BTRFS_I(inode)->flags = BTRFS_I(src)->flags; - ret = btrfs_update_inode(trans, root, inode); - } - btrfs_end_transaction(trans, root); unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); - if (ret) - vmtruncate(inode, 0); out_unlock: mutex_unlock(&src->i_mutex); mutex_unlock(&inode->i_mutex); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a127c0e..e56c72b 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -124,6 +124,15 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) return 1; } +static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, + u64 len) +{ + if (file_offset + len <= entry->file_offset || + entry->file_offset + entry->len <= file_offset) + return 0; + return 1; +} + /* * look find the first ordered struct that has this offset, otherwise * the first one less than this offset @@ -161,8 +170,9 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, * The tree is given a single reference on the ordered extent that was * inserted. */ -int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int type) +static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, + int type, int dio) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; @@ -182,6 +192,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) set_bit(type, &entry->flags); + if (dio) + set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); + /* one ref for the tree */ atomic_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); @@ -203,6 +216,20 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, return 0; } +int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type) +{ + return __btrfs_add_ordered_extent(inode, file_offset, start, len, + disk_len, type, 0); +} + +int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type) +{ + return __btrfs_add_ordered_extent(inode, file_offset, start, len, + disk_len, type, 1); +} + /* * Add a struct btrfs_ordered_sum into the list of checksums to be inserted * when an ordered extent is finished. If the list covers more than one @@ -311,13 +338,6 @@ static int __btrfs_remove_ordered_extent(struct inode *inode, tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); - spin_lock(&BTRFS_I(inode)->accounting_lock); - WARN_ON(!BTRFS_I(inode)->outstanding_extents); - BTRFS_I(inode)->outstanding_extents--; - spin_unlock(&BTRFS_I(inode)->accounting_lock); - btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root, - inode, 1); - spin_lock(&root->fs_info->ordered_extent_lock); list_del_init(&entry->root_extent_list); @@ -491,7 +511,8 @@ void btrfs_start_ordered_extent(struct inode *inode, * start IO on any dirty ones so the wait doesn't stall waiting * for pdflush to find them */ - filemap_fdatawrite_range(inode->i_mapping, start, end); + if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) + filemap_fdatawrite_range(inode->i_mapping, start, end); if (wait) { wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); @@ -588,6 +609,47 @@ out: return entry; } +/* Since the DIO code tries to lock a wide area we need to look for any ordered + * extents that exist in the range, rather than just the start of the range. + */ +struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, + u64 file_offset, + u64 len) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + + tree = &BTRFS_I(inode)->ordered_tree; + spin_lock(&tree->lock); + node = tree_search(tree, file_offset); + if (!node) { + node = tree_search(tree, file_offset + len); + if (!node) + goto out; + } + + while (1) { + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (range_overlaps(entry, file_offset, len)) + break; + + if (entry->file_offset >= file_offset + len) { + entry = NULL; + break; + } + entry = NULL; + node = rb_next(node); + if (!node) + break; + } +out: + if (entry) + atomic_inc(&entry->refs); + spin_unlock(&tree->lock); + return entry; +} + /* * lookup and return any extent before 'file_offset'. NULL is returned * if none is found diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index c82f76a..8ac3654 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -72,6 +72,8 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ +#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */ + struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; @@ -140,7 +142,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size); int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int tyep); + u64 start, u64 len, u64 disk_len, int type); +int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type); int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); @@ -151,6 +155,9 @@ void btrfs_start_ordered_extent(struct inode *inode, int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); +struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, + u64 file_offset, + u64 len); int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index e558dd9..05d41e5 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -44,8 +44,12 @@ struct tree_entry { struct backref_node { struct rb_node rb_node; u64 bytenr; - /* objectid tree block owner */ + + u64 new_bytenr; + /* objectid of tree block owner, can be not uptodate */ u64 owner; + /* link to pending, changed or detached list */ + struct list_head list; /* list of upper level blocks reference this block */ struct list_head upper; /* list of child blocks in the cache */ @@ -56,9 +60,9 @@ struct backref_node { struct extent_buffer *eb; /* level of tree block */ unsigned int level:8; - /* 1 if the block is root of old snapshot */ - unsigned int old_root:1; - /* 1 if no child blocks in the cache */ + /* is the block in non-reference counted tree */ + unsigned int cowonly:1; + /* 1 if no child node in the cache */ unsigned int lowest:1; /* is the extent buffer locked */ unsigned int locked:1; @@ -66,6 +70,16 @@ struct backref_node { unsigned int processed:1; /* have backrefs of this block been checked */ unsigned int checked:1; + /* + * 1 if corresponding block has been cowed but some upper + * level block pointers may not point to the new location + */ + unsigned int pending:1; + /* + * 1 if the backref node isn't connected to any other + * backref node. + */ + unsigned int detached:1; }; /* @@ -74,7 +88,6 @@ struct backref_node { struct backref_edge { struct list_head list[2]; struct backref_node *node[2]; - u64 blockptr; }; #define LOWER 0 @@ -83,9 +96,25 @@ struct backref_edge { struct backref_cache { /* red black tree of all backref nodes in the cache */ struct rb_root rb_root; - /* list of backref nodes with no child block in the cache */ + /* for passing backref nodes to btrfs_reloc_cow_block */ + struct backref_node *path[BTRFS_MAX_LEVEL]; + /* + * list of blocks that have been cowed but some block + * pointers in upper level blocks may not reflect the + * new location + */ struct list_head pending[BTRFS_MAX_LEVEL]; - spinlock_t lock; + /* list of backref nodes with no child node */ + struct list_head leaves; + /* list of blocks that have been cowed in current transaction */ + struct list_head changed; + /* list of detached backref node. */ + struct list_head detached; + + u64 last_trans; + + int nr_nodes; + int nr_edges; }; /* @@ -113,15 +142,6 @@ struct tree_block { unsigned int key_ready:1; }; -/* inode vector */ -#define INODEVEC_SIZE 16 - -struct inodevec { - struct list_head list; - struct inode *inode[INODEVEC_SIZE]; - int nr; -}; - #define MAX_EXTENTS 128 struct file_extent_cluster { @@ -138,36 +158,43 @@ struct reloc_control { struct btrfs_root *extent_root; /* inode for moving data */ struct inode *data_inode; - struct btrfs_workers workers; + + struct btrfs_block_rsv *block_rsv; + + struct backref_cache backref_cache; + + struct file_extent_cluster cluster; /* tree blocks have been processed */ struct extent_io_tree processed_blocks; /* map start of tree root to corresponding reloc tree */ struct mapping_tree reloc_root_tree; /* list of reloc trees */ struct list_head reloc_roots; + /* size of metadata reservation for merging reloc trees */ + u64 merging_rsv_size; + /* size of relocated tree nodes */ + u64 nodes_relocated; + u64 search_start; u64 extents_found; - u64 extents_skipped; - int stage; - int create_reloc_root; + + int block_rsv_retries; + + unsigned int stage:8; + unsigned int create_reloc_tree:1; + unsigned int merge_reloc_tree:1; unsigned int found_file_extent:1; - unsigned int found_old_snapshot:1; + unsigned int commit_transaction:1; }; /* stages of data relocation */ #define MOVE_DATA_EXTENTS 0 #define UPDATE_DATA_PTRS 1 -/* - * merge reloc tree to corresponding fs tree in worker threads - */ -struct async_merge { - struct btrfs_work work; - struct reloc_control *rc; - struct btrfs_root *root; - struct completion *done; - atomic_t *num_pending; -}; +static void remove_backref_node(struct backref_cache *cache, + struct backref_node *node); +static void __mark_block_processed(struct reloc_control *rc, + struct backref_node *node); static void mapping_tree_init(struct mapping_tree *tree) { @@ -181,15 +208,80 @@ static void backref_cache_init(struct backref_cache *cache) cache->rb_root = RB_ROOT; for (i = 0; i < BTRFS_MAX_LEVEL; i++) INIT_LIST_HEAD(&cache->pending[i]); - spin_lock_init(&cache->lock); + INIT_LIST_HEAD(&cache->changed); + INIT_LIST_HEAD(&cache->detached); + INIT_LIST_HEAD(&cache->leaves); +} + +static void backref_cache_cleanup(struct backref_cache *cache) +{ + struct backref_node *node; + int i; + + while (!list_empty(&cache->detached)) { + node = list_entry(cache->detached.next, + struct backref_node, list); + remove_backref_node(cache, node); + } + + while (!list_empty(&cache->leaves)) { + node = list_entry(cache->leaves.next, + struct backref_node, lower); + remove_backref_node(cache, node); + } + + cache->last_trans = 0; + + for (i = 0; i < BTRFS_MAX_LEVEL; i++) + BUG_ON(!list_empty(&cache->pending[i])); + BUG_ON(!list_empty(&cache->changed)); + BUG_ON(!list_empty(&cache->detached)); + BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root)); + BUG_ON(cache->nr_nodes); + BUG_ON(cache->nr_edges); +} + +static struct backref_node *alloc_backref_node(struct backref_cache *cache) +{ + struct backref_node *node; + + node = kzalloc(sizeof(*node), GFP_NOFS); + if (node) { + INIT_LIST_HEAD(&node->list); + INIT_LIST_HEAD(&node->upper); + INIT_LIST_HEAD(&node->lower); + RB_CLEAR_NODE(&node->rb_node); + cache->nr_nodes++; + } + return node; +} + +static void free_backref_node(struct backref_cache *cache, + struct backref_node *node) +{ + if (node) { + cache->nr_nodes--; + kfree(node); + } +} + +static struct backref_edge *alloc_backref_edge(struct backref_cache *cache) +{ + struct backref_edge *edge; + + edge = kzalloc(sizeof(*edge), GFP_NOFS); + if (edge) + cache->nr_edges++; + return edge; } -static void backref_node_init(struct backref_node *node) +static void free_backref_edge(struct backref_cache *cache, + struct backref_edge *edge) { - memset(node, 0, sizeof(*node)); - INIT_LIST_HEAD(&node->upper); - INIT_LIST_HEAD(&node->lower); - RB_CLEAR_NODE(&node->rb_node); + if (edge) { + cache->nr_edges--; + kfree(edge); + } } static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, @@ -250,6 +342,7 @@ static struct backref_node *walk_up_backref(struct backref_node *node, edges[idx++] = edge; node = edge->node[UPPER]; } + BUG_ON(node->detached); *index = idx; return node; } @@ -281,13 +374,18 @@ static struct backref_node *walk_down_backref(struct backref_edge *edges[], return NULL; } +static void unlock_node_buffer(struct backref_node *node) +{ + if (node->locked) { + btrfs_tree_unlock(node->eb); + node->locked = 0; + } +} + static void drop_node_buffer(struct backref_node *node) { if (node->eb) { - if (node->locked) { - btrfs_tree_unlock(node->eb); - node->locked = 0; - } + unlock_node_buffer(node); free_extent_buffer(node->eb); node->eb = NULL; } @@ -296,14 +394,14 @@ static void drop_node_buffer(struct backref_node *node) static void drop_backref_node(struct backref_cache *tree, struct backref_node *node) { - BUG_ON(!node->lowest); BUG_ON(!list_empty(&node->upper)); drop_node_buffer(node); + list_del(&node->list); list_del(&node->lower); - - rb_erase(&node->rb_node, &tree->rb_root); - kfree(node); + if (!RB_EMPTY_NODE(&node->rb_node)) + rb_erase(&node->rb_node, &tree->rb_root); + free_backref_node(tree, node); } /* @@ -318,27 +416,121 @@ static void remove_backref_node(struct backref_cache *cache, if (!node) return; - BUG_ON(!node->lowest); + BUG_ON(!node->lowest && !node->detached); while (!list_empty(&node->upper)) { edge = list_entry(node->upper.next, struct backref_edge, list[LOWER]); upper = edge->node[UPPER]; list_del(&edge->list[LOWER]); list_del(&edge->list[UPPER]); - kfree(edge); + free_backref_edge(cache, edge); + + if (RB_EMPTY_NODE(&upper->rb_node)) { + BUG_ON(!list_empty(&node->upper)); + drop_backref_node(cache, node); + node = upper; + node->lowest = 1; + continue; + } /* - * add the node to pending list if no other + * add the node to leaf node list if no other * child block cached. */ if (list_empty(&upper->lower)) { - list_add_tail(&upper->lower, - &cache->pending[upper->level]); + list_add_tail(&upper->lower, &cache->leaves); upper->lowest = 1; } } + drop_backref_node(cache, node); } +static void update_backref_node(struct backref_cache *cache, + struct backref_node *node, u64 bytenr) +{ + struct rb_node *rb_node; + rb_erase(&node->rb_node, &cache->rb_root); + node->bytenr = bytenr; + rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); + BUG_ON(rb_node); +} + +/* + * update backref cache after a transaction commit + */ +static int update_backref_cache(struct btrfs_trans_handle *trans, + struct backref_cache *cache) +{ + struct backref_node *node; + int level = 0; + + if (cache->last_trans == 0) { + cache->last_trans = trans->transid; + return 0; + } + + if (cache->last_trans == trans->transid) + return 0; + + /* + * detached nodes are used to avoid unnecessary backref + * lookup. transaction commit changes the extent tree. + * so the detached nodes are no longer useful. + */ + while (!list_empty(&cache->detached)) { + node = list_entry(cache->detached.next, + struct backref_node, list); + remove_backref_node(cache, node); + } + + while (!list_empty(&cache->changed)) { + node = list_entry(cache->changed.next, + struct backref_node, list); + list_del_init(&node->list); + BUG_ON(node->pending); + update_backref_node(cache, node, node->new_bytenr); + } + + /* + * some nodes can be left in the pending list if there were + * errors during processing the pending nodes. + */ + for (level = 0; level < BTRFS_MAX_LEVEL; level++) { + list_for_each_entry(node, &cache->pending[level], list) { + BUG_ON(!node->pending); + if (node->bytenr == node->new_bytenr) + continue; + update_backref_node(cache, node, node->new_bytenr); + } + } + + cache->last_trans = 0; + return 1; +} + +static int should_ignore_root(struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + + if (!root->ref_cows) + return 0; + + reloc_root = root->reloc_root; + if (!reloc_root) + return 0; + + if (btrfs_root_last_snapshot(&reloc_root->root_item) == + root->fs_info->running_transaction->transid - 1) + return 0; + /* + * if there is reloc tree and it was created in previous + * transaction backref lookup can find the reloc tree, + * so backref node for the fs tree root is useless for + * relocation. + */ + return 1; +} + /* * find reloc tree by address of tree root */ @@ -453,11 +645,12 @@ int find_inline_backref(struct extent_buffer *leaf, int slot, * for all upper level blocks that directly/indirectly reference the * block are also cached. */ -static struct backref_node *build_backref_tree(struct reloc_control *rc, - struct backref_cache *cache, - struct btrfs_key *node_key, - int level, u64 bytenr) +static noinline_for_stack +struct backref_node *build_backref_tree(struct reloc_control *rc, + struct btrfs_key *node_key, + int level, u64 bytenr) { + struct backref_cache *cache = &rc->backref_cache; struct btrfs_path *path1; struct btrfs_path *path2; struct extent_buffer *eb; @@ -473,6 +666,8 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc, unsigned long end; unsigned long ptr; LIST_HEAD(list); + LIST_HEAD(useless); + int cowonly; int ret; int err = 0; @@ -483,15 +678,13 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc, goto out; } - node = kmalloc(sizeof(*node), GFP_NOFS); + node = alloc_backref_node(cache); if (!node) { err = -ENOMEM; goto out; } - backref_node_init(node); node->bytenr = bytenr; - node->owner = 0; node->level = level; node->lowest = 1; cur = node; @@ -587,17 +780,20 @@ again: #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY || key.type == BTRFS_EXTENT_REF_V0_KEY) { - if (key.objectid == key.offset && - key.type == BTRFS_EXTENT_REF_V0_KEY) { + if (key.type == BTRFS_EXTENT_REF_V0_KEY) { struct btrfs_extent_ref_v0 *ref0; ref0 = btrfs_item_ptr(eb, path1->slots[0], struct btrfs_extent_ref_v0); root = find_tree_root(rc, eb, ref0); - if (root) - cur->root = root; - else - cur->old_root = 1; - break; + if (!root->ref_cows) + cur->cowonly = 1; + if (key.objectid == key.offset) { + if (root && !should_ignore_root(root)) + cur->root = root; + else + list_add(&cur->list, &useless); + break; + } } #else BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); @@ -614,22 +810,20 @@ again: break; } - edge = kzalloc(sizeof(*edge), GFP_NOFS); + edge = alloc_backref_edge(cache); if (!edge) { err = -ENOMEM; goto out; } rb_node = tree_search(&cache->rb_root, key.offset); if (!rb_node) { - upper = kmalloc(sizeof(*upper), GFP_NOFS); + upper = alloc_backref_node(cache); if (!upper) { - kfree(edge); + free_backref_edge(cache, edge); err = -ENOMEM; goto out; } - backref_node_init(upper); upper->bytenr = key.offset; - upper->owner = 0; upper->level = cur->level + 1; /* * backrefs for the upper level block isn't @@ -639,11 +833,12 @@ again: } else { upper = rb_entry(rb_node, struct backref_node, rb_node); + BUG_ON(!upper->checked); INIT_LIST_HEAD(&edge->list[UPPER]); } - list_add(&edge->list[LOWER], &cur->upper); - edge->node[UPPER] = upper; + list_add_tail(&edge->list[LOWER], &cur->upper); edge->node[LOWER] = cur; + edge->node[UPPER] = upper; goto next; } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { @@ -657,11 +852,17 @@ again: goto out; } + if (!root->ref_cows) + cur->cowonly = 1; + if (btrfs_root_level(&root->root_item) == cur->level) { /* tree root */ BUG_ON(btrfs_root_bytenr(&root->root_item) != cur->bytenr); - cur->root = root; + if (should_ignore_root(root)) + list_add(&cur->list, &useless); + else + cur->root = root; break; } @@ -692,11 +893,14 @@ again: if (!path2->nodes[level]) { BUG_ON(btrfs_root_bytenr(&root->root_item) != lower->bytenr); - lower->root = root; + if (should_ignore_root(root)) + list_add(&lower->list, &useless); + else + lower->root = root; break; } - edge = kzalloc(sizeof(*edge), GFP_NOFS); + edge = alloc_backref_edge(cache); if (!edge) { err = -ENOMEM; goto out; @@ -705,16 +909,17 @@ again: eb = path2->nodes[level]; rb_node = tree_search(&cache->rb_root, eb->start); if (!rb_node) { - upper = kmalloc(sizeof(*upper), GFP_NOFS); + upper = alloc_backref_node(cache); if (!upper) { - kfree(edge); + free_backref_edge(cache, edge); err = -ENOMEM; goto out; } - backref_node_init(upper); upper->bytenr = eb->start; upper->owner = btrfs_header_owner(eb); upper->level = lower->level + 1; + if (!root->ref_cows) + upper->cowonly = 1; /* * if we know the block isn't shared @@ -744,10 +949,12 @@ again: rb_node); BUG_ON(!upper->checked); INIT_LIST_HEAD(&edge->list[UPPER]); + if (!upper->owner) + upper->owner = btrfs_header_owner(eb); } list_add_tail(&edge->list[LOWER], &lower->upper); - edge->node[UPPER] = upper; edge->node[LOWER] = lower; + edge->node[UPPER] = upper; if (rb_node) break; @@ -785,8 +992,13 @@ next: * into the cache. */ BUG_ON(!node->checked); - rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); - BUG_ON(rb_node); + cowonly = node->cowonly; + if (!cowonly) { + rb_node = tree_insert(&cache->rb_root, node->bytenr, + &node->rb_node); + BUG_ON(rb_node); + list_add_tail(&node->lower, &cache->leaves); + } list_for_each_entry(edge, &node->upper, list[LOWER]) list_add_tail(&edge->list[UPPER], &list); @@ -795,6 +1007,14 @@ next: edge = list_entry(list.next, struct backref_edge, list[UPPER]); list_del_init(&edge->list[UPPER]); upper = edge->node[UPPER]; + if (upper->detached) { + list_del(&edge->list[LOWER]); + lower = edge->node[LOWER]; + free_backref_edge(cache, edge); + if (list_empty(&lower->upper)) + list_add(&lower->list, &useless); + continue; + } if (!RB_EMPTY_NODE(&upper->rb_node)) { if (upper->lowest) { @@ -807,25 +1027,69 @@ next: } BUG_ON(!upper->checked); - rb_node = tree_insert(&cache->rb_root, upper->bytenr, - &upper->rb_node); - BUG_ON(rb_node); + BUG_ON(cowonly != upper->cowonly); + if (!cowonly) { + rb_node = tree_insert(&cache->rb_root, upper->bytenr, + &upper->rb_node); + BUG_ON(rb_node); + } list_add_tail(&edge->list[UPPER], &upper->lower); list_for_each_entry(edge, &upper->upper, list[LOWER]) list_add_tail(&edge->list[UPPER], &list); } + /* + * process useless backref nodes. backref nodes for tree leaves + * are deleted from the cache. backref nodes for upper level + * tree blocks are left in the cache to avoid unnecessary backref + * lookup. + */ + while (!list_empty(&useless)) { + upper = list_entry(useless.next, struct backref_node, list); + list_del_init(&upper->list); + BUG_ON(!list_empty(&upper->upper)); + if (upper == node) + node = NULL; + if (upper->lowest) { + list_del_init(&upper->lower); + upper->lowest = 0; + } + while (!list_empty(&upper->lower)) { + edge = list_entry(upper->lower.next, + struct backref_edge, list[UPPER]); + list_del(&edge->list[UPPER]); + list_del(&edge->list[LOWER]); + lower = edge->node[LOWER]; + free_backref_edge(cache, edge); + + if (list_empty(&lower->upper)) + list_add(&lower->list, &useless); + } + __mark_block_processed(rc, upper); + if (upper->level > 0) { + list_add(&upper->list, &cache->detached); + upper->detached = 1; + } else { + rb_erase(&upper->rb_node, &cache->rb_root); + free_backref_node(cache, upper); + } + } out: btrfs_free_path(path1); btrfs_free_path(path2); if (err) { - INIT_LIST_HEAD(&list); + while (!list_empty(&useless)) { + lower = list_entry(useless.next, + struct backref_node, upper); + list_del_init(&lower->upper); + } upper = node; + INIT_LIST_HEAD(&list); while (upper) { if (RB_EMPTY_NODE(&upper->rb_node)) { list_splice_tail(&upper->upper, &list); - kfree(upper); + free_backref_node(cache, upper); } if (list_empty(&list)) @@ -833,15 +1097,104 @@ out: edge = list_entry(list.next, struct backref_edge, list[LOWER]); + list_del(&edge->list[LOWER]); upper = edge->node[UPPER]; - kfree(edge); + free_backref_edge(cache, edge); } return ERR_PTR(err); } + BUG_ON(node && node->detached); return node; } /* + * helper to add backref node for the newly created snapshot. + * the backref node is created by cloning backref node that + * corresponds to root of source tree + */ +static int clone_backref_node(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_root *src, + struct btrfs_root *dest) +{ + struct btrfs_root *reloc_root = src->reloc_root; + struct backref_cache *cache = &rc->backref_cache; + struct backref_node *node = NULL; + struct backref_node *new_node; + struct backref_edge *edge; + struct backref_edge *new_edge; + struct rb_node *rb_node; + + if (cache->last_trans > 0) + update_backref_cache(trans, cache); + + rb_node = tree_search(&cache->rb_root, src->commit_root->start); + if (rb_node) { + node = rb_entry(rb_node, struct backref_node, rb_node); + if (node->detached) + node = NULL; + else + BUG_ON(node->new_bytenr != reloc_root->node->start); + } + + if (!node) { + rb_node = tree_search(&cache->rb_root, + reloc_root->commit_root->start); + if (rb_node) { + node = rb_entry(rb_node, struct backref_node, + rb_node); + BUG_ON(node->detached); + } + } + + if (!node) + return 0; + + new_node = alloc_backref_node(cache); + if (!new_node) + return -ENOMEM; + + new_node->bytenr = dest->node->start; + new_node->level = node->level; + new_node->lowest = node->lowest; + new_node->root = dest; + + if (!node->lowest) { + list_for_each_entry(edge, &node->lower, list[UPPER]) { + new_edge = alloc_backref_edge(cache); + if (!new_edge) + goto fail; + + new_edge->node[UPPER] = new_node; + new_edge->node[LOWER] = edge->node[LOWER]; + list_add_tail(&new_edge->list[UPPER], + &new_node->lower); + } + } + + rb_node = tree_insert(&cache->rb_root, new_node->bytenr, + &new_node->rb_node); + BUG_ON(rb_node); + + if (!new_node->lowest) { + list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) { + list_add_tail(&new_edge->list[LOWER], + &new_edge->node[LOWER]->upper); + } + } + return 0; +fail: + while (!list_empty(&new_node->lower)) { + new_edge = list_entry(new_node->lower.next, + struct backref_edge, list[UPPER]); + list_del(&new_edge->list[UPPER]); + free_backref_edge(cache, new_edge); + } + free_backref_node(cache, new_node); + return -ENOMEM; +} + +/* * helper to add 'address of tree root -> reloc tree' mapping */ static int __add_reloc_root(struct btrfs_root *root) @@ -901,12 +1254,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del) return 0; } -/* - * create reloc tree for a given fs tree. reloc tree is just a - * snapshot of the fs tree with special root objectid. - */ -int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid) { struct btrfs_root *reloc_root; struct extent_buffer *eb; @@ -914,36 +1263,45 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_key root_key; int ret; - if (root->reloc_root) { - reloc_root = root->reloc_root; - reloc_root->last_trans = trans->transid; - return 0; - } - - if (!root->fs_info->reloc_ctl || - !root->fs_info->reloc_ctl->create_reloc_root || - root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) - return 0; - root_item = kmalloc(sizeof(*root_item), GFP_NOFS); BUG_ON(!root_item); root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; root_key.type = BTRFS_ROOT_ITEM_KEY; - root_key.offset = root->root_key.objectid; + root_key.offset = objectid; - ret = btrfs_copy_root(trans, root, root->commit_root, &eb, - BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(ret); + if (root->root_key.objectid == objectid) { + /* called by btrfs_init_reloc_root */ + ret = btrfs_copy_root(trans, root, root->commit_root, &eb, + BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(ret); + + btrfs_set_root_last_snapshot(&root->root_item, + trans->transid - 1); + } else { + /* + * called by btrfs_reloc_post_snapshot_hook. + * the source tree is a reloc tree, all tree blocks + * modified after it was created have RELOC flag + * set in their headers. so it's OK to not update + * the 'last_snapshot'. + */ + ret = btrfs_copy_root(trans, root, root->node, &eb, + BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(ret); + } - btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1); memcpy(root_item, &root->root_item, sizeof(*root_item)); - btrfs_set_root_refs(root_item, 1); btrfs_set_root_bytenr(root_item, eb->start); btrfs_set_root_level(root_item, btrfs_header_level(eb)); btrfs_set_root_generation(root_item, trans->transid); - memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); - root_item->drop_level = 0; + + if (root->root_key.objectid == objectid) { + btrfs_set_root_refs(root_item, 0); + memset(&root_item->drop_progress, 0, + sizeof(struct btrfs_disk_key)); + root_item->drop_level = 0; + } btrfs_tree_unlock(eb); free_extent_buffer(eb); @@ -957,6 +1315,37 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, &root_key); BUG_ON(IS_ERR(reloc_root)); reloc_root->last_trans = trans->transid; + return reloc_root; +} + +/* + * create reloc tree for a given fs tree. reloc tree is just a + * snapshot of the fs tree with special root objectid. + */ +int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + struct reloc_control *rc = root->fs_info->reloc_ctl; + int clear_rsv = 0; + + if (root->reloc_root) { + reloc_root = root->reloc_root; + reloc_root->last_trans = trans->transid; + return 0; + } + + if (!rc || !rc->create_reloc_tree || + root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) + return 0; + + if (!trans->block_rsv) { + trans->block_rsv = rc->block_rsv; + clear_rsv = 1; + } + reloc_root = create_reloc_root(trans, root, root->root_key.objectid); + if (clear_rsv) + trans->block_rsv = NULL; __add_reloc_root(reloc_root); root->reloc_root = reloc_root; @@ -980,7 +1369,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, reloc_root = root->reloc_root; root_item = &reloc_root->root_item; - if (btrfs_root_refs(root_item) == 0) { + if (root->fs_info->reloc_ctl->merge_reloc_tree && + btrfs_root_refs(root_item) == 0) { root->reloc_root = NULL; del = 1; } @@ -1102,8 +1492,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr, goto out; } - if (new_bytenr) - *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); ret = 0; out: btrfs_free_path(path); @@ -1114,19 +1503,18 @@ out: * update file extent items in the tree leaf to point to * the new locations. */ -static int replace_file_extents(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct btrfs_root *root, - struct extent_buffer *leaf, - struct list_head *inode_list) +static noinline_for_stack +int replace_file_extents(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_root *root, + struct extent_buffer *leaf) { struct btrfs_key key; struct btrfs_file_extent_item *fi; struct inode *inode = NULL; - struct inodevec *ivec = NULL; u64 parent; u64 bytenr; - u64 new_bytenr; + u64 new_bytenr = 0; u64 num_bytes; u64 end; u32 nritems; @@ -1166,21 +1554,12 @@ static int replace_file_extents(struct btrfs_trans_handle *trans, * to complete and drop the extent cache */ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { - if (!ivec || ivec->nr == INODEVEC_SIZE) { - ivec = kmalloc(sizeof(*ivec), GFP_NOFS); - BUG_ON(!ivec); - ivec->nr = 0; - list_add_tail(&ivec->list, inode_list); - } if (first) { inode = find_next_inode(root, key.objectid); - if (inode) - ivec->inode[ivec->nr++] = inode; first = 0; } else if (inode && inode->i_ino < key.objectid) { + btrfs_add_delayed_iput(inode); inode = find_next_inode(root, key.objectid); - if (inode) - ivec->inode[ivec->nr++] = inode; } if (inode && inode->i_ino == key.objectid) { end = key.offset + @@ -1204,8 +1583,10 @@ static int replace_file_extents(struct btrfs_trans_handle *trans, ret = get_new_location(rc->data_inode, &new_bytenr, bytenr, num_bytes); - if (ret > 0) + if (ret > 0) { + WARN_ON(1); continue; + } BUG_ON(ret < 0); btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); @@ -1225,6 +1606,8 @@ static int replace_file_extents(struct btrfs_trans_handle *trans, } if (dirty) btrfs_mark_buffer_dirty(leaf); + if (inode) + btrfs_add_delayed_iput(inode); return 0; } @@ -1248,11 +1631,11 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot, * if no block got replaced, 0 is returned. if there are other * errors, a negative error number is returned. */ -static int replace_path(struct btrfs_trans_handle *trans, - struct btrfs_root *dest, struct btrfs_root *src, - struct btrfs_path *path, struct btrfs_key *next_key, - struct extent_buffer **leaf, - int lowest_level, int max_level) +static noinline_for_stack +int replace_path(struct btrfs_trans_handle *trans, + struct btrfs_root *dest, struct btrfs_root *src, + struct btrfs_path *path, struct btrfs_key *next_key, + int lowest_level, int max_level) { struct extent_buffer *eb; struct extent_buffer *parent; @@ -1263,16 +1646,16 @@ static int replace_path(struct btrfs_trans_handle *trans, u64 new_ptr_gen; u64 last_snapshot; u32 blocksize; + int cow = 0; int level; int ret; int slot; BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(lowest_level > 1 && leaf); last_snapshot = btrfs_root_last_snapshot(&src->root_item); - +again: slot = path->slots[lowest_level]; btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); @@ -1286,8 +1669,10 @@ static int replace_path(struct btrfs_trans_handle *trans, return 0; } - ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); - BUG_ON(ret); + if (cow) { + ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); + BUG_ON(ret); + } btrfs_set_lock_blocking(eb); if (next_key) { @@ -1331,7 +1716,7 @@ static int replace_path(struct btrfs_trans_handle *trans, if (new_bytenr == 0 || old_ptr_gen > last_snapshot || memcmp_node_keys(parent, slot, path, level)) { - if (level <= lowest_level && !leaf) { + if (level <= lowest_level) { ret = 0; break; } @@ -1339,16 +1724,12 @@ static int replace_path(struct btrfs_trans_handle *trans, eb = read_tree_block(dest, old_bytenr, blocksize, old_ptr_gen); btrfs_tree_lock(eb); - ret = btrfs_cow_block(trans, dest, eb, parent, - slot, &eb); - BUG_ON(ret); - btrfs_set_lock_blocking(eb); - - if (level <= lowest_level) { - *leaf = eb; - ret = 0; - break; + if (cow) { + ret = btrfs_cow_block(trans, dest, eb, parent, + slot, &eb); + BUG_ON(ret); } + btrfs_set_lock_blocking(eb); btrfs_tree_unlock(parent); free_extent_buffer(parent); @@ -1357,6 +1738,13 @@ static int replace_path(struct btrfs_trans_handle *trans, continue; } + if (!cow) { + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + cow = 1; + goto again; + } + btrfs_node_key_to_cpu(path->nodes[level], &key, path->slots[level]); btrfs_release_path(src, path); @@ -1562,20 +1950,6 @@ static int invalidate_extent_cache(struct btrfs_root *root, return 0; } -static void put_inodes(struct list_head *list) -{ - struct inodevec *ivec; - while (!list_empty(list)) { - ivec = list_entry(list->next, struct inodevec, list); - list_del(&ivec->list); - while (ivec->nr > 0) { - ivec->nr--; - iput(ivec->inode[ivec->nr]); - } - kfree(ivec); - } -} - static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key) @@ -1608,13 +1982,14 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, struct btrfs_root *reloc_root; struct btrfs_root_item *root_item; struct btrfs_path *path; - struct extent_buffer *leaf = NULL; + struct extent_buffer *leaf; unsigned long nr; int level; int max_level; int replaced = 0; int ret; int err = 0; + u32 min_reserved; path = btrfs_alloc_path(); if (!path) @@ -1648,34 +2023,23 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, btrfs_unlock_up_safe(path, 0); } - if (level == 0 && rc->stage == UPDATE_DATA_PTRS) { - trans = btrfs_start_transaction(root, 1); + min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; + memset(&next_key, 0, sizeof(next_key)); - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, 0); - btrfs_release_path(reloc_root, path); + while (1) { + trans = btrfs_start_transaction(root, 0); + trans->block_rsv = rc->block_rsv; - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); - if (ret < 0) { - err = ret; - goto out; + ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, + min_reserved, 0); + if (ret) { + BUG_ON(ret != -EAGAIN); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + continue; } - leaf = path->nodes[0]; - btrfs_unlock_up_safe(path, 1); - ret = replace_file_extents(trans, rc, root, leaf, - &inode_list); - if (ret < 0) - err = ret; - goto out; - } - - memset(&next_key, 0, sizeof(next_key)); - - while (1) { - leaf = NULL; replaced = 0; - trans = btrfs_start_transaction(root, 1); max_level = level; ret = walk_down_reloc_tree(reloc_root, path, &level); @@ -1689,14 +2053,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, if (!find_next_key(path, level, &key) && btrfs_comp_cpu_keys(&next_key, &key) >= 0) { ret = 0; - } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) { - ret = replace_path(trans, root, reloc_root, - path, &next_key, &leaf, - level, max_level); } else { - ret = replace_path(trans, root, reloc_root, - path, &next_key, NULL, - level, max_level); + ret = replace_path(trans, root, reloc_root, path, + &next_key, level, max_level); } if (ret < 0) { err = ret; @@ -1708,16 +2067,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, btrfs_node_key_to_cpu(path->nodes[level], &key, path->slots[level]); replaced = 1; - } else if (leaf) { - /* - * no block got replaced, try replacing file extents - */ - btrfs_item_key_to_cpu(leaf, &key, 0); - ret = replace_file_extents(trans, rc, root, leaf, - &inode_list); - btrfs_tree_unlock(leaf); - free_extent_buffer(leaf); - BUG_ON(ret < 0); } ret = walk_up_reloc_tree(reloc_root, path, &level); @@ -1734,15 +2083,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, root_item->drop_level = level; nr = trans->blocks_used; - btrfs_end_transaction(trans, root); + btrfs_end_transaction_throttle(trans, root); btrfs_btree_balance_dirty(root, nr); - /* - * put inodes outside transaction, otherwise we may deadlock. - */ - put_inodes(&inode_list); - if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); } @@ -1765,87 +2109,125 @@ out: sizeof(root_item->drop_progress)); root_item->drop_level = 0; btrfs_set_root_refs(root_item, 0); + btrfs_update_reloc_root(trans, root); } nr = trans->blocks_used; - btrfs_end_transaction(trans, root); + btrfs_end_transaction_throttle(trans, root); btrfs_btree_balance_dirty(root, nr); - put_inodes(&inode_list); - if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); return err; } -/* - * callback for the work threads. - * this function merges reloc tree with corresponding fs tree, - * and then drops the reloc tree. - */ -static void merge_func(struct btrfs_work *work) +static noinline_for_stack +int prepare_to_merge(struct reloc_control *rc, int err) { - struct btrfs_trans_handle *trans; - struct btrfs_root *root; + struct btrfs_root *root = rc->extent_root; struct btrfs_root *reloc_root; - struct async_merge *async; + struct btrfs_trans_handle *trans; + LIST_HEAD(reloc_roots); + u64 num_bytes = 0; + int ret; + int retries = 0; + + mutex_lock(&root->fs_info->trans_mutex); + rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; + rc->merging_rsv_size += rc->nodes_relocated * 2; + mutex_unlock(&root->fs_info->trans_mutex); +again: + if (!err) { + num_bytes = rc->merging_rsv_size; + ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, + num_bytes, &retries); + if (ret) + err = ret; + } + + trans = btrfs_join_transaction(rc->extent_root, 1); + + if (!err) { + if (num_bytes != rc->merging_rsv_size) { + btrfs_end_transaction(trans, rc->extent_root); + btrfs_block_rsv_release(rc->extent_root, + rc->block_rsv, num_bytes); + retries = 0; + goto again; + } + } - async = container_of(work, struct async_merge, work); - reloc_root = async->root; + rc->merge_reloc_tree = 1; + + while (!list_empty(&rc->reloc_roots)) { + reloc_root = list_entry(rc->reloc_roots.next, + struct btrfs_root, root_list); + list_del_init(&reloc_root->root_list); - if (btrfs_root_refs(&reloc_root->root_item) > 0) { root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset); BUG_ON(IS_ERR(root)); BUG_ON(root->reloc_root != reloc_root); - merge_reloc_root(async->rc, root); - - trans = btrfs_start_transaction(root, 1); + /* + * set reference count to 1, so btrfs_recover_relocation + * knows it should resumes merging + */ + if (!err) + btrfs_set_root_refs(&reloc_root->root_item, 1); btrfs_update_reloc_root(trans, root); - btrfs_end_transaction(trans, root); - } - btrfs_drop_snapshot(reloc_root, 0); + list_add(&reloc_root->root_list, &reloc_roots); + } - if (atomic_dec_and_test(async->num_pending)) - complete(async->done); + list_splice(&reloc_roots, &rc->reloc_roots); - kfree(async); + if (!err) + btrfs_commit_transaction(trans, rc->extent_root); + else + btrfs_end_transaction(trans, rc->extent_root); + return err; } -static int merge_reloc_roots(struct reloc_control *rc) +static noinline_for_stack +int merge_reloc_roots(struct reloc_control *rc) { - struct async_merge *async; struct btrfs_root *root; - struct completion done; - atomic_t num_pending; + struct btrfs_root *reloc_root; + LIST_HEAD(reloc_roots); + int found = 0; + int ret; +again: + root = rc->extent_root; + mutex_lock(&root->fs_info->trans_mutex); + list_splice_init(&rc->reloc_roots, &reloc_roots); + mutex_unlock(&root->fs_info->trans_mutex); - init_completion(&done); - atomic_set(&num_pending, 1); + while (!list_empty(&reloc_roots)) { + found = 1; + reloc_root = list_entry(reloc_roots.next, + struct btrfs_root, root_list); - while (!list_empty(&rc->reloc_roots)) { - root = list_entry(rc->reloc_roots.next, - struct btrfs_root, root_list); - list_del_init(&root->root_list); + if (btrfs_root_refs(&reloc_root->root_item) > 0) { + root = read_fs_root(reloc_root->fs_info, + reloc_root->root_key.offset); + BUG_ON(IS_ERR(root)); + BUG_ON(root->reloc_root != reloc_root); - async = kmalloc(sizeof(*async), GFP_NOFS); - BUG_ON(!async); - async->work.func = merge_func; - async->work.flags = 0; - async->rc = rc; - async->root = root; - async->done = &done; - async->num_pending = &num_pending; - atomic_inc(&num_pending); - btrfs_queue_worker(&rc->workers, &async->work); + ret = merge_reloc_root(rc, root); + BUG_ON(ret); + } else { + list_del_init(&reloc_root->root_list); + } + btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0); } - if (!atomic_dec_and_test(&num_pending)) - wait_for_completion(&done); - + if (found) { + found = 0; + goto again; + } BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); return 0; } @@ -1876,119 +2258,169 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, return btrfs_record_root_in_trans(trans, root); } -/* - * select one tree from trees that references the block. - * for blocks in refernce counted trees, we preper reloc tree. - * if no reloc tree found and reloc_only is true, NULL is returned. - */ -static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans, - struct backref_node *node, - struct backref_edge *edges[], - int *nr, int reloc_only) +static noinline_for_stack +struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_node *node, + struct backref_edge *edges[], int *nr) { struct backref_node *next; struct btrfs_root *root; - int index; - int loop = 0; -again: - index = 0; + int index = 0; + next = node; while (1) { cond_resched(); next = walk_up_backref(next, edges, &index); root = next->root; - if (!root) { - BUG_ON(!node->old_root); - goto skip; - } - - /* no other choice for non-refernce counted tree */ - if (!root->ref_cows) { - BUG_ON(reloc_only); - break; - } + BUG_ON(!root); + BUG_ON(!root->ref_cows); if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { record_reloc_root_in_trans(trans, root); break; } - if (loop) { - btrfs_record_root_in_trans(trans, root); + btrfs_record_root_in_trans(trans, root); + root = root->reloc_root; + + if (next->new_bytenr != root->node->start) { + BUG_ON(next->new_bytenr); + BUG_ON(!list_empty(&next->list)); + next->new_bytenr = root->node->start; + next->root = root; + list_add_tail(&next->list, + &rc->backref_cache.changed); + __mark_block_processed(rc, next); break; } - if (reloc_only || next != node) { - if (!root->reloc_root) - btrfs_record_root_in_trans(trans, root); - root = root->reloc_root; - /* - * if the reloc tree was created in current - * transation, there is no node in backref tree - * corresponds to the root of the reloc tree. - */ - if (btrfs_root_last_snapshot(&root->root_item) == - trans->transid - 1) - break; - } -skip: + WARN_ON(1); root = NULL; next = walk_down_backref(edges, &index); if (!next || next->level <= node->level) break; } + if (!root) + return NULL; - if (!root && !loop && !reloc_only) { - loop = 1; - goto again; + *nr = index; + next = node; + /* setup backref node path for btrfs_reloc_cow_block */ + while (1) { + rc->backref_cache.path[next->level] = next; + if (--index < 0) + break; + next = edges[index]->node[UPPER]; } - - if (root) - *nr = index; - else - *nr = 0; - return root; } +/* + * select a tree root for relocation. return NULL if the block + * is reference counted. we should use do_relocation() in this + * case. return a tree root pointer if the block isn't reference + * counted. return -ENOENT if the block is root of reloc tree. + */ static noinline_for_stack struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, struct backref_node *node) { + struct backref_node *next; + struct btrfs_root *root; + struct btrfs_root *fs_root = NULL; struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; - int nr; - return __select_one_root(trans, node, edges, &nr, 0); + int index = 0; + + next = node; + while (1) { + cond_resched(); + next = walk_up_backref(next, edges, &index); + root = next->root; + BUG_ON(!root); + + /* no other choice for non-refernce counted tree */ + if (!root->ref_cows) + return root; + + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) + fs_root = root; + + if (next != node) + return NULL; + + next = walk_down_backref(edges, &index); + if (!next || next->level <= node->level) + break; + } + + if (!fs_root) + return ERR_PTR(-ENOENT); + return fs_root; } static noinline_for_stack -struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, - struct backref_node *node, - struct backref_edge *edges[], int *nr) +u64 calcu_metadata_size(struct reloc_control *rc, + struct backref_node *node, int reserve) { - return __select_one_root(trans, node, edges, nr, 1); + struct backref_node *next = node; + struct backref_edge *edge; + struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; + u64 num_bytes = 0; + int index = 0; + + BUG_ON(reserve && node->processed); + + while (next) { + cond_resched(); + while (1) { + if (next->processed && (reserve || next != node)) + break; + + num_bytes += btrfs_level_size(rc->extent_root, + next->level); + + if (list_empty(&next->upper)) + break; + + edge = list_entry(next->upper.next, + struct backref_edge, list[LOWER]); + edges[index++] = edge; + next = edge->node[UPPER]; + } + next = walk_down_backref(edges, &index); + } + return num_bytes; } -static void grab_path_buffers(struct btrfs_path *path, - struct backref_node *node, - struct backref_edge *edges[], int nr) +static int reserve_metadata_space(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct backref_node *node) { - int i = 0; - while (1) { - drop_node_buffer(node); - node->eb = path->nodes[node->level]; - BUG_ON(!node->eb); - if (path->locks[node->level]) - node->locked = 1; - path->nodes[node->level] = NULL; - path->locks[node->level] = 0; - - if (i >= nr) - break; + struct btrfs_root *root = rc->extent_root; + u64 num_bytes; + int ret; + + num_bytes = calcu_metadata_size(rc, node, 1) * 2; - edges[i]->blockptr = node->eb->start; - node = edges[i]->node[UPPER]; - i++; + trans->block_rsv = rc->block_rsv; + ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes, + &rc->block_rsv_retries); + if (ret) { + if (ret == -EAGAIN) + rc->commit_transaction = 1; + return ret; } + + rc->block_rsv_retries = 0; + return 0; +} + +static void release_metadata_space(struct reloc_control *rc, + struct backref_node *node) +{ + u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2; + btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes); } /* @@ -1999,6 +2431,7 @@ static void grab_path_buffers(struct btrfs_path *path, * in that case this function just updates pointers. */ static int do_relocation(struct btrfs_trans_handle *trans, + struct reloc_control *rc, struct backref_node *node, struct btrfs_key *key, struct btrfs_path *path, int lowest) @@ -2019,18 +2452,25 @@ static int do_relocation(struct btrfs_trans_handle *trans, BUG_ON(lowest && node->eb); path->lowest_level = node->level + 1; + rc->backref_cache.path[node->level] = node; list_for_each_entry(edge, &node->upper, list[LOWER]) { cond_resched(); - if (node->eb && node->eb->start == edge->blockptr) - continue; upper = edge->node[UPPER]; - root = select_reloc_root(trans, upper, edges, &nr); - if (!root) - continue; - - if (upper->eb && !upper->locked) + root = select_reloc_root(trans, rc, upper, edges, &nr); + BUG_ON(!root); + + if (upper->eb && !upper->locked) { + if (!lowest) { + ret = btrfs_bin_search(upper->eb, key, + upper->level, &slot); + BUG_ON(ret); + bytenr = btrfs_node_blockptr(upper->eb, slot); + if (node->eb->start == bytenr) + goto next; + } drop_node_buffer(upper); + } if (!upper->eb) { ret = btrfs_search_slot(trans, root, key, path, 0, 1); @@ -2040,11 +2480,17 @@ static int do_relocation(struct btrfs_trans_handle *trans, } BUG_ON(ret > 0); - slot = path->slots[upper->level]; + if (!upper->eb) { + upper->eb = path->nodes[upper->level]; + path->nodes[upper->level] = NULL; + } else { + BUG_ON(upper->eb != path->nodes[upper->level]); + } - btrfs_unlock_up_safe(path, upper->level + 1); - grab_path_buffers(path, upper, edges, nr); + upper->locked = 1; + path->locks[upper->level] = 0; + slot = path->slots[upper->level]; btrfs_release_path(NULL, path); } else { ret = btrfs_bin_search(upper->eb, key, upper->level, @@ -2053,14 +2499,11 @@ static int do_relocation(struct btrfs_trans_handle *trans, } bytenr = btrfs_node_blockptr(upper->eb, slot); - if (!lowest) { - if (node->eb->start == bytenr) { - btrfs_tree_unlock(upper->eb); - upper->locked = 0; - continue; - } + if (lowest) { + BUG_ON(bytenr != node->bytenr); } else { - BUG_ON(node->bytenr != bytenr); + if (node->eb->start == bytenr) + goto next; } blocksize = btrfs_level_size(root, node->level); @@ -2072,13 +2515,13 @@ static int do_relocation(struct btrfs_trans_handle *trans, if (!node->eb) { ret = btrfs_cow_block(trans, root, eb, upper->eb, slot, &eb); + btrfs_tree_unlock(eb); + free_extent_buffer(eb); if (ret < 0) { err = ret; - break; + goto next; } - btrfs_set_lock_blocking(eb); - node->eb = eb; - node->locked = 1; + BUG_ON(node->eb != eb); } else { btrfs_set_node_blockptr(upper->eb, slot, node->eb->start); @@ -2096,67 +2539,80 @@ static int do_relocation(struct btrfs_trans_handle *trans, ret = btrfs_drop_subtree(trans, root, eb, upper->eb); BUG_ON(ret); } - if (!lowest) { - btrfs_tree_unlock(upper->eb); - upper->locked = 0; - } +next: + if (!upper->pending) + drop_node_buffer(upper); + else + unlock_node_buffer(upper); + if (err) + break; } + + if (!err && node->pending) { + drop_node_buffer(node); + list_move_tail(&node->list, &rc->backref_cache.changed); + node->pending = 0; + } + path->lowest_level = 0; + BUG_ON(err == -ENOSPC); return err; } static int link_to_upper(struct btrfs_trans_handle *trans, + struct reloc_control *rc, struct backref_node *node, struct btrfs_path *path) { struct btrfs_key key; - if (!node->eb || list_empty(&node->upper)) - return 0; btrfs_node_key_to_cpu(node->eb, &key, 0); - return do_relocation(trans, node, &key, path, 0); + return do_relocation(trans, rc, node, &key, path, 0); } static int finish_pending_nodes(struct btrfs_trans_handle *trans, - struct backref_cache *cache, - struct btrfs_path *path) + struct reloc_control *rc, + struct btrfs_path *path, int err) { + LIST_HEAD(list); + struct backref_cache *cache = &rc->backref_cache; struct backref_node *node; int level; int ret; - int err = 0; for (level = 0; level < BTRFS_MAX_LEVEL; level++) { while (!list_empty(&cache->pending[level])) { node = list_entry(cache->pending[level].next, - struct backref_node, lower); - BUG_ON(node->level != level); + struct backref_node, list); + list_move_tail(&node->list, &list); + BUG_ON(!node->pending); - ret = link_to_upper(trans, node, path); - if (ret < 0) - err = ret; - /* - * this remove the node from the pending list and - * may add some other nodes to the level + 1 - * pending list - */ - remove_backref_node(cache, node); + if (!err) { + ret = link_to_upper(trans, rc, node, path); + if (ret < 0) + err = ret; + } } + list_splice_init(&list, &cache->pending[level]); } - BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root)); return err; } static void mark_block_processed(struct reloc_control *rc, - struct backref_node *node) + u64 bytenr, u32 blocksize) +{ + set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1, + EXTENT_DIRTY, GFP_NOFS); +} + +static void __mark_block_processed(struct reloc_control *rc, + struct backref_node *node) { u32 blocksize; if (node->level == 0 || in_block_group(node->bytenr, rc->block_group)) { blocksize = btrfs_level_size(rc->extent_root, node->level); - set_extent_bits(&rc->processed_blocks, node->bytenr, - node->bytenr + blocksize - 1, EXTENT_DIRTY, - GFP_NOFS); + mark_block_processed(rc, node->bytenr, blocksize); } node->processed = 1; } @@ -2179,7 +2635,7 @@ static void update_processed_blocks(struct reloc_control *rc, if (next->processed) break; - mark_block_processed(rc, next); + __mark_block_processed(rc, next); if (list_empty(&next->upper)) break; @@ -2202,138 +2658,6 @@ static int tree_block_processed(u64 bytenr, u32 blocksize, return 0; } -/* - * check if there are any file extent pointers in the leaf point to - * data require processing - */ -static int check_file_extents(struct reloc_control *rc, - u64 bytenr, u32 blocksize, u64 ptr_gen) -{ - struct btrfs_key found_key; - struct btrfs_file_extent_item *fi; - struct extent_buffer *leaf; - u32 nritems; - int i; - int ret = 0; - - leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen); - - nritems = btrfs_header_nritems(leaf); - for (i = 0; i < nritems; i++) { - cond_resched(); - btrfs_item_key_to_cpu(leaf, &found_key, i); - if (found_key.type != BTRFS_EXTENT_DATA_KEY) - continue; - fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, fi) == - BTRFS_FILE_EXTENT_INLINE) - continue; - bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - if (bytenr == 0) - continue; - if (in_block_group(bytenr, rc->block_group)) { - ret = 1; - break; - } - } - free_extent_buffer(leaf); - return ret; -} - -/* - * scan child blocks of a given block to find blocks require processing - */ -static int add_child_blocks(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct backref_node *node, - struct rb_root *blocks) -{ - struct tree_block *block; - struct rb_node *rb_node; - u64 bytenr; - u64 ptr_gen; - u32 blocksize; - u32 nritems; - int i; - int err = 0; - - nritems = btrfs_header_nritems(node->eb); - blocksize = btrfs_level_size(rc->extent_root, node->level - 1); - for (i = 0; i < nritems; i++) { - cond_resched(); - bytenr = btrfs_node_blockptr(node->eb, i); - ptr_gen = btrfs_node_ptr_generation(node->eb, i); - if (ptr_gen == trans->transid) - continue; - if (!in_block_group(bytenr, rc->block_group) && - (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS)) - continue; - if (tree_block_processed(bytenr, blocksize, rc)) - continue; - - readahead_tree_block(rc->extent_root, - bytenr, blocksize, ptr_gen); - } - - for (i = 0; i < nritems; i++) { - cond_resched(); - bytenr = btrfs_node_blockptr(node->eb, i); - ptr_gen = btrfs_node_ptr_generation(node->eb, i); - if (ptr_gen == trans->transid) - continue; - if (!in_block_group(bytenr, rc->block_group) && - (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS)) - continue; - if (tree_block_processed(bytenr, blocksize, rc)) - continue; - if (!in_block_group(bytenr, rc->block_group) && - !check_file_extents(rc, bytenr, blocksize, ptr_gen)) - continue; - - block = kmalloc(sizeof(*block), GFP_NOFS); - if (!block) { - err = -ENOMEM; - break; - } - block->bytenr = bytenr; - btrfs_node_key_to_cpu(node->eb, &block->key, i); - block->level = node->level - 1; - block->key_ready = 1; - rb_node = tree_insert(blocks, block->bytenr, &block->rb_node); - BUG_ON(rb_node); - } - if (err) - free_block_list(blocks); - return err; -} - -/* - * find adjacent blocks require processing - */ -static noinline_for_stack -int add_adjacent_blocks(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct backref_cache *cache, - struct rb_root *blocks, int level, - struct backref_node **upper) -{ - struct backref_node *node; - int ret = 0; - - WARN_ON(!list_empty(&cache->pending[level])); - - if (list_empty(&cache->pending[level + 1])) - return 1; - - node = list_entry(cache->pending[level + 1].next, - struct backref_node, lower); - if (node->eb) - ret = add_child_blocks(trans, rc, node, blocks); - - *upper = node; - return ret; -} - static int get_tree_block_key(struct reloc_control *rc, struct tree_block *block) { @@ -2371,40 +2695,53 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, struct btrfs_path *path) { struct btrfs_root *root; - int ret; + int release = 0; + int ret = 0; + if (!node) + return 0; + + BUG_ON(node->processed); root = select_one_root(trans, node); - if (unlikely(!root)) { - rc->found_old_snapshot = 1; + if (root == ERR_PTR(-ENOENT)) { update_processed_blocks(rc, node); - return 0; + goto out; } - if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - ret = do_relocation(trans, node, key, path, 1); - if (ret < 0) - goto out; - if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) { - ret = replace_file_extents(trans, rc, root, - node->eb, NULL); - if (ret < 0) - goto out; - } - drop_node_buffer(node); - } else if (!root->ref_cows) { - path->lowest_level = node->level; - ret = btrfs_search_slot(trans, root, key, path, 0, 1); - btrfs_release_path(root, path); - if (ret < 0) + if (!root || root->ref_cows) { + ret = reserve_metadata_space(trans, rc, node); + if (ret) goto out; - } else if (root != node->root) { - WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS); + release = 1; } - update_processed_blocks(rc, node); - ret = 0; + if (root) { + if (root->ref_cows) { + BUG_ON(node->new_bytenr); + BUG_ON(!list_empty(&node->list)); + btrfs_record_root_in_trans(trans, root); + root = root->reloc_root; + node->new_bytenr = root->node->start; + node->root = root; + list_add_tail(&node->list, &rc->backref_cache.changed); + } else { + path->lowest_level = node->level; + ret = btrfs_search_slot(trans, root, key, path, 0, 1); + btrfs_release_path(root, path); + if (ret > 0) + ret = 0; + } + if (!ret) + update_processed_blocks(rc, node); + } else { + ret = do_relocation(trans, rc, node, key, path, 1); + } out: - drop_node_buffer(node); + if (ret || node->level == 0 || node->cowonly) { + if (release) + release_metadata_space(rc, node); + remove_backref_node(&rc->backref_cache, node); + } return ret; } @@ -2415,12 +2752,10 @@ static noinline_for_stack int relocate_tree_blocks(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct rb_root *blocks) { - struct backref_cache *cache; struct backref_node *node; struct btrfs_path *path; struct tree_block *block; struct rb_node *rb_node; - int level = -1; int ret; int err = 0; @@ -2428,21 +2763,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - cache = kmalloc(sizeof(*cache), GFP_NOFS); - if (!cache) { - btrfs_free_path(path); - return -ENOMEM; - } - - backref_cache_init(cache); - rb_node = rb_first(blocks); while (rb_node) { block = rb_entry(rb_node, struct tree_block, rb_node); - if (level == -1) - level = block->level; - else - BUG_ON(level != block->level); if (!block->key_ready) reada_tree_block(rc, block); rb_node = rb_next(rb_node); @@ -2460,7 +2783,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, while (rb_node) { block = rb_entry(rb_node, struct tree_block, rb_node); - node = build_backref_tree(rc, cache, &block->key, + node = build_backref_tree(rc, &block->key, block->level, block->bytenr); if (IS_ERR(node)) { err = PTR_ERR(node); @@ -2470,79 +2793,62 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, ret = relocate_tree_block(trans, rc, node, &block->key, path); if (ret < 0) { - err = ret; + if (ret != -EAGAIN || rb_node == rb_first(blocks)) + err = ret; goto out; } - remove_backref_node(cache, node); rb_node = rb_next(rb_node); } - - if (level > 0) - goto out; - +out: free_block_list(blocks); + err = finish_pending_nodes(trans, rc, path, err); - /* - * now backrefs of some upper level tree blocks have been cached, - * try relocating blocks referenced by these upper level blocks. - */ - while (1) { - struct backref_node *upper = NULL; - if (trans->transaction->in_commit || - trans->transaction->delayed_refs.flushing) - break; + btrfs_free_path(path); + return err; +} - ret = add_adjacent_blocks(trans, rc, cache, blocks, level, - &upper); - if (ret < 0) - err = ret; - if (ret != 0) - break; +static noinline_for_stack +int prealloc_file_extent_cluster(struct inode *inode, + struct file_extent_cluster *cluster) +{ + u64 alloc_hint = 0; + u64 start; + u64 end; + u64 offset = BTRFS_I(inode)->index_cnt; + u64 num_bytes; + int nr = 0; + int ret = 0; - rb_node = rb_first(blocks); - while (rb_node) { - block = rb_entry(rb_node, struct tree_block, rb_node); - if (trans->transaction->in_commit || - trans->transaction->delayed_refs.flushing) - goto out; - BUG_ON(!block->key_ready); - node = build_backref_tree(rc, cache, &block->key, - level, block->bytenr); - if (IS_ERR(node)) { - err = PTR_ERR(node); - goto out; - } + BUG_ON(cluster->start != cluster->boundary[0]); + mutex_lock(&inode->i_mutex); - ret = relocate_tree_block(trans, rc, node, - &block->key, path); - if (ret < 0) { - err = ret; - goto out; - } - remove_backref_node(cache, node); - rb_node = rb_next(rb_node); - } - free_block_list(blocks); + ret = btrfs_check_data_free_space(inode, cluster->end + + 1 - cluster->start); + if (ret) + goto out; - if (upper) { - ret = link_to_upper(trans, upper, path); - if (ret < 0) { - err = ret; - break; - } - remove_backref_node(cache, upper); - } + while (nr < cluster->nr) { + start = cluster->boundary[nr] - offset; + if (nr + 1 < cluster->nr) + end = cluster->boundary[nr + 1] - 1 - offset; + else + end = cluster->end - offset; + + lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + num_bytes = end + 1 - start; + ret = btrfs_prealloc_file_range(inode, 0, start, + num_bytes, num_bytes, + end + 1, &alloc_hint); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS); + if (ret) + break; + nr++; } + btrfs_free_reserved_data_space(inode, cluster->end + + 1 - cluster->start); out: - free_block_list(blocks); - - ret = finish_pending_nodes(trans, cache, path); - if (ret < 0) - err = ret; - - kfree(cache); - btrfs_free_path(path); - return err; + mutex_unlock(&inode->i_mutex); + return ret; } static noinline_for_stack @@ -2588,7 +2894,6 @@ static int relocate_file_extent_cluster(struct inode *inode, u64 offset = BTRFS_I(inode)->index_cnt; unsigned long index; unsigned long last_index; - unsigned int dirty_page = 0; struct page *page; struct file_ra_state *ra; int nr = 0; @@ -2601,21 +2906,24 @@ static int relocate_file_extent_cluster(struct inode *inode, if (!ra) return -ENOMEM; - index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; - last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; + ret = prealloc_file_extent_cluster(inode, cluster); + if (ret) + goto out; - mutex_lock(&inode->i_mutex); + file_ra_state_init(ra, inode->i_mapping); - i_size_write(inode, cluster->end + 1 - offset); ret = setup_extent_mapping(inode, cluster->start - offset, cluster->end - offset, cluster->start); if (ret) - goto out_unlock; - - file_ra_state_init(ra, inode->i_mapping); + goto out; - WARN_ON(cluster->start != cluster->boundary[0]); + index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; + last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; while (index <= last_index) { + ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); + if (ret) + goto out; + page = find_lock_page(inode->i_mapping, index); if (!page) { page_cache_sync_readahead(inode->i_mapping, @@ -2623,8 +2931,10 @@ static int relocate_file_extent_cluster(struct inode *inode, last_index + 1 - index); page = grab_cache_page(inode->i_mapping, index); if (!page) { + btrfs_delalloc_release_metadata(inode, + PAGE_CACHE_SIZE); ret = -ENOMEM; - goto out_unlock; + goto out; } } @@ -2640,8 +2950,10 @@ static int relocate_file_extent_cluster(struct inode *inode, if (!PageUptodate(page)) { unlock_page(page); page_cache_release(page); + btrfs_delalloc_release_metadata(inode, + PAGE_CACHE_SIZE); ret = -EIO; - goto out_unlock; + goto out; } } @@ -2660,10 +2972,9 @@ static int relocate_file_extent_cluster(struct inode *inode, EXTENT_BOUNDARY, GFP_NOFS); nr++; } - btrfs_set_extent_delalloc(inode, page_start, page_end, NULL); + btrfs_set_extent_delalloc(inode, page_start, page_end, NULL); set_page_dirty(page); - dirty_page++; unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); @@ -2671,20 +2982,11 @@ static int relocate_file_extent_cluster(struct inode *inode, page_cache_release(page); index++; - if (nr < cluster->nr && - page_end + 1 + offset == cluster->boundary[nr]) { - balance_dirty_pages_ratelimited_nr(inode->i_mapping, - dirty_page); - dirty_page = 0; - } - } - if (dirty_page) { - balance_dirty_pages_ratelimited_nr(inode->i_mapping, - dirty_page); + balance_dirty_pages_ratelimited(inode->i_mapping); + btrfs_throttle(BTRFS_I(inode)->root); } WARN_ON(nr != cluster->nr); -out_unlock: - mutex_unlock(&inode->i_mutex); +out: kfree(ra); return ret; } @@ -2870,9 +3172,6 @@ out: static int block_use_full_backref(struct reloc_control *rc, struct extent_buffer *eb) { - struct btrfs_path *path; - struct btrfs_extent_item *ei; - struct btrfs_key key; u64 flags; int ret; @@ -2880,28 +3179,14 @@ static int block_use_full_backref(struct reloc_control *rc, btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) return 1; - path = btrfs_alloc_path(); - BUG_ON(!path); - - key.objectid = eb->start; - key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = eb->len; - - path->search_commit_root = 1; - path->skip_locking = 1; - ret = btrfs_search_slot(NULL, rc->extent_root, - &key, path, 0, 0); + ret = btrfs_lookup_extent_info(NULL, rc->extent_root, + eb->start, eb->len, NULL, &flags); BUG_ON(ret); - ei = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_extent_item); - flags = btrfs_extent_flags(path->nodes[0], ei); - BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)); if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) ret = 1; else ret = 0; - btrfs_free_path(path); return ret; } @@ -3074,22 +3359,10 @@ int add_data_references(struct reloc_control *rc, struct btrfs_extent_inline_ref *iref; unsigned long ptr; unsigned long end; - u32 blocksize; + u32 blocksize = btrfs_level_size(rc->extent_root, 0); int ret; int err = 0; - ret = get_new_location(rc->data_inode, NULL, extent_key->objectid, - extent_key->offset); - BUG_ON(ret < 0); - if (ret > 0) { - /* the relocated data is fragmented */ - rc->extents_skipped++; - btrfs_release_path(rc->extent_root, path); - return 0; - } - - blocksize = btrfs_level_size(rc->extent_root, 0); - eb = path->nodes[0]; ptr = btrfs_item_ptr_offset(eb, path->slots[0]); end = ptr + btrfs_item_size_nr(eb, path->slots[0]); @@ -3170,7 +3443,8 @@ int add_data_references(struct reloc_control *rc, */ static noinline_for_stack int find_next_extent(struct btrfs_trans_handle *trans, - struct reloc_control *rc, struct btrfs_path *path) + struct reloc_control *rc, struct btrfs_path *path, + struct btrfs_key *extent_key) { struct btrfs_key key; struct extent_buffer *leaf; @@ -3225,6 +3499,7 @@ next: rc->search_start = end + 1; } else { rc->search_start = key.objectid + key.offset; + memcpy(extent_key, &key, sizeof(key)); return 0; } } @@ -3262,12 +3537,49 @@ static int check_extent_flags(u64 flags) return 0; } +static noinline_for_stack +int prepare_to_relocate(struct reloc_control *rc) +{ + struct btrfs_trans_handle *trans; + int ret; + + rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root); + if (!rc->block_rsv) + return -ENOMEM; + + /* + * reserve some space for creating reloc trees. + * btrfs_init_reloc_root will use them when there + * is no reservation in transaction handle. + */ + ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, + rc->extent_root->nodesize * 256, + &rc->block_rsv_retries); + if (ret) + return ret; + + rc->block_rsv->refill_used = 1; + btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv); + + memset(&rc->cluster, 0, sizeof(rc->cluster)); + rc->search_start = rc->block_group->key.objectid; + rc->extents_found = 0; + rc->nodes_relocated = 0; + rc->merging_rsv_size = 0; + rc->block_rsv_retries = 0; + + rc->create_reloc_tree = 1; + set_reloc_control(rc); + + trans = btrfs_join_transaction(rc->extent_root, 1); + btrfs_commit_transaction(trans, rc->extent_root); + return 0; +} static noinline_for_stack int relocate_block_group(struct reloc_control *rc) { struct rb_root blocks = RB_ROOT; struct btrfs_key key; - struct file_extent_cluster *cluster; struct btrfs_trans_handle *trans = NULL; struct btrfs_path *path; struct btrfs_extent_item *ei; @@ -3277,33 +3589,25 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) int ret; int err = 0; - cluster = kzalloc(sizeof(*cluster), GFP_NOFS); - if (!cluster) - return -ENOMEM; - path = btrfs_alloc_path(); - if (!path) { - kfree(cluster); + if (!path) return -ENOMEM; - } - - rc->extents_found = 0; - rc->extents_skipped = 0; - - rc->search_start = rc->block_group->key.objectid; - clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, - GFP_NOFS); - - rc->create_reloc_root = 1; - set_reloc_control(rc); - trans = btrfs_start_transaction(rc->extent_root, 1); - btrfs_commit_transaction(trans, rc->extent_root); + ret = prepare_to_relocate(rc); + if (ret) { + err = ret; + goto out_free; + } while (1) { - trans = btrfs_start_transaction(rc->extent_root, 1); + trans = btrfs_start_transaction(rc->extent_root, 0); + + if (update_backref_cache(trans, &rc->backref_cache)) { + btrfs_end_transaction(trans, rc->extent_root); + continue; + } - ret = find_next_extent(trans, rc, path); + ret = find_next_extent(trans, rc, path, &key); if (ret < 0) err = ret; if (ret != 0) @@ -3313,9 +3617,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - item_size = btrfs_item_size_nr(path->nodes[0], - path->slots[0]); + item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); if (item_size >= sizeof(*ei)) { flags = btrfs_extent_flags(path->nodes[0], ei); ret = check_extent_flags(flags); @@ -3356,73 +3658,100 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { ret = add_tree_block(rc, &key, path, &blocks); } else if (rc->stage == UPDATE_DATA_PTRS && - (flags & BTRFS_EXTENT_FLAG_DATA)) { + (flags & BTRFS_EXTENT_FLAG_DATA)) { ret = add_data_references(rc, &key, path, &blocks); } else { btrfs_release_path(rc->extent_root, path); ret = 0; } if (ret < 0) { - err = 0; + err = ret; break; } if (!RB_EMPTY_ROOT(&blocks)) { ret = relocate_tree_blocks(trans, rc, &blocks); if (ret < 0) { + if (ret != -EAGAIN) { + err = ret; + break; + } + rc->extents_found--; + rc->search_start = key.objectid; + } + } + + ret = btrfs_block_rsv_check(trans, rc->extent_root, + rc->block_rsv, 0, 5); + if (ret < 0) { + if (ret != -EAGAIN) { err = ret; + WARN_ON(1); break; } + rc->commit_transaction = 1; } - nr = trans->blocks_used; - btrfs_end_transaction(trans, rc->extent_root); + if (rc->commit_transaction) { + rc->commit_transaction = 0; + ret = btrfs_commit_transaction(trans, rc->extent_root); + BUG_ON(ret); + } else { + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, rc->extent_root); + btrfs_btree_balance_dirty(rc->extent_root, nr); + } trans = NULL; - btrfs_btree_balance_dirty(rc->extent_root, nr); if (rc->stage == MOVE_DATA_EXTENTS && (flags & BTRFS_EXTENT_FLAG_DATA)) { rc->found_file_extent = 1; ret = relocate_data_extent(rc->data_inode, - &key, cluster); + &key, &rc->cluster); if (ret < 0) { err = ret; break; } } } - btrfs_free_path(path); + + btrfs_release_path(rc->extent_root, path); + clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, + GFP_NOFS); if (trans) { nr = trans->blocks_used; - btrfs_end_transaction(trans, rc->extent_root); + btrfs_end_transaction_throttle(trans, rc->extent_root); btrfs_btree_balance_dirty(rc->extent_root, nr); } if (!err) { - ret = relocate_file_extent_cluster(rc->data_inode, cluster); + ret = relocate_file_extent_cluster(rc->data_inode, + &rc->cluster); if (ret < 0) err = ret; } - kfree(cluster); + rc->create_reloc_tree = 0; + set_reloc_control(rc); - rc->create_reloc_root = 0; - smp_mb(); + backref_cache_cleanup(&rc->backref_cache); + btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1); - if (rc->extents_found > 0) { - trans = btrfs_start_transaction(rc->extent_root, 1); - btrfs_commit_transaction(trans, rc->extent_root); - } + err = prepare_to_merge(rc, err); merge_reloc_roots(rc); + rc->merge_reloc_tree = 0; unset_reloc_control(rc); + btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1); /* get rid of pinned extents */ - trans = btrfs_start_transaction(rc->extent_root, 1); + trans = btrfs_join_transaction(rc->extent_root, 1); btrfs_commit_transaction(trans, rc->extent_root); - +out_free: + btrfs_free_block_rsv(rc->extent_root, rc->block_rsv); + btrfs_free_path(path); return err; } @@ -3448,7 +3777,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_generation(leaf, item, 1); btrfs_set_inode_size(leaf, item, 0); btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); - btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); + btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | + BTRFS_INODE_PREALLOC); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(root, path); out: @@ -3460,8 +3790,9 @@ out: * helper to create inode for data relocation. * the inode is in data relocation tree and its link count is 0 */ -static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, - struct btrfs_block_group_cache *group) +static noinline_for_stack +struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *group) { struct inode *inode = NULL; struct btrfs_trans_handle *trans; @@ -3475,8 +3806,9 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, if (IS_ERR(root)) return ERR_CAST(root); - trans = btrfs_start_transaction(root, 1); - BUG_ON(!trans); + trans = btrfs_start_transaction(root, 6); + if (IS_ERR(trans)) + return ERR_CAST(trans); err = btrfs_find_free_objectid(trans, root, objectid, &objectid); if (err) @@ -3496,7 +3828,6 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, out: nr = trans->blocks_used; btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); if (err) { if (inode) @@ -3506,6 +3837,21 @@ out: return inode; } +static struct reloc_control *alloc_reloc_control(void) +{ + struct reloc_control *rc; + + rc = kzalloc(sizeof(*rc), GFP_NOFS); + if (!rc) + return NULL; + + INIT_LIST_HEAD(&rc->reloc_roots); + backref_cache_init(&rc->backref_cache); + mapping_tree_init(&rc->reloc_root_tree); + extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS); + return rc; +} + /* * function to relocate all extents in a block group. */ @@ -3514,24 +3860,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) struct btrfs_fs_info *fs_info = extent_root->fs_info; struct reloc_control *rc; int ret; + int rw = 0; int err = 0; - rc = kzalloc(sizeof(*rc), GFP_NOFS); + rc = alloc_reloc_control(); if (!rc) return -ENOMEM; - mapping_tree_init(&rc->reloc_root_tree); - extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS); - INIT_LIST_HEAD(&rc->reloc_roots); + rc->extent_root = extent_root; rc->block_group = btrfs_lookup_block_group(fs_info, group_start); BUG_ON(!rc->block_group); - btrfs_init_workers(&rc->workers, "relocate", - fs_info->thread_pool_size, NULL); - - rc->extent_root = extent_root; - btrfs_prepare_block_group_relocation(extent_root, rc->block_group); + if (!rc->block_group->ro) { + ret = btrfs_set_block_group_ro(extent_root, rc->block_group); + if (ret) { + err = ret; + goto out; + } + rw = 1; + } rc->data_inode = create_reloc_inode(fs_info, rc->block_group); if (IS_ERR(rc->data_inode)) { @@ -3548,9 +3896,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); while (1) { - rc->extents_found = 0; - rc->extents_skipped = 0; - mutex_lock(&fs_info->cleaner_mutex); btrfs_clean_old_snapshots(fs_info->tree_root); @@ -3559,7 +3904,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { err = ret; - break; + goto out; } if (rc->extents_found == 0) @@ -3573,18 +3918,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) invalidate_mapping_pages(rc->data_inode->i_mapping, 0, -1); rc->stage = UPDATE_DATA_PTRS; - } else if (rc->stage == UPDATE_DATA_PTRS && - rc->extents_skipped >= rc->extents_found) { - iput(rc->data_inode); - rc->data_inode = create_reloc_inode(fs_info, - rc->block_group); - if (IS_ERR(rc->data_inode)) { - err = PTR_ERR(rc->data_inode); - rc->data_inode = NULL; - break; - } - rc->stage = MOVE_DATA_EXTENTS; - rc->found_file_extent = 0; } } @@ -3597,8 +3930,9 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) WARN_ON(rc->block_group->reserved > 0); WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); out: + if (err && rw) + btrfs_set_block_group_rw(extent_root, rc->block_group); iput(rc->data_inode); - btrfs_stop_workers(&rc->workers); btrfs_put_block_group(rc->block_group); kfree(rc); return err; @@ -3609,7 +3943,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) struct btrfs_trans_handle *trans; int ret; - trans = btrfs_start_transaction(root->fs_info->tree_root, 1); + trans = btrfs_start_transaction(root->fs_info->tree_root, 0); memset(&root->root_item.drop_progress, 0, sizeof(root->root_item.drop_progress)); @@ -3702,20 +4036,20 @@ int btrfs_recover_relocation(struct btrfs_root *root) if (list_empty(&reloc_roots)) goto out; - rc = kzalloc(sizeof(*rc), GFP_NOFS); + rc = alloc_reloc_control(); if (!rc) { err = -ENOMEM; goto out; } - mapping_tree_init(&rc->reloc_root_tree); - INIT_LIST_HEAD(&rc->reloc_roots); - btrfs_init_workers(&rc->workers, "relocate", - root->fs_info->thread_pool_size, NULL); rc->extent_root = root->fs_info->extent_root; set_reloc_control(rc); + trans = btrfs_join_transaction(rc->extent_root, 1); + + rc->merge_reloc_tree = 1; + while (!list_empty(&reloc_roots)) { reloc_root = list_entry(reloc_roots.next, struct btrfs_root, root_list); @@ -3735,20 +4069,16 @@ int btrfs_recover_relocation(struct btrfs_root *root) fs_root->reloc_root = reloc_root; } - trans = btrfs_start_transaction(rc->extent_root, 1); btrfs_commit_transaction(trans, rc->extent_root); merge_reloc_roots(rc); unset_reloc_control(rc); - trans = btrfs_start_transaction(rc->extent_root, 1); + trans = btrfs_join_transaction(rc->extent_root, 1); btrfs_commit_transaction(trans, rc->extent_root); out: - if (rc) { - btrfs_stop_workers(&rc->workers); - kfree(rc); - } + kfree(rc); while (!list_empty(&reloc_roots)) { reloc_root = list_entry(reloc_roots.next, struct btrfs_root, root_list); @@ -3814,3 +4144,130 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) btrfs_put_ordered_extent(ordered); return 0; } + +void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *cow) +{ + struct reloc_control *rc; + struct backref_node *node; + int first_cow = 0; + int level; + int ret; + + rc = root->fs_info->reloc_ctl; + if (!rc) + return; + + BUG_ON(rc->stage == UPDATE_DATA_PTRS && + root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); + + level = btrfs_header_level(buf); + if (btrfs_header_generation(buf) <= + btrfs_root_last_snapshot(&root->root_item)) + first_cow = 1; + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID && + rc->create_reloc_tree) { + WARN_ON(!first_cow && level == 0); + + node = rc->backref_cache.path[level]; + BUG_ON(node->bytenr != buf->start && + node->new_bytenr != buf->start); + + drop_node_buffer(node); + extent_buffer_get(cow); + node->eb = cow; + node->new_bytenr = cow->start; + + if (!node->pending) { + list_move_tail(&node->list, + &rc->backref_cache.pending[level]); + node->pending = 1; + } + + if (first_cow) + __mark_block_processed(rc, node); + + if (first_cow && level > 0) + rc->nodes_relocated += buf->len; + } + + if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) { + ret = replace_file_extents(trans, rc, root, cow); + BUG_ON(ret); + } +} + +/* + * called before creating snapshot. it calculates metadata reservation + * requried for relocating tree blocks in the snapshot + */ +void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending, + u64 *bytes_to_reserve) +{ + struct btrfs_root *root; + struct reloc_control *rc; + + root = pending->root; + if (!root->reloc_root) + return; + + rc = root->fs_info->reloc_ctl; + if (!rc->merge_reloc_tree) + return; + + root = root->reloc_root; + BUG_ON(btrfs_root_refs(&root->root_item) == 0); + /* + * relocation is in the stage of merging trees. the space + * used by merging a reloc tree is twice the size of + * relocated tree nodes in the worst case. half for cowing + * the reloc tree, half for cowing the fs tree. the space + * used by cowing the reloc tree will be freed after the + * tree is dropped. if we create snapshot, cowing the fs + * tree may use more space than it frees. so we need + * reserve extra space. + */ + *bytes_to_reserve += rc->nodes_relocated; +} + +/* + * called after snapshot is created. migrate block reservation + * and create reloc root for the newly created snapshot + */ +void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending) +{ + struct btrfs_root *root = pending->root; + struct btrfs_root *reloc_root; + struct btrfs_root *new_root; + struct reloc_control *rc; + int ret; + + if (!root->reloc_root) + return; + + rc = root->fs_info->reloc_ctl; + rc->merging_rsv_size += rc->nodes_relocated; + + if (rc->merge_reloc_tree) { + ret = btrfs_block_rsv_migrate(&pending->block_rsv, + rc->block_rsv, + rc->nodes_relocated); + BUG_ON(ret); + } + + new_root = pending->snap; + reloc_root = create_reloc_root(trans, root->reloc_root, + new_root->root_key.objectid); + + __add_reloc_root(reloc_root); + new_root->reloc_root = reloc_root; + + if (rc->create_reloc_tree) { + ret = clone_backref_node(trans, rc, root, reloc_root); + BUG_ON(ret); + } +} diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 67fa2d2..b91ccd9 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -259,6 +259,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) struct extent_buffer *leaf; struct btrfs_path *path; struct btrfs_key key; + struct btrfs_key root_key; + struct btrfs_root *root; int err = 0; int ret; @@ -270,6 +272,9 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = 0; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = (u64)-1; + while (1) { ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); if (ret < 0) { @@ -294,13 +299,25 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) key.type != BTRFS_ORPHAN_ITEM_KEY) break; - ret = btrfs_find_dead_roots(tree_root, key.offset); - if (ret) { + root_key.objectid = key.offset; + key.offset++; + + root = btrfs_read_fs_root_no_name(tree_root->fs_info, + &root_key); + if (!IS_ERR(root)) + continue; + + ret = PTR_ERR(root); + if (ret != -ENOENT) { err = ret; break; } - key.offset++; + ret = btrfs_find_dead_roots(tree_root, root_key.objectid); + if (ret) { + err = ret; + break; + } } btrfs_free_path(path); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 2909a03..d34b2df 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -498,7 +498,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) btrfs_start_delalloc_inodes(root, 0); btrfs_wait_ordered_extents(root, 0, 0); - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); ret = btrfs_commit_transaction(trans, root); return ret; } @@ -694,11 +694,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) return -EINVAL; - /* recover relocation */ - ret = btrfs_recover_relocation(root); + ret = btrfs_cleanup_fs_roots(root->fs_info); WARN_ON(ret); - ret = btrfs_cleanup_fs_roots(root->fs_info); + /* recover relocation */ + ret = btrfs_recover_relocation(root); WARN_ON(ret); sb->s_flags &= ~MS_RDONLY; @@ -714,34 +714,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct list_head *head = &root->fs_info->space_info; struct btrfs_space_info *found; u64 total_used = 0; - u64 data_used = 0; int bits = dentry->d_sb->s_blocksize_bits; __be32 *fsid = (__be32 *)root->fs_info->fsid; rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { - if (found->flags & (BTRFS_BLOCK_GROUP_DUP| - BTRFS_BLOCK_GROUP_RAID10| - BTRFS_BLOCK_GROUP_RAID1)) { - total_used += found->bytes_used; - if (found->flags & BTRFS_BLOCK_GROUP_DATA) - data_used += found->bytes_used; - else - data_used += found->total_bytes; - } - - total_used += found->bytes_used; - if (found->flags & BTRFS_BLOCK_GROUP_DATA) - data_used += found->bytes_used; - else - data_used += found->total_bytes; - } + list_for_each_entry_rcu(found, head, list) + total_used += found->disk_used; rcu_read_unlock(); buf->f_namelen = BTRFS_NAME_LEN; buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; buf->f_bfree = buf->f_blocks - (total_used >> bits); - buf->f_bavail = buf->f_blocks - (data_used >> bits); + buf->f_bavail = buf->f_bfree; buf->f_bsize = dentry->d_sb->s_blocksize; buf->f_type = BTRFS_SUPER_MAGIC; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2cb1160..66e4c66 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -165,54 +165,89 @@ enum btrfs_trans_type { TRANS_USERSPACE, }; +static int may_wait_transaction(struct btrfs_root *root, int type) +{ + if (!root->fs_info->log_root_recovering && + ((type == TRANS_START && !root->fs_info->open_ioctl_trans) || + type == TRANS_USERSPACE)) + return 1; + return 0; +} + static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, - int num_blocks, int type) + u64 num_items, int type) { - struct btrfs_trans_handle *h = - kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); + struct btrfs_trans_handle *h; + struct btrfs_transaction *cur_trans; + int retries = 0; int ret; +again: + h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); + if (!h) + return ERR_PTR(-ENOMEM); mutex_lock(&root->fs_info->trans_mutex); - if (!root->fs_info->log_root_recovering && - ((type == TRANS_START && !root->fs_info->open_ioctl_trans) || - type == TRANS_USERSPACE)) + if (may_wait_transaction(root, type)) wait_current_trans(root); + ret = join_transaction(root); BUG_ON(ret); - h->transid = root->fs_info->running_transaction->transid; - h->transaction = root->fs_info->running_transaction; - h->blocks_reserved = num_blocks; + cur_trans = root->fs_info->running_transaction; + cur_trans->use_count++; + mutex_unlock(&root->fs_info->trans_mutex); + + h->transid = cur_trans->transid; + h->transaction = cur_trans; h->blocks_used = 0; h->block_group = 0; - h->alloc_exclude_nr = 0; - h->alloc_exclude_start = 0; + h->bytes_reserved = 0; h->delayed_ref_updates = 0; + h->block_rsv = NULL; - if (!current->journal_info && type != TRANS_USERSPACE) - current->journal_info = h; + smp_mb(); + if (cur_trans->blocked && may_wait_transaction(root, type)) { + btrfs_commit_transaction(h, root); + goto again; + } + + if (num_items > 0) { + ret = btrfs_trans_reserve_metadata(h, root, num_items, + &retries); + if (ret == -EAGAIN) { + btrfs_commit_transaction(h, root); + goto again; + } + if (ret < 0) { + btrfs_end_transaction(h, root); + return ERR_PTR(ret); + } + } - root->fs_info->running_transaction->use_count++; + mutex_lock(&root->fs_info->trans_mutex); record_root_in_trans(h, root); mutex_unlock(&root->fs_info->trans_mutex); + + if (!current->journal_info && type != TRANS_USERSPACE) + current->journal_info = h; return h; } struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, - int num_blocks) + int num_items) { - return start_transaction(root, num_blocks, TRANS_START); + return start_transaction(root, num_items, TRANS_START); } struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, int num_blocks) { - return start_transaction(root, num_blocks, TRANS_JOIN); + return start_transaction(root, 0, TRANS_JOIN); } struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, int num_blocks) { - return start_transaction(r, num_blocks, TRANS_USERSPACE); + return start_transaction(r, 0, TRANS_USERSPACE); } /* wait for a transaction commit to be fully complete */ @@ -286,10 +321,36 @@ void btrfs_throttle(struct btrfs_root *root) mutex_unlock(&root->fs_info->trans_mutex); } +static int should_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + ret = btrfs_block_rsv_check(trans, root, + &root->fs_info->global_block_rsv, 0, 5); + return ret ? 1 : 0; +} + +int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_transaction *cur_trans = trans->transaction; + int updates; + + if (cur_trans->blocked || cur_trans->delayed_refs.flushing) + return 1; + + updates = trans->delayed_ref_updates; + trans->delayed_ref_updates = 0; + if (updates) + btrfs_run_delayed_refs(trans, root, updates); + + return should_end_transaction(trans, root); +} + static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, int throttle) { - struct btrfs_transaction *cur_trans; + struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *info = root->fs_info; int count = 0; @@ -313,9 +374,21 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, count++; } + btrfs_trans_release_metadata(trans, root); + + if (!root->fs_info->open_ioctl_trans && + should_end_transaction(trans, root)) + trans->transaction->blocked = 1; + + if (cur_trans->blocked && !cur_trans->in_commit) { + if (throttle) + return btrfs_commit_transaction(trans, root); + else + wake_up_process(info->transaction_kthread); + } + mutex_lock(&info->trans_mutex); - cur_trans = info->running_transaction; - WARN_ON(cur_trans != trans->transaction); + WARN_ON(cur_trans != info->running_transaction); WARN_ON(cur_trans->num_writers < 1); cur_trans->num_writers--; @@ -603,6 +676,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, btrfs_free_log(trans, root); btrfs_update_reloc_root(trans, root); + btrfs_orphan_commit_root(trans, root); if (root->commit_root != root->node) { switch_commit_root(root); @@ -627,30 +701,30 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) { struct btrfs_fs_info *info = root->fs_info; - int ret; struct btrfs_trans_handle *trans; + int ret; unsigned long nr; - smp_mb(); - if (root->defrag_running) + if (xchg(&root->defrag_running, 1)) return 0; - trans = btrfs_start_transaction(root, 1); + while (1) { - root->defrag_running = 1; + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + ret = btrfs_defrag_leaves(trans, root, cacheonly); + nr = trans->blocks_used; btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(info->tree_root, nr); cond_resched(); - trans = btrfs_start_transaction(root, 1); if (root->fs_info->closing || ret != -EAGAIN) break; } root->defrag_running = 0; - smp_mb(); - btrfs_end_transaction(trans, root); - return 0; + return ret; } #if 0 @@ -758,47 +832,63 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *root = pending->root; struct btrfs_root *parent_root; struct inode *parent_inode; + struct dentry *dentry; struct extent_buffer *tmp; struct extent_buffer *old; int ret; - u64 objectid; - int namelen; + int retries = 0; + u64 to_reserve = 0; u64 index = 0; - - parent_inode = pending->dentry->d_parent->d_inode; - parent_root = BTRFS_I(parent_inode)->root; + u64 objectid; new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); if (!new_root_item) { - ret = -ENOMEM; + pending->error = -ENOMEM; goto fail; } + ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid); - if (ret) + if (ret) { + pending->error = ret; goto fail; + } + + btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); + btrfs_orphan_pre_snapshot(trans, pending, &to_reserve); + + if (to_reserve > 0) { + ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, + to_reserve, &retries); + if (ret) { + pending->error = ret; + goto fail; + } + } key.objectid = objectid; - /* record when the snapshot was created in key.offset */ - key.offset = trans->transid; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.offset = (u64)-1; + key.type = BTRFS_ROOT_ITEM_KEY; - memcpy(&pending->root_key, &key, sizeof(key)); - pending->root_key.offset = (u64)-1; + trans->block_rsv = &pending->block_rsv; + dentry = pending->dentry; + parent_inode = dentry->d_parent->d_inode; + parent_root = BTRFS_I(parent_inode)->root; record_root_in_trans(trans, parent_root); + /* * insert the directory item */ - namelen = strlen(pending->name); ret = btrfs_set_inode_index(parent_inode, &index); BUG_ON(ret); ret = btrfs_insert_dir_item(trans, parent_root, - pending->name, namelen, - parent_inode->i_ino, - &pending->root_key, BTRFS_FT_DIR, index); + dentry->d_name.name, dentry->d_name.len, + parent_inode->i_ino, &key, + BTRFS_FT_DIR, index); BUG_ON(ret); - btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2); + btrfs_i_size_write(parent_inode, parent_inode->i_size + + dentry->d_name.len * 2); ret = btrfs_update_inode(trans, parent_root, parent_inode); BUG_ON(ret); @@ -815,22 +905,32 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, free_extent_buffer(old); btrfs_set_root_node(new_root_item, tmp); - ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, - new_root_item); - BUG_ON(ret); + /* record when the snapshot was created in key.offset */ + key.offset = trans->transid; + ret = btrfs_insert_root(trans, tree_root, &key, new_root_item); btrfs_tree_unlock(tmp); free_extent_buffer(tmp); + BUG_ON(ret); - ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, - pending->root_key.objectid, + /* + * insert root back/forward references + */ + ret = btrfs_add_root_ref(trans, tree_root, objectid, parent_root->root_key.objectid, - parent_inode->i_ino, index, pending->name, - namelen); + parent_inode->i_ino, index, + dentry->d_name.name, dentry->d_name.len); BUG_ON(ret); + key.offset = (u64)-1; + pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); + BUG_ON(IS_ERR(pending->snap)); + + btrfs_reloc_post_snapshot(trans, pending); + btrfs_orphan_post_snapshot(trans, pending); fail: kfree(new_root_item); - return ret; + btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); + return 0; } /* @@ -878,6 +978,16 @@ int btrfs_transaction_in_commit(struct btrfs_fs_info *info) return ret; } +int btrfs_transaction_blocked(struct btrfs_fs_info *info) +{ + int ret = 0; + spin_lock(&info->new_trans_lock); + if (info->running_transaction) + ret = info->running_transaction->blocked; + spin_unlock(&info->new_trans_lock); + return ret; +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -899,6 +1009,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = btrfs_run_delayed_refs(trans, root, 0); BUG_ON(ret); + btrfs_trans_release_metadata(trans, root); + cur_trans = trans->transaction; /* * set the flushing flag so procs in this transaction have to @@ -951,9 +1063,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, snap_pending = 1; WARN_ON(cur_trans != trans->transaction); - prepare_to_wait(&cur_trans->writer_wait, &wait, - TASK_UNINTERRUPTIBLE); - if (cur_trans->num_writers > 1) timeout = MAX_SCHEDULE_TIMEOUT; else if (should_grow) @@ -976,6 +1085,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, */ btrfs_run_ordered_operations(root, 1); + prepare_to_wait(&cur_trans->writer_wait, &wait, + TASK_UNINTERRUPTIBLE); + smp_mb(); if (cur_trans->num_writers > 1 || should_grow) schedule_timeout(timeout); @@ -1103,9 +1215,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root) if (btrfs_header_backref_rev(root->node) < BTRFS_MIXED_BACKREF_REV) - btrfs_drop_snapshot(root, 0); + btrfs_drop_snapshot(root, NULL, 0); else - btrfs_drop_snapshot(root, 1); + btrfs_drop_snapshot(root, NULL, 1); } return 0; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 93c7ccb..e104986 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -45,20 +45,23 @@ struct btrfs_transaction { struct btrfs_trans_handle { u64 transid; + u64 block_group; + u64 bytes_reserved; unsigned long blocks_reserved; unsigned long blocks_used; - struct btrfs_transaction *transaction; - u64 block_group; - u64 alloc_exclude_start; - u64 alloc_exclude_nr; unsigned long delayed_ref_updates; + struct btrfs_transaction *transaction; + struct btrfs_block_rsv *block_rsv; }; struct btrfs_pending_snapshot { struct dentry *dentry; struct btrfs_root *root; - char *name; - struct btrfs_key root_key; + struct btrfs_root *snap; + /* block reservation for the operation */ + struct btrfs_block_rsv block_rsv; + /* extra metadata reseration for relocation */ + int error; struct list_head list; }; @@ -85,11 +88,11 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, int btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, - int num_blocks); + int num_items); struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, - int num_blocks); + int num_blocks); struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, - int num_blocks); + int num_blocks); int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, @@ -103,6 +106,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root); void btrfs_throttle(struct btrfs_root *root); int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root); @@ -112,5 +117,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark); int btrfs_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages, int mark); +int btrfs_transaction_blocked(struct btrfs_fs_info *info); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); #endif diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index b10eacd..f7ac8e0 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -117,13 +117,14 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, path->nodes[1], 0, cache_only, &last_ret, &root->defrag_progress); - WARN_ON(ret && ret != -EAGAIN); + if (ret) { + WARN_ON(ret == -EAGAIN); + goto out; + } if (next_key_ret == 0) { memcpy(&root->defrag_progress, &key, sizeof(key)); ret = -EAGAIN; } - - btrfs_release_path(root, path); out: if (path) btrfs_free_path(path); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index af57dd2..fb102a9 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -135,6 +135,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int ret; + int err = 0; mutex_lock(&root->log_mutex); if (root->log_root) { @@ -155,17 +156,19 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(&root->fs_info->tree_log_mutex); if (!root->fs_info->log_root_tree) { ret = btrfs_init_log_root_tree(trans, root->fs_info); - BUG_ON(ret); + if (ret) + err = ret; } - if (!root->log_root) { + if (err == 0 && !root->log_root) { ret = btrfs_add_log_tree(trans, root); - BUG_ON(ret); + if (ret) + err = ret; } mutex_unlock(&root->fs_info->tree_log_mutex); root->log_batch++; atomic_inc(&root->log_writers); mutex_unlock(&root->log_mutex); - return 0; + return err; } /* @@ -376,7 +379,7 @@ insert: BUG_ON(ret); } } else if (ret) { - BUG(); + return ret; } dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); @@ -1699,9 +1702,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, next = btrfs_find_create_tree_block(root, bytenr, blocksize); - wc->process_func(root, next, wc, ptr_gen); - if (*level == 1) { + wc->process_func(root, next, wc, ptr_gen); + path->slots[*level]++; if (wc->free) { btrfs_read_buffer(next, ptr_gen); @@ -1734,35 +1737,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, WARN_ON(*level < 0); WARN_ON(*level >= BTRFS_MAX_LEVEL); - if (path->nodes[*level] == root->node) - parent = path->nodes[*level]; - else - parent = path->nodes[*level + 1]; - - bytenr = path->nodes[*level]->start; - - blocksize = btrfs_level_size(root, *level); - root_owner = btrfs_header_owner(parent); - root_gen = btrfs_header_generation(parent); - - wc->process_func(root, path->nodes[*level], wc, - btrfs_header_generation(path->nodes[*level])); - - if (wc->free) { - next = path->nodes[*level]; - btrfs_tree_lock(next); - clean_tree_block(trans, root, next); - btrfs_set_lock_blocking(next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); - - WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_reserved_extent(root, bytenr, blocksize); - BUG_ON(ret); - } - free_extent_buffer(path->nodes[*level]); - path->nodes[*level] = NULL; - *level += 1; + path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); cond_resched(); return 0; @@ -1781,7 +1756,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { slot = path->slots[i]; - if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { + if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { struct extent_buffer *node; node = path->nodes[i]; path->slots[i]++; @@ -2047,7 +2022,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&log_root_tree->log_mutex); ret = update_log_root(trans, log); - BUG_ON(ret); mutex_lock(&log_root_tree->log_mutex); if (atomic_dec_and_test(&log_root_tree->log_writers)) { @@ -2056,6 +2030,15 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, wake_up(&log_root_tree->log_writer_wait); } + if (ret) { + BUG_ON(ret != -ENOSPC); + root->fs_info->last_trans_log_full_commit = trans->transid; + btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + mutex_unlock(&log_root_tree->log_mutex); + ret = -EAGAIN; + goto out; + } + index2 = log_root_tree->log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); @@ -2129,15 +2112,10 @@ out: return 0; } -/* - * free all the extents used by the tree log. This should be called - * at commit time of the full transaction - */ -int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) +static void free_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *log) { int ret; - struct btrfs_root *log; - struct key; u64 start; u64 end; struct walk_control wc = { @@ -2145,10 +2123,6 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) .process_func = process_one_buffer }; - if (!root->log_root || root->fs_info->log_root_recovering) - return 0; - - log = root->log_root; ret = walk_log_tree(trans, log, &wc); BUG_ON(ret); @@ -2162,14 +2136,30 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); } - if (log->log_transid > 0) { - ret = btrfs_del_root(trans, root->fs_info->log_root_tree, - &log->root_key); - BUG_ON(ret); - } - root->log_root = NULL; free_extent_buffer(log->node); kfree(log); +} + +/* + * free all the extents used by the tree log. This should be called + * at commit time of the full transaction + */ +int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) +{ + if (root->log_root) { + free_log_tree(trans, root->log_root); + root->log_root = NULL; + } + return 0; +} + +int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + if (fs_info->log_root_tree) { + free_log_tree(trans, fs_info->log_root_tree); + fs_info->log_root_tree = NULL; + } return 0; } @@ -2203,6 +2193,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_dir_item *di; struct btrfs_path *path; int ret; + int err = 0; int bytes_del = 0; if (BTRFS_I(dir)->logged_trans < trans->transid) @@ -2218,7 +2209,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, name, name_len, -1); - if (di && !IS_ERR(di)) { + if (IS_ERR(di)) { + err = PTR_ERR(di); + goto fail; + } + if (di) { ret = btrfs_delete_one_dir_name(trans, log, path, di); bytes_del += name_len; BUG_ON(ret); @@ -2226,7 +2221,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, btrfs_release_path(log, path); di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, index, name, name_len, -1); - if (di && !IS_ERR(di)) { + if (IS_ERR(di)) { + err = PTR_ERR(di); + goto fail; + } + if (di) { ret = btrfs_delete_one_dir_name(trans, log, path, di); bytes_del += name_len; BUG_ON(ret); @@ -2244,6 +2243,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, btrfs_release_path(log, path); ret = btrfs_search_slot(trans, log, &key, path, 0, 1); + if (ret < 0) { + err = ret; + goto fail; + } if (ret == 0) { struct btrfs_inode_item *item; u64 i_size; @@ -2261,9 +2264,13 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, ret = 0; btrfs_release_path(log, path); } - +fail: btrfs_free_path(path); mutex_unlock(&BTRFS_I(dir)->log_mutex); + if (ret == -ENOSPC) { + root->fs_info->last_trans_log_full_commit = trans->transid; + ret = 0; + } btrfs_end_log_trans(root); return 0; @@ -2291,6 +2298,10 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, dirid, &index); mutex_unlock(&BTRFS_I(inode)->log_mutex); + if (ret == -ENOSPC) { + root->fs_info->last_trans_log_full_commit = trans->transid; + ret = 0; + } btrfs_end_log_trans(root); return ret; @@ -2318,7 +2329,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, else key.type = BTRFS_DIR_LOG_INDEX_KEY; ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); - BUG_ON(ret); + if (ret) + return ret; item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_dir_log_item); @@ -2343,6 +2355,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, struct btrfs_key max_key; struct btrfs_root *log = root->log_root; struct extent_buffer *src; + int err = 0; int ret; int i; int nritems; @@ -2405,6 +2418,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, ret = overwrite_item(trans, log, dst_path, path->nodes[0], path->slots[0], &tmp); + if (ret) { + err = ret; + goto done; + } } } btrfs_release_path(root, path); @@ -2432,7 +2449,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, goto done; ret = overwrite_item(trans, log, dst_path, src, i, &min_key); - BUG_ON(ret); + if (ret) { + err = ret; + goto done; + } } path->slots[0] = nritems; @@ -2454,22 +2474,30 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, ret = overwrite_item(trans, log, dst_path, path->nodes[0], path->slots[0], &tmp); - - BUG_ON(ret); - last_offset = tmp.offset; + if (ret) + err = ret; + else + last_offset = tmp.offset; goto done; } } done: - *last_offset_ret = last_offset; btrfs_release_path(root, path); btrfs_release_path(log, dst_path); - /* insert the log range keys to indicate where the log is valid */ - ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino, - first_offset, last_offset); - BUG_ON(ret); - return 0; + if (err == 0) { + *last_offset_ret = last_offset; + /* + * insert the log range keys to indicate where the log + * is valid + */ + ret = insert_dir_log_key(trans, log, path, key_type, + inode->i_ino, first_offset, + last_offset); + if (ret) + err = ret; + } + return err; } /* @@ -2501,7 +2529,8 @@ again: ret = log_dir_items(trans, root, inode, path, dst_path, key_type, min_key, &max_key); - BUG_ON(ret); + if (ret) + return ret; if (max_key == (u64)-1) break; min_key = max_key + 1; @@ -2535,8 +2564,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, while (1) { ret = btrfs_search_slot(trans, log, &key, path, -1, 1); - - if (ret != 1) + BUG_ON(ret == 0); + if (ret < 0) break; if (path->slots[0] == 0) @@ -2554,7 +2583,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, btrfs_release_path(log, path); } btrfs_release_path(log, path); - return 0; + return ret; } static noinline int copy_items(struct btrfs_trans_handle *trans, @@ -2587,7 +2616,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, } ret = btrfs_insert_empty_items(trans, log, dst_path, ins_keys, ins_sizes, nr); - BUG_ON(ret); + if (ret) { + kfree(ins_data); + return ret; + } for (i = 0; i < nr; i++, dst_path->slots[0]++) { dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], @@ -2660,16 +2692,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, * we have to do this after the loop above to avoid changing the * log tree while trying to change the log tree. */ + ret = 0; while (!list_empty(&ordered_sums)) { struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, struct btrfs_ordered_sum, list); - ret = btrfs_csum_file_blocks(trans, log, sums); - BUG_ON(ret); + if (!ret) + ret = btrfs_csum_file_blocks(trans, log, sums); list_del(&sums->list); kfree(sums); } - return 0; + return ret; } /* log a single inode in the tree log. @@ -2697,6 +2730,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *log = root->log_root; struct extent_buffer *src = NULL; u32 size; + int err = 0; int ret; int nritems; int ins_start_slot = 0; @@ -2739,7 +2773,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } else { ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); } - BUG_ON(ret); + if (ret) { + err = ret; + goto out_unlock; + } path->keep_locks = 1; while (1) { @@ -2768,7 +2805,10 @@ again: ret = copy_items(trans, log, dst_path, src, ins_start_slot, ins_nr, inode_only); - BUG_ON(ret); + if (ret) { + err = ret; + goto out_unlock; + } ins_nr = 1; ins_start_slot = path->slots[0]; next_slot: @@ -2784,7 +2824,10 @@ next_slot: ret = copy_items(trans, log, dst_path, src, ins_start_slot, ins_nr, inode_only); - BUG_ON(ret); + if (ret) { + err = ret; + goto out_unlock; + } ins_nr = 0; } btrfs_release_path(root, path); @@ -2802,7 +2845,10 @@ next_slot: ret = copy_items(trans, log, dst_path, src, ins_start_slot, ins_nr, inode_only); - BUG_ON(ret); + if (ret) { + err = ret; + goto out_unlock; + } ins_nr = 0; } WARN_ON(ins_nr); @@ -2810,14 +2856,18 @@ next_slot: btrfs_release_path(root, path); btrfs_release_path(log, dst_path); ret = log_directory_changes(trans, root, inode, path, dst_path); - BUG_ON(ret); + if (ret) { + err = ret; + goto out_unlock; + } } BTRFS_I(inode)->logged_trans = trans->transid; +out_unlock: mutex_unlock(&BTRFS_I(inode)->log_mutex); btrfs_free_path(path); btrfs_free_path(dst_path); - return 0; + return err; } /* @@ -2942,10 +2992,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_no_trans; } - start_log_trans(trans, root); + ret = start_log_trans(trans, root); + if (ret) + goto end_trans; ret = btrfs_log_inode(trans, root, inode, inode_only); - BUG_ON(ret); + if (ret) + goto end_trans; /* * for regular files, if its inode is already on disk, we don't @@ -2955,8 +3008,10 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, */ if (S_ISREG(inode->i_mode) && BTRFS_I(inode)->generation <= last_committed && - BTRFS_I(inode)->last_unlink_trans <= last_committed) - goto no_parent; + BTRFS_I(inode)->last_unlink_trans <= last_committed) { + ret = 0; + goto end_trans; + } inode_only = LOG_INODE_EXISTS; while (1) { @@ -2970,15 +3025,21 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { ret = btrfs_log_inode(trans, root, inode, inode_only); - BUG_ON(ret); + if (ret) + goto end_trans; } if (IS_ROOT(parent)) break; parent = parent->d_parent; } -no_parent: ret = 0; +end_trans: + if (ret < 0) { + BUG_ON(ret != -ENOSPC); + root->fs_info->last_trans_log_full_commit = trans->transid; + ret = 1; + } btrfs_end_log_trans(root); end_no_trans: return ret; @@ -3020,7 +3081,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) path = btrfs_alloc_path(); BUG_ON(!path); - trans = btrfs_start_transaction(fs_info->tree_root, 1); + trans = btrfs_start_transaction(fs_info->tree_root, 0); wc.trans = trans; wc.pin = 1; diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 0776eac..3dfae84 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -25,6 +25,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct dentry *dentry); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8db7b14..d6e3af8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1097,7 +1097,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root, if (!path) return -ENOMEM; - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; @@ -1486,7 +1486,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) goto error; } - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); lock_chunks(root); device->barriers = 1; @@ -1751,9 +1751,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, /* step one, relocate all the extents inside this chunk */ ret = btrfs_relocate_block_group(extent_root, chunk_offset); - BUG_ON(ret); + if (ret) + return ret; - trans = btrfs_start_transaction(root, 1); + trans = btrfs_start_transaction(root, 0); BUG_ON(!trans); lock_chunks(root); @@ -1925,7 +1926,7 @@ int btrfs_balance(struct btrfs_root *dev_root) break; BUG_ON(ret); - trans = btrfs_start_transaction(dev_root, 1); + trans = btrfs_start_transaction(dev_root, 0); BUG_ON(!trans); ret = btrfs_grow_device(trans, device, old_size); @@ -2094,11 +2095,7 @@ again: } /* Shrinking succeeded, else we would be at "done". */ - trans = btrfs_start_transaction(root, 1); - if (!trans) { - ret = -ENOMEM; - goto done; - } + trans = btrfs_start_transaction(root, 0); lock_chunks(root); device->disk_total_bytes = new_size; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 59acd3e..88ecbb2 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -154,15 +154,10 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, if (trans) return do_setxattr(trans, inode, name, value, size, flags); - ret = btrfs_reserve_metadata_space(root, 2); - if (ret) - return ret; + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); - trans = btrfs_start_transaction(root, 1); - if (!trans) { - ret = -ENOMEM; - goto out; - } btrfs_set_trans_block_group(trans, inode); ret = do_setxattr(trans, inode, name, value, size, flags); @@ -174,7 +169,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, BUG_ON(ret); out: btrfs_end_transaction_throttle(trans, root); - btrfs_unreserve_metadata_space(root, 2); return ret; } diff --git a/fs/direct-io.c b/fs/direct-io.c index e82adc2..da111aa 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -82,6 +82,8 @@ struct dio { int reap_counter; /* rate limit reaping */ get_block_t *get_block; /* block mapping function */ dio_iodone_t *end_io; /* IO completion function */ + dio_submit_t *submit_io; /* IO submition function */ + loff_t logical_offset_in_bio; /* current first logical block in bio */ sector_t final_block_in_bio; /* current final block in bio + 1 */ sector_t next_block_for_io; /* next block to be put under IO, in dio_blocks units */ @@ -96,6 +98,7 @@ struct dio { unsigned cur_page_offset; /* Offset into it, in bytes */ unsigned cur_page_len; /* Nr of bytes at cur_page_offset */ sector_t cur_page_block; /* Where it starts */ + loff_t cur_page_fs_offset; /* Offset in file */ /* BIO completion state */ spinlock_t bio_lock; /* protects BIO fields below */ @@ -300,6 +303,26 @@ static void dio_bio_end_io(struct bio *bio, int error) spin_unlock_irqrestore(&dio->bio_lock, flags); } +/** + * dio_end_io - handle the end io action for the given bio + * @bio: The direct io bio thats being completed + * @error: Error if there was one + * + * This is meant to be called by any filesystem that uses their own dio_submit_t + * so that the DIO specific endio actions are dealt with after the filesystem + * has done it's completion work. + */ +void dio_end_io(struct bio *bio, int error) +{ + struct dio *dio = bio->bi_private; + + if (dio->is_async) + dio_bio_end_aio(bio, error); + else + dio_bio_end_io(bio, error); +} +EXPORT_SYMBOL_GPL(dio_end_io); + static int dio_bio_alloc(struct dio *dio, struct block_device *bdev, sector_t first_sector, int nr_vecs) @@ -316,6 +339,7 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev, bio->bi_end_io = dio_bio_end_io; dio->bio = bio; + dio->logical_offset_in_bio = dio->cur_page_fs_offset; return 0; } @@ -340,10 +364,15 @@ static void dio_bio_submit(struct dio *dio) if (dio->is_async && dio->rw == READ) bio_set_pages_dirty(bio); - submit_bio(dio->rw, bio); + if (dio->submit_io) + dio->submit_io(dio->rw, bio, dio->inode, + dio->logical_offset_in_bio); + else + submit_bio(dio->rw, bio); dio->bio = NULL; dio->boundary = 0; + dio->logical_offset_in_bio = 0; } /* @@ -603,10 +632,26 @@ static int dio_send_cur_page(struct dio *dio) int ret = 0; if (dio->bio) { + loff_t cur_offset = dio->block_in_file << dio->blkbits; + loff_t bio_next_offset = dio->logical_offset_in_bio + + dio->bio->bi_size; + /* - * See whether this new request is contiguous with the old + * See whether this new request is contiguous with the old. + * + * Btrfs cannot handl having logically non-contiguous requests + * submitted. For exmple if you have + * + * Logical: [0-4095][HOLE][8192-12287] + * Phyiscal: [0-4095] [4096-8181] + * + * We cannot submit those pages together as one BIO. So if our + * current logical offset in the file does not equal what would + * be the next logical offset in the bio, submit the bio we + * have. */ - if (dio->final_block_in_bio != dio->cur_page_block) + if (dio->final_block_in_bio != dio->cur_page_block || + cur_offset != bio_next_offset) dio_bio_submit(dio); /* * Submit now if the underlying fs is about to perform a @@ -701,6 +746,7 @@ submit_page_section(struct dio *dio, struct page *page, dio->cur_page_offset = offset; dio->cur_page_len = len; dio->cur_page_block = blocknr; + dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits; out: return ret; } @@ -935,7 +981,7 @@ static ssize_t direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, const struct iovec *iov, loff_t offset, unsigned long nr_segs, unsigned blkbits, get_block_t get_block, dio_iodone_t end_io, - struct dio *dio) + dio_submit_t submit_io, struct dio *dio) { unsigned long user_addr; unsigned long flags; @@ -952,6 +998,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, dio->get_block = get_block; dio->end_io = end_io; + dio->submit_io = submit_io; dio->final_block_in_bio = -1; dio->next_block_for_io = -1; @@ -1008,7 +1055,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, } } /* end iovec loop */ - if (ret == -ENOTBLK && (rw & WRITE)) { + if (ret == -ENOTBLK) { /* * The remaining part of the request will be * be handled by buffered I/O when we return @@ -1110,7 +1157,7 @@ ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, - int flags) + dio_submit_t submit_io, int flags) { int seg; size_t size; @@ -1197,7 +1244,8 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, (end > i_size_read(inode))); retval = direct_io_worker(rw, iocb, inode, iov, offset, - nr_segs, blkbits, get_block, end_io, dio); + nr_segs, blkbits, get_block, end_io, + submit_io, dio); /* * In case of error extending write may have instantiated a few diff --git a/include/linux/fs.h b/include/linux/fs.h index 9682d52..85e823a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2251,10 +2251,15 @@ static inline int xip_truncate_page(struct address_space *mapping, loff_t from) #endif #ifdef CONFIG_BLOCK +struct bio; +typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode, + loff_t file_offset); +void dio_end_io(struct bio *bio, int error); + ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, - int lock_type); + dio_submit_t submit_io, int lock_type); enum { /* need locking between buffered and direct access */ @@ -2270,7 +2275,7 @@ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, dio_iodone_t end_io) { return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_block, end_io, + nr_segs, get_block, end_io, NULL, DIO_LOCKING | DIO_SKIP_HOLES); } @@ -2280,7 +2285,7 @@ static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, dio_iodone_t end_io) { return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_block, end_io, 0); + nr_segs, get_block, end_io, NULL, 0); } #endif diff --git a/mm/filemap.c b/mm/filemap.c index 35e12d1..45a2d18 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1275,7 +1275,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, { struct file *filp = iocb->ki_filp; ssize_t retval; - unsigned long seg; + unsigned long seg = 0; size_t count; loff_t *ppos = &iocb->ki_pos; @@ -1302,21 +1302,47 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, retval = mapping->a_ops->direct_IO(READ, iocb, iov, pos, nr_segs); } - if (retval > 0) + if (retval > 0) { *ppos = pos + retval; - if (retval) { + count -= retval; + } + + /* + * Btrfs can have a short DIO read if we encounter + * compressed extents, so if there was an error, or if + * we've already read everything we wanted to, or if + * there was a short read because we hit EOF, go ahead + * and return. Otherwise fallthrough to buffered io for + * the rest of the read. + */ + if (retval < 0 || !count || *ppos >= size) { file_accessed(filp); goto out; } } } + count = retval; for (seg = 0; seg < nr_segs; seg++) { read_descriptor_t desc; + loff_t offset = 0; + + /* + * If we did a short DIO read we need to skip the section of the + * iov that we've already read data into. + */ + if (count) { + if (count > iov[seg].iov_len) { + count -= iov[seg].iov_len; + continue; + } + offset = count; + count = 0; + } desc.written = 0; - desc.arg.buf = iov[seg].iov_base; - desc.count = iov[seg].iov_len; + desc.arg.buf = iov[seg].iov_base + offset; + desc.count = iov[seg].iov_len - offset; if (desc.count == 0) continue; desc.error = 0; |