aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/extent-tree.c306
-rw-r--r--fs/btrfs/inode.c2
-rw-r--r--fs/btrfs/ref-cache.c1
-rw-r--r--fs/btrfs/ref-cache.h1
4 files changed, 265 insertions, 45 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ed1e25d..1d3e926 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1533,6 +1533,11 @@ out:
* struct refsort is used to match byte number to slot in the btree block.
* we sort based on the byte number and then use the slot to actually
* find the item.
+ *
+ * struct refsort is smaller than strcut btrfs_item and smaller than
+ * struct btrfs_key_ptr. Since we're currently limited to the page size
+ * for a btree block, there's no way for a kmalloc of refsorts for a
+ * single node to be bigger than a page.
*/
struct refsort {
u64 bytenr;
@@ -3457,36 +3462,73 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
{
u64 leaf_owner;
u64 leaf_generation;
+ struct refsort *sorted;
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
int i;
int nritems;
int ret;
+ int refi = 0;
+ int slot;
BUG_ON(!btrfs_is_leaf(leaf));
nritems = btrfs_header_nritems(leaf);
leaf_owner = btrfs_header_owner(leaf);
leaf_generation = btrfs_header_generation(leaf);
+ sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
+ /* we do this loop twice. The first time we build a list
+ * of the extents we have a reference on, then we sort the list
+ * by bytenr. The second time around we actually do the
+ * extent freeing.
+ */
for (i = 0; i < nritems; i++) {
u64 disk_bytenr;
cond_resched();
btrfs_item_key_to_cpu(leaf, &key, i);
+
+ /* only extents have references, skip everything else */
if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
continue;
+
fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+
+ /* inline extents live in the btree, they don't have refs */
if (btrfs_file_extent_type(leaf, fi) ==
BTRFS_FILE_EXTENT_INLINE)
continue;
- /*
- * FIXME make sure to insert a trans record that
- * repeats the snapshot del on crash
- */
+
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+
+ /* holes don't have refs */
if (disk_bytenr == 0)
continue;
+ sorted[refi].bytenr = disk_bytenr;
+ sorted[refi].slot = i;
+ refi++;
+ }
+
+ if (refi == 0)
+ goto out;
+
+ sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
+
+ for (i = 0; i < refi; i++) {
+ u64 disk_bytenr;
+
+ disk_bytenr = sorted[i].bytenr;
+ slot = sorted[i].slot;
+
+ cond_resched();
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+ continue;
+
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
ret = __btrfs_free_extent(trans, root, disk_bytenr,
btrfs_file_extent_disk_num_bytes(leaf, fi),
leaf->start, leaf_owner, leaf_generation,
@@ -3497,6 +3539,8 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
wake_up(&root->fs_info->transaction_throttle);
cond_resched();
}
+out:
+ kfree(sorted);
return 0;
}
@@ -3506,9 +3550,25 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
{
int i;
int ret;
- struct btrfs_extent_info *info = ref->extents;
+ struct btrfs_extent_info *info;
+ struct refsort *sorted;
+
+ if (ref->nritems == 0)
+ return 0;
+ sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
+ for (i = 0; i < ref->nritems; i++) {
+ sorted[i].bytenr = ref->extents[i].bytenr;
+ sorted[i].slot = i;
+ }
+ sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
+
+ /*
+ * the items in the ref were sorted when the ref was inserted
+ * into the ref cache, so this is already in order
+ */
for (i = 0; i < ref->nritems; i++) {
+ info = ref->extents + sorted[i].slot;
ret = __btrfs_free_extent(trans, root, info->bytenr,
info->num_bytes, ref->bytenr,
ref->owner, ref->generation,
@@ -3566,6 +3626,152 @@ static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
}
/*
+ * this is used while deleting old snapshots, and it drops the refs
+ * on a whole subtree starting from a level 1 node.
+ *
+ * The idea is to sort all the leaf pointers, and then drop the
+ * ref on all the leaves in order. Most of the time the leaves
+ * will have ref cache entries, so no leaf IOs will be required to
+ * find the extents they have references on.
+ *
+ * For each leaf, any references it has are also dropped in order
+ *
+ * This ends up dropping the references in something close to optimal
+ * order for reading and modifying the extent allocation tree.
+ */
+static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path)
+{
+ u64 bytenr;
+ u64 root_owner;
+ u64 root_gen;
+ struct extent_buffer *eb = path->nodes[1];
+ struct extent_buffer *leaf;
+ struct btrfs_leaf_ref *ref;
+ struct refsort *sorted = NULL;
+ int nritems = btrfs_header_nritems(eb);
+ int ret;
+ int i;
+ int refi = 0;
+ int slot = path->slots[1];
+ u32 blocksize = btrfs_level_size(root, 0);
+ u32 refs;
+
+ if (nritems == 0)
+ goto out;
+
+ root_owner = btrfs_header_owner(eb);
+ root_gen = btrfs_header_generation(eb);
+ sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
+
+ /*
+ * step one, sort all the leaf pointers so we don't scribble
+ * randomly into the extent allocation tree
+ */
+ for (i = slot; i < nritems; i++) {
+ sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
+ sorted[refi].slot = i;
+ refi++;
+ }
+
+ /*
+ * nritems won't be zero, but if we're picking up drop_snapshot
+ * after a crash, slot might be > 0, so double check things
+ * just in case.
+ */
+ if (refi == 0)
+ goto out;
+
+ sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
+
+ /*
+ * the first loop frees everything the leaves point to
+ */
+ for (i = 0; i < refi; i++) {
+ u64 ptr_gen;
+
+ bytenr = sorted[i].bytenr;
+
+ /*
+ * check the reference count on this leaf. If it is > 1
+ * we just decrement it below and don't update any
+ * of the refs the leaf points to.
+ */
+ ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
+ BUG_ON(ret);
+ if (refs != 1)
+ continue;
+
+ ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
+
+ /*
+ * the leaf only had one reference, which means the
+ * only thing pointing to this leaf is the snapshot
+ * we're deleting. It isn't possible for the reference
+ * count to increase again later
+ *
+ * The reference cache is checked for the leaf,
+ * and if found we'll be able to drop any refs held by
+ * the leaf without needing to read it in.
+ */
+ ref = btrfs_lookup_leaf_ref(root, bytenr);
+ if (ref && ref->generation != ptr_gen) {
+ btrfs_free_leaf_ref(root, ref);
+ ref = NULL;
+ }
+ if (ref) {
+ ret = cache_drop_leaf_ref(trans, root, ref);
+ BUG_ON(ret);
+ btrfs_remove_leaf_ref(root, ref);
+ btrfs_free_leaf_ref(root, ref);
+ } else {
+ /*
+ * the leaf wasn't in the reference cache, so
+ * we have to read it.
+ */
+ leaf = read_tree_block(root, bytenr, blocksize,
+ ptr_gen);
+ ret = btrfs_drop_leaf_ref(trans, root, leaf);
+ BUG_ON(ret);
+ free_extent_buffer(leaf);
+ }
+ atomic_inc(&root->fs_info->throttle_gen);
+ wake_up(&root->fs_info->transaction_throttle);
+ cond_resched();
+ }
+
+ /*
+ * run through the loop again to free the refs on the leaves.
+ * This is faster than doing it in the loop above because
+ * the leaves are likely to be clustered together. We end up
+ * working in nice chunks on the extent allocation tree.
+ */
+ for (i = 0; i < refi; i++) {
+ bytenr = sorted[i].bytenr;
+ ret = __btrfs_free_extent(trans, root, bytenr,
+ blocksize, eb->start,
+ root_owner, root_gen, 0, 1);
+ BUG_ON(ret);
+
+ atomic_inc(&root->fs_info->throttle_gen);
+ wake_up(&root->fs_info->transaction_throttle);
+ cond_resched();
+ }
+out:
+ kfree(sorted);
+
+ /*
+ * update the path to show we've processed the entire level 1
+ * node. This will get saved into the root's drop_snapshot_progress
+ * field so these drops are not repeated again if this transaction
+ * commits.
+ */
+ path->slots[1] = nritems;
+ return 0;
+}
+
+/*
* helper function for drop_snapshot, this walks down the tree dropping ref
* counts as it goes.
*/
@@ -3580,7 +3786,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
struct extent_buffer *next;
struct extent_buffer *cur;
struct extent_buffer *parent;
- struct btrfs_leaf_ref *ref;
u32 blocksize;
int ret;
u32 refs;
@@ -3607,17 +3812,46 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
if (path->slots[*level] >=
btrfs_header_nritems(cur))
break;
+
+ /* the new code goes down to level 1 and does all the
+ * leaves pointed to that node in bulk. So, this check
+ * for level 0 will always be false.
+ *
+ * But, the disk format allows the drop_snapshot_progress
+ * field in the root to leave things in a state where
+ * a leaf will need cleaning up here. If someone crashes
+ * with the old code and then boots with the new code,
+ * we might find a leaf here.
+ */
if (*level == 0) {
ret = btrfs_drop_leaf_ref(trans, root, cur);
BUG_ON(ret);
break;
}
+
+ /*
+ * once we get to level one, process the whole node
+ * at once, including everything below it.
+ */
+ if (*level == 1) {
+ ret = drop_level_one_refs(trans, root, path);
+ BUG_ON(ret);
+ break;
+ }
+
bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
blocksize = btrfs_level_size(root, *level - 1);
ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
BUG_ON(ret);
+
+ /*
+ * if there is more than one reference, we don't need
+ * to read that node to drop any references it has. We
+ * just drop the ref we hold on that node and move on to the
+ * next slot in this level.
+ */
if (refs != 1) {
parent = path->nodes[*level];
root_owner = btrfs_header_owner(parent);
@@ -3636,46 +3870,12 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
continue;
}
+
/*
- * at this point, we have a single ref, and since the
- * only place referencing this extent is a dead root
- * the reference count should never go higher.
- * So, we don't need to check it again
+ * we need to keep freeing things in the next level down.
+ * read the block and loop around to process it
*/
- if (*level == 1) {
- ref = btrfs_lookup_leaf_ref(root, bytenr);
- if (ref && ref->generation != ptr_gen) {
- btrfs_free_leaf_ref(root, ref);
- ref = NULL;
- }
- if (ref) {
- ret = cache_drop_leaf_ref(trans, root, ref);
- BUG_ON(ret);
- btrfs_remove_leaf_ref(root, ref);
- btrfs_free_leaf_ref(root, ref);
- *level = 0;
- break;
- }
- }
- next = btrfs_find_tree_block(root, bytenr, blocksize);
- if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
- free_extent_buffer(next);
-
- next = read_tree_block(root, bytenr, blocksize,
- ptr_gen);
- cond_resched();
-#if 0
- /*
- * this is a debugging check and can go away
- * the ref should never go all the way down to 1
- * at this point
- */
- ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
- &refs);
- BUG_ON(ret);
- WARN_ON(refs != 1);
-#endif
- }
+ next = read_tree_block(root, bytenr, blocksize, ptr_gen);
WARN_ON(*level <= 0);
if (path->nodes[*level-1])
free_extent_buffer(path->nodes[*level-1]);
@@ -3700,11 +3900,16 @@ out:
root_owner = btrfs_header_owner(parent);
root_gen = btrfs_header_generation(parent);
+ /*
+ * cleanup and free the reference on the last node
+ * we processed
+ */
ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
parent->start, root_owner, root_gen,
*level, 1);
free_extent_buffer(path->nodes[*level]);
path->nodes[*level] = NULL;
+
*level += 1;
BUG_ON(ret);
@@ -3824,6 +4029,13 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
struct extent_buffer *node;
struct btrfs_disk_key disk_key;
+
+ /*
+ * there is more work to do in this level.
+ * Update the drop_progress marker to reflect
+ * the work we've done so far, and then bump
+ * the slot number
+ */
node = path->nodes[i];
path->slots[i]++;
*level = i;
@@ -3835,6 +4047,11 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
return 0;
} else {
struct extent_buffer *parent;
+
+ /*
+ * this whole node is done, free our reference
+ * on it and go up one level
+ */
if (path->nodes[*level] == root->node)
parent = path->nodes[*level];
else
@@ -4849,6 +5066,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
ref->bytenr = buf->start;
ref->owner = btrfs_header_owner(buf);
ref->generation = btrfs_header_generation(buf);
+
ret = btrfs_add_leaf_ref(root, ref, 0);
WARN_ON(ret);
btrfs_free_leaf_ref(root, ref);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ebd7d6c..95ea58c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2441,6 +2441,8 @@ next_node:
ref->generation = leaf_gen;
ref->nritems = 0;
+ btrfs_sort_leaf_ref(ref);
+
ret = btrfs_add_leaf_ref(root, ref, 0);
WARN_ON(ret);
btrfs_free_leaf_ref(root, ref);
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index 6f0acc4..d0cc62b 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -17,6 +17,7 @@
*/
#include <linux/sched.h>
+#include <linux/sort.h>
#include "ctree.h"
#include "ref-cache.h"
#include "transaction.h"
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index 16f3183..bc283ad 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -73,5 +73,4 @@ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
int shared);
int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
-
#endif