From 27d20fddc8af539464fc3ba499d6a830054c3bd6 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 11 Nov 2010 14:05:19 -0800 Subject: radix-tree: fix RCU bug Salman Qazi describes the following radix-tree bug: In the following case, we get can get a deadlock: 0. The radix tree contains two items, one has the index 0. 1. The reader (in this case find_get_pages) takes the rcu_read_lock. 2. The reader acquires slot(s) for item(s) including the index 0 item. 3. The non-zero index item is deleted, and as a consequence the other item is moved to the root of the tree. The place where it used to be is queued for deletion after the readers finish. 3b. The zero item is deleted, removing it from the direct slot, it remains in the rcu-delayed indirect node. 4. The reader looks at the index 0 slot, and finds that the page has 0 ref count 5. The reader looks at it again, hoping that the item will either be freed or the ref count will increase. This never happens, as the slot it is looking at will never be updated. Also, this slot can never be reclaimed because the reader is holding rcu_read_lock and is in an infinite loop. The fix is to re-use the same "indirect" pointer case that requires a slot lookup retry into a general "retry the lookup" bit. Signed-off-by: Nick Piggin Reported-by: Salman Qazi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/radix-tree.c | 83 +++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 58 insertions(+), 25 deletions(-) (limited to 'lib') diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 6f412ab4..5086bb9 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -82,6 +82,16 @@ struct radix_tree_preload { }; static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; +static inline void *ptr_to_indirect(void *ptr) +{ + return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR); +} + +static inline void *indirect_to_ptr(void *ptr) +{ + return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR); +} + static inline gfp_t root_gfp_mask(struct radix_tree_root *root) { return root->gfp_mask & __GFP_BITS_MASK; @@ -265,7 +275,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) return -ENOMEM; /* Increase the height. */ - node->slots[0] = radix_tree_indirect_to_ptr(root->rnode); + node->slots[0] = indirect_to_ptr(root->rnode); /* Propagate the aggregated tag info into the new root */ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { @@ -276,7 +286,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) newheight = root->height+1; node->height = newheight; node->count = 1; - node = radix_tree_ptr_to_indirect(node); + node = ptr_to_indirect(node); rcu_assign_pointer(root->rnode, node); root->height = newheight; } while (height > root->height); @@ -309,7 +319,7 @@ int radix_tree_insert(struct radix_tree_root *root, return error; } - slot = radix_tree_indirect_to_ptr(root->rnode); + slot = indirect_to_ptr(root->rnode); height = root->height; shift = (height-1) * RADIX_TREE_MAP_SHIFT; @@ -325,8 +335,7 @@ int radix_tree_insert(struct radix_tree_root *root, rcu_assign_pointer(node->slots[offset], slot); node->count++; } else - rcu_assign_pointer(root->rnode, - radix_tree_ptr_to_indirect(slot)); + rcu_assign_pointer(root->rnode, ptr_to_indirect(slot)); } /* Go a level down */ @@ -374,7 +383,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, return NULL; return is_slot ? (void *)&root->rnode : node; } - node = radix_tree_indirect_to_ptr(node); + node = indirect_to_ptr(node); height = node->height; if (index > radix_tree_maxindex(height)) @@ -393,7 +402,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, height--; } while (height > 0); - return is_slot ? (void *)slot:node; + return is_slot ? (void *)slot : indirect_to_ptr(node); } /** @@ -455,7 +464,7 @@ void *radix_tree_tag_set(struct radix_tree_root *root, height = root->height; BUG_ON(index > radix_tree_maxindex(height)); - slot = radix_tree_indirect_to_ptr(root->rnode); + slot = indirect_to_ptr(root->rnode); shift = (height - 1) * RADIX_TREE_MAP_SHIFT; while (height > 0) { @@ -509,7 +518,7 @@ void *radix_tree_tag_clear(struct radix_tree_root *root, shift = (height - 1) * RADIX_TREE_MAP_SHIFT; pathp->node = NULL; - slot = radix_tree_indirect_to_ptr(root->rnode); + slot = indirect_to_ptr(root->rnode); while (height > 0) { int offset; @@ -579,7 +588,7 @@ int radix_tree_tag_get(struct radix_tree_root *root, if (!radix_tree_is_indirect_ptr(node)) return (index == 0); - node = radix_tree_indirect_to_ptr(node); + node = indirect_to_ptr(node); height = node->height; if (index > radix_tree_maxindex(height)) @@ -666,7 +675,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, } shift = (height - 1) * RADIX_TREE_MAP_SHIFT; - slot = radix_tree_indirect_to_ptr(root->rnode); + slot = indirect_to_ptr(root->rnode); /* * we fill the path from (root->height - 2) to 0, leaving the index at @@ -897,7 +906,7 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, results[0] = node; return 1; } - node = radix_tree_indirect_to_ptr(node); + node = indirect_to_ptr(node); max_index = radix_tree_maxindex(node->height); @@ -916,7 +925,8 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, slot = *(((void ***)results)[ret + i]); if (!slot) continue; - results[ret + nr_found] = rcu_dereference_raw(slot); + results[ret + nr_found] = + indirect_to_ptr(rcu_dereference_raw(slot)); nr_found++; } ret += nr_found; @@ -965,7 +975,7 @@ radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, results[0] = (void **)&root->rnode; return 1; } - node = radix_tree_indirect_to_ptr(node); + node = indirect_to_ptr(node); max_index = radix_tree_maxindex(node->height); @@ -1090,7 +1100,7 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, results[0] = node; return 1; } - node = radix_tree_indirect_to_ptr(node); + node = indirect_to_ptr(node); max_index = radix_tree_maxindex(node->height); @@ -1109,7 +1119,8 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, slot = *(((void ***)results)[ret + i]); if (!slot) continue; - results[ret + nr_found] = rcu_dereference_raw(slot); + results[ret + nr_found] = + indirect_to_ptr(rcu_dereference_raw(slot)); nr_found++; } ret += nr_found; @@ -1159,7 +1170,7 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, results[0] = (void **)&root->rnode; return 1; } - node = radix_tree_indirect_to_ptr(node); + node = indirect_to_ptr(node); max_index = radix_tree_maxindex(node->height); @@ -1195,7 +1206,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) void *newptr; BUG_ON(!radix_tree_is_indirect_ptr(to_free)); - to_free = radix_tree_indirect_to_ptr(to_free); + to_free = indirect_to_ptr(to_free); /* * The candidate node has more than one child, or its child @@ -1208,16 +1219,39 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) /* * We don't need rcu_assign_pointer(), since we are simply - * moving the node from one part of the tree to another. If - * it was safe to dereference the old pointer to it + * moving the node from one part of the tree to another: if it + * was safe to dereference the old pointer to it * (to_free->slots[0]), it will be safe to dereference the new - * one (root->rnode). + * one (root->rnode) as far as dependent read barriers go. */ newptr = to_free->slots[0]; if (root->height > 1) - newptr = radix_tree_ptr_to_indirect(newptr); + newptr = ptr_to_indirect(newptr); root->rnode = newptr; root->height--; + + /* + * We have a dilemma here. The node's slot[0] must not be + * NULLed in case there are concurrent lookups expecting to + * find the item. However if this was a bottom-level node, + * then it may be subject to the slot pointer being visible + * to callers dereferencing it. If item corresponding to + * slot[0] is subsequently deleted, these callers would expect + * their slot to become empty sooner or later. + * + * For example, lockless pagecache will look up a slot, deref + * the page pointer, and if the page is 0 refcount it means it + * was concurrently deleted from pagecache so try the deref + * again. Fortunately there is already a requirement for logic + * to retry the entire slot lookup -- the indirect pointer + * problem (replacing direct root node with an indirect pointer + * also results in a stale slot). So tag the slot as indirect + * to force callers to retry. + */ + if (root->height == 0) + *((unsigned long *)&to_free->slots[0]) |= + RADIX_TREE_INDIRECT_PTR; + radix_tree_node_free(to_free); } } @@ -1254,7 +1288,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) root->rnode = NULL; goto out; } - slot = radix_tree_indirect_to_ptr(slot); + slot = indirect_to_ptr(slot); shift = (height - 1) * RADIX_TREE_MAP_SHIFT; pathp->node = NULL; @@ -1296,8 +1330,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) radix_tree_node_free(to_free); if (pathp->node->count) { - if (pathp->node == - radix_tree_indirect_to_ptr(root->rnode)) + if (pathp->node == indirect_to_ptr(root->rnode)) radix_tree_shrink(root); goto out; } -- cgit v1.1 From bcb38ceb225f5d5b2198a2755277cd441ed1e82b Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Tue, 30 Nov 2010 09:15:46 +1000 Subject: Revert "debug_locks: set oops_in_progress if we will log messages." This reverts commit e0fdace10e75dac67d906213b780ff1b1a4cc360. On-list discussion seems to suggest that the robustness fixes for printk make this unnecessary and DaveM has also agreed in person at Kernel Summit and on list. The main problem with this code is once we hit a lockdep splat we always keep oops_in_progress set, the console layer uses oops_in_progress with KMS to decide when it should be showing the oops and not showing X, so it causes problems around suspend/resume time when a userspace resume can cause a console switch away from X, only if oops_in_progress is set (which is what we want if an oops actually is in progress, but not because we had a lockdep splat 2 days prior). Cc: David S Miller Cc: Ingo Molnar Signed-off-by: Dave Airlie Signed-off-by: Linus Torvalds --- lib/debug_locks.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'lib') diff --git a/lib/debug_locks.c b/lib/debug_locks.c index 5bf0020..b1c1773 100644 --- a/lib/debug_locks.c +++ b/lib/debug_locks.c @@ -8,7 +8,6 @@ * * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar */ -#include #include #include #include @@ -39,7 +38,6 @@ int debug_locks_off(void) { if (__debug_locks_off()) { if (!debug_locks_silent) { - oops_in_progress = 1; console_verbose(); return 1; } -- cgit v1.1