From 4eaf3f64397c3db3c5785eee508270d62a9fabd9 Mon Sep 17 00:00:00 2001 From: Haicheng Li Date: Mon, 24 May 2010 14:32:52 -0700 Subject: mem-hotplug: fix potential race while building zonelist for new populated zone Add global mutex zonelists_mutex to fix the possible race: CPU0 CPU1 CPU2 (1) zone->present_pages += online_pages; (2) build_all_zonelists(); (3) alloc_page(); (4) free_page(); (5) build_all_zonelists(); (6) __build_all_zonelists(); (7) zone->pageset = alloc_percpu(); In step (3,4), zone->pageset still points to boot_pageset, so bad things may happen if 2+ nodes are in this state. Even if only 1 node is accessing the boot_pageset, (3) may still consume too much memory to fail the memory allocations in step (7). Besides, atomic operation ensures alloc_percpu() in step (7) will never fail since there is a new fresh memory block added in step(6). [haicheng.li@linux.intel.com: hold zonelists_mutex when build_all_zonelists] Signed-off-by: Haicheng Li Signed-off-by: Wu Fengguang Reviewed-by: Andi Kleen Cc: Christoph Lameter Cc: Mel Gorman Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 21c52d2..08b3499 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2571,8 +2571,11 @@ int numa_zonelist_order_handler(ctl_table *table, int write, strncpy((char*)table->data, saved_string, NUMA_ZONELIST_ORDER_LEN); user_zonelist_order = oldval; - } else if (oldval != user_zonelist_order) + } else if (oldval != user_zonelist_order) { + mutex_lock(&zonelists_mutex); build_all_zonelists(NULL); + mutex_unlock(&zonelists_mutex); + } } out: mutex_unlock(&zl_order_mutex); @@ -2924,6 +2927,12 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); static void setup_zone_pageset(struct zone *zone); +/* + * Global mutex to protect against size modification of zonelists + * as well as to serialize pageset setup for the new populated zone. + */ +DEFINE_MUTEX(zonelists_mutex); + /* return values int ....just for stop_machine() */ static __init_refok int __build_all_zonelists(void *data) { @@ -2967,6 +2976,10 @@ static __init_refok int __build_all_zonelists(void *data) return 0; } +/* + * Called with zonelists_mutex held always + * unless system_state == SYSTEM_BOOTING. + */ void build_all_zonelists(void *data) { set_zonelist_order(); -- cgit v1.1