From 517a92c4e19fcea815332d3155e9fb7723251274 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 14 Feb 2008 09:02:13 +0100 Subject: panic: print more informative messages on stackprotect failure pointed out by pageexec@freemail.hu: we just simply panic() when there's a stackprotector attack - giving the attacked person no information about what kernel code the attack went against. print out the attacked function. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/panic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 425567f..f236001 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -327,7 +327,8 @@ EXPORT_SYMBOL(warn_on_slowpath); */ void __stack_chk_fail(void) { - panic("stack-protector: Kernel stack is corrupted"); + panic("stack-protector: Kernel stack is corrupted in: %p\n", + __builtin_return_address(0)); } EXPORT_SYMBOL(__stack_chk_fail); #endif -- cgit v1.1 From 5cb273013e182a35e7db614d3e20a144cba71e53 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 14 Feb 2008 09:07:01 +0100 Subject: panic: print out stacktrace if DEBUG_BUGVERBOSE if CONFIG_DEBUG_BUGVERBOSE is set then the user most definitely wanted to see as much information about kernel crashes as possible - so give them at least a stack dump. this is particularly useful for stackprotector related panics, where the stacktrace can give us the exact location of the (attempted) attack. Pointed out by pageexec@freemail.hu in the stackprotector breakage threads. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/panic.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index f236001..17aad57 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -80,6 +80,9 @@ NORET_TYPE void panic(const char * fmt, ...) vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); +#ifdef CONFIG_DEBUG_BUGVERBOSE + dump_stack(); +#endif bust_spinlocks(0); /* -- cgit v1.1 From 54371a43a66f4477889769b4fa00df936855dc8f Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 15 Feb 2008 15:33:12 -0800 Subject: x86: add CONFIG_CC_STACKPROTECTOR self-test This patch adds a simple self-test capability to the stackprotector feature. The test deliberately overflows a stack buffer and then checks if the canary trap function gets called. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/panic.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 17aad57..50cf925 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -324,14 +324,82 @@ EXPORT_SYMBOL(warn_on_slowpath); #endif #ifdef CONFIG_CC_STACKPROTECTOR + +static unsigned long __stack_check_testing; +/* + * Self test function for the stack-protector feature. + * This test requires that the local variable absolutely has + * a stack slot, hence the barrier()s. + */ +static noinline void __stack_chk_test_func(void) +{ + unsigned long foo; + barrier(); + /* + * we need to make sure we're not about to clobber the return address, + * while real exploits do this, it's unhealthy on a running system. + * Besides, if we would, the test is already failed anyway so + * time to pull the emergency brake on it. + */ + if ((unsigned long)__builtin_return_address(0) == + *(((unsigned long *)&foo)+1)) { + printk(KERN_ERR "No -fstack-protector-stack-frame!\n"); + return; + } +#ifdef CONFIG_FRAME_POINTER + /* We also don't want to clobber the frame pointer */ + if ((unsigned long)__builtin_return_address(0) == + *(((unsigned long *)&foo)+2)) { + printk(KERN_ERR "No -fstack-protector-stack-frame!\n"); + return; + } +#endif + barrier(); + if (current->stack_canary == *(((unsigned long *)&foo)+1)) + *(((unsigned long *)&foo)+1) = 0; + else + printk(KERN_ERR "No -fstack-protector canary found\n"); + barrier(); +} + +static int __stack_chk_test(void) +{ + printk(KERN_INFO "Testing -fstack-protector-all feature\n"); + __stack_check_testing = (unsigned long)&__stack_chk_test_func; + __stack_chk_test_func(); + if (__stack_check_testing) { + printk(KERN_ERR "-fstack-protector-all test failed\n"); + WARN_ON(1); + } + return 0; +} /* * Called when gcc's -fstack-protector feature is used, and * gcc detects corruption of the on-stack canary value */ void __stack_chk_fail(void) { + if (__stack_check_testing == (unsigned long)&__stack_chk_test_func) { + long delta; + + delta = (unsigned long)__builtin_return_address(0) - + __stack_check_testing; + /* + * The test needs to happen inside the test function, so + * check if the return address is close to that function. + * The function is only 2 dozen bytes long, but keep a wide + * safety margin to avoid panic()s for normal users regardless + * of the quality of the compiler. + */ + if (delta >= 0 && delta <= 400) { + __stack_check_testing = 0; + return; + } + } panic("stack-protector: Kernel stack is corrupted in: %p\n", __builtin_return_address(0)); } EXPORT_SYMBOL(__stack_chk_fail); + +late_initcall(__stack_chk_test); #endif -- cgit v1.1 From b719ac56c0032bc1602914c6ea70b0f1581b08c7 Mon Sep 17 00:00:00 2001 From: Daniel Walker Date: Mon, 14 Apr 2008 10:03:50 -0700 Subject: panic.c: fix whitespace additions trivial: remove white space addition in stack protector Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/panic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 50cf925..866be9b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -341,14 +341,14 @@ static noinline void __stack_chk_test_func(void) * Besides, if we would, the test is already failed anyway so * time to pull the emergency brake on it. */ - if ((unsigned long)__builtin_return_address(0) == + if ((unsigned long)__builtin_return_address(0) == *(((unsigned long *)&foo)+1)) { printk(KERN_ERR "No -fstack-protector-stack-frame!\n"); return; } #ifdef CONFIG_FRAME_POINTER /* We also don't want to clobber the frame pointer */ - if ((unsigned long)__builtin_return_address(0) == + if ((unsigned long)__builtin_return_address(0) == *(((unsigned long *)&foo)+2)) { printk(KERN_ERR "No -fstack-protector-stack-frame!\n"); return; -- cgit v1.1 From b40a4392a3c262e0d1b5379b4e142a8eefa63439 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 18 Apr 2008 06:16:45 -0700 Subject: stackprotector: turn not having the right gcc into a #warning If the user selects the stack-protector config option, but does not have a gcc that has the right bits enabled (for example because it isn't build with a glibc that supports TLS, as is common for cross-compilers, but also because it may be too old), then the runtime test fails right now. This patch adds a warning message for this scenario. This warning accomplishes two goals 1) the user is informed that the security option he selective isn't available 2) the user is suggested to turn of the CONFIG option that won't work for him, and would make the runtime test fail anyway. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/panic.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 866be9b..6729e3f 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -325,6 +325,9 @@ EXPORT_SYMBOL(warn_on_slowpath); #ifdef CONFIG_CC_STACKPROTECTOR +#ifndef GCC_HAS_SP +#warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this. +#endif static unsigned long __stack_check_testing; /* * Self test function for the stack-protector feature. -- cgit v1.1 From 7c9f8861e6c9c839f913e49b98c3854daca18f27 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 22 Apr 2008 16:38:23 -0500 Subject: stackprotector: use canary at end of stack to indicate overruns at oops time (Updated with a common max-stack-used checker that knows about the canary, as suggested by Joe Perches) Use a canary at the end of the stack to clearly indicate at oops time whether the stack has ever overflowed. This is a very simple implementation with a couple of drawbacks: 1) a thread may legitimately use exactly up to the last word on the stack -- but the chances of doing this and then oopsing later seem slim 2) it's possible that the stack usage isn't dense enough that the canary location could get skipped over -- but the worst that happens is that we don't flag the overrun -- though this happens fairly often in my testing :( With the code in place, an intentionally-bloated stack oops might do: BUG: unable to handle kernel paging request at ffff8103f84cc680 IP: [] update_curr+0x9a/0xa8 PGD 8063 PUD 0 Thread overran stack or stack corrupted Oops: 0000 [1] SMP CPU 0 ... ... unless the stack overrun is so bad that it corrupts some other thread. Signed-off-by: Eric Sandeen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/exit.c | 5 +---- kernel/fork.c | 5 +++++ kernel/sched.c | 7 +------ 3 files changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 8f6185e..fb8de6c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -899,12 +899,9 @@ static void check_stack_usage(void) { static DEFINE_SPINLOCK(low_water_lock); static int lowest_to_date = THREAD_SIZE; - unsigned long *n = end_of_stack(current); unsigned long free; - while (*n == 0) - n++; - free = (unsigned long)n - (unsigned long)end_of_stack(current); + free = stack_not_used(current); if (free >= lowest_to_date) return; diff --git a/kernel/fork.c b/kernel/fork.c index 19908b2..d428336 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include @@ -186,6 +187,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) { struct task_struct *tsk; struct thread_info *ti; + unsigned long *stackend; + int err; prepare_to_copy(orig); @@ -211,6 +214,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) goto out; setup_thread_stack(tsk, orig); + stackend = end_of_stack(tsk); + *stackend = STACK_END_MAGIC; /* for overflow detection */ #ifdef CONFIG_CC_STACKPROTECTOR tsk->stack_canary = get_random_int(); diff --git a/kernel/sched.c b/kernel/sched.c index cfa222a..a964ed9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5748,12 +5748,7 @@ void sched_show_task(struct task_struct *p) printk(KERN_CONT " %016lx ", thread_saved_pc(p)); #endif #ifdef CONFIG_DEBUG_STACK_USAGE - { - unsigned long *n = end_of_stack(p); - while (!*n) - n++; - free = (unsigned long)n - (unsigned long)end_of_stack(p); - } + free = stack_not_used(p); #endif printk(KERN_CONT "%5lu %5d %6d\n", free, task_pid_nr(p), task_pid_nr(p->real_parent)); -- cgit v1.1 From aa92db14270b79f0f91a9060b547a46f9e2639da Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 11 Jul 2008 05:09:55 -0700 Subject: stackprotector: better self-test check stackprotector functionality by manipulating the canary briefly during bootup. far more robust than trying to overflow the stack. (which is architecture dependent, etc.) Signed-off-by: Ingo Molnar --- kernel/panic.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 6729e3f..28153ae 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -347,22 +347,18 @@ static noinline void __stack_chk_test_func(void) if ((unsigned long)__builtin_return_address(0) == *(((unsigned long *)&foo)+1)) { printk(KERN_ERR "No -fstack-protector-stack-frame!\n"); - return; } #ifdef CONFIG_FRAME_POINTER /* We also don't want to clobber the frame pointer */ if ((unsigned long)__builtin_return_address(0) == *(((unsigned long *)&foo)+2)) { printk(KERN_ERR "No -fstack-protector-stack-frame!\n"); - return; } #endif - barrier(); - if (current->stack_canary == *(((unsigned long *)&foo)+1)) - *(((unsigned long *)&foo)+1) = 0; - else + if (current->stack_canary != *(((unsigned long *)&foo)+1)) printk(KERN_ERR "No -fstack-protector canary found\n"); - barrier(); + + current->stack_canary = ~current->stack_canary; } static int __stack_chk_test(void) @@ -373,7 +369,8 @@ static int __stack_chk_test(void) if (__stack_check_testing) { printk(KERN_ERR "-fstack-protector-all test failed\n"); WARN_ON(1); - } + }; + current->stack_canary = ~current->stack_canary; return 0; } /* -- cgit v1.1 From af9ff7868f0f76d3364351b1641b9dfa99588e77 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 12 Jul 2008 09:36:38 -0700 Subject: x86: simplify stackprotector self-check Clean up the code by removing no longer needed code; make sure the pda is updated and kept in sync Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- kernel/panic.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 28153ae..87445a8 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -328,37 +328,21 @@ EXPORT_SYMBOL(warn_on_slowpath); #ifndef GCC_HAS_SP #warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this. #endif + static unsigned long __stack_check_testing; + /* * Self test function for the stack-protector feature. * This test requires that the local variable absolutely has - * a stack slot, hence the barrier()s. + * a stack slot. */ static noinline void __stack_chk_test_func(void) { - unsigned long foo; - barrier(); - /* - * we need to make sure we're not about to clobber the return address, - * while real exploits do this, it's unhealthy on a running system. - * Besides, if we would, the test is already failed anyway so - * time to pull the emergency brake on it. - */ - if ((unsigned long)__builtin_return_address(0) == - *(((unsigned long *)&foo)+1)) { - printk(KERN_ERR "No -fstack-protector-stack-frame!\n"); - } -#ifdef CONFIG_FRAME_POINTER - /* We also don't want to clobber the frame pointer */ - if ((unsigned long)__builtin_return_address(0) == - *(((unsigned long *)&foo)+2)) { - printk(KERN_ERR "No -fstack-protector-stack-frame!\n"); - } -#endif - if (current->stack_canary != *(((unsigned long *)&foo)+1)) - printk(KERN_ERR "No -fstack-protector canary found\n"); + unsigned long dummy_buffer[64]; /* force gcc to use the canary */ current->stack_canary = ~current->stack_canary; + refresh_stack_canary(); + dummy_buffer[3] = 1; /* fool gcc into keeping the variable */ } static int __stack_chk_test(void) @@ -371,6 +355,7 @@ static int __stack_chk_test(void) WARN_ON(1); }; current->stack_canary = ~current->stack_canary; + refresh_stack_canary(); return 0; } /* -- cgit v1.1 From 4f962d4d65923d7b722192e729840cfb79af0a5a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 13 Jul 2008 21:42:44 +0200 Subject: stackprotector: remove self-test turns out gcc generates such stackprotector-failure sequences in certain circumstances: movq -8(%rbp), %rax # D.16032, xorq %gs:40, %rax #, jne .L17 #, leave ret .L17: call __stack_chk_fail # .size __stack_chk_test_func, .-__stack_chk_test_func .section .init.text,"ax",@progbits .type panic_setup, @function panic_setup: pushq %rbp # note that there's no jump back to the failing context after the call to __stack_chk_fail - i.e. it has a ((noreturn)) attribute. Which is fair enough in the normal case but kills the self-test. (as we cannot reliably return in the self-test) Signed-off-by: Ingo Molnar --- kernel/panic.c | 47 ----------------------------------------------- 1 file changed, 47 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 87445a8..c35c9ec 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -329,62 +329,15 @@ EXPORT_SYMBOL(warn_on_slowpath); #warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this. #endif -static unsigned long __stack_check_testing; - -/* - * Self test function for the stack-protector feature. - * This test requires that the local variable absolutely has - * a stack slot. - */ -static noinline void __stack_chk_test_func(void) -{ - unsigned long dummy_buffer[64]; /* force gcc to use the canary */ - - current->stack_canary = ~current->stack_canary; - refresh_stack_canary(); - dummy_buffer[3] = 1; /* fool gcc into keeping the variable */ -} - -static int __stack_chk_test(void) -{ - printk(KERN_INFO "Testing -fstack-protector-all feature\n"); - __stack_check_testing = (unsigned long)&__stack_chk_test_func; - __stack_chk_test_func(); - if (__stack_check_testing) { - printk(KERN_ERR "-fstack-protector-all test failed\n"); - WARN_ON(1); - }; - current->stack_canary = ~current->stack_canary; - refresh_stack_canary(); - return 0; -} /* * Called when gcc's -fstack-protector feature is used, and * gcc detects corruption of the on-stack canary value */ void __stack_chk_fail(void) { - if (__stack_check_testing == (unsigned long)&__stack_chk_test_func) { - long delta; - - delta = (unsigned long)__builtin_return_address(0) - - __stack_check_testing; - /* - * The test needs to happen inside the test function, so - * check if the return address is close to that function. - * The function is only 2 dozen bytes long, but keep a wide - * safety margin to avoid panic()s for normal users regardless - * of the quality of the compiler. - */ - if (delta >= 0 && delta <= 400) { - __stack_check_testing = 0; - return; - } - } panic("stack-protector: Kernel stack is corrupted in: %p\n", __builtin_return_address(0)); } EXPORT_SYMBOL(__stack_chk_fail); -late_initcall(__stack_chk_test); #endif -- cgit v1.1 From 7f7ace0cda64c99599c23785f8979a072e118058 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 10 Jan 2009 21:58:08 -0800 Subject: cpumask: update irq_desc to use cpumask_var_t Impact: reduce memory usage, use new cpumask API. Replace the affinity and pending_masks with cpumask_var_t's. This adds to the significant size reduction done with the SPARSE_IRQS changes. The added functions (init_alloc_desc_masks & init_copy_desc_masks) are in the include file so they can be inlined (and optimized out for the !CONFIG_CPUMASKS_OFFSTACK case.) [Naming chosen to be consistent with the other init*irq functions, as well as the backwards arg declaration of "from, to" instead of the more common "to, from" standard.] Includes a slight change to the declaration of struct irq_desc to embed the pending_mask within ifdef(CONFIG_SMP) to be consistent with other references, and some small changes to Xen. Tested: sparse/non-sparse/cpumask_offstack/non-cpumask_offstack/nonuma/nosmp on x86_64 Signed-off-by: Mike Travis Cc: Chris Wright Cc: Jeremy Fitzhardinge Cc: KOSAKI Motohiro Cc: Venkatesh Pallipadi Cc: virtualization@lists.osdl.org Cc: xen-devel@lists.xensource.com Cc: Yinghai Lu --- kernel/irq/chip.c | 5 ++++- kernel/irq/handle.c | 26 ++++++++++++++------------ kernel/irq/manage.c | 12 ++++++------ kernel/irq/migration.c | 12 ++++++------ kernel/irq/numa_migrate.c | 12 +++++++++++- kernel/irq/proc.c | 4 ++-- 6 files changed, 43 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f63c706..c248eba 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -46,7 +46,10 @@ void dynamic_irq_init(unsigned int irq) desc->irq_count = 0; desc->irqs_unhandled = 0; #ifdef CONFIG_SMP - cpumask_setall(&desc->affinity); + cpumask_setall(desc->affinity); +#ifdef CONFIG_GENERIC_PENDING_IRQ + cpumask_clear(desc->pending_mask); +#endif #endif spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index c20db0b..b8fa135 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -64,9 +64,6 @@ static struct irq_desc irq_desc_init = { .handle_irq = handle_bad_irq, .depth = 1, .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), -#ifdef CONFIG_SMP - .affinity = CPU_MASK_ALL -#endif }; void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) @@ -88,6 +85,8 @@ void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) { + int node = cpu_to_node(cpu); + memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); spin_lock_init(&desc->lock); @@ -101,6 +100,10 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) printk(KERN_ERR "can not alloc kstat_irqs\n"); BUG_ON(1); } + if (!init_alloc_desc_masks(desc, node, false)) { + printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); + BUG_ON(1); + } arch_init_chip_data(desc, cpu); } @@ -119,9 +122,6 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm .handle_irq = handle_bad_irq, .depth = 1, .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), -#ifdef CONFIG_SMP - .affinity = CPU_MASK_ALL -#endif } }; @@ -141,7 +141,7 @@ int __init early_irq_init(void) desc[i].irq = i; desc[i].kstat_irqs = kstat_irqs_legacy[i]; lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); - + init_alloc_desc_masks(&desc[i], 0, true); irq_desc_ptrs[i] = desc + i; } @@ -188,6 +188,10 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) printk(KERN_ERR "can not alloc irq_desc\n"); BUG_ON(1); } + if (!init_alloc_desc_masks(desc, node, false)) { + printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); + BUG_ON(1); + } init_one_irq_desc(irq, desc, cpu); irq_desc_ptrs[irq] = desc; @@ -207,9 +211,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { .handle_irq = handle_bad_irq, .depth = 1, .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), -#ifdef CONFIG_SMP - .affinity = CPU_MASK_ALL -#endif } }; @@ -222,9 +223,10 @@ int __init early_irq_init(void) desc = irq_desc; count = ARRAY_SIZE(irq_desc); - for (i = 0; i < count; i++) + for (i = 0; i < count; i++) { desc[i].irq = i; - + init_alloc_desc_masks(&desc[i], 0, true); + } return arch_early_irq_init(); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index cd0cd8d..b98739a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -98,14 +98,14 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { - cpumask_copy(&desc->affinity, cpumask); + cpumask_copy(desc->affinity, cpumask); desc->chip->set_affinity(irq, cpumask); } else { desc->status |= IRQ_MOVE_PENDING; - cpumask_copy(&desc->pending_mask, cpumask); + cpumask_copy(desc->pending_mask, cpumask); } #else - cpumask_copy(&desc->affinity, cpumask); + cpumask_copy(desc->affinity, cpumask); desc->chip->set_affinity(irq, cpumask); #endif desc->status |= IRQ_AFFINITY_SET; @@ -127,16 +127,16 @@ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc) * one of the targets is online. */ if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { - if (cpumask_any_and(&desc->affinity, cpu_online_mask) + if (cpumask_any_and(desc->affinity, cpu_online_mask) < nr_cpu_ids) goto set_affinity; else desc->status &= ~IRQ_AFFINITY_SET; } - cpumask_and(&desc->affinity, cpu_online_mask, irq_default_affinity); + cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity); set_affinity: - desc->chip->set_affinity(irq, &desc->affinity); + desc->chip->set_affinity(irq, desc->affinity); return 0; } diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index bd72329..e05ad9b 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -18,7 +18,7 @@ void move_masked_irq(int irq) desc->status &= ~IRQ_MOVE_PENDING; - if (unlikely(cpumask_empty(&desc->pending_mask))) + if (unlikely(cpumask_empty(desc->pending_mask))) return; if (!desc->chip->set_affinity) @@ -38,13 +38,13 @@ void move_masked_irq(int irq) * For correct operation this depends on the caller * masking the irqs. */ - if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask) + if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids)) { - cpumask_and(&desc->affinity, - &desc->pending_mask, cpu_online_mask); - desc->chip->set_affinity(irq, &desc->affinity); + cpumask_and(desc->affinity, + desc->pending_mask, cpu_online_mask); + desc->chip->set_affinity(irq, desc->affinity); } - cpumask_clear(&desc->pending_mask); + cpumask_clear(desc->pending_mask); } void move_native_irq(int irq) diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index ecf765c..f001a4e 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -46,6 +46,7 @@ static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, desc->cpu = cpu; lockdep_set_class(&desc->lock, &irq_desc_lock_class); init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); + init_copy_desc_masks(old_desc, desc); arch_init_copy_chip_data(old_desc, desc, cpu); } @@ -76,11 +77,20 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, node = cpu_to_node(cpu); desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); if (!desc) { - printk(KERN_ERR "irq %d: can not get new irq_desc for migration.\n", irq); + printk(KERN_ERR "irq %d: can not get new irq_desc " + "for migration.\n", irq); /* still use old one */ desc = old_desc; goto out_unlock; } + if (!init_alloc_desc_masks(desc, node, false)) { + printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " + "for migration.\n", irq); + /* still use old one */ + kfree(desc); + desc = old_desc; + goto out_unlock; + } init_copy_one_irq_desc(irq, old_desc, desc, cpu); irq_desc_ptrs[irq] = desc; diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index aae3f74..692363d 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -20,11 +20,11 @@ static struct proc_dir_entry *root_irq_dir; static int irq_affinity_proc_show(struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long)m->private); - const struct cpumask *mask = &desc->affinity; + const struct cpumask *mask = desc->affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PENDING) - mask = &desc->pending_mask; + mask = desc->pending_mask; #endif seq_cpumask(m, mask); seq_putc(m, '\n'); -- cgit v1.1 From 802bf931f2688ad125b73db597ce63cc842fb27a Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 10 Jan 2009 21:58:09 -0800 Subject: cpumask: fix bug in use cpumask_var_t in irq_desc Impact: fix bug where new irq_desc uses old cpumask pointers which are freed. As Yinghai pointed out, init_copy_one_irq_desc() copies the old desc to the new desc overwriting the cpumask pointers. Since the old_desc and the cpumask pointers are freed, then memory corruption will occur if these old pointers are used. Move the allocation of these pointers to after the copy. Signed-off-by: Mike Travis Cc: Yinghai Lu --- kernel/irq/handle.c | 8 +------- kernel/irq/numa_migrate.c | 13 ++++++++----- 2 files changed, 9 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index b8fa135..f01c0a3 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -85,8 +85,6 @@ void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) { - int node = cpu_to_node(cpu); - memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); spin_lock_init(&desc->lock); @@ -100,7 +98,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) printk(KERN_ERR "can not alloc kstat_irqs\n"); BUG_ON(1); } - if (!init_alloc_desc_masks(desc, node, false)) { + if (!init_alloc_desc_masks(desc, cpu, false)) { printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); BUG_ON(1); } @@ -188,10 +186,6 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) printk(KERN_ERR "can not alloc irq_desc\n"); BUG_ON(1); } - if (!init_alloc_desc_masks(desc, node, false)) { - printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); - BUG_ON(1); - } init_one_irq_desc(irq, desc, cpu); irq_desc_ptrs[irq] = desc; diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index f001a4e..666260e 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -38,16 +38,22 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc) old_desc->kstat_irqs = NULL; } -static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, +static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, struct irq_desc *desc, int cpu) { memcpy(desc, old_desc, sizeof(struct irq_desc)); + if (!init_alloc_desc_masks(desc, cpu, false)) { + printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " + "for migration.\n", irq); + return false; + } spin_lock_init(&desc->lock); desc->cpu = cpu; lockdep_set_class(&desc->lock, &irq_desc_lock_class); init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); init_copy_desc_masks(old_desc, desc); arch_init_copy_chip_data(old_desc, desc, cpu); + return true; } static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) @@ -83,15 +89,12 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, desc = old_desc; goto out_unlock; } - if (!init_alloc_desc_masks(desc, node, false)) { - printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " - "for migration.\n", irq); + if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) { /* still use old one */ kfree(desc); desc = old_desc; goto out_unlock; } - init_copy_one_irq_desc(irq, old_desc, desc, cpu); irq_desc_ptrs[irq] = desc; -- cgit v1.1 From d38b223c86db3162dc85b5a1997ac8a210e1660b Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 10 Jan 2009 21:58:11 -0800 Subject: cpumask: reduce stack usage in find_lowest_rq Impact: reduce stack usage, cleanup Use a cpumask_var_t in find_lowest_rq() and clean up other old cpumask_t calls. Signed-off-by: Mike Travis --- kernel/sched_rt.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 954e1a8..da932f4 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -960,16 +960,17 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); -static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) +static inline int pick_optimal_cpu(int this_cpu, + const struct cpumask *mask) { int first; /* "this_cpu" is cheaper to preempt than a remote processor */ - if ((this_cpu != -1) && cpu_isset(this_cpu, *mask)) + if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask)) return this_cpu; - first = first_cpu(*mask); - if (first != NR_CPUS) + first = cpumask_first(mask); + if (first < nr_cpu_ids) return first; return -1; @@ -981,6 +982,7 @@ static int find_lowest_rq(struct task_struct *task) struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); + cpumask_var_t domain_mask; if (task->rt.nr_cpus_allowed == 1) return -1; /* No other targets possible */ @@ -1013,19 +1015,25 @@ static int find_lowest_rq(struct task_struct *task) if (this_cpu == cpu) this_cpu = -1; /* Skip this_cpu opt if the same */ - for_each_domain(cpu, sd) { - if (sd->flags & SD_WAKE_AFFINE) { - cpumask_t domain_mask; - int best_cpu; + if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) { + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_AFFINE) { + int best_cpu; - cpumask_and(&domain_mask, sched_domain_span(sd), - lowest_mask); + cpumask_and(domain_mask, + sched_domain_span(sd), + lowest_mask); - best_cpu = pick_optimal_cpu(this_cpu, - &domain_mask); - if (best_cpu != -1) - return best_cpu; + best_cpu = pick_optimal_cpu(this_cpu, + domain_mask); + + if (best_cpu != -1) { + free_cpumask_var(domain_mask); + return best_cpu; + } + } } + free_cpumask_var(domain_mask); } /* -- cgit v1.1 From 9594949b060efe86ecaa1a66839232a3b9800bc9 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 10 Jan 2009 22:24:06 -0800 Subject: irq: change references from NR_IRQS to nr_irqs Impact: preparation, cleanup, add KERN_INFO printk Modify references from NR_IRQS to nr_irqs as the later will become variable-sized based on nr_cpu_ids when CONFIG_SPARSE_IRQS=y. Signed-off-by: Mike Travis --- kernel/irq/handle.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index f01c0a3..790c5fa 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -132,6 +132,8 @@ int __init early_irq_init(void) int legacy_count; int i; + printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs); + desc = irq_desc_legacy; legacy_count = ARRAY_SIZE(irq_desc_legacy); @@ -143,7 +145,7 @@ int __init early_irq_init(void) irq_desc_ptrs[i] = desc + i; } - for (i = legacy_count; i < NR_IRQS; i++) + for (i = legacy_count; i < nr_irqs; i++) irq_desc_ptrs[i] = NULL; return arch_early_irq_init(); @@ -151,7 +153,7 @@ int __init early_irq_init(void) struct irq_desc *irq_to_desc(unsigned int irq) { - return (irq < NR_IRQS) ? irq_desc_ptrs[irq] : NULL; + return (irq < nr_irqs) ? irq_desc_ptrs[irq] : NULL; } struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) @@ -160,9 +162,9 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) unsigned long flags; int node; - if (irq >= NR_IRQS) { - printk(KERN_WARNING "irq >= NR_IRQS in irq_to_desc_alloc: %d %d\n", - irq, NR_IRQS); + if (irq >= nr_irqs) { + printk(KERN_WARNING "irq >= nr_irqs in irq_to_desc_alloc: %d %d\n", + irq, nr_irqs); WARN_ON(1); return NULL; } @@ -214,6 +216,8 @@ int __init early_irq_init(void) int count; int i; + printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); + desc = irq_desc; count = ARRAY_SIZE(irq_desc); -- cgit v1.1 From e2f4d06545ec1f29b0e838ee34cbf3500ea5b9a4 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 10 Jan 2009 22:24:06 -0800 Subject: irq: use WARN() instead of WARN_ON(). Impact: cleanup WARN msg. Ingo requested: > While at it, could you please also convert this to a WARN() construct > instead? (in a separate commit) ... and it shall be done. ;-) Signed-off-by: Mike Travis --- kernel/irq/handle.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 790c5fa..fd1ef16 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -163,9 +163,8 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) int node; if (irq >= nr_irqs) { - printk(KERN_WARNING "irq >= nr_irqs in irq_to_desc_alloc: %d %d\n", - irq, nr_irqs); - WARN_ON(1); + WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", + irq, nr_irqs); return NULL; } -- cgit v1.1 From 0fa0ebbf15addc1be8f73325d809c8547a9de304 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 10 Jan 2009 22:24:06 -0800 Subject: irq: allocate irq_desc_ptrs array based on nr_irqs Impact: allocate irq_desc_ptrs in preparation for making it variable-sized. This addresses this memory usage bump when NR_CPUS bumped from 128 to 4096: 34816 +229376 264192 +658% irq_desc_ptrs(.data.read_mostly) The patch is split into two parts, the first simply allocates the irq_desc_ptrs array. Then next will deal with making it variable. This is only when CONFIG_SPARSE_IRQS=y. Signed-off-by: Mike Travis --- kernel/irq/handle.c | 11 +++++++++-- kernel/irq/internals.h | 7 +++++++ 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index fd1ef16..d0b8f7e 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "internals.h" @@ -110,7 +111,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) */ DEFINE_SPINLOCK(sparse_irq_lock); -struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly; +struct irq_desc **irq_desc_ptrs __read_mostly; static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { [0 ... NR_IRQS_LEGACY-1] = { @@ -137,6 +138,9 @@ int __init early_irq_init(void) desc = irq_desc_legacy; legacy_count = ARRAY_SIZE(irq_desc_legacy); + /* allocate irq_desc_ptrs array based on nr_irqs */ + irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *)); + for (i = 0; i < legacy_count; i++) { desc[i].irq = i; desc[i].kstat_irqs = kstat_irqs_legacy[i]; @@ -153,7 +157,10 @@ int __init early_irq_init(void) struct irq_desc *irq_to_desc(unsigned int irq) { - return (irq < nr_irqs) ? irq_desc_ptrs[irq] : NULL; + if (irq_desc_ptrs && irq < nr_irqs) + return irq_desc_ptrs[irq]; + + return NULL; } struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index e6d0a43..40416a8 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -16,7 +16,14 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, extern struct lock_class_key irq_desc_lock_class; extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); extern spinlock_t sparse_irq_lock; + +#ifdef CONFIG_SPARSE_IRQ +/* irq_desc_ptrs allocated at boot time */ +extern struct irq_desc **irq_desc_ptrs; +#else +/* irq_desc_ptrs is a fixed size array */ extern struct irq_desc *irq_desc_ptrs[NR_IRQS]; +#endif #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); -- cgit v1.1 From 9332fccdedf8e09448f3b69b624211ae879f6c45 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 10 Jan 2009 22:24:07 -0800 Subject: irq: initialize nr_irqs based on nr_cpu_ids Impact: Reduce memory usage. This is the second half of the changes to make the irq_desc_ptrs be variable sized based on nr_cpu_ids. This is done by adding a new "max_nr_irqs" macro to irq_vectors.h (and a dummy in irqnr.h) to return a max NR_IRQS value based on NR_CPUS or nr_cpu_ids. This necessitated moving the define of MAX_IO_APICS to a separate file (asm/apicnum.h) so it could be included without the baggage of the other asm/apicdef.h declarations. Signed-off-by: Mike Travis --- kernel/irq/handle.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index d0b8f7e..ebba7a1 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -133,6 +133,9 @@ int __init early_irq_init(void) int legacy_count; int i; + /* initialize nr_irqs based on nr_cpu_ids */ + nr_irqs = max_nr_irqs(nr_cpu_ids); + printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs); desc = irq_desc_legacy; -- cgit v1.1 From 542d865bbed4ce1f050f586e53cf1cfadda93766 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 10 Jan 2009 22:24:07 -0800 Subject: kstat: modify kstat_irqs_legacy to be variable sized Impact: reduce memory usage. Allocate kstat_irqs_legacy based on nr_cpu_ids to deal with this memory usage bump when NR_CPUS bumped from 128 to 4096: 8192 +253952 262144 +3100% kstat_irqs_legacy(.bss) This is only when CONFIG_SPARSE_IRQS=y. Signed-off-by: Mike Travis --- kernel/irq/handle.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index ebba7a1..b39f32a 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -124,8 +124,7 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm } }; -/* FIXME: use bootmem alloc ...*/ -static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS]; +static unsigned int *kstat_irqs_legacy; int __init early_irq_init(void) { @@ -144,9 +143,14 @@ int __init early_irq_init(void) /* allocate irq_desc_ptrs array based on nr_irqs */ irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *)); + /* allocate based on nr_cpu_ids */ + /* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */ + kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids * + sizeof(int)); + for (i = 0; i < legacy_count; i++) { desc[i].irq = i; - desc[i].kstat_irqs = kstat_irqs_legacy[i]; + desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); init_alloc_desc_masks(&desc[i], 0, true); irq_desc_ptrs[i] = desc + i; -- cgit v1.1 From 92296c6d6e908c35fca287a21af27be814af9c75 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sun, 11 Jan 2009 09:22:58 -0800 Subject: cpumask, irq: non-x86 build failures Ingo Molnar wrote: > All non-x86 architectures fail to build: > > In file included from /home/mingo/tip/include/linux/random.h:11, > from /home/mingo/tip/include/linux/stackprotector.h:6, > from /home/mingo/tip/init/main.c:17: > /home/mingo/tip/include/linux/irqnr.h:26:63: error: asm/irq_vectors.h: No such file or directory Do not include asm/irq_vectors.h in generic code - it's not available on all architectures. Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index b39f32a..04d3e46 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -58,6 +58,11 @@ int nr_irqs = NR_IRQS; EXPORT_SYMBOL_GPL(nr_irqs); #ifdef CONFIG_SPARSE_IRQ + +#ifndef max_nr_irqs +#define max_nr_irqs(nr_cpus) NR_IRQS +#endif + static struct irq_desc irq_desc_init = { .irq = -1, .status = IRQ_DISABLED, -- cgit v1.1 From 4a046d1754ee6ebb6f399696805ed61ea0444d4c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 12 Jan 2009 17:39:24 -0800 Subject: x86: arch_probe_nr_irqs Impact: save RAM with large NR_CPUS, get smaller nr_irqs Signed-off-by: Yinghai Lu Signed-off-by: Mike Travis --- kernel/irq/handle.c | 9 ++------- kernel/softirq.c | 5 +++++ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 04d3e46..375d68c 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -59,10 +59,6 @@ EXPORT_SYMBOL_GPL(nr_irqs); #ifdef CONFIG_SPARSE_IRQ -#ifndef max_nr_irqs -#define max_nr_irqs(nr_cpus) NR_IRQS -#endif - static struct irq_desc irq_desc_init = { .irq = -1, .status = IRQ_DISABLED, @@ -137,9 +133,8 @@ int __init early_irq_init(void) int legacy_count; int i; - /* initialize nr_irqs based on nr_cpu_ids */ - nr_irqs = max_nr_irqs(nr_cpu_ids); - + /* initialize nr_irqs based on nr_cpu_ids */ + arch_probe_nr_irqs(); printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs); desc = irq_desc_legacy; diff --git a/kernel/softirq.c b/kernel/softirq.c index bdbe9de..0365b48 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -795,6 +795,11 @@ int __init __weak early_irq_init(void) return 0; } +int __init __weak arch_probe_nr_irqs(void) +{ + return 0; +} + int __init __weak arch_early_irq_init(void) { return 0; -- cgit v1.1 From 68564a46976017496c2227660930d81240f82355 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 16 Jan 2009 15:31:15 -0800 Subject: work_on_cpu: don't try to get_online_cpus() in work_on_cpu. Impact: remove potential circular lock dependency with cpu hotplug lock This has caused more problems than it solved, with a pile of cpu hotplug locking issues. Followup patches will get_online_cpus() in callers that need it, but if they don't do it they're no worse than before when they were using set_cpus_allowed without locking. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis --- kernel/workqueue.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2f44583..a35afdb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -991,8 +991,8 @@ static void do_work_for_cpu(struct work_struct *w) * @fn: the function to run * @arg: the function arg * - * This will return -EINVAL in the cpu is not online, or the return value - * of @fn otherwise. + * This will return the value @fn returns. + * It is up to the caller to ensure that the cpu doesn't go offline. */ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) { @@ -1001,14 +1001,8 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) INIT_WORK(&wfc.work, do_work_for_cpu); wfc.fn = fn; wfc.arg = arg; - get_online_cpus(); - if (unlikely(!cpu_online(cpu))) - wfc.ret = -EINVAL; - else { - schedule_work_on(cpu, &wfc.work); - flush_work(&wfc.work); - } - put_online_cpus(); + schedule_work_on(cpu, &wfc.work); + flush_work(&wfc.work); return wfc.ret; } -- cgit v1.1 From e1d9ec6246a2668a5d037f529877efb7cf176af8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 16 Jan 2009 15:31:15 -0800 Subject: work_on_cpu: Use our own workqueue. Impact: remove potential clashes with generic kevent workqueue Annoyingly, some places we want to use work_on_cpu are already in workqueues. As per Ingo's suggestion, we create a different workqueue for work_on_cpu. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis --- kernel/workqueue.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a35afdb..1f0c509 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -971,6 +971,8 @@ undo: } #ifdef CONFIG_SMP +static struct workqueue_struct *work_on_cpu_wq __read_mostly; + struct work_for_cpu { struct work_struct work; long (*fn)(void *); @@ -1001,7 +1003,7 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) INIT_WORK(&wfc.work, do_work_for_cpu); wfc.fn = fn; wfc.arg = arg; - schedule_work_on(cpu, &wfc.work); + queue_work_on(cpu, work_on_cpu_wq, &wfc.work); flush_work(&wfc.work); return wfc.ret; @@ -1019,4 +1021,8 @@ void __init init_workqueues(void) hotcpu_notifier(workqueue_cpu_callback, 0); keventd_wq = create_workqueue("events"); BUG_ON(!keventd_wq); +#ifdef CONFIG_SMP + work_on_cpu_wq = create_workqueue("work_on_cpu"); + BUG_ON(!work_on_cpu_wq); +#endif } -- cgit v1.1 From 483b4ee60edbefdfbff0dd538fb81f368d9e7c0d Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 4 Feb 2009 11:59:44 -0800 Subject: sched: fix nohz load balancer on cpu offline Christian Borntraeger reports: > After a logical cpu offline, even on a complete idle system, there > is one cpu with full ticks. It turns out that nohz.cpu_mask has the > the offlined cpu still set. > > In select_nohz_load_balancer() we check if the system is completely > idle to turn of load balancing. We compare cpu_online_map with > nohz.cpu_mask. Since cpu_online_map is updated on cpu unplug, > but nohz.cpu_mask is not, the check fails and the scheduler believes > that we need an "idle load balancer" even on a fully idle system. > Since the ilb cpu does not deactivate the timer tick this breaks NOHZ. Fix the select_nohz_load_balancer() to not set the nohz.cpu_mask while a cpu is going offline. Reported-by: Christian Borntraeger Signed-off-by: Suresh Siddha Tested-by: Christian Borntraeger Signed-off-by: Ingo Molnar --- kernel/sched.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 242d0d4..e1fc67d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3890,19 +3890,24 @@ int select_nohz_load_balancer(int stop_tick) int cpu = smp_processor_id(); if (stop_tick) { - cpumask_set_cpu(cpu, nohz.cpu_mask); cpu_rq(cpu)->in_nohz_recently = 1; - /* - * If we are going offline and still the leader, give up! - */ - if (!cpu_active(cpu) && - atomic_read(&nohz.load_balancer) == cpu) { + if (!cpu_active(cpu)) { + if (atomic_read(&nohz.load_balancer) != cpu) + return 0; + + /* + * If we are going offline and still the leader, + * give up! + */ if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) BUG(); + return 0; } + cpumask_set_cpu(cpu, nohz.cpu_mask); + /* time for ilb owner also to sleep */ if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { if (atomic_read(&nohz.load_balancer) == cpu) -- cgit v1.1 From 32bd671d6cbeda60dc73be77fa2b9037d9a9bfa0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Feb 2009 12:24:15 +0100 Subject: signal: re-add dead task accumulation stats. We're going to split the process wide cpu accounting into two parts: - clocks; which can take all the time they want since they run from user context. - timers; which need constant time tracing but can affort the overhead because they're default off -- and rare. The clock readout will go back to a full sum of the thread group, for this we need to re-add the exit stats that were removed in the initial itimer rework (f06febc9: timers: fix itimer/many thread hang). Furthermore, since that full sum can be rather slow for large thread groups and we have the complete dead task stats, revert the do_notify_parent time computation. Signed-off-by: Peter Zijlstra Reviewed-by: Ingo Molnar Signed-off-by: Ingo Molnar --- kernel/exit.c | 3 +++ kernel/fork.c | 3 ++- kernel/signal.c | 8 ++++---- 3 files changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f80dec3..efd30cc 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -118,6 +118,8 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ + sig->utime = cputime_add(sig->utime, task_utime(tsk)); + sig->stime = cputime_add(sig->stime, task_stime(tsk)); sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; @@ -126,6 +128,7 @@ static void __exit_signal(struct task_struct *tsk) sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); + sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig = NULL; /* Marker for below. */ } diff --git a/kernel/fork.c b/kernel/fork.c index 242a706..e8e854a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -851,13 +851,14 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->tty_old_pgrp = NULL; sig->tty = NULL; - sig->cutime = sig->cstime = cputime_zero; + sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; sig->gtime = cputime_zero; sig->cgtime = cputime_zero; sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; task_io_accounting_init(&sig->ioac); + sig->sum_sched_runtime = 0; taskstats_tgid_init(sig); task_lock(current->group_leader); diff --git a/kernel/signal.c b/kernel/signal.c index b6b3676..2a74fe8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1367,7 +1367,6 @@ int do_notify_parent(struct task_struct *tsk, int sig) struct siginfo info; unsigned long flags; struct sighand_struct *psig; - struct task_cputime cputime; int ret = sig; BUG_ON(sig == -1); @@ -1397,9 +1396,10 @@ int do_notify_parent(struct task_struct *tsk, int sig) info.si_uid = __task_cred(tsk)->uid; rcu_read_unlock(); - thread_group_cputime(tsk, &cputime); - info.si_utime = cputime_to_jiffies(cputime.utime); - info.si_stime = cputime_to_jiffies(cputime.stime); + info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, + tsk->signal->utime)); + info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime, + tsk->signal->stime)); info.si_status = tsk->exit_code & 0x7f; if (tsk->exit_code & 0x80) -- cgit v1.1 From 4cd4c1b40d40447fb5e7ba80746c6d7ba91d7a53 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Feb 2009 12:24:16 +0100 Subject: timers: split process wide cpu clocks/timers Change the process wide cpu timers/clocks so that we: 1) don't mess up the kernel with too many threads, 2) don't have a per-cpu allocation for each process, 3) have no impact when not used. In order to accomplish this we're going to split it into two parts: - clocks; which can take all the time they want since they run from user context -- ie. sys_clock_gettime(CLOCK_PROCESS_CPUTIME_ID) - timers; which need constant time sampling but since they're explicity used, the user can pay the overhead. The clock readout will go back to a full sum of the thread group, while the timers will run of a global 'clock' that only runs when needed, so only programs that make use of the facility pay the price. Signed-off-by: Peter Zijlstra Reviewed-by: Ingo Molnar Signed-off-by: Ingo Molnar --- kernel/itimer.c | 4 +- kernel/posix-cpu-timers.c | 95 +++++++++++++++++++++++++++++++++++++++++++++-- kernel/sched_stats.h | 45 ++++++++++++---------- 3 files changed, 119 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/itimer.c b/kernel/itimer.c index 6a5fe93..58762f7 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -62,7 +62,7 @@ int do_getitimer(int which, struct itimerval *value) struct task_cputime cputime; cputime_t utime; - thread_group_cputime(tsk, &cputime); + thread_group_cputimer(tsk, &cputime); utime = cputime.utime; if (cputime_le(cval, utime)) { /* about to fire */ cval = jiffies_to_cputime(1); @@ -82,7 +82,7 @@ int do_getitimer(int which, struct itimerval *value) struct task_cputime times; cputime_t ptime; - thread_group_cputime(tsk, ×); + thread_group_cputimer(tsk, ×); ptime = cputime_add(times.utime, times.stime); if (cputime_le(cval, ptime)) { /* about to fire */ cval = jiffies_to_cputime(1); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index fa07da9..db107c9 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -230,6 +230,37 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, return 0; } +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) +{ + struct sighand_struct *sighand; + struct signal_struct *sig; + struct task_struct *t; + + *times = INIT_CPUTIME; + + rcu_read_lock(); + sighand = rcu_dereference(tsk->sighand); + if (!sighand) + goto out; + + sig = tsk->signal; + + t = tsk; + do { + times->utime = cputime_add(times->utime, t->utime); + times->stime = cputime_add(times->stime, t->stime); + times->sum_exec_runtime += t->se.sum_exec_runtime; + + t = next_thread(t); + } while (t != tsk); + + times->utime = cputime_add(times->utime, sig->utime); + times->stime = cputime_add(times->stime, sig->stime); + times->sum_exec_runtime += sig->sum_sched_runtime; +out: + rcu_read_unlock(); +} + /* * Sample a process (thread group) clock for the given group_leader task. * Must be called with tasklist_lock held for reading. @@ -476,6 +507,29 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) } /* + * Enable the process wide cpu timer accounting. + * + * serialized using ->sighand->siglock + */ +static void start_process_timers(struct task_struct *tsk) +{ + tsk->signal->cputimer.running = 1; + barrier(); +} + +/* + * Release the process wide timer accounting -- timer stops ticking when + * nobody cares about it. + * + * serialized using ->sighand->siglock + */ +static void stop_process_timers(struct task_struct *tsk) +{ + tsk->signal->cputimer.running = 0; + barrier(); +} + +/* * Insert the timer on the appropriate list before any timers that * expire later. This must be called with the tasklist_lock held * for reading, and interrupts disabled. @@ -495,6 +549,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) BUG_ON(!irqs_disabled()); spin_lock(&p->sighand->siglock); + if (!CPUCLOCK_PERTHREAD(timer->it_clock)) + start_process_timers(p); + listpos = head; if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { list_for_each_entry(next, head, entry) { @@ -987,13 +1044,15 @@ static void check_process_timers(struct task_struct *tsk, sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && list_empty(&timers[CPUCLOCK_VIRT]) && cputime_eq(sig->it_virt_expires, cputime_zero) && - list_empty(&timers[CPUCLOCK_SCHED])) + list_empty(&timers[CPUCLOCK_SCHED])) { + stop_process_timers(tsk); return; + } /* * Collect the current process totals. */ - thread_group_cputime(tsk, &cputime); + thread_group_cputimer(tsk, &cputime); utime = cputime.utime; ptime = cputime_add(utime, cputime.stime); sum_sched_runtime = cputime.sum_exec_runtime; @@ -1259,7 +1318,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk) if (!task_cputime_zero(&sig->cputime_expires)) { struct task_cputime group_sample; - thread_group_cputime(tsk, &group_sample); + thread_group_cputimer(tsk, &group_sample); if (task_cputime_expired(&group_sample, &sig->cputime_expires)) return 1; } @@ -1329,6 +1388,33 @@ void run_posix_cpu_timers(struct task_struct *tsk) } /* + * Sample a process (thread group) timer for the given group_leader task. + * Must be called with tasklist_lock held for reading. + */ +static int cpu_timer_sample_group(const clockid_t which_clock, + struct task_struct *p, + union cpu_time_count *cpu) +{ + struct task_cputime cputime; + + thread_group_cputimer(p, &cputime); + switch (CPUCLOCK_WHICH(which_clock)) { + default: + return -EINVAL; + case CPUCLOCK_PROF: + cpu->cpu = cputime_add(cputime.utime, cputime.stime); + break; + case CPUCLOCK_VIRT: + cpu->cpu = cputime.utime; + break; + case CPUCLOCK_SCHED: + cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); + break; + } + return 0; +} + +/* * Set one of the process-wide special case CPU timers. * The tsk->sighand->siglock must be held by the caller. * The *newval argument is relative and we update it to be absolute, *oldval @@ -1341,7 +1427,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, struct list_head *head; BUG_ON(clock_idx == CPUCLOCK_SCHED); - cpu_clock_sample_group(clock_idx, tsk, &now); + start_process_timers(tsk); + cpu_timer_sample_group(clock_idx, tsk, &now); if (oldval) { if (!cputime_eq(*oldval, cputime_zero)) { diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 8ab0cef..a8f93dd 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -296,19 +296,21 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) static inline void account_group_user_time(struct task_struct *tsk, cputime_t cputime) { - struct task_cputime *times; - struct signal_struct *sig; + struct thread_group_cputimer *cputimer; /* tsk == current, ensure it is safe to use ->signal */ if (unlikely(tsk->exit_state)) return; - sig = tsk->signal; - times = &sig->cputime.totals; + cputimer = &tsk->signal->cputimer; - spin_lock(×->lock); - times->utime = cputime_add(times->utime, cputime); - spin_unlock(×->lock); + if (!cputimer->running) + return; + + spin_lock(&cputimer->lock); + cputimer->cputime.utime = + cputime_add(cputimer->cputime.utime, cputime); + spin_unlock(&cputimer->lock); } /** @@ -324,19 +326,21 @@ static inline void account_group_user_time(struct task_struct *tsk, static inline void account_group_system_time(struct task_struct *tsk, cputime_t cputime) { - struct task_cputime *times; - struct signal_struct *sig; + struct thread_group_cputimer *cputimer; /* tsk == current, ensure it is safe to use ->signal */ if (unlikely(tsk->exit_state)) return; - sig = tsk->signal; - times = &sig->cputime.totals; + cputimer = &tsk->signal->cputimer; + + if (!cputimer->running) + return; - spin_lock(×->lock); - times->stime = cputime_add(times->stime, cputime); - spin_unlock(×->lock); + spin_lock(&cputimer->lock); + cputimer->cputime.stime = + cputime_add(cputimer->cputime.stime, cputime); + spin_unlock(&cputimer->lock); } /** @@ -352,7 +356,7 @@ static inline void account_group_system_time(struct task_struct *tsk, static inline void account_group_exec_runtime(struct task_struct *tsk, unsigned long long ns) { - struct task_cputime *times; + struct thread_group_cputimer *cputimer; struct signal_struct *sig; sig = tsk->signal; @@ -361,9 +365,12 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, if (unlikely(!sig)) return; - times = &sig->cputime.totals; + cputimer = &sig->cputimer; + + if (!cputimer->running) + return; - spin_lock(×->lock); - times->sum_exec_runtime += ns; - spin_unlock(×->lock); + spin_lock(&cputimer->lock); + cputimer->cputime.sum_exec_runtime += ns; + spin_unlock(&cputimer->lock); } -- cgit v1.1 From 6cd61c0baa8bce32271226198b46c67a7a05d108 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 9 Feb 2009 22:17:39 +0900 Subject: elf: add ELF_CORE_COPY_KERNEL_REGS() ELF core dump is used for both user land core dump and kernel crash dump. Depending on architecture, register might need to be accessed differently for userland and kernel. Allow architectures to define ELF_CORE_COPY_KERNEL_REGS() and use different operation for kernel register dump. Signed-off-by: Tejun Heo Signed-off-by: Ingo Molnar --- kernel/kexec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 8a6d7b0..795e7b6 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1130,7 +1130,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) return; memset(&prstatus, 0, sizeof(prstatus)); prstatus.pr_pid = current->pid; - elf_core_copy_regs(&prstatus.pr_reg, regs); + elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, &prstatus, sizeof(prstatus)); final_note(buf); -- cgit v1.1 From 5d707e9c8ef2a3596ed5c975c6ff05cec890c2b4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 9 Feb 2009 22:17:39 +0900 Subject: stackprotector: update make rules Impact: no default -fno-stack-protector if stackp is enabled, cleanup Stackprotector make rules had the following problems. * cc support test and warning are scattered across makefile and kernel/panic.c. * -fno-stack-protector was always added regardless of configuration. Update such that cc support test and warning are contained in makefile and -fno-stack-protector is added iff stackp is turned off. While at it, prepare for 32bit support. Signed-off-by: Tejun Heo Signed-off-by: Ingo Molnar --- kernel/panic.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 33cab3d..32fe4ef 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -359,10 +359,6 @@ EXPORT_SYMBOL(warn_slowpath); #ifdef CONFIG_CC_STACKPROTECTOR -#ifndef GCC_HAS_SP -#warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this. -#endif - /* * Called when gcc's -fstack-protector feature is used, and * gcc detects corruption of the on-stack canary value -- cgit v1.1 From acd895795d35d7c6405f20301a846d16998795ec Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 9 Feb 2009 19:20:50 +0000 Subject: profiling: fix broken profiling regression Impact: fix broken /proc/profile on UP machines Commit c309b917cab55799ea489d7b5f1b77025d9f8462 "cpumask: convert kernel/profile.c" broke profiling. prof_cpu_mask was previously initialized to CPU_MASK_ALL, but left uninitialized in that commit. We need to copy cpu_possible_mask (cpu_online_mask is not enough). Signed-off-by: Hugh Dickins Signed-off-by: Ingo Molnar --- kernel/profile.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index 784933a..7724e04 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -114,12 +114,15 @@ int __ref profile_init(void) if (!slab_is_available()) { prof_buffer = alloc_bootmem(buffer_bytes); alloc_bootmem_cpumask_var(&prof_cpu_mask); + cpumask_copy(prof_cpu_mask, cpu_possible_mask); return 0; } if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) return -ENOMEM; + cpumask_copy(prof_cpu_mask, cpu_possible_mask); + prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); if (prof_buffer) return 0; -- cgit v1.1 From 06eb23b1ba39c61ee5d5faeb42a097635693e370 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 9 Feb 2009 02:02:33 +0100 Subject: ptrace, x86: fix the usage of ptrace_fork() I noticed by pure accident we have ptrace_fork() and friends. This was added by "x86, bts: add fork and exit handling", commit bf53de907dfdaac178c92d774aae7370d7b97d20. I can't test this, ds_request_bts() returns -EOPNOTSUPP, but I strongly believe this needs the fix. I think something like this program int main(void) { int pid = fork(); if (!pid) { ptrace(PTRACE_TRACEME, 0, NULL, NULL); kill(getpid(), SIGSTOP); fork(); } else { struct ptrace_bts_config bts = { .flags = PTRACE_BTS_O_ALLOC, .size = 4 * 4096, }; wait(NULL); ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_TRACEFORK); ptrace(PTRACE_BTS_CONFIG, pid, &bts, sizeof(bts)); ptrace(PTRACE_CONT, pid, NULL, NULL); sleep(1); } return 0; } should crash the kernel. If the task is traced by its natural parent ptrace_reparented() returns 0 but we should clear ->btsxxx anyway. Signed-off-by: Oleg Nesterov Acked-by: Markus Metzger Signed-off-by: Ingo Molnar --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 242a706..43c039d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1093,7 +1093,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif - if (unlikely(ptrace_reparented(current))) + if (unlikely(current->ptrace)) ptrace_fork(p, clone_flags); /* Perform scheduler related setup. Assign this task to a CPU. */ -- cgit v1.1 From 3fccfd67df79c6351a156eb25a7a514e5f39c4d9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 Feb 2009 16:37:31 +0100 Subject: timers: split process wide cpu clocks/timers, fix To decrease the chance of a missed enable, always enable the timer when we sample it, we'll always disable it when we find that there are no active timers in the jiffy tick. This fixes a flood of warnings reported by Mike Galbraith. Reported-by: Mike Galbraith Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/posix-cpu-timers.c | 42 ++++++++++++++---------------------------- 1 file changed, 14 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index db107c9..e5d7bfd 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -488,7 +488,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) { struct task_cputime cputime; - thread_group_cputime(tsk, &cputime); + thread_group_cputimer(tsk, &cputime); cleanup_timers(tsk->signal->cpu_timers, cputime.utime, cputime.stime, cputime.sum_exec_runtime); } @@ -507,29 +507,6 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) } /* - * Enable the process wide cpu timer accounting. - * - * serialized using ->sighand->siglock - */ -static void start_process_timers(struct task_struct *tsk) -{ - tsk->signal->cputimer.running = 1; - barrier(); -} - -/* - * Release the process wide timer accounting -- timer stops ticking when - * nobody cares about it. - * - * serialized using ->sighand->siglock - */ -static void stop_process_timers(struct task_struct *tsk) -{ - tsk->signal->cputimer.running = 0; - barrier(); -} - -/* * Insert the timer on the appropriate list before any timers that * expire later. This must be called with the tasklist_lock held * for reading, and interrupts disabled. @@ -549,9 +526,6 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) BUG_ON(!irqs_disabled()); spin_lock(&p->sighand->siglock); - if (!CPUCLOCK_PERTHREAD(timer->it_clock)) - start_process_timers(p); - listpos = head; if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { list_for_each_entry(next, head, entry) { @@ -1021,6 +995,19 @@ static void check_thread_timers(struct task_struct *tsk, } } +static void stop_process_timers(struct task_struct *tsk) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + unsigned long flags; + + if (!cputimer->running) + return; + + spin_lock_irqsave(&cputimer->lock, flags); + cputimer->running = 0; + spin_unlock_irqrestore(&cputimer->lock, flags); +} + /* * Check for any per-thread CPU timers that have fired and move them * off the tsk->*_timers list onto the firing list. Per-thread timers @@ -1427,7 +1414,6 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, struct list_head *head; BUG_ON(clock_idx == CPUCLOCK_SCHED); - start_process_timers(tsk); cpu_timer_sample_group(clock_idx, tsk, &now); if (oldval) { -- cgit v1.1 From 4da94d49b2ecb0a26e716a8811c3ecc542c2a65d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Feb 2009 11:30:27 +0100 Subject: timers: fix TIMER_ABSTIME for process wide cpu timers The POSIX timer interface allows for absolute time expiry values through the TIMER_ABSTIME flag, therefore we have to synchronize the timer to the clock every time we start it. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/posix-cpu-timers.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index e5d7bfd..2313a4c 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -261,6 +261,40 @@ out: rcu_read_unlock(); } +static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) +{ + if (cputime_gt(b->utime, a->utime)) + a->utime = b->utime; + + if (cputime_gt(b->stime, a->stime)) + a->stime = b->stime; + + if (b->sum_exec_runtime > a->sum_exec_runtime) + a->sum_exec_runtime = b->sum_exec_runtime; +} + +void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + struct task_cputime sum; + unsigned long flags; + + spin_lock_irqsave(&cputimer->lock, flags); + if (!cputimer->running) { + cputimer->running = 1; + /* + * The POSIX timer interface allows for absolute time expiry + * values through the TIMER_ABSTIME flag, therefore we have + * to synchronize the timer to the clock every time we start + * it. + */ + thread_group_cputime(tsk, &sum); + update_gt_cputime(&cputimer->cputime, &sum); + } + *times = cputimer->cputime; + spin_unlock_irqrestore(&cputimer->lock, flags); +} + /* * Sample a process (thread group) clock for the given group_leader task. * Must be called with tasklist_lock held for reading. -- cgit v1.1 From fc631c82e1734d718ff0832558f64c8f5d185f26 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Feb 2009 14:27:17 +0100 Subject: sched: revert recent sync wakeup changes Intel reported a 10% regression (mysql+sysbench) on a 16-way machine with these patches: 1596e29: sched: symmetric sync vs avg_overlap d942fb6: sched: fix sync wakeups Revert them. Reported-by: "Zhang, Yanmin" Bisected-by: Lin Ming Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 ---------- kernel/sched_fair.c | 11 +++++++++-- 2 files changed, 9 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e1fc67d..f11c02b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2266,16 +2266,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (!sched_feat(SYNC_WAKEUPS)) sync = 0; - if (!sync) { - if (current->se.avg_overlap < sysctl_sched_migration_cost && - p->se.avg_overlap < sysctl_sched_migration_cost) - sync = 1; - } else { - if (current->se.avg_overlap >= sysctl_sched_migration_cost || - p->se.avg_overlap >= sysctl_sched_migration_cost) - sync = 0; - } - #ifdef CONFIG_SMP if (sched_feat(LB_WAKEUP_UPDATE)) { struct sched_domain *sd; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a7e50ba..0566f2a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1191,15 +1191,20 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, int idx, unsigned long load, unsigned long this_load, unsigned int imbalance) { + struct task_struct *curr = this_rq->curr; + struct task_group *tg; unsigned long tl = this_load; unsigned long tl_per_task; - struct task_group *tg; unsigned long weight; int balanced; if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) return 0; + if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || + p->se.avg_overlap > sysctl_sched_migration_cost)) + sync = 0; + /* * If sync wakeup then subtract the (maximum possible) * effect of the currently running task from the load @@ -1426,7 +1431,9 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) if (!sched_feat(WAKEUP_PREEMPT)) return; - if (sched_feat(WAKEUP_OVERLAP) && sync) { + if (sched_feat(WAKEUP_OVERLAP) && (sync || + (se->avg_overlap < sysctl_sched_migration_cost && + pse->avg_overlap < sysctl_sched_migration_cost))) { resched_task(curr); return; } -- cgit v1.1 From fc3501d411d34823fb9be248a95a0c44f945866f Mon Sep 17 00:00:00 2001 From: Sven Wegener Date: Wed, 11 Feb 2009 13:04:23 -0800 Subject: mm: fix dirty_bytes/dirty_background_bytes sysctls on 64bit arches We need to pass an unsigned long as the minimum, because it gets casted to an unsigned long in the sysctl handler. If we pass an int, we'll access four more bytes on 64bit arches, resulting in a random minimum value. [rientjes@google.com: fix type of `old_bytes'] Signed-off-by: Sven Wegener Cc: Peter Zijlstra Cc: Dave Chinner Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 790f9d7..c5ef44f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -101,6 +101,7 @@ static int two = 2; static int zero; static int one = 1; +static unsigned long one_ul = 1; static int one_hundred = 100; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ @@ -974,7 +975,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &dirty_background_bytes_handler, .strategy = &sysctl_intvec, - .extra1 = &one, + .extra1 = &one_ul, }, { .ctl_name = VM_DIRTY_RATIO, @@ -995,7 +996,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &dirty_bytes_handler, .strategy = &sysctl_intvec, - .extra1 = &one, + .extra1 = &one_ul, }, { .procname = "dirty_writeback_centisecs", -- cgit v1.1 From cfebe563bd0a3ff97e1bc167123120d59c7a84db Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 11 Feb 2009 13:04:36 -0800 Subject: cgroups: fix lockdep subclasses overflow I enabled all cgroup subsystems when compiling kernel, and then: # mount -t cgroup -o net_cls xxx /mnt # mkdir /mnt/0 This showed up immediately: BUG: MAX_LOCKDEP_SUBCLASSES too low! turning off the locking correctness validator. It's caused by the cgroup hierarchy lock: for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss->root == root) mutex_lock_nested(&ss->hierarchy_mutex, i); } Now we have 9 cgroup subsystems, and the above 'i' for net_cls is 8, but MAX_LOCKDEP_SUBCLASSES is 8. This patch uses different lockdep keys for different subsystems. Signed-off-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5a54ff4..e14db9c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2351,7 +2351,7 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root) for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss->root == root) - mutex_lock_nested(&ss->hierarchy_mutex, i); + mutex_lock(&ss->hierarchy_mutex); } } @@ -2637,6 +2637,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(!list_empty(&init_task.tasks)); mutex_init(&ss->hierarchy_mutex); + lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); ss->active = 1; } -- cgit v1.1 From fb5ae64fdde29236e1a15e0366946df7060f41f2 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Fri, 13 Feb 2009 14:04:21 +0000 Subject: User namespaces: Only put the userns when we unhash the uid uids in namespaces other than init don't get a sysfs entry. For those in the init namespace, while we're waiting to remove the sysfs entry for the uid the uid is still hashed, and alloc_uid() may re-grab that uid without getting a new reference to the user_ns, which we've already put in free_user before scheduling remove_user_sysfs_dir(). Reported-and-tested-by: KOSAKI Motohiro Signed-off-by: Serge E. Hallyn Acked-by: David Howells Tested-by: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/user.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 477b666..3551ac7 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -72,6 +72,7 @@ static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) static void uid_hash_remove(struct user_struct *up) { hlist_del_init(&up->uidhash_node); + put_user_ns(up->user_ns); } static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) @@ -334,7 +335,6 @@ static void free_user(struct user_struct *up, unsigned long flags) atomic_inc(&up->__count); spin_unlock_irqrestore(&uidhash_lock, flags); - put_user_ns(up->user_ns); INIT_WORK(&up->work, remove_user_sysfs_dir); schedule_work(&up->work); } @@ -357,7 +357,6 @@ static void free_user(struct user_struct *up, unsigned long flags) sched_destroy_user(up); key_put(up->uid_keyring); key_put(up->session_keyring); - put_user_ns(up->user_ns); kmem_cache_free(uid_cachep, up); } -- cgit v1.1