From 490da40d82b31c0562d3f5edb37810f492ca1c34 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Wed, 19 Jan 2011 10:51:44 +0800 Subject: blktrace: Don't output messages if NOTIFY isn't set. Now if we enable blktrace, cfq has too many messages output to the trace buffer. It is fine if we don't specify any action mask. But if I do like this: blktrace /dev/sdb -a issue -a complete -o - | blkparse -i - I only want to see 'D' and 'C', while with the following command dd if=/mnt/ocfs2/test of=/dev/null bs=4k count=1 iflag=direct I will get(with a 2.6.37 vanilla kernel): 8,16 0 0 0.000000000 0 m N cfq3805 alloced 8,16 0 0 0.000004126 0 m N cfq3805 insert_request 8,16 0 0 0.000004884 0 m N cfq3805 add_to_rr 8,16 0 0 0.000008417 0 m N cfq workload slice:300 8,16 0 0 0.000009557 0 m N cfq3805 set_active wl_prio:0 wl_type:2 8,16 0 0 0.000010640 0 m N cfq3805 fifo= (null) 8,16 0 0 0.000011193 0 m N cfq3805 dispatch_insert 8,16 0 0 0.000012221 0 m N cfq3805 dispatched a request 8,16 0 0 0.000012802 0 m N cfq3805 activate rq, drv=1 8,16 0 1 0.000013181 3805 D R 114759 + 8 [dd] 8,16 0 2 0.000164244 0 C R 114759 + 8 [0] 8,16 0 0 0.000167997 0 m N cfq3805 complete rqnoidle 0 8,16 0 0 0.000168782 0 m N cfq3805 set_slice=100 8,16 0 0 0.000169874 0 m N cfq3805 arm_idle: 8 group_idle: 0 8,16 0 0 0.000170189 0 m N cfq schedule dispatch 8,16 0 0 0.000397938 0 m N cfq3805 slice expired t=0 8,16 0 0 0.000399763 0 m N cfq3805 sl_used=1 disp=1 charge=1 iops=0 sect=8 8,16 0 0 0.000400227 0 m N cfq3805 del_from_rr 8,16 0 0 0.000400882 0 m N cfq3805 put_queue See, there are 19 lines while I only need 2. I don't think it is appropriate for a user. So this patch will disable any messages if the BLK_TC_NOTIFY isn't set. Now the output for the same command will look like: 8,16 0 1 0.000000000 4908 D R 114759 + 8 [dd] 8,16 0 2 0.000146827 0 C R 114759 + 8 [0] Yes, it is what I want to see. Cc: Steven Rostedt Cc: Jeff Moyer Signed-off-by: Tao Ma Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 153562d..d95721f 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -138,6 +138,13 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) !blk_tracer_enabled)) return; + /* + * If the BLK_TC_NOTIFY action mask isn't set, don't send any note + * message to the trace. + */ + if (!(bt->act_mask & BLK_TC_NOTIFY)) + return; + local_irq_save(flags); buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); va_start(args, fmt); -- cgit v1.1 From 88d4f0db7fa8785859c1d637f9aac210932b6216 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 25 Jan 2011 19:40:51 +0100 Subject: perf: Fix alloc_callchain_buffers() Commit 927c7a9e92c4 ("perf: Fix race in callchains") introduced a mismatch in the sizing of struct callchain_cpus_entries. nr_cpu_ids must be used instead of num_possible_cpus(), or we might get out of bound memory accesses on some machines. Signed-off-by: Eric Dumazet Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: David Miller Cc: Stephane Eranian CC: stable@kernel.org LKML-Reference: <1295980851.3588.351.camel@edumazet-laptop> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 126a302..852ae8c 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1999,8 +1999,7 @@ static int alloc_callchain_buffers(void) * accessed from NMI. Use a temporary manual per cpu allocation * until that gets sorted out. */ - size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) * - num_possible_cpus(); + size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); entries = kzalloc(size, GFP_KERNEL); if (!entries) -- cgit v1.1 From 4135038a582c20ffdadfcf6564852e0b72a20968 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Fri, 28 Jan 2011 11:00:31 -0500 Subject: watchdog: Fix broken nowatchdog logic Passing nowatchdog to kernel disables 2 things: creation of watchdog threads AND initialization of percpu watchdog_hrtimer. As hrtimers are initialized only at boot it's not possible to enable watchdog later - for me all watchdog threads started to eat 100% of CPU time, but they could just crash. Additionally, even if these threads would start properly, watchdog_disable_all_cpus was guarded by no_watchdog check, so you couldn't disable watchdog. To fix this, remove no_watchdog variable and use already existing watchdog_enabled variable. Signed-off-by: Marcin Slusarz [ removed another no_watchdog instance ] Signed-off-by: Don Zickus Cc: Stephane Eranian Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: LKML-Reference: <1296230433-6261-1-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index d7ebdf4..d9961ea 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -27,7 +27,7 @@ #include #include -int watchdog_enabled; +int watchdog_enabled = 1; int __read_mostly softlockup_thresh = 60; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); @@ -43,9 +43,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); #endif -static int no_watchdog; - - /* boot commands */ /* * Should we panic when a soft-lockup or hard-lockup occurs: @@ -58,7 +55,7 @@ static int __init hardlockup_panic_setup(char *str) if (!strncmp(str, "panic", 5)) hardlockup_panic = 1; else if (!strncmp(str, "0", 1)) - no_watchdog = 1; + watchdog_enabled = 0; return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); @@ -77,7 +74,7 @@ __setup("softlockup_panic=", softlockup_panic_setup); static int __init nowatchdog_setup(char *str) { - no_watchdog = 1; + watchdog_enabled = 0; return 1; } __setup("nowatchdog", nowatchdog_setup); @@ -85,7 +82,7 @@ __setup("nowatchdog", nowatchdog_setup); /* deprecated */ static int __init nosoftlockup_setup(char *str) { - no_watchdog = 1; + watchdog_enabled = 0; return 1; } __setup("nosoftlockup", nosoftlockup_setup); @@ -476,9 +473,6 @@ static void watchdog_disable_all_cpus(void) { int cpu; - if (no_watchdog) - return; - for_each_online_cpu(cpu) watchdog_disable(cpu); @@ -530,7 +524,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - err = watchdog_enable(hotcpu); + if (watchdog_enabled) + err = watchdog_enable(hotcpu); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: @@ -555,9 +550,6 @@ void __init lockup_detector_init(void) void *cpu = (void *)(long)smp_processor_id(); int err; - if (no_watchdog) - return; - err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); WARN_ON(notifier_to_errno(err)); -- cgit v1.1 From 397357666de6b5b6adb5fa99f9758ec8cf30ac34 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Fri, 28 Jan 2011 11:00:32 -0500 Subject: watchdog: Fix sysctl consistency If it was not possible to enable watchdog for any cpu, switch watchdog_enabled back to 0, because it's visible via kernel.watchdog sysctl. Signed-off-by: Marcin Slusarz Signed-off-by: Don Zickus Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: LKML-Reference: <1296230433-6261-2-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index d9961ea..c7e0049 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -429,9 +429,6 @@ static int watchdog_enable(int cpu) wake_up_process(p); } - /* if any cpu succeeds, watchdog is considered enabled for the system */ - watchdog_enabled = 1; - return 0; } @@ -459,12 +456,16 @@ static void watchdog_disable(int cpu) static void watchdog_enable_all_cpus(void) { int cpu; - int result = 0; + + watchdog_enabled = 0; for_each_online_cpu(cpu) - result += watchdog_enable(cpu); + if (!watchdog_enable(cpu)) + /* if any cpu succeeds, watchdog is considered + enabled for the system */ + watchdog_enabled = 1; - if (result) + if (!watchdog_enabled) printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); } -- cgit v1.1 From 9ffdc6c37df131f89d52001e0ef03091b158826f Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Fri, 28 Jan 2011 11:00:33 -0500 Subject: watchdog: Don't change watchdog state on read of sysctl Signed-off-by: Marcin Slusarz [ add {}'s to fix a warning ] Signed-off-by: Don Zickus Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: LKML-Reference: <1296230433-6261-3-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index c7e0049..f37f974 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -493,10 +493,12 @@ int proc_dowatchdog_enabled(struct ctl_table *table, int write, { proc_dointvec(table, write, buffer, length, ppos); - if (watchdog_enabled) - watchdog_enable_all_cpus(); - else - watchdog_disable_all_cpus(); + if (write) { + if (watchdog_enabled) + watchdog_enable_all_cpus(); + else + watchdog_disable_all_cpus(); + } return 0; } -- cgit v1.1 From f1a06390d013244e721372b3f9b66e39b6429c71 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 28 Jan 2011 08:47:15 +0100 Subject: genirq: Prevent irq storm on migration move_native_irq() masks and unmasks the interrupt line unconditionally, but the interrupt line might be masked due to a threaded oneshot handler in progress. Unmasking the line in that case can lead to interrupt storms. Observed on PREEMPT_RT. Originally-from: Ingo Molnar Signed-off-by: Thomas Gleixner Cc: stable@kernel.org --- kernel/irq/migration.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 1d25419..441fd62 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -56,6 +56,7 @@ void move_masked_irq(int irq) void move_native_irq(int irq) { struct irq_desc *desc = irq_to_desc(irq); + bool masked; if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; @@ -63,8 +64,15 @@ void move_native_irq(int irq) if (unlikely(desc->status & IRQ_DISABLED)) return; - desc->irq_data.chip->irq_mask(&desc->irq_data); + /* + * Be careful vs. already masked interrupts. If this is a + * threaded interrupt with ONESHOT set, we can end up with an + * interrupt storm. + */ + masked = desc->status & IRQ_MASKED; + if (!masked) + desc->irq_data.chip->irq_mask(&desc->irq_data); move_masked_irq(irq); - desc->irq_data.chip->irq_unmask(&desc->irq_data); + if (!masked) + desc->irq_data.chip->irq_unmask(&desc->irq_data); } - -- cgit v1.1 From e4a9ea5ee7c8812a7bf0c3fb725ceeaa3d4c2fcc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 09:15:30 -0500 Subject: tracing: Replace trace_event struct array with pointer array Currently the trace_event structures are placed in the _ftrace_events section, and at link time, the linker makes one large array of all the trace_event structures. On boot up, this array is read (much like the initcall sections) and the events are processed. The problem is that there is no guarantee that gcc will place complex structures nicely together in an array format. Two structures in the same file may be placed awkwardly, because gcc has no clue that they are suppose to be in an array. A hack was used previous to force the alignment to 4, to pack the structures together. But this caused alignment issues with other architectures (sparc). Instead of packing the structures into an array, the structures' addresses are now put into the _ftrace_event section. As pointers are always the natural alignment, gcc should always pack them tightly together (otherwise initcall, extable, etc would also fail). By having the pointers to the structures in the section, we can still iterate the trace_events without causing unnecessary alignment problems with other architectures, or depending on the current behaviour of gcc that will likely change in the future just to tick us kernel developers off a little more. The _ftrace_event section is also moved into the .init.data section as it is now only needed at boot up. Suggested-by: David Miller Cc: Mathieu Desnoyers Acked-by: David S. Miller Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 12 ++++++------ kernel/trace/trace_export.c | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 35fde09..5f499e0 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1284,7 +1284,7 @@ trace_create_file_ops(struct module *mod) static void trace_module_add_events(struct module *mod) { struct ftrace_module_file_ops *file_ops = NULL; - struct ftrace_event_call *call, *start, *end; + struct ftrace_event_call **call, **start, **end; start = mod->trace_events; end = mod->trace_events + mod->num_trace_events; @@ -1297,7 +1297,7 @@ static void trace_module_add_events(struct module *mod) return; for_each_event(call, start, end) { - __trace_add_event_call(call, mod, + __trace_add_event_call(*call, mod, &file_ops->id, &file_ops->enable, &file_ops->filter, &file_ops->format); } @@ -1367,8 +1367,8 @@ static struct notifier_block trace_module_nb = { .priority = 0, }; -extern struct ftrace_event_call __start_ftrace_events[]; -extern struct ftrace_event_call __stop_ftrace_events[]; +extern struct ftrace_event_call *__start_ftrace_events[]; +extern struct ftrace_event_call *__stop_ftrace_events[]; static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; @@ -1384,7 +1384,7 @@ __setup("trace_event=", setup_trace_event); static __init int event_trace_init(void) { - struct ftrace_event_call *call; + struct ftrace_event_call **call; struct dentry *d_tracer; struct dentry *entry; struct dentry *d_events; @@ -1430,7 +1430,7 @@ static __init int event_trace_init(void) pr_warning("tracing: Failed to allocate common fields"); for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { - __trace_add_event_call(call, NULL, &ftrace_event_id_fops, + __trace_add_event_call(*call, NULL, &ftrace_event_id_fops, &ftrace_enable_fops, &ftrace_event_filter_fops, &ftrace_event_format_fops); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 4b74d71..bbeec31 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -161,13 +161,13 @@ struct ftrace_event_class event_class_ftrace_##call = { \ .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ }; \ \ -struct ftrace_event_call __used \ -__attribute__((__aligned__(4))) \ -__attribute__((section("_ftrace_events"))) event_##call = { \ +struct ftrace_event_call __used event_##call = { \ .name = #call, \ .event.type = etype, \ .class = &event_class_ftrace_##call, \ .print_fmt = print, \ }; \ +struct ftrace_event_call __used \ +__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; #include "trace_entries.h" -- cgit v1.1 From 542e72fc90f5ed9eecb574f80f70868c7f296093 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 26 Jan 2011 15:38:35 +0100 Subject: perf: Fix reading in perf_event_read() It is quite possible for the event to have been disabled between perf_event_read() sending the IPI and the CPU servicing the IPI and calling __perf_event_read(), hence revalidate the state. Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 852ae8c..999835b 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1901,11 +1901,12 @@ static void __perf_event_read(void *info) return; raw_spin_lock(&ctx->lock); - update_context_time(ctx); + if (ctx->is_active) + update_context_time(ctx); update_event_times(event); + if (event->state == PERF_EVENT_STATE_ACTIVE) + event->pmu->read(event); raw_spin_unlock(&ctx->lock); - - event->pmu->read(event); } static inline u64 perf_event_count(struct perf_event *event) -- cgit v1.1 From 06c3bc655697b19521901f9254eb0bbb2c67e7e8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Feb 2011 13:19:48 +0100 Subject: sched: Fix update_curr_rt() cpu_stopper_thread() migration_cpu_stop() __migrate_task() deactivate_task() dequeue_task() dequeue_task_rq() update_curr_rt() Will call update_curr_rt() on rq->curr, which at that time is rq->stop. The problem is that rq->stop.prio matches an RT prio and thus falsely assumes its a rt_sched_class task. Reported-Debuged-Tested-Acked-by: Thomas Gleixner Signed-off-by: Peter Zijlstra LKML-Reference: Cc: stable@kernel.org # .37 Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index c914ec7..ad62677 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -625,7 +625,7 @@ static void update_curr_rt(struct rq *rq) struct rt_rq *rt_rq = rt_rq_of_se(rt_se); u64 delta_exec; - if (!task_has_rt_policy(curr)) + if (curr->sched_class != &rt_sched_class) return; delta_exec = rq->clock_task - curr->se.exec_start; -- cgit v1.1 From 654986462939cd7ec18f276c6379a334dac106a7 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 26 Jan 2011 17:26:22 -0500 Subject: tracepoints: Fix section alignment using pointer array Make the tracepoints more robust, making them solid enough to handle compiler changes by not relying on anything based on compiler-specific behavior with respect to structure alignment. Implement an approach proposed by David Miller: use an array of const pointers to refer to the individual structures, and export this pointer array through the linker script rather than the structures per se. It will consume 32 extra bytes per tracepoint (24 for structure padding and 8 for the pointers), but are less likely to break due to compiler changes. History: commit 7e066fb8 tracepoints: add DECLARE_TRACE() and DEFINE_TRACE() added the aligned(32) type and variable attribute to the tracepoint structures to deal with gcc happily aligning statically defined structures on 32-byte multiples. One attempt was to use a 8-byte alignment for tracepoint structures by applying both the variable and type attribute to tracepoint structures definitions and declarations. It worked fine with gcc 4.5.1, but broke with gcc 4.4.4 and 4.4.5. The reason is that the "aligned" attribute only specify the _minimum_ alignment for a structure, leaving both the compiler and the linker free to align on larger multiples. Because tracepoint.c expects the structures to be placed as an array within each section, up-alignment cause NULL-pointer exceptions due to the extra unexpected padding. (this patch applies on top of -tip) Signed-off-by: Mathieu Desnoyers Acked-by: David S. Miller LKML-Reference: <20110126222622.GA10794@Krystal> CC: Frederic Weisbecker CC: Ingo Molnar CC: Thomas Gleixner CC: Andrew Morton CC: Peter Zijlstra CC: Rusty Russell Signed-off-by: Steven Rostedt --- kernel/module.c | 16 ++++++++-------- kernel/tracepoint.c | 31 ++++++++++++++++--------------- 2 files changed, 24 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 34e00b7..efa290e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2460,9 +2460,9 @@ static void find_module_sections(struct module *mod, struct load_info *info) #endif #ifdef CONFIG_TRACEPOINTS - mod->tracepoints = section_objs(info, "__tracepoints", - sizeof(*mod->tracepoints), - &mod->num_tracepoints); + mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs", + sizeof(*mod->tracepoints_ptrs), + &mod->num_tracepoints); #endif #ifdef HAVE_JUMP_LABEL mod->jump_entries = section_objs(info, "__jump_table", @@ -3393,7 +3393,7 @@ void module_layout(struct module *mod, struct modversion_info *ver, struct kernel_param *kp, struct kernel_symbol *ks, - struct tracepoint *tp) + struct tracepoint * const *tp) { } EXPORT_SYMBOL(module_layout); @@ -3407,8 +3407,8 @@ void module_update_tracepoints(void) mutex_lock(&module_mutex); list_for_each_entry(mod, &modules, list) if (!mod->taints) - tracepoint_update_probe_range(mod->tracepoints, - mod->tracepoints + mod->num_tracepoints); + tracepoint_update_probe_range(mod->tracepoints_ptrs, + mod->tracepoints_ptrs + mod->num_tracepoints); mutex_unlock(&module_mutex); } @@ -3432,8 +3432,8 @@ int module_get_iter_tracepoints(struct tracepoint_iter *iter) else if (iter_mod > iter->module) iter->tracepoint = NULL; found = tracepoint_get_iter_range(&iter->tracepoint, - iter_mod->tracepoints, - iter_mod->tracepoints + iter_mod->tracepoints_ptrs, + iter_mod->tracepoints_ptrs + iter_mod->num_tracepoints); if (found) { iter->module = iter_mod; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index e95ee7f..68187af 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -27,8 +27,8 @@ #include #include -extern struct tracepoint __start___tracepoints[]; -extern struct tracepoint __stop___tracepoints[]; +extern struct tracepoint * const __start___tracepoints_ptrs[]; +extern struct tracepoint * const __stop___tracepoints_ptrs[]; /* Set to 1 to enable tracepoint debug output */ static const int tracepoint_debug; @@ -298,10 +298,10 @@ static void disable_tracepoint(struct tracepoint *elem) * * Updates the probe callback corresponding to a range of tracepoints. */ -void -tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) +void tracepoint_update_probe_range(struct tracepoint * const *begin, + struct tracepoint * const *end) { - struct tracepoint *iter; + struct tracepoint * const *iter; struct tracepoint_entry *mark_entry; if (!begin) @@ -309,12 +309,12 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) mutex_lock(&tracepoints_mutex); for (iter = begin; iter < end; iter++) { - mark_entry = get_tracepoint(iter->name); + mark_entry = get_tracepoint((*iter)->name); if (mark_entry) { - set_tracepoint(&mark_entry, iter, + set_tracepoint(&mark_entry, *iter, !!mark_entry->refcount); } else { - disable_tracepoint(iter); + disable_tracepoint(*iter); } } mutex_unlock(&tracepoints_mutex); @@ -326,8 +326,8 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) static void tracepoint_update_probes(void) { /* Core kernel tracepoints */ - tracepoint_update_probe_range(__start___tracepoints, - __stop___tracepoints); + tracepoint_update_probe_range(__start___tracepoints_ptrs, + __stop___tracepoints_ptrs); /* tracepoints in modules. */ module_update_tracepoints(); } @@ -514,8 +514,8 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all); * Will return the first tracepoint in the range if the input tracepoint is * NULL. */ -int tracepoint_get_iter_range(struct tracepoint **tracepoint, - struct tracepoint *begin, struct tracepoint *end) +int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, + struct tracepoint * const *begin, struct tracepoint * const *end) { if (!*tracepoint && begin != end) { *tracepoint = begin; @@ -534,7 +534,8 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter) /* Core kernel tracepoints */ if (!iter->module) { found = tracepoint_get_iter_range(&iter->tracepoint, - __start___tracepoints, __stop___tracepoints); + __start___tracepoints_ptrs, + __stop___tracepoints_ptrs); if (found) goto end; } @@ -585,8 +586,8 @@ int tracepoint_module_notify(struct notifier_block *self, switch (val) { case MODULE_STATE_COMING: case MODULE_STATE_GOING: - tracepoint_update_probe_range(mod->tracepoints, - mod->tracepoints + mod->num_tracepoints); + tracepoint_update_probe_range(mod->tracepoints_ptrs, + mod->tracepoints_ptrs + mod->num_tracepoints); break; } return 0; -- cgit v1.1 From 3d56e331b6537671c66f1b510bed0f1e0331dfc8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 2 Feb 2011 17:06:09 -0500 Subject: tracing: Replace syscall_meta_data struct array with pointer array Currently the syscall_meta structures for the syscall tracepoints are placed in the __syscall_metadata section, and at link time, the linker makes one large array of all these syscall metadata structures. On boot up, this array is read (much like the initcall sections) and the syscall data is processed. The problem is that there is no guarantee that gcc will place complex structures nicely together in an array format. Two structures in the same file may be placed awkwardly, because gcc has no clue that they are suppose to be in an array. A hack was used previous to force the alignment to 4, to pack the structures together. But this caused alignment issues with other architectures (sparc). Instead of packing the structures into an array, the structures' addresses are now put into the __syscall_metadata section. As pointers are always the natural alignment, gcc should always pack them tightly together (otherwise initcall, extable, etc would also fail). By having the pointers to the structures in the section, we can still iterate the trace_events without causing unnecessary alignment problems with other architectures, or depending on the current behaviour of gcc that will likely change in the future just to tick us kernel developers off a little more. The __syscall_metadata section is also moved into the .init.data section as it is now only needed at boot up. Suggested-by: David Miller Acked-by: David S. Miller Cc: Mathieu Desnoyers Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index b706529..5c9fe08 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -55,20 +55,21 @@ struct ftrace_event_class event_class_syscall_exit = { .raw_init = init_syscall_trace, }; -extern unsigned long __start_syscalls_metadata[]; -extern unsigned long __stop_syscalls_metadata[]; +extern struct syscall_metadata *__start_syscalls_metadata[]; +extern struct syscall_metadata *__stop_syscalls_metadata[]; static struct syscall_metadata **syscalls_metadata; -static struct syscall_metadata *find_syscall_meta(unsigned long syscall) +static __init struct syscall_metadata * +find_syscall_meta(unsigned long syscall) { - struct syscall_metadata *start; - struct syscall_metadata *stop; + struct syscall_metadata **start; + struct syscall_metadata **stop; char str[KSYM_SYMBOL_LEN]; - start = (struct syscall_metadata *)__start_syscalls_metadata; - stop = (struct syscall_metadata *)__stop_syscalls_metadata; + start = __start_syscalls_metadata; + stop = __stop_syscalls_metadata; kallsyms_lookup(syscall, NULL, NULL, NULL, str); for ( ; start < stop; start++) { @@ -78,8 +79,8 @@ static struct syscall_metadata *find_syscall_meta(unsigned long syscall) * with "SyS" instead of "sys", leading to an unwanted * mismatch. */ - if (start->name && !strcmp(start->name + 3, str + 3)) - return start; + if ((*start)->name && !strcmp((*start)->name + 3, str + 3)) + return *start; } return NULL; } -- cgit v1.1 From f266a5110d453b7987194460ac7edd31f1a5426c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Feb 2011 15:09:41 +0100 Subject: lockdep, timer: Fix del_timer_sync() annotation Calling local_bh_enable() will want to actually start processing softirqs, which isn't a good idea since this can get called with IRQs disabled. Cure this by using _local_bh_enable() which doesn't start processing softirqs, and use raw_local_irq_save() to avoid any softirqs from happening without letting lockdep think IRQs are in fact disabled. Reported-by: Nick Bowler Signed-off-by: Peter Zijlstra Reviewed-by: Yong Zhang LKML-Reference: <20110203141548.039540914@chello.nl> Signed-off-by: Thomas Gleixner --- kernel/timer.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 43ca993..d53ce66 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -969,10 +969,14 @@ EXPORT_SYMBOL(try_to_del_timer_sync); int del_timer_sync(struct timer_list *timer) { #ifdef CONFIG_LOCKDEP + unsigned long flags; + + raw_local_irq_save(flags); local_bh_disable(); lock_map_acquire(&timer->lockdep_map); lock_map_release(&timer->lockdep_map); - local_bh_enable(); + _local_bh_enable(); + raw_local_irq_restore(flags); #endif /* * don't use it in hardirq context, because it -- cgit v1.1 From 2edeaa34a6e3f2c43b667f6c4f7b27944b811695 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 7 Feb 2011 13:36:10 +0000 Subject: CRED: Fix BUG() upon security_cred_alloc_blank() failure In cred_alloc_blank() since 2.6.32, abort_creds(new) is called with new->security == NULL and new->magic == 0 when security_cred_alloc_blank() returns an error. As a result, BUG() will be triggered if SELinux is enabled or CONFIG_DEBUG_CREDENTIALS=y. If CONFIG_DEBUG_CREDENTIALS=y, BUG() is called from __invalid_creds() because cred->magic == 0. Failing that, BUG() is called from selinux_cred_free() because selinux_cred_free() is not expecting cred->security == NULL. This does not affect smack_cred_free(), tomoyo_cred_free() or apparmor_cred_free(). Fix these bugs by (1) Set new->magic before calling security_cred_alloc_blank(). (2) Handle null cred->security in creds_are_invalid() and selinux_cred_free(). Signed-off-by: Tetsuo Handa Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/cred.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 6a1aa00..fcf104b 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -252,13 +252,13 @@ struct cred *cred_alloc_blank(void) #endif atomic_set(&new->usage, 1); +#ifdef CONFIG_DEBUG_CREDENTIALS + new->magic = CRED_MAGIC; +#endif if (security_cred_alloc_blank(new, GFP_KERNEL) < 0) goto error; -#ifdef CONFIG_DEBUG_CREDENTIALS - new->magic = CRED_MAGIC; -#endif return new; error: @@ -748,7 +748,11 @@ bool creds_are_invalid(const struct cred *cred) if (cred->magic != CRED_MAGIC) return true; #ifdef CONFIG_SECURITY_SELINUX - if (selinux_is_enabled()) { + /* + * cred->security == NULL if security_cred_alloc_blank() or + * security_prepare_creds() returned an error. + */ + if (selinux_is_enabled() && cred->security) { if ((unsigned long) cred->security < PAGE_SIZE) return true; if ((*(u32 *)cred->security & 0xffffff00) == -- cgit v1.1 From fb2b2a1d37f80cc818fd4487b510f4e11816e5e1 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 7 Feb 2011 13:36:16 +0000 Subject: CRED: Fix memory and refcount leaks upon security_prepare_creds() failure In prepare_kernel_cred() since 2.6.29, put_cred(new) is called without assigning new->usage when security_prepare_creds() returned an error. As a result, memory for new and refcount for new->{user,group_info,tgcred} are leaked because put_cred(new) won't call __put_cred() unless old->usage == 1. Fix these leaks by assigning new->usage (and new->subscribers which was added in 2.6.32) before calling security_prepare_creds(). Signed-off-by: Tetsuo Handa Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/cred.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index fcf104b..3a9d6dd 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -657,6 +657,8 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) validate_creds(old); *new = *old; + atomic_set(&new->usage, 1); + set_cred_subscribers(new, 0); get_uid(new->user); get_group_info(new->group_info); @@ -674,8 +676,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) if (security_prepare_creds(new, old, GFP_KERNEL) < 0) goto error; - atomic_set(&new->usage, 1); - set_cred_subscribers(new, 0); put_cred(old); validate_creds(new); return new; -- cgit v1.1 From 7ff207928eb0761fa6b6c39eda82ac07a5241acf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 8 Feb 2011 15:18:00 +0100 Subject: Revert "lockdep, timer: Fix del_timer_sync() annotation" Both attempts at trying to allow softirq usage for del_timer_sync() failed (produced bogus warnings), so revert the commit for this release: f266a5110d45: lockdep, timer: Fix del_timer_sync() annotation and try again later. Reported-by: Borislav Petkov Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Yong Zhang Cc: Thomas Gleixner LKML-Reference: <1297174680.13327.107.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/timer.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index d53ce66..d645992 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -959,7 +959,7 @@ EXPORT_SYMBOL(try_to_del_timer_sync); * * Synchronization rules: Callers must prevent restarting of the timer, * otherwise this function is meaningless. It must not be called from - * hardirq contexts. The caller must not hold locks which would prevent + * interrupt contexts. The caller must not hold locks which would prevent * completion of the timer's handler. The timer's handler must not call * add_timer_on(). Upon exit the timer is not queued and the handler is * not running on any CPU. @@ -971,12 +971,10 @@ int del_timer_sync(struct timer_list *timer) #ifdef CONFIG_LOCKDEP unsigned long flags; - raw_local_irq_save(flags); - local_bh_disable(); + local_irq_save(flags); lock_map_acquire(&timer->lockdep_map); lock_map_release(&timer->lockdep_map); - _local_bh_enable(); - raw_local_irq_restore(flags); + local_irq_restore(flags); #endif /* * don't use it in hardirq context, because it -- cgit v1.1 From 5651f7f47dbb1cf2b95a60582546db4ff508e2b4 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Wed, 9 Feb 2011 14:02:33 -0500 Subject: watchdog, nmi: Lower the severity of error messages During boot if the hardlockup detector fails to initialize, it complains very loudly. Some failures should be expected under certain situations, ie no lapics, or resource in-use. Tone those error messages down a bit. Keep the rest at a high level. Reported-by: Paul Bolle Tested-by: Paul Bolle Signed-off-by: Don Zickus Cc: Peter Zijlstra LKML-Reference: <1297278153-21111-1-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index f37f974..18bb157 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -363,8 +363,14 @@ static int watchdog_nmi_enable(int cpu) goto out_save; } - printk(KERN_ERR "NMI watchdog disabled for cpu%i: unable to create perf event: %ld\n", - cpu, PTR_ERR(event)); + + /* vary the KERN level based on the returned errno */ + if (PTR_ERR(event) == -EOPNOTSUPP) + printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu); + else if (PTR_ERR(event) == -ENOENT) + printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu); + else + printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event)); return PTR_ERR(event); /* success path */ -- cgit v1.1 From ee24aebffb75a7f940cf52c8cf6910947b3130c0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 10 Feb 2011 17:53:55 -0800 Subject: cap_syslog: accept CAP_SYS_ADMIN for now In commit ce6ada35bdf7 ("security: Define CAP_SYSLOG") Serge Hallyn introduced CAP_SYSLOG, but broke backwards compatibility by no longer accepting CAP_SYS_ADMIN as an override (it would cause a warning and then reject the operation). Re-instate CAP_SYS_ADMIN - but keeping the warning - as an acceptable capability until any legacy applications have been updated. There are apparently applications out there that drop all capabilities except for CAP_SYS_ADMIN in order to access the syslog. (This is a re-implementation of a patch by Serge, cleaning the logic up and making the code more readable) Acked-by: Serge Hallyn Reviewed-by: James Morris Signed-off-by: Linus Torvalds --- kernel/printk.c | 54 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 2ddbdc7..3623152 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -262,25 +262,47 @@ int dmesg_restrict = 1; int dmesg_restrict; #endif +static int syslog_action_restricted(int type) +{ + if (dmesg_restrict) + return 1; + /* Unless restricted, we allow "read all" and "get buffer size" for everybody */ + return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER; +} + +static int check_syslog_permissions(int type, bool from_file) +{ + /* + * If this is from /proc/kmsg and we've already opened it, then we've + * already done the capabilities checks at open time. + */ + if (from_file && type != SYSLOG_ACTION_OPEN) + return 0; + + if (syslog_action_restricted(type)) { + if (capable(CAP_SYSLOG)) + return 0; + /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ + if (capable(CAP_SYS_ADMIN)) { + WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN " + "but no CAP_SYSLOG (deprecated).\n"); + return 0; + } + return -EPERM; + } + return 0; +} + int do_syslog(int type, char __user *buf, int len, bool from_file) { unsigned i, j, limit, count; int do_clear = 0; char c; - int error = 0; + int error; - /* - * If this is from /proc/kmsg we only do the capabilities checks - * at open time. - */ - if (type == SYSLOG_ACTION_OPEN || !from_file) { - if (dmesg_restrict && !capable(CAP_SYSLOG)) - goto warn; /* switch to return -EPERM after 2.6.39 */ - if ((type != SYSLOG_ACTION_READ_ALL && - type != SYSLOG_ACTION_SIZE_BUFFER) && - !capable(CAP_SYSLOG)) - goto warn; /* switch to return -EPERM after 2.6.39 */ - } + error = check_syslog_permissions(type, from_file); + if (error) + goto out; error = security_syslog(type); if (error) @@ -423,12 +445,6 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) } out: return error; -warn: - /* remove after 2.6.39 */ - if (capable(CAP_SYS_ADMIN)) - WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN " - "but no CAP_SYSLOG (deprecated and denied).\n"); - return -EPERM; } SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) -- cgit v1.1 From 6037b715d6fab139742c3df8851db4c823081561 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Wed, 9 Feb 2011 22:11:51 -0800 Subject: security: add cred argument to security_capable() Expand security_capable() to include cred, so that it can be usable in a wider range of call sites. Signed-off-by: Chris Wright Acked-by: Serge Hallyn Signed-off-by: James Morris --- kernel/capability.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index 2f05303..9e9385f 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -306,7 +306,7 @@ int capable(int cap) BUG(); } - if (security_capable(cap) == 0) { + if (security_capable(current_cred(), cap) == 0) { current->flags |= PF_SUPERPRIV; return 1; } -- cgit v1.1 From 01e05e9a90b8f4c3997ae0537e87720eb475e532 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 10 Feb 2011 15:01:22 -0800 Subject: ptrace: use safer wake up on ptrace_detach() The wake_up_process() call in ptrace_detach() is spurious and not interlocked with the tracee state. IOW, the tracee could be running or sleeping in any place in the kernel by the time wake_up_process() is called. This can lead to the tracee waking up unexpectedly which can be dangerous. The wake_up is spurious and should be removed but for now reduce its toxicity by only waking up if the tracee is in TRACED or STOPPED state. This bug can possibly be used as an attack vector. I don't think it will take too much effort to come up with an attack which triggers oops somewhere. Most sleeps are wrapped in condition test loops and should be safe but we have quite a number of places where sleep and wakeup conditions are expected to be interlocked. Although the window of opportunity is tiny, ptrace can be used by non-privileged users and with some loading the window can definitely be extended and exploited. Signed-off-by: Tejun Heo Acked-by: Roland McGrath Acked-by: Oleg Nesterov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 99bbaa3..1708b1e 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -313,7 +313,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data) child->exit_code = data; dead = __ptrace_detach(current, child); if (!child->exit_state) - wake_up_process(child); + wake_up_state(child, TASK_TRACED | TASK_STOPPED); } write_unlock_irq(&tasklist_lock); -- cgit v1.1 From f590308536db432e4747f562b29e5858123938e9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 11 Feb 2011 19:21:25 -0800 Subject: timer debug: Hide kernel addresses via %pK in /proc/timer_list In the continuing effort to avoid kernel addresses leaking to unprivileged users, this patch switches to %pK for /proc/timer_list reporting. Signed-off-by: Kees Cook Cc: John Stultz Cc: Dan Rosenberg Cc: Eugene Teo Cc: Linus Torvalds LKML-Reference: <20110212032125.GA23571@outflux.net> Signed-off-by: Ingo Molnar --- kernel/time/timer_list.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 32a19f9..3258455 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -41,7 +41,7 @@ static void print_name_offset(struct seq_file *m, void *sym) char symname[KSYM_NAME_LEN]; if (lookup_symbol_name((unsigned long)sym, symname) < 0) - SEQ_printf(m, "<%p>", sym); + SEQ_printf(m, "<%pK>", sym); else SEQ_printf(m, "%s", symname); } @@ -112,7 +112,7 @@ next_one: static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { - SEQ_printf(m, " .base: %p\n", base); + SEQ_printf(m, " .base: %pK\n", base); SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %Lu nsecs\n", -- cgit v1.1 From 7576958a9d5a4a677ad7dd40901cdbb6c1110c98 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 14 Feb 2011 14:04:46 +0100 Subject: workqueue: wake up a worker when a rescuer is leaving a gcwq After executing the matching works, a rescuer leaves the gcwq whether there are more pending works or not. This may decrease the concurrency level to zero and stall execution until a new work item is queued on the gcwq. Make rescuer wake up a regular worker when it leaves a gcwq if there are more works to execute, so that execution isn't stalled. Signed-off-by: Tejun Heo Reported-by: Ray Jui Cc: stable@kernel.org --- kernel/workqueue.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 11869fa..90a17ca 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2047,6 +2047,15 @@ repeat: move_linked_works(work, scheduled, &n); process_scheduled_works(rescuer); + + /* + * Leave this gcwq. If keep_working() is %true, notify a + * regular worker; otherwise, we end up with 0 concurrency + * and stalling the execution. + */ + if (keep_working(gcwq)) + wake_up_worker(gcwq); + spin_unlock_irq(&gcwq->lock); } -- cgit v1.1 From 58a69cb47ec6991bf006a3e5d202e8571b0327a4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 09:25:31 +0100 Subject: workqueue, freezer: unify spelling of 'freeze' + 'able' to 'freezable' There are two spellings in use for 'freeze' + 'able' - 'freezable' and 'freezeable'. The former is the more prominent one. The latter is mostly used by workqueue and in a few other odd places. Unify the spelling to 'freezable'. Signed-off-by: Tejun Heo Reported-by: Alan Stern Acked-by: "Rafael J. Wysocki" Acked-by: Greg Kroah-Hartman Acked-by: Dmitry Torokhov Cc: David Woodhouse Cc: Alex Dubov Cc: "David S. Miller" Cc: Steven Whitehouse --- kernel/power/main.c | 2 +- kernel/power/process.c | 6 +++--- kernel/workqueue.c | 24 ++++++++++++------------ 3 files changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 7b5db6a..7018530 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -326,7 +326,7 @@ EXPORT_SYMBOL_GPL(pm_wq); static int __init pm_start_workqueue(void) { - pm_wq = alloc_workqueue("pm", WQ_FREEZEABLE, 0); + pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0); return pm_wq ? 0 : -ENOMEM; } diff --git a/kernel/power/process.c b/kernel/power/process.c index d6d2a10..0cf3a27 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -22,7 +22,7 @@ */ #define TIMEOUT (20 * HZ) -static inline int freezeable(struct task_struct * p) +static inline int freezable(struct task_struct * p) { if ((p == current) || (p->flags & PF_NOFREEZE) || @@ -53,7 +53,7 @@ static int try_to_freeze_tasks(bool sig_only) todo = 0; read_lock(&tasklist_lock); do_each_thread(g, p) { - if (frozen(p) || !freezeable(p)) + if (frozen(p) || !freezable(p)) continue; if (!freeze_task(p, sig_only)) @@ -167,7 +167,7 @@ static void thaw_tasks(bool nosig_only) read_lock(&tasklist_lock); do_each_thread(g, p) { - if (!freezeable(p)) + if (!freezable(p)) continue; if (nosig_only && should_send_signal(p)) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 90a17ca..88a3e34 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2965,7 +2965,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, */ spin_lock(&workqueue_lock); - if (workqueue_freezing && wq->flags & WQ_FREEZEABLE) + if (workqueue_freezing && wq->flags & WQ_FREEZABLE) for_each_cwq_cpu(cpu, wq) get_cwq(cpu, wq)->max_active = 0; @@ -3077,7 +3077,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) spin_lock_irq(&gcwq->lock); - if (!(wq->flags & WQ_FREEZEABLE) || + if (!(wq->flags & WQ_FREEZABLE) || !(gcwq->flags & GCWQ_FREEZING)) get_cwq(gcwq->cpu, wq)->max_active = max_active; @@ -3327,7 +3327,7 @@ static int __cpuinit trustee_thread(void *__gcwq) * want to get it over with ASAP - spam rescuers, wake up as * many idlers as necessary and create new ones till the * worklist is empty. Note that if the gcwq is frozen, there - * may be frozen works in freezeable cwqs. Don't declare + * may be frozen works in freezable cwqs. Don't declare * completion while frozen. */ while (gcwq->nr_workers != gcwq->nr_idle || @@ -3585,9 +3585,9 @@ EXPORT_SYMBOL_GPL(work_on_cpu); /** * freeze_workqueues_begin - begin freezing workqueues * - * Start freezing workqueues. After this function returns, all - * freezeable workqueues will queue new works to their frozen_works - * list instead of gcwq->worklist. + * Start freezing workqueues. After this function returns, all freezable + * workqueues will queue new works to their frozen_works list instead of + * gcwq->worklist. * * CONTEXT: * Grabs and releases workqueue_lock and gcwq->lock's. @@ -3613,7 +3613,7 @@ void freeze_workqueues_begin(void) list_for_each_entry(wq, &workqueues, list) { struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - if (cwq && wq->flags & WQ_FREEZEABLE) + if (cwq && wq->flags & WQ_FREEZABLE) cwq->max_active = 0; } @@ -3624,7 +3624,7 @@ void freeze_workqueues_begin(void) } /** - * freeze_workqueues_busy - are freezeable workqueues still busy? + * freeze_workqueues_busy - are freezable workqueues still busy? * * Check whether freezing is complete. This function must be called * between freeze_workqueues_begin() and thaw_workqueues(). @@ -3633,8 +3633,8 @@ void freeze_workqueues_begin(void) * Grabs and releases workqueue_lock. * * RETURNS: - * %true if some freezeable workqueues are still busy. %false if - * freezing is complete. + * %true if some freezable workqueues are still busy. %false if freezing + * is complete. */ bool freeze_workqueues_busy(void) { @@ -3654,7 +3654,7 @@ bool freeze_workqueues_busy(void) list_for_each_entry(wq, &workqueues, list) { struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - if (!cwq || !(wq->flags & WQ_FREEZEABLE)) + if (!cwq || !(wq->flags & WQ_FREEZABLE)) continue; BUG_ON(cwq->nr_active < 0); @@ -3699,7 +3699,7 @@ void thaw_workqueues(void) list_for_each_entry(wq, &workqueues, list) { struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); - if (!cwq || !(wq->flags & WQ_FREEZEABLE)) + if (!cwq || !(wq->flags & WQ_FREEZABLE)) continue; /* restore max_active and repopulate worklist */ -- cgit v1.1 From 3233cdbd9fa347a6d6897a94cc6ed0302ae83c4f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 16 Feb 2011 18:10:19 +0100 Subject: workqueue: make sure MAYDAY_INITIAL_TIMEOUT is at least 2 jiffies long MAYDAY_INITIAL_TIMEOUT is defined as HZ / 100 and depending on configuration may end up 0 or 1. Even when it's 1, depending on when the mayday timer is added in the current jiffy interval, it may expire way before a jiffy has passed. Make sure MAYDAY_INITIAL_TIMEOUT is at least two to guarantee that at least a full jiffy has passed before calling rescuers. Signed-off-by: Tejun Heo Reported-by: Ray Jui Cc: stable@kernel.org --- kernel/workqueue.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 88a3e34..ee6578b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -79,7 +79,9 @@ enum { MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ - MAYDAY_INITIAL_TIMEOUT = HZ / 100, /* call for help after 10ms */ + MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2, + /* call for help after 10ms + (min two ticks) */ MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ CREATE_COOLDOWN = HZ, /* time to breath after fail */ TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ -- cgit v1.1 From 2e725a065b0153f0c449318da1923a120477633d Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Sat, 12 Feb 2011 21:06:51 +0100 Subject: PM / Hibernate: Return error code when alloc_image_page() fails Currently we return 0 in swsusp_alloc() when alloc_image_page() fails. Fix that. Also remove unneeded "error" variable since the only useful value of error is -ENOMEM. [rjw: Fixed up the changelog and changed subject.] Signed-off-by: Stanislaw Gruszka Cc: stable@kernel.org Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 0dac75e..64db648 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1519,11 +1519,8 @@ static int swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, unsigned int nr_pages, unsigned int nr_highmem) { - int error = 0; - if (nr_highmem > 0) { - error = get_highmem_buffer(PG_ANY); - if (error) + if (get_highmem_buffer(PG_ANY)) goto err_out; if (nr_highmem > alloc_highmem) { nr_highmem -= alloc_highmem; @@ -1546,7 +1543,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, err_out: swsusp_free(); - return error; + return -ENOMEM; } asmlinkage int swsusp_save(void) -- cgit v1.1