From 1087e9b4ff708976499b4de541d9e1d57d49b60a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sun, 4 Oct 2009 02:20:11 +0200 Subject: HWPOISON: Clean up PR_MCE_KILL interface While writing the manpage I noticed some shortcomings in the current interface. - Define symbolic names for all the different values - Boundary check the kill mode values - For symmetry add a get interface too. This allows library code to get/set the current state. - For consistency define a PR_MCE_KILL_DEFAULT value Signed-off-by: Andi Kleen --- kernel/sys.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 255475d..f6afe07 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1546,24 +1546,37 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, if (arg4 | arg5) return -EINVAL; switch (arg2) { - case 0: + case PR_MCE_KILL_CLEAR: if (arg3 != 0) return -EINVAL; current->flags &= ~PF_MCE_PROCESS; break; - case 1: + case PR_MCE_KILL_SET: current->flags |= PF_MCE_PROCESS; - if (arg3 != 0) + if (arg3 == PR_MCE_KILL_EARLY) current->flags |= PF_MCE_EARLY; - else + else if (arg3 == PR_MCE_KILL_LATE) current->flags &= ~PF_MCE_EARLY; + else if (arg3 == PR_MCE_KILL_DEFAULT) + current->flags &= + ~(PF_MCE_EARLY|PF_MCE_PROCESS); + else + return -EINVAL; break; default: return -EINVAL; } error = 0; break; - + case PR_MCE_KILL_GET: + if (arg2 | arg3 | arg4 | arg5) + return -EINVAL; + if (current->flags & PF_MCE_PROCESS) + error = (current->flags & PF_MCE_EARLY) ? + PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; + else + error = PR_MCE_KILL_DEFAULT; + break; default: error = -EINVAL; break; -- cgit v1.1 From d58e6576b0deec6f0b9ff8450fe282da18c50883 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Oct 2009 20:40:43 +0200 Subject: futex: Handle spurious wake up The futex code does not handle spurious wake up in futex_wait and futex_wait_requeue_pi. The code assumes that any wake up which was not caused by futex_wake / requeue or by a timeout was caused by a signal wake up and returns one of the syscall restart error codes. In case of a spurious wake up the signal delivery code which deals with the restart error codes is not invoked and we return that error code to user space. That causes applications which actually check the return codes to fail. Blaise reported that on preempt-rt a python test program run into a exception trap. -rt exposed that due to a built in spurious wake up accelerator :) Solve this by checking signal_pending(current) in the wake up path and handle the spurious wake up case w/o returning to user space. Reported-by: Blaise Gassend Debugged-by: Darren Hart Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: stable@kernel.org LKML-Reference: --- kernel/futex.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 4949d33..5c88839 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1791,6 +1791,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, current->timer_slack_ns); } +retry: /* Prepare to wait on uaddr. */ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); if (ret) @@ -1808,9 +1809,14 @@ static int futex_wait(u32 __user *uaddr, int fshared, goto out_put_key; /* - * We expect signal_pending(current), but another thread may - * have handled it for us already. + * We expect signal_pending(current), but we might be the + * victim of a spurious wakeup as well. */ + if (!signal_pending(current)) { + put_futex_key(fshared, &q.key); + goto retry; + } + ret = -ERESTARTSYS; if (!abs_time) goto out_put_key; @@ -2118,9 +2124,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, */ plist_del(&q->list, &q->list.plist); + /* Handle spurious wakeups gracefully */ + ret = -EAGAIN; if (timeout && !timeout->task) ret = -ETIMEDOUT; - else + else if (signal_pending(current)) ret = -ERESTARTNOINTR; } return ret; @@ -2198,6 +2206,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, debug_rt_mutex_init_waiter(&rt_waiter); rt_waiter.task = NULL; +retry: key2 = FUTEX_KEY_INIT; ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); if (unlikely(ret != 0)) @@ -2292,6 +2301,9 @@ out_put_keys: out_key2: put_futex_key(fshared, &key2); + /* Spurious wakeup ? */ + if (ret == -EAGAIN) + goto retry; out: if (to) { hrtimer_cancel(&to->timer); -- cgit v1.1 From 92f6a5e37a2e2d3342dafb2b39c2f8bc340bbf84 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Oct 2009 12:43:07 +0200 Subject: sched: Do less agressive buddy clearing Yanmin reported a hackbench regression due to: > commit de69a80be32445b0a71e8e3b757e584d7beb90f7 > Author: Peter Zijlstra > Date: Thu Sep 17 09:01:20 2009 +0200 > > sched: Stop buddies from hogging the system I really liked de69a80b, and it affecting hackbench shows I wasn't crazy ;-) So hackbench is a multi-cast, with one sender spraying multiple receivers, who in their turn don't spray back. This would be exactly the scenario that patch 'cures'. Previously we would not clear the last buddy after running the next task, allowing the sender to get back to work sooner than it otherwise ought to have been, increasing latencies for other tasks. Now, since those receivers don't poke back, they don't enforce the buddy relation, which means there's nothing to re-elect the sender. Cure this by less agressively clearing the buddy stats. Only clear buddies when they were not chosen. It should still avoid a buddy sticking around long after its served its time. Reported-by: "Zhang, Yanmin" Signed-off-by: Peter Zijlstra CC: Mike Galbraith LKML-Reference: <1255084986.8802.46.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4e777b4..c32c3e6 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -861,12 +861,21 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { struct sched_entity *se = __pick_next_entity(cfs_rq); + struct sched_entity *buddy; - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1) - return cfs_rq->next; + if (cfs_rq->next) { + buddy = cfs_rq->next; + cfs_rq->next = NULL; + if (wakeup_preempt_entity(buddy, se) < 1) + return buddy; + } - if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1) - return cfs_rq->last; + if (cfs_rq->last) { + buddy = cfs_rq->last; + cfs_rq->last = NULL; + if (wakeup_preempt_entity(buddy, se) < 1) + return buddy; + } return se; } @@ -1654,16 +1663,6 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) do { se = pick_next_entity(cfs_rq); - /* - * If se was a buddy, clear it so that it will have to earn - * the favour again. - * - * If se was not a buddy, clear the buddies because neither - * was elegible to run, let them earn it again. - * - * IOW. unconditionally clear buddies. - */ - __clear_buddies(cfs_rq, NULL); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); -- cgit v1.1 From 2bc872036e1c5948b5b02942810bbdd8dbdb9812 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Wed, 14 Oct 2009 10:12:39 -0700 Subject: futex: Check for NULL keys in match_futex If userspace tries to perform a requeue_pi on a non-requeue_pi waiter, it will find the futex_q->requeue_pi_key to be NULL and OOPS. Check for NULL in match_futex() instead of doing explicit NULL pointer checks on all call sites. While match_futex(NULL, NULL) returning false is a little odd, it's still correct as we expect valid key references. Signed-off-by: Darren Hart Cc: Peter Zijlstra Cc: Ingo Molnar CC: Eric Dumazet CC: Dinakar Guniguntala CC: John Stultz Cc: stable@kernel.org LKML-Reference: <4AD60687.10306@us.ibm.com> Signed-off-by: Thomas Gleixner --- kernel/futex.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 5c88839..06938e5 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -150,7 +150,8 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key) */ static inline int match_futex(union futex_key *key1, union futex_key *key2) { - return (key1->both.word == key2->both.word + return (key1 && key2 + && key1->both.word == key2->both.word && key1->both.ptr == key2->both.ptr && key1->both.offset == key2->both.offset); } -- cgit v1.1 From 8c53e46314562fe814b0afef6cfcbd2f562b017c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 14 Oct 2009 09:16:42 -0700 Subject: workqueue: add 'flush_delayed_work()' to run and wait for delayed work It basically turns a delayed work into an immediate work, and then waits for it to finish, thus allowing you to force (and wait for) an immediate flush of a delayed work. We'll want to use this in the tty layer to clean up tty_flush_to_ldisc(). Acked-by: Oleg Nesterov [ Fixed to use 'del_timer_sync()' as noted by Oleg ] Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index addfe2d..47cdd7e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -640,6 +640,24 @@ int schedule_delayed_work(struct delayed_work *dwork, EXPORT_SYMBOL(schedule_delayed_work); /** + * flush_delayed_work - block until a dwork_struct's callback has terminated + * @dwork: the delayed work which is to be flushed + * + * Any timeout is cancelled, and any pending work is run immediately. + */ +void flush_delayed_work(struct delayed_work *dwork) +{ + if (del_timer_sync(&dwork->timer)) { + struct cpu_workqueue_struct *cwq; + cwq = wq_per_cpu(keventd_wq, get_cpu()); + __queue_work(cwq, &dwork->work); + put_cpu(); + } + flush_work(&dwork->work); +} +EXPORT_SYMBOL(flush_delayed_work); + +/** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay * @cpu: cpu to use * @dwork: job to be done -- cgit v1.1 From 37c72e56f6b234ea7387ba530434a80abf2658d8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 14 Oct 2009 10:15:55 -0700 Subject: rcu: Prevent RCU IPI storms in presence of high call_rcu() load As the number of callbacks on a given CPU rises, invoke force_quiescent_state() only every blimit number of callbacks (defaults to 10,000), and even then only if no other CPU has invoked force_quiescent_state() in the meantime. This should fix the performance regression reported by Nick. Reported-by: Nick Piggin Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com Cc: jens.axboe@oracle.com LKML-Reference: <12555405592133-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 29 ++++++++++++++++++++++++----- kernel/rcutree.h | 4 ++++ 2 files changed, 28 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 705f02a..ddbf111 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -958,7 +958,7 @@ static void rcu_offline_cpu(int cpu) * Invoke any RCU callbacks that have made it to the end of their grace * period. Thottle as specified by rdp->blimit. */ -static void rcu_do_batch(struct rcu_data *rdp) +static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; struct rcu_head *next, *list, **tail; @@ -1011,6 +1011,13 @@ static void rcu_do_batch(struct rcu_data *rdp) if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) rdp->blimit = blimit; + /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ + if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { + rdp->qlen_last_fqs_check = 0; + rdp->n_force_qs_snap = rsp->n_force_qs; + } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) + rdp->qlen_last_fqs_check = rdp->qlen; + local_irq_restore(flags); /* Re-raise the RCU softirq if there are callbacks remaining. */ @@ -1224,7 +1231,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) } /* If there are callbacks ready, invoke them. */ - rcu_do_batch(rdp); + rcu_do_batch(rsp, rdp); } /* @@ -1288,10 +1295,20 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ } - /* Force the grace period if too many callbacks or too long waiting. */ - if (unlikely(++rdp->qlen > qhimark)) { + /* + * Force the grace period if too many callbacks or too long waiting. + * Enforce hysteresis, and don't invoke force_quiescent_state() + * if some other CPU has recently done so. Also, don't bother + * invoking force_quiescent_state() if the newly enqueued callback + * is the only one waiting for a grace period to complete. + */ + if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { rdp->blimit = LONG_MAX; - force_quiescent_state(rsp, 0); + if (rsp->n_force_qs == rdp->n_force_qs_snap && + *rdp->nxttail[RCU_DONE_TAIL] != head) + force_quiescent_state(rsp, 0); + rdp->n_force_qs_snap = rsp->n_force_qs; + rdp->qlen_last_fqs_check = rdp->qlen; } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) force_quiescent_state(rsp, 1); local_irq_restore(flags); @@ -1523,6 +1540,8 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) rdp->beenonline = 1; /* We have now been online. */ rdp->preemptable = preemptable; rdp->passed_quiesc_completed = lastcomp - 1; + rdp->qlen_last_fqs_check = 0; + rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; spin_unlock(&rnp->lock); /* irqs remain disabled. */ diff --git a/kernel/rcutree.h b/kernel/rcutree.h index b40ac57..599161f 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -167,6 +167,10 @@ struct rcu_data { struct rcu_head *nxtlist; struct rcu_head **nxttail[RCU_NEXT_SIZE]; long qlen; /* # of queued callbacks */ + long qlen_last_fqs_check; + /* qlen at last check for QS forcing */ + unsigned long n_force_qs_snap; + /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ #ifdef CONFIG_NO_HZ -- cgit v1.1 From 019129d595caaa5bd0b41d128308da1be6a91869 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 14 Oct 2009 10:15:56 -0700 Subject: rcu: Stopgap fix for synchronize_rcu_expedited() for TREE_PREEMPT_RCU For the short term, map synchronize_rcu_expedited() to synchronize_rcu() for TREE_PREEMPT_RCU and to synchronize_sched_expedited() for TREE_RCU. Longer term, there needs to be a real expedited grace period for TREE_PREEMPT_RCU, but candidate patches to date are considerably more complex and intrusive. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com Cc: npiggin@suse.de Cc: jens.axboe@oracle.com LKML-Reference: <12555405592331-git-send-email-> Signed-off-by: Ingo Molnar --- kernel/rcutree_plugin.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c0cb783..ebd20ee 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -393,6 +393,17 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) EXPORT_SYMBOL_GPL(call_rcu); /* + * Wait for an rcu-preempt grace period. We are supposed to expedite the + * grace period, but this is the crude slow compatability hack, so just + * invoke synchronize_rcu(). + */ +void synchronize_rcu_expedited(void) +{ + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +/* * Check to see if there is any immediate preemptable-RCU-related work * to be done. */ @@ -565,6 +576,16 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) EXPORT_SYMBOL_GPL(call_rcu); /* + * Wait for an rcu-preempt grace period, but make it happen quickly. + * But because preemptable RCU does not exist, map to rcu-sched. + */ +void synchronize_rcu_expedited(void) +{ + synchronize_sched_expedited(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +/* * Because preemptable RCU does not exist, it never has any work to do. */ static int rcu_preempt_pending(int cpu) -- cgit v1.1 From 237c80c5c8fb7ec128cf2a756b550dc41ad7eac7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 15 Oct 2009 09:26:14 -0700 Subject: rcu: Fix TREE_PREEMPT_RCU CPU_HOTPLUG bad-luck hang If the following sequence of events occurs, then TREE_PREEMPT_RCU will hang waiting for a grace period to complete, eventually OOMing the system: o A TREE_PREEMPT_RCU build of the kernel is booted on a system with more than 64 physical CPUs present (32 on a 32-bit system). Alternatively, a TREE_PREEMPT_RCU build of the kernel is booted with RCU_FANOUT set to a sufficiently small value that the physical CPUs populate two or more leaf rcu_node structures. o A task is preempted in an RCU read-side critical section while running on a CPU corresponding to a given leaf rcu_node structure. o All CPUs corresponding to this same leaf rcu_node structure record quiescent states for the current grace period. o All of these same CPUs go offline (hence the need for enough physical CPUs to populate more than one leaf rcu_node structure). This causes the preempted task to be moved to the root rcu_node structure. At this point, there is nothing left to cause the quiescent state to be propagated up the rcu_node tree, so the current grace period never completes. The simplest fix, especially after considering the deadlock possibilities, is to detect this situation when the last CPU is offlined, and to set that CPU's ->qsmask bit in its leaf rcu_node structure. This will cause the next invocation of force_quiescent_state() to end the grace period. Without this fix, this hang can be triggered in an hour or so on some machines with rcutorture and random CPU onlining/offlining. With this fix, these same machines pass a full 10 hours of this sort of abuse. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <20091015162614.GA19131@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 15 ++++++++++++++- kernel/rcutree.h | 6 +++--- kernel/rcutree_plugin.h | 25 +++++++++++++++++-------- 3 files changed, 34 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ddbf111..0536125 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -913,7 +913,20 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) spin_unlock(&rnp->lock); /* irqs remain disabled. */ break; } - rcu_preempt_offline_tasks(rsp, rnp, rdp); + + /* + * If there was a task blocking the current grace period, + * and if all CPUs have checked in, we need to propagate + * the quiescent state up the rcu_node hierarchy. But that + * is inconvenient at the moment due to deadlock issues if + * this should end the current grace period. So set the + * offlined CPU's bit in ->qsmask in order to force the + * next force_quiescent_state() invocation to clean up this + * mess in a deadlock-free manner. + */ + if (rcu_preempt_offline_tasks(rsp, rnp, rdp) && !rnp->qsmask) + rnp->qsmask |= mask; + mask = rnp->grpmask; spin_unlock(&rnp->lock); /* irqs remain disabled. */ rnp = rnp->parent; diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 599161f..1823c6e 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -306,9 +306,9 @@ static void rcu_print_task_stall(struct rcu_node *rnp); #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU -static void rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp); +static int rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp); static void rcu_preempt_offline_cpu(int cpu); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_preempt_check_callbacks(int cpu); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index ebd20ee..ef2a58c 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -304,21 +304,25 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) * parent is to remove the need for rcu_read_unlock_special() to * make more than two attempts to acquire the target rcu_node's lock. * + * Returns 1 if there was previously a task blocking the current grace + * period on the specified rcu_node structure. + * * The caller must hold rnp->lock with irqs disabled. */ -static void rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) +static int rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp) { int i; struct list_head *lp; struct list_head *lp_root; + int retval = rcu_preempted_readers(rnp); struct rcu_node *rnp_root = rcu_get_root(rsp); struct task_struct *tp; if (rnp == rnp_root) { WARN_ONCE(1, "Last CPU thought to be offlined?"); - return; /* Shouldn't happen: at least one CPU online. */ + return 0; /* Shouldn't happen: at least one CPU online. */ } WARN_ON_ONCE(rnp != rdp->mynode && (!list_empty(&rnp->blocked_tasks[0]) || @@ -342,6 +346,8 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp, spin_unlock(&rnp_root->lock); /* irqs remain disabled */ } } + + return retval; } /* @@ -532,12 +538,15 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) /* * Because preemptable RCU does not exist, it never needs to migrate - * tasks that were blocked within RCU read-side critical sections. + * tasks that were blocked within RCU read-side critical sections, and + * such non-existent tasks cannot possibly have been blocking the current + * grace period. */ -static void rcu_preempt_offline_tasks(struct rcu_state *rsp, - struct rcu_node *rnp, - struct rcu_data *rdp) +static int rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp) { + return 0; } /* -- cgit v1.1 From 89061d3d58e1f0742139605dc6a7950aa1ecc019 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Thu, 15 Oct 2009 15:30:48 -0700 Subject: futex: Move drop_futex_key_refs out of spinlock'ed region When requeuing tasks from one futex to another, the reference held by the requeued task to the original futex location needs to be dropped eventually. Dropping the reference may ultimately lead to a call to "iput_final" and subsequently call into filesystem- specific code - which may be non-atomic. It is therefore safer to defer this drop operation until after the futex_hash_bucket spinlock has been dropped. Originally-From: Helge Bahmann Signed-off-by: Darren Hart Cc: Cc: Peter Zijlstra Cc: Eric Dumazet Cc: Dinakar Guniguntala Cc: John Stultz Cc: Sven-Thorsten Dietrich Cc: John Kacur LKML-Reference: <4AD7A298.5040802@us.ibm.com> Signed-off-by: Ingo Molnar --- kernel/futex.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 06938e5..642f3bb 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1029,7 +1029,6 @@ static inline void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, struct futex_hash_bucket *hb) { - drop_futex_key_refs(&q->key); get_futex_key_refs(key); q->key = *key; @@ -1227,6 +1226,7 @@ retry_private: */ if (ret == 1) { WARN_ON(pi_state); + drop_count++; task_count++; ret = get_futex_value_locked(&curval2, uaddr2); if (!ret) @@ -1305,6 +1305,7 @@ retry_private: if (ret == 1) { /* We got the lock. */ requeue_pi_wake_futex(this, &key2, hb2); + drop_count++; continue; } else if (ret) { /* -EDEADLK */ -- cgit v1.1 From 65a64464349883891e21e74af16c05d6e1eeb4e9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 14 Oct 2009 06:22:47 +0200 Subject: HWPOISON: Allow schedule_on_each_cpu() from keventd Right now when calling schedule_on_each_cpu() from keventd there is a deadlock because it tries to schedule a work item on the current CPU too. This happens via lru_add_drain_all() in hwpoison. Just call the function for the current CPU in this case. This is actually faster too. Debugging with Fengguang Wu & Max Asbock Signed-off-by: Andi Kleen --- kernel/workqueue.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index addfe2d..f61a2fe 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -667,21 +667,38 @@ EXPORT_SYMBOL(schedule_delayed_work_on); int schedule_on_each_cpu(work_func_t func) { int cpu; + int orig = -1; struct work_struct *works; works = alloc_percpu(struct work_struct); if (!works) return -ENOMEM; + /* + * when running in keventd don't schedule a work item on itself. + * Can just call directly because the work queue is already bound. + * This also is faster. + * Make this a generic parameter for other workqueues? + */ + if (current_is_keventd()) { + orig = raw_smp_processor_id(); + INIT_WORK(per_cpu_ptr(works, orig), func); + func(per_cpu_ptr(works, orig)); + } + get_online_cpus(); for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); + if (cpu == orig) + continue; INIT_WORK(work, func); schedule_work_on(cpu, work); } - for_each_online_cpu(cpu) - flush_work(per_cpu_ptr(works, cpu)); + for_each_online_cpu(cpu) { + if (cpu != orig) + flush_work(per_cpu_ptr(works, cpu)); + } put_online_cpus(); free_percpu(works); return 0; -- cgit v1.1 From 04bf7539c08d64184736cdc5e4ad617eda77eb0f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 20 Oct 2009 06:45:02 +0200 Subject: PM: Make warning in suspend_test_finish() less likely to happen Increase TEST_SUSPEND_SECONDS to 10 so the warning in suspend_test_finish() doesn't annoy the users of slower systems so much. Also, make the warning print the suspend-resume cycle time, so that we know why the warning actually triggered. Patch prepared during the hacking session at the Kernel Summit in Tokyo. Signed-off-by: Rafael J. Wysocki Signed-off-by: Linus Torvalds --- kernel/power/suspend_test.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 17d8bb1..25596e4 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -19,7 +19,7 @@ * The time it takes is system-specific though, so when we test this * during system bootup we allow a LOT of time. */ -#define TEST_SUSPEND_SECONDS 5 +#define TEST_SUSPEND_SECONDS 10 static unsigned long suspend_test_start_time; @@ -49,7 +49,8 @@ void suspend_test_finish(const char *label) * has some performance issues. The stack dump of a WARN_ON * is more likely to get the right attention than a printk... */ - WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label); + WARN(msec > (TEST_SUSPEND_SECONDS * 1000), + "Component: %s, time: %u\n", label, msec); } /* -- cgit v1.1 From 721a669b7225edeeb0ca8e2bf71b83882326a71b Mon Sep 17 00:00:00 2001 From: Soeren Sandmann Date: Tue, 15 Sep 2009 14:33:08 +0200 Subject: perf events: Fix swevent hrtimer sampling by keeping track of remaining time when enabling/disabling swevent hrtimers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the hrtimer based events work for sysprof. Whenever a swevent is scheduled out, the hrtimer is canceled. When it is scheduled back in, the timer is restarted. This happens every scheduler tick, which means the timer never expired because it was getting repeatedly restarted over and over with the same period. To fix that, save the remaining time when disabling; when reenabling, use that saved time as the period instead of the user-specified sampling period. Also, move the starting and stopping of the hrtimers to helper functions instead of duplicating the code. Signed-off-by: Søren Sandmann Pedersen LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 61 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index afb7ef3..33ff019 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -3969,6 +3969,42 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) return ret; } +static void perf_swevent_start_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + if (hwc->sample_period) { + u64 period; + + if (hwc->remaining) { + if (hwc->remaining < 0) + period = 10000; + else + period = hwc->remaining; + hwc->remaining = 0; + } else { + period = max_t(u64, 10000, hwc->sample_period); + } + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(period), 0, + HRTIMER_MODE_REL, 0); + } +} + +static void perf_swevent_cancel_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->sample_period) { + ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); + hwc->remaining = ktime_to_ns(remaining); + + hrtimer_cancel(&hwc->hrtimer); + } +} + /* * Software event: cpu wall time clock */ @@ -3991,22 +4027,14 @@ static int cpu_clock_perf_event_enable(struct perf_event *event) int cpu = raw_smp_processor_id(); atomic64_set(&hwc->prev_count, cpu_clock(cpu)); - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swevent_hrtimer; - if (hwc->sample_period) { - u64 period = max_t(u64, 10000, hwc->sample_period); - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL, 0); - } + perf_swevent_start_hrtimer(event); return 0; } static void cpu_clock_perf_event_disable(struct perf_event *event) { - if (event->hw.sample_period) - hrtimer_cancel(&event->hw.hrtimer); + perf_swevent_cancel_hrtimer(event); cpu_clock_perf_event_update(event); } @@ -4043,22 +4071,15 @@ static int task_clock_perf_event_enable(struct perf_event *event) now = event->ctx->time; atomic64_set(&hwc->prev_count, now); - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swevent_hrtimer; - if (hwc->sample_period) { - u64 period = max_t(u64, 10000, hwc->sample_period); - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL, 0); - } + + perf_swevent_start_hrtimer(event); return 0; } static void task_clock_perf_event_disable(struct perf_event *event) { - if (event->hw.sample_period) - hrtimer_cancel(&event->hw.hrtimer); + perf_swevent_cancel_hrtimer(event); task_clock_perf_event_update(event, event->ctx->time); } -- cgit v1.1 From 54f4407608c59712a8f5ec1e10dfac40bef5a2e7 Mon Sep 17 00:00:00 2001 From: Soeren Sandmann Date: Thu, 22 Oct 2009 18:34:08 +0200 Subject: perf events: Don't generate events for the idle task when exclude_idle is set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Getting samples for the idle task is often not interesting, so don't generate them when exclude_idle is set for the event in question. Signed-off-by: Søren Sandmann Pedersen Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 33ff019..7f29643 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -3959,8 +3959,9 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) regs = task_pt_regs(current); if (regs) { - if (perf_event_overflow(event, 0, &data, regs)) - ret = HRTIMER_NORESTART; + if (!(event->attr.exclude_idle && current->pid == 0)) + if (perf_event_overflow(event, 0, &data, regs)) + ret = HRTIMER_NORESTART; } period = max_t(u64, 10000, event->hw.sample_period); -- cgit v1.1 From cf8517cf905b5cd31d5790250b9ac39f7cb8aa53 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 23 Oct 2009 19:36:16 -0400 Subject: tracing: Update *ppos instead of filp->f_pos Instead of directly updating filp->f_pos we should update the *ppos argument. The filp->f_pos gets updated within the file_pos_write() function called from sys_write(). Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <20091023233646.399670810@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 2 +- kernel/trace/trace.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 37ba67e..9c451a1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -740,7 +740,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, out: mutex_unlock(&ftrace_profile_lock); - filp->f_pos += cnt; + *ppos += cnt; return cnt; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c820b03..b20d3ec 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2440,7 +2440,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, return ret; } - filp->f_pos += cnt; + *ppos += cnt; return cnt; } @@ -2582,7 +2582,7 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, } mutex_unlock(&trace_types_lock); - filp->f_pos += cnt; + *ppos += cnt; return cnt; } @@ -2764,7 +2764,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, if (err) return err; - filp->f_pos += ret; + *ppos += ret; return ret; } @@ -3299,7 +3299,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, } } - filp->f_pos += cnt; + *ppos += cnt; /* If check pages failed, return ENOMEM */ if (tracing_disabled) -- cgit v1.1 From 3e69533b51930a7169235db2caf703884e6e3bbb Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 23 Oct 2009 19:36:17 -0400 Subject: tracing: Fix trace_seq_printf() return value trace_seq_printf() return value is a little ambiguous. It currently returns the length of the space available in the buffer. printf usually returns the amount written. This is not adequate here, because: trace_seq_printf(s, ""); is perfectly legal, and returning 0 would indicate that it failed. We can always see the amount written by looking at the before and after values of s->len. This is not quite the same use as printf. We only care if the string was successfully written to the buffer or not. Make trace_seq_printf() return 0 if the trace oversizes the buffer's free space, 1 otherwise. Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <20091023233646.631787612@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/trace/trace_output.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index ed17565..b6c12c6 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -69,6 +69,9 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) * @s: trace sequence descriptor * @fmt: printf format string * + * It returns 0 if the trace oversizes the buffer's free + * space, 1 otherwise. + * * The tracer may use either sequence operations or its own * copy to user routines. To simplify formating of a trace * trace_seq_printf is used to store strings into a special @@ -95,7 +98,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) s->len += ret; - return len; + return 1; } EXPORT_SYMBOL_GPL(trace_seq_printf); -- cgit v1.1 From 67b394f7f26d84edb7294cc6528ab7ca6daa2ad1 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 23 Oct 2009 19:36:18 -0400 Subject: tracing: Fix comment typo and documentation example Trivial patch to fix a documentation example and to fix a comment. Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <20091023233646.871719877@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d4ff019..217f699 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2681,7 +2681,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) EXPORT_SYMBOL_GPL(ring_buffer_entries); /** - * ring_buffer_overrun_cpu - get the number of overruns in buffer + * ring_buffer_overruns - get the number of overruns in buffer * @buffer: The ring buffer * * Returns the total number of overruns in the ring buffer -- cgit v1.1 From 6d3f1e12f46a2f9a1bb7e7aa433df8dd31ce5647 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 23 Oct 2009 19:36:19 -0400 Subject: tracing: Remove cpu arg from the rb_time_stamp() function The cpu argument is not used inside the rb_time_stamp() function. Plus fix a typo. Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <20091023233647.118547500@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 217f699..3ffa502 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -483,7 +483,7 @@ struct ring_buffer_iter { /* Up this if you want to test the TIME_EXTENTS and normalization */ #define DEBUG_SHIFT 0 -static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu) +static inline u64 rb_time_stamp(struct ring_buffer *buffer) { /* shift to debug/test normalization and TIME_EXTENTS */ return buffer->clock() << DEBUG_SHIFT; @@ -494,7 +494,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) u64 time; preempt_disable_notrace(); - time = rb_time_stamp(buffer, cpu); + time = rb_time_stamp(buffer); preempt_enable_no_resched_notrace(); return time; @@ -599,7 +599,7 @@ static struct list_head *rb_list_head(struct list_head *list) } /* - * rb_is_head_page - test if the give page is the head page + * rb_is_head_page - test if the given page is the head page * * Because the reader may move the head_page pointer, we can * not trust what the head page is (it may be pointing to @@ -1868,7 +1868,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, * Nested commits always have zero deltas, so * just reread the time stamp */ - *ts = rb_time_stamp(buffer, cpu_buffer->cpu); + *ts = rb_time_stamp(buffer); next_page->page->time_stamp = *ts; } @@ -2111,7 +2111,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) goto out_fail; - ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); + ts = rb_time_stamp(cpu_buffer->buffer); /* * Only the first commit can update the timestamp. -- cgit v1.1 From 4a6cc4bd32e580722882115d4c8b964d732c11e4 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 29 Oct 2009 00:26:00 +0900 Subject: sched: move rq_weight data array out of .percpu Commit 34d76c41 introduced percpu array update_shares_data, size of which being proportional to NR_CPUS. Unfortunately this blows up ia64 for large NR_CPUS configuration, as ia64 allows only 64k for .percpu section. Fix this by allocating this array dynamically and keep only pointer to it percpu. The per-cpu handling doesn't impose significant performance penalty on potentially contented path in tg_shares_up(). ... ffffffff8104337c: 65 48 8b 14 25 20 cd mov %gs:0xcd20,%rdx ffffffff81043383: 00 00 ffffffff81043385: 48 c7 c0 00 e1 00 00 mov $0xe100,%rax ffffffff8104338c: 48 c7 45 a0 00 00 00 movq $0x0,-0x60(%rbp) ffffffff81043393: 00 ffffffff81043394: 48 c7 45 a8 00 00 00 movq $0x0,-0x58(%rbp) ffffffff8104339b: 00 ffffffff8104339c: 48 01 d0 add %rdx,%rax ffffffff8104339f: 49 8d 94 24 08 01 00 lea 0x108(%r12),%rdx ffffffff810433a6: 00 ffffffff810433a7: b9 ff ff ff ff mov $0xffffffff,%ecx ffffffff810433ac: 48 89 45 b0 mov %rax,-0x50(%rbp) ffffffff810433b0: bb 00 04 00 00 mov $0x400,%ebx ffffffff810433b5: 48 89 55 c0 mov %rdx,-0x40(%rbp) ... After: ... ffffffff8104337c: 65 8b 04 25 28 cd 00 mov %gs:0xcd28,%eax ffffffff81043383: 00 ffffffff81043384: 48 98 cltq ffffffff81043386: 49 8d bc 24 08 01 00 lea 0x108(%r12),%rdi ffffffff8104338d: 00 ffffffff8104338e: 48 8b 15 d3 7f 76 00 mov 0x767fd3(%rip),%rdx # ffffffff817ab368 ffffffff81043395: 48 8b 34 c5 00 ee 6d mov -0x7e921200(,%rax,8),%rsi ffffffff8104339c: 81 ffffffff8104339d: 48 c7 45 a0 00 00 00 movq $0x0,-0x60(%rbp) ffffffff810433a4: 00 ffffffff810433a5: b9 ff ff ff ff mov $0xffffffff,%ecx ffffffff810433aa: 48 89 7d c0 mov %rdi,-0x40(%rbp) ffffffff810433ae: 48 c7 45 a8 00 00 00 movq $0x0,-0x58(%rbp) ffffffff810433b5: 00 ffffffff810433b6: bb 00 04 00 00 mov $0x400,%ebx ffffffff810433bb: 48 01 f2 add %rsi,%rdx ffffffff810433be: 48 89 55 b0 mov %rdx,-0x50(%rbp) ... Signed-off-by: Jiri Kosina Acked-by: Ingo Molnar Signed-off-by: Tejun Heo --- kernel/sched.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ee61f45..526d237 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1563,11 +1563,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) #ifdef CONFIG_FAIR_GROUP_SCHED -struct update_shares_data { - unsigned long rq_weight[NR_CPUS]; -}; - -static DEFINE_PER_CPU(struct update_shares_data, update_shares_data); +static __read_mostly unsigned long *update_shares_data; static void __set_se_shares(struct sched_entity *se, unsigned long shares); @@ -1577,12 +1573,12 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares); static void update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long sd_shares, unsigned long sd_rq_weight, - struct update_shares_data *usd) + unsigned long *usd_rq_weight) { unsigned long shares, rq_weight; int boost = 0; - rq_weight = usd->rq_weight[cpu]; + rq_weight = usd_rq_weight[cpu]; if (!rq_weight) { boost = 1; rq_weight = NICE_0_LOAD; @@ -1617,7 +1613,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, static int tg_shares_up(struct task_group *tg, void *data) { unsigned long weight, rq_weight = 0, shares = 0; - struct update_shares_data *usd; + unsigned long *usd_rq_weight; struct sched_domain *sd = data; unsigned long flags; int i; @@ -1626,11 +1622,11 @@ static int tg_shares_up(struct task_group *tg, void *data) return 0; local_irq_save(flags); - usd = &__get_cpu_var(update_shares_data); + usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); for_each_cpu(i, sched_domain_span(sd)) { weight = tg->cfs_rq[i]->load.weight; - usd->rq_weight[i] = weight; + usd_rq_weight[i] = weight; /* * If there are currently no tasks on the cpu pretend there @@ -1651,7 +1647,7 @@ static int tg_shares_up(struct task_group *tg, void *data) shares = tg->shares; for_each_cpu(i, sched_domain_span(sd)) - update_group_shares_cpu(tg, i, shares, rq_weight, usd); + update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); local_irq_restore(flags); @@ -9406,6 +9402,10 @@ void __init sched_init(void) #endif /* CONFIG_USER_SCHED */ #endif /* CONFIG_GROUP_SCHED */ +#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP + update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), + __alignof__(unsigned long)); +#endif for_each_possible_cpu(i) { struct rq *rq; -- cgit v1.1 From 11df6dddcbc38affb7473aad3d962baf8414a947 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 28 Oct 2009 20:26:48 +0100 Subject: futex: Fix spurious wakeup for requeue_pi really The requeue_pi path doesn't use unqueue_me() (and the racy lock_ptr == NULL test) nor does it use the wake_list of futex_wake() which where the reason for commit 41890f2 (futex: Handle spurious wake up) See debugging discussing on LKML Message-ID: <4AD4080C.20703@us.ibm.com> The changes in this fix to the wait_requeue_pi path were considered to be a likely unecessary, but harmless safety net. But it turns out that due to the fact that for unknown $@#!*( reasons EWOULDBLOCK is defined as EAGAIN we built an endless loop in the code path which returns correctly EWOULDBLOCK. Spurious wakeups in wait_requeue_pi code path are unlikely so we do the easy solution and return EWOULDBLOCK^WEAGAIN to user space and let it deal with the spurious wakeup. Cc: Darren Hart Cc: Peter Zijlstra Cc: Eric Dumazet Cc: John Stultz Cc: Dinakar Guniguntala LKML-Reference: <4AE23C74.1090502@us.ibm.com> Cc: stable@kernel.org Signed-off-by: Thomas Gleixner --- kernel/futex.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 642f3bb..fb65e82 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2127,7 +2127,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, plist_del(&q->list, &q->list.plist); /* Handle spurious wakeups gracefully */ - ret = -EAGAIN; + ret = -EWOULDBLOCK; if (timeout && !timeout->task) ret = -ETIMEDOUT; else if (signal_pending(current)) @@ -2208,7 +2208,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, debug_rt_mutex_init_waiter(&rt_waiter); rt_waiter.task = NULL; -retry: key2 = FUTEX_KEY_INIT; ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); if (unlikely(ret != 0)) @@ -2303,9 +2302,6 @@ out_put_keys: out_key2: put_futex_key(fshared, &key2); - /* Spurious wakeup ? */ - if (ret == -EAGAIN) - goto retry; out: if (to) { hrtimer_cancel(&to->timer); -- cgit v1.1 From 65afac7d80ab3bc9f81e75eafb71eeb92a3ebdef Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 29 Oct 2009 08:56:16 -0600 Subject: param: fix lots of bugs with writing charp params from sysfs, by leaking mem. e180a6b7759a "param: fix charp parameters set via sysfs" fixed the case where charp parameters written via sysfs were freed, leaving drivers accessing random memory. Unfortunately, storing a flag in the kparam struct was a bad idea: it's rodata so setting it causes an oops on some archs. But that's not all: 1) module_param_array() on charp doesn't work reliably, since we use an uninitialized temporary struct kernel_param. 2) there's a fundamental race if a module uses this parameter and then it's changed: they will still access the old, freed, memory. The simplest fix (ie. for 2.6.32) is to never free the memory. This prevents all these problems, at cost of a memory leak. In practice, there are only 18 places where a charp is writable via sysfs, and all are root-only writable. Reported-by: Takashi Iwai Cc: Sitsofe Wheeler Cc: Frederic Weisbecker Cc: Christof Schmitt Signed-off-by: Rusty Russell Cc: stable@kernel.org --- kernel/params.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 9da58ea..95ef27c 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -218,13 +218,9 @@ int param_set_charp(const char *val, struct kernel_param *kp) return -ENOSPC; } - if (kp->flags & KPARAM_KMALLOCED) - kfree(*(char **)kp->arg); - /* This is a hack. We can't need to strdup in early boot, and we * don't need to; this mangled commandline is preserved. */ if (slab_is_available()) { - kp->flags |= KPARAM_KMALLOCED; *(char **)kp->arg = kstrdup(val, GFP_KERNEL); if (!kp->arg) return -ENOMEM; @@ -605,11 +601,7 @@ void module_param_sysfs_remove(struct module *mod) void destroy_params(const struct kernel_param *params, unsigned num) { - unsigned int i; - - for (i = 0; i < num; i++) - if (params[i].flags & KPARAM_KMALLOCED) - kfree(*(char **)params[i].arg); + /* FIXME: This should free kmalloced charp parameters. It doesn't. */ } static void __init kernel_add_sysfs_param(const char *name, -- cgit v1.1 From d553ad864e3b3dde3f1038d491e207021b2d6293 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 29 Oct 2009 08:56:17 -0600 Subject: param: fix NULL comparison on oom kp->arg is always true: it's the contents of that pointer we care about. Reported-by: Takashi Iwai Signed-off-by: Rusty Russell Cc: stable@kernel.org --- kernel/params.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 95ef27c..00520c4 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -222,7 +222,7 @@ int param_set_charp(const char *val, struct kernel_param *kp) * don't need to; this mangled commandline is preserved. */ if (slab_is_available()) { *(char **)kp->arg = kstrdup(val, GFP_KERNEL); - if (!kp->arg) + if (!*(char **)kp->arg) return -ENOMEM; } else *(const char **)kp->arg = val; -- cgit v1.1 From 3c7d76e371ac1a3802ae1673f5c63554af59325c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 29 Oct 2009 08:56:19 -0600 Subject: param: fix setting arrays of bool We create a dummy struct kernel_param on the stack for parsing each array element, but we didn't initialize the flags word. This matters for arrays of type "bool", where the flag indicates if it really is an array of bools or unsigned int (old-style). Reported-by: Takashi Iwai Signed-off-by: Rusty Russell Cc: stable@kernel.org --- kernel/params.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 00520c4..d656c27 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -300,6 +300,7 @@ static int param_array(const char *name, unsigned int min, unsigned int max, void *elem, int elemsize, int (*set)(const char *, struct kernel_param *kp), + u16 flags, unsigned int *num) { int ret; @@ -309,6 +310,7 @@ static int param_array(const char *name, /* Get the name right for errors. */ kp.name = name; kp.arg = elem; + kp.flags = flags; /* No equals sign? */ if (!val) { @@ -354,7 +356,8 @@ int param_array_set(const char *val, struct kernel_param *kp) unsigned int temp_num; return param_array(kp->name, val, 1, arr->max, arr->elem, - arr->elemsize, arr->set, arr->num ?: &temp_num); + arr->elemsize, arr->set, kp->flags, + arr->num ?: &temp_num); } int param_array_get(char *buffer, struct kernel_param *kp) -- cgit v1.1 From 0d0df599f1f11f12d589318bacb59a50fb5c0310 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Mon, 26 Oct 2009 16:49:34 -0700 Subject: connector: fix regression introduced by sid connector Since commit 02b51df1b07b4e9ca823c89284e704cadb323cd1 (proc connector: add event for process becoming session leader) we have the following warning: Badness at kernel/softirq.c:143 [...] Krnl PSW : 0404c00180000000 00000000001481d4 (local_bh_enable+0xb0/0xe0) [...] Call Trace: ([<000000013fe04100>] 0x13fe04100) [<000000000048a946>] sk_filter+0x9a/0xd0 [<000000000049d938>] netlink_broadcast+0x2c0/0x53c [<00000000003ba9ae>] cn_netlink_send+0x272/0x2b0 [<00000000003baef0>] proc_sid_connector+0xc4/0xd4 [<0000000000142604>] __set_special_pids+0x58/0x90 [<0000000000159938>] sys_setsid+0xb4/0xd8 [<00000000001187fe>] sysc_noemu+0x10/0x16 [<00000041616cb266>] 0x41616cb266 The warning is ---> WARN_ON_ONCE(in_irq() || irqs_disabled()); The network code must not be called with disabled interrupts but sys_setsid holds the tasklist_lock with spinlock_irq while calling the connector. After a discussion we agreed that we can move proc_sid_connector from __set_special_pids to sys_setsid. We also agreed that it is sufficient to change the check from task_session(curr) != pid into err > 0, since if we don't change the session, this means we were already the leader and return -EPERM. One last thing: There is also daemonize(), and some people might want to get a notification in that case. Since daemonize() is only needed if a user space does kernel_thread this does not look important (and there seems to be no consensus if this connector should be called in daemonize). If we really want this, we can add proc_sid_connector to daemonize() in an additional patch (Scott?) Signed-off-by: Christian Borntraeger Cc: Scott James Remnant Cc: Matt Helsley Cc: David S. Miller Acked-by: Oleg Nesterov Acked-by: Evgeniy Polyakov Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 4 +--- kernel/sys.c | 2 ++ 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index e61891f..f7864ac 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -359,10 +359,8 @@ void __set_special_pids(struct pid *pid) { struct task_struct *curr = current->group_leader; - if (task_session(curr) != pid) { + if (task_session(curr) != pid) change_pid(curr, PIDTYPE_SID, pid); - proc_sid_connector(curr); - } if (task_pgrp(curr) != pid) change_pid(curr, PIDTYPE_PGID, pid); diff --git a/kernel/sys.c b/kernel/sys.c index 255475d..1828f8d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1110,6 +1110,8 @@ SYSCALL_DEFINE0(setsid) err = session; out: write_unlock_irq(&tasklist_lock); + if (err > 0) + proc_sid_connector(group_leader); return err; } -- cgit v1.1 From 478988d3b28e98a31e0cfe24e011e28ba0f3cf0d Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 26 Oct 2009 16:49:36 -0700 Subject: cgroup: fix strstrip() misuse cgroup_write_X64() and cgroup_write_string() ignore the return value of strstrip(). it makes small inconsistent behavior. example: ========================= # cd /mnt/cgroup/hoge # cat memory.swappiness 60 # echo "59 " > memory.swappiness # cat memory.swappiness 59 # echo " 58" > memory.swappiness bash: echo: write error: Invalid argument This patch fixes it. Cc: Li Zefan Acked-by: Paul Menage Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ca83b73..0249f4b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1710,14 +1710,13 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, return -EFAULT; buffer[nbytes] = 0; /* nul-terminate */ - strstrip(buffer); if (cft->write_u64) { - u64 val = simple_strtoull(buffer, &end, 0); + u64 val = simple_strtoull(strstrip(buffer), &end, 0); if (*end) return -EINVAL; retval = cft->write_u64(cgrp, cft, val); } else { - s64 val = simple_strtoll(buffer, &end, 0); + s64 val = simple_strtoll(strstrip(buffer), &end, 0); if (*end) return -EINVAL; retval = cft->write_s64(cgrp, cft, val); @@ -1753,8 +1752,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, } buffer[nbytes] = 0; /* nul-terminate */ - strstrip(buffer); - retval = cft->write_string(cgrp, cft, buffer); + retval = cft->write_string(cgrp, cft, strstrip(buffer)); if (!retval) retval = nbytes; out: -- cgit v1.1 From 8c85dd8730bfb696e691145335f884c7baef8277 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 26 Oct 2009 16:50:07 -0700 Subject: sysctl: fix false positives when PROC_SYSCTL=n Having ->procname but not ->proc_handler is valid when PROC_SYSCTL=n, people use such combination to reduce ifdefs with non-standard handlers. Addresses http://bugzilla.kernel.org/show_bug.cgi?id=14408 Signed-off-by: Alexey Dobriyan Reported-by: Peter Teoh Cc: "Eric W. Biederman" Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl_check.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index b38423c..b6e7aae 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -1521,7 +1521,7 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) if (!table->ctl_name && table->strategy) set_fail(&fail, table, "Strategy without ctl_name"); #endif -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_PROC_SYSCTL if (table->procname && !table->proc_handler) set_fail(&fail, table, "No proc_handler"); #endif -- cgit v1.1 From 76b57e613f6006ff525a17876c89326d127cadc9 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 7 Oct 2009 22:37:35 +0200 Subject: PM / Hibernate: Fix blkdev refleaks While cruising through the swsusp code I found few blkdev reference leaks of resume_bdev. swsusp_read: remove blkdev_put altogether. Some fail paths do not do that. swsusp_check: make sure we always put a reference on fail paths software_resume: all fail paths between swsusp_check and swsusp_read omit swsusp_close. Add it in those cases. And since swsusp_read doesn't drop the reference anymore, do it here unconditionally. [rjw: Fixed a small coding style issue.] Signed-off-by: Jiri Slaby Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 11 ++++++++--- kernel/power/swap.c | 8 ++++---- 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 04b3a83..04a9e90 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -693,21 +693,22 @@ static int software_resume(void) /* The snapshot device should not be opened while we're running */ if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { error = -EBUSY; + swsusp_close(FMODE_READ); goto Unlock; } pm_prepare_console(); error = pm_notifier_call_chain(PM_RESTORE_PREPARE); if (error) - goto Finish; + goto close_finish; error = usermodehelper_disable(); if (error) - goto Finish; + goto close_finish; error = create_basic_memory_bitmaps(); if (error) - goto Finish; + goto close_finish; pr_debug("PM: Preparing processes for restore.\n"); error = prepare_processes(); @@ -719,6 +720,7 @@ static int software_resume(void) pr_debug("PM: Reading hibernation image.\n"); error = swsusp_read(&flags); + swsusp_close(FMODE_READ); if (!error) hibernation_restore(flags & SF_PLATFORM_MODE); @@ -737,6 +739,9 @@ static int software_resume(void) mutex_unlock(&pm_mutex); pr_debug("PM: Resume from disk failed.\n"); return error; +close_finish: + swsusp_close(FMODE_READ); + goto Finish; } late_initcall(software_resume); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index b101cdc..a438862 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -572,8 +572,6 @@ int swsusp_read(unsigned int *flags_p) error = load_image(&handle, &snapshot, header->pages - 1); release_swap_reader(&handle); - blkdev_put(resume_bdev, FMODE_READ); - if (!error) pr_debug("PM: Image successfully loaded\n"); else @@ -596,7 +594,7 @@ int swsusp_check(void) error = bio_read_page(swsusp_resume_block, swsusp_header, NULL); if (error) - return error; + goto put; if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); @@ -604,8 +602,10 @@ int swsusp_check(void) error = bio_write_page(swsusp_resume_block, swsusp_header, NULL); } else { - return -EINVAL; + error = -EINVAL; } + +put: if (error) blkdev_put(resume_bdev, FMODE_READ); else -- cgit v1.1 From 4ff277f9e42fa16314045bd124a61519286094c0 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 28 Oct 2009 22:55:33 +0100 Subject: PM / Hibernate: Fix error handling in save_image() There are too many retval variables in save_image(). Thus error return value from snapshot_read_next() may be ignored and only part of the snapshot (successfully) written. Remove 'error' variable, invert the condition in the do-while loop and convert the loop to use only 'ret' variable. Switch the rest of the function to consider only 'ret'. Also make sure we end printed line by \n if an error occurs. Signed-off-by: Jiri Slaby Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index a438862..afa052b 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -314,7 +314,6 @@ static int save_image(struct swap_map_handle *handle, { unsigned int m; int ret; - int error = 0; int nr_pages; int err2; struct bio *bio; @@ -329,26 +328,27 @@ static int save_image(struct swap_map_handle *handle, nr_pages = 0; bio = NULL; do_gettimeofday(&start); - do { + while (1) { ret = snapshot_read_next(snapshot, PAGE_SIZE); - if (ret > 0) { - error = swap_write_page(handle, data_of(*snapshot), - &bio); - if (error) - break; - if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); - nr_pages++; - } - } while (ret > 0); + if (ret <= 0) + break; + ret = swap_write_page(handle, data_of(*snapshot), &bio); + if (ret) + break; + if (!(nr_pages % m)) + printk("\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + } err2 = wait_on_bio_chain(&bio); do_gettimeofday(&stop); - if (!error) - error = err2; - if (!error) + if (!ret) + ret = err2; + if (!ret) printk("\b\b\b\bdone\n"); + else + printk("\n"); swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); - return error; + return ret; } /** -- cgit v1.1 From bf9fd67a0328d56eff6022f80d4eb88ba6614119 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 28 Oct 2009 22:55:42 +0100 Subject: PM / Hibernate: Add newline to load_image() fail path Finish a line by \n when load_image fails in the middle of loading. Signed-off-by: Jiri Slaby Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index afa052b..890f6b1 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -536,7 +536,8 @@ static int load_image(struct swap_map_handle *handle, snapshot_write_finalize(snapshot); if (!snapshot_image_loaded(snapshot)) error = -ENODATA; - } + } else + printk("\n"); swsusp_show_speed(&start, &stop, nr_to_read, "Read"); return error; } -- cgit v1.1 From 1d510750941a53a1d3049c1d33c75d6dfcd78618 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 3 Nov 2009 10:11:14 +0000 Subject: Correct nr_processes() when CPUs have been unplugged nr_processes() returns the sum of the per cpu counter process_counts for all online CPUs. This counter is incremented for the current CPU on fork() and decremented for the current CPU on exit(). Since a process does not necessarily fork and exit on the same CPU the process_count for an individual CPU can be either positive or negative and effectively has no meaning in isolation. Therefore calculating the sum of process_counts over only the online CPUs omits the processes which were started or stopped on any CPU which has since been unplugged. Only the sum of process_counts across all possible CPUs has meaning. The only caller of nr_processes() is proc_root_getattr() which calculates the number of links to /proc as stat->nlink = proc_root.nlink + nr_processes(); You don't have to be all that unlucky for the nr_processes() to return a negative value leading to a negative number of links (or rather, an apparently enormous number of links). If this happens then you can get failures where things like "ls /proc" start to fail because they got an -EOVERFLOW from some stat() call. Example with some debugging inserted to show what goes on: # ps haux|wc -l nr_processes: CPU0: 90 nr_processes: CPU1: 1030 nr_processes: CPU2: -900 nr_processes: CPU3: -136 nr_processes: TOTAL: 84 proc_root_getattr. nlink 12 + nr_processes() 84 = 96 84 # echo 0 >/sys/devices/system/cpu/cpu1/online # ps haux|wc -l nr_processes: CPU0: 85 nr_processes: CPU2: -901 nr_processes: CPU3: -137 nr_processes: TOTAL: -953 proc_root_getattr. nlink 12 + nr_processes() -953 = -941 75 # stat /proc/ nr_processes: CPU0: 84 nr_processes: CPU2: -901 nr_processes: CPU3: -137 nr_processes: TOTAL: -954 proc_root_getattr. nlink 12 + nr_processes() -954 = -942 File: `/proc/' Size: 0 Blocks: 0 IO Block: 1024 directory Device: 3h/3d Inode: 1 Links: 4294966354 Access: (0555/dr-xr-xr-x) Uid: ( 0/ root) Gid: ( 0/ root) Access: 2009-11-03 09:06:55.000000000 +0000 Modify: 2009-11-03 09:06:55.000000000 +0000 Change: 2009-11-03 09:06:55.000000000 +0000 I'm not 100% convinced that the per_cpu regions remain valid for offline CPUs, although my testing suggests that they do. If not then I think the correct solution would be to aggregate the process_count for a given CPU into a global base value in cpu_down(). This bug appears to pre-date the transition to git and it looks like it may even have been present in linux-2.6.0-test7-bk3 since it looks like the code Rusty patched in http://lwn.net/Articles/64773/ was already wrong. Signed-off-by: Ian Campbell Cc: Andrew Morton Cc: Rusty Russell Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 4c20fff..166b8c4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -91,7 +91,7 @@ int nr_processes(void) int cpu; int total = 0; - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) total += per_cpu(process_counts, cpu); return total; -- cgit v1.1