From 6f6f21eceec3a60d2066bb3b5e31dc14f9168fa7 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 10 May 2012 13:01:45 -0700 Subject: namespaces, pid_ns: fix leakage on fork() failure commit 5e2bf0142231194d36fdc9596b36a261ed2b9fe7 upstream. Fork() failure post namespace creation for a child cloned with CLONE_NEWPID leaks pid_namespace/mnt_cache due to proc being mounted during creation, but not unmounted during cleanup. Call pid_ns_release_proc() during cleanup. Signed-off-by: Mike Galbraith Acked-by: Oleg Nesterov Reviewed-by: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Cyrill Gorcunov Cc: Louis Rilling Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/fork.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a4e453b..4712e3e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -1378,6 +1379,8 @@ bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: + if (unlikely(clone_flags & CLONE_NEWPID)) + pid_ns_release_proc(p->nsproxy->pid_ns); exit_task_namespaces(p); bad_fork_cleanup_mm: if (p->mm) { -- cgit v1.1 From 5c17daa89308cc028fe336af58cf9d4e4d83298d Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Thu, 10 May 2012 10:04:36 -0300 Subject: compat: Fix RT signal mask corruption via sigprocmask commit b7dafa0ef3145c31d7753be0a08b3cbda51f0209 upstream. compat_sys_sigprocmask reads a smaller signal mask from userspace than sigprogmask accepts for setting. So the high word of blocked.sig[0] will be cleared, releasing any potentially blocked RT signal. This was discovered via userspace code that relies on get/setcontext. glibc's i386 versions of those functions use sigprogmask instead of rt_sigprogmask to save/restore signal mask and caused RT signal unblocking this way. As suggested by Linus, this replaces the sys_sigprocmask based compat version with one that open-codes the required logic, including the merge of the existing blocked set with the new one provided on SIG_SETMASK. Signed-off-by: Jan Kiszka Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/compat.c | 63 +++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index fc9eb093..3507c93 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -318,25 +318,54 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set) #ifdef __ARCH_WANT_SYS_SIGPROCMASK -asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, - compat_old_sigset_t __user *oset) +/* + * sys_sigprocmask SIG_SETMASK sets the first (compat) word of the + * blocked set of signals to the supplied signal set + */ +static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set) { - old_sigset_t s; - long ret; - mm_segment_t old_fs; + memcpy(blocked->sig, &set, sizeof(set)); +} - if (set && get_user(s, set)) - return -EFAULT; - old_fs = get_fs(); - set_fs(KERNEL_DS); - ret = sys_sigprocmask(how, - set ? (old_sigset_t __user *) &s : NULL, - oset ? (old_sigset_t __user *) &s : NULL); - set_fs(old_fs); - if (ret == 0) - if (oset) - ret = put_user(s, oset); - return ret; +asmlinkage long compat_sys_sigprocmask(int how, + compat_old_sigset_t __user *nset, + compat_old_sigset_t __user *oset) +{ + old_sigset_t old_set, new_set; + sigset_t new_blocked; + + old_set = current->blocked.sig[0]; + + if (nset) { + if (get_user(new_set, nset)) + return -EFAULT; + new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); + + new_blocked = current->blocked; + + switch (how) { + case SIG_BLOCK: + sigaddsetmask(&new_blocked, new_set); + break; + case SIG_UNBLOCK: + sigdelsetmask(&new_blocked, new_set); + break; + case SIG_SETMASK: + compat_sig_setmask(&new_blocked, new_set); + break; + default: + return -EINVAL; + } + + set_current_blocked(&new_blocked); + } + + if (oset) { + if (put_user(old_set, oset)) + return -EFAULT; + } + + return 0; } #endif -- cgit v1.1 From 935055856458a05c43f518bf9ed406f67c090f0a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 14 May 2012 15:04:50 -0700 Subject: workqueue: skip nr_running sanity check in worker_enter_idle() if trustee is active commit 544ecf310f0e7f51fa057ac2a295fc1b3b35a9d3 upstream. worker_enter_idle() has WARN_ON_ONCE() which triggers if nr_running isn't zero when every worker is idle. This can trigger spuriously while a cpu is going down due to the way trustee sets %WORKER_ROGUE and zaps nr_running. It first sets %WORKER_ROGUE on all workers without updating nr_running, releases gcwq->lock, schedules, regrabs gcwq->lock and then zaps nr_running. If the last running worker enters idle inbetween, it would see stale nr_running which hasn't been zapped yet and trigger the WARN_ON_ONCE(). Fix it by performing the sanity check iff the trustee is idle. Signed-off-by: Tejun Heo Reported-by: "Paul E. McKenney" Signed-off-by: Greg Kroah-Hartman --- kernel/workqueue.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1456dab..ee1845b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1214,8 +1214,13 @@ static void worker_enter_idle(struct worker *worker) } else wake_up_all(&gcwq->trustee_wait); - /* sanity check nr_running */ - WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle && + /* + * Sanity check nr_running. Because trustee releases gcwq->lock + * between setting %WORKER_ROGUE and zapping nr_running, the + * warning may trigger spuriously. Check iff trustee is idle. + */ + WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE && + gcwq->nr_workers == gcwq->nr_idle && atomic_read(get_gcwq_nr_running(gcwq->cpu))); } -- cgit v1.1 From 34b1a9eb1db3de73460f3a21d8ffb48ea7cbd338 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Tue, 5 Jun 2012 13:44:36 -0500 Subject: sched: Fix the relax_domain_level boot parameter commit a841f8cef4bb124f0f5563314d0beaf2e1249d72 upstream. It does not get processed because sched_domain_level_max is 0 at the time that setup_relax_domain_level() is run. Simply accept the value as it is, as we don't know the value of sched_domain_level_max until sched domain construction is completed. Fix sched_relax_domain_level in cpuset. The build_sched_domain() routine calls the set_domain_attribute() routine prior to setting the sd->level, however, the set_domain_attribute() routine relies on the sd->level to decide whether idle load balancing will be off/on. Signed-off-by: Dimitri Sivanich Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120605184436.GA15668@sgi.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/sched.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 03dff14..8ef48f0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7220,11 +7220,8 @@ int sched_domain_level_max; static int __init setup_relax_domain_level(char *str) { - unsigned long val; - - val = simple_strtoul(str, NULL, 0); - if (val < sched_domain_level_max) - default_relax_domain_level = val; + if (kstrtoint(str, 0, &default_relax_domain_level)) + pr_warn("Unable to set relax_domain_level\n"); return 1; } @@ -7417,7 +7414,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, if (!sd) return child; - set_domain_attribute(sd, attr); cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); if (child) { sd->level = child->level + 1; @@ -7425,6 +7421,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, child->parent = sd; } sd->child = child; + set_domain_attribute(sd, attr); return sd; } -- cgit v1.1 From f437e75ac267fa886bcbe7ab40d8f1ba1de1c865 Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Thu, 3 May 2012 18:59:52 -0700 Subject: tracing: change CPU ring buffer state from tracing_cpumask commit 71babb2705e2203a64c27ede13ae3508a0d2c16c upstream. According to Documentation/trace/ftrace.txt: tracing_cpumask: This is a mask that lets the user only trace on specified CPUS. The format is a hex string representing the CPUS. The tracing_cpumask currently doesn't affect the tracing state of per-CPU ring buffers. This patch enables/disables CPU recording as its corresponding bit in tracing_cpumask is set/unset. Link: http://lkml.kernel.org/r/1336096792-25373-3-git-send-email-vnagarnaik@google.com Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Laurent Chavey Cc: Justin Teravest Cc: David Sharp Signed-off-by: Vaibhav Nagarnaik Signed-off-by: Steven Rostedt Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0731e81a..672a749 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2432,10 +2432,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, if (cpumask_test_cpu(cpu, tracing_cpumask) && !cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_inc(&global_trace.data[cpu]->disabled); + ring_buffer_record_disable_cpu(global_trace.buffer, cpu); } if (!cpumask_test_cpu(cpu, tracing_cpumask) && cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_dec(&global_trace.data[cpu]->disabled); + ring_buffer_record_enable_cpu(global_trace.buffer, cpu); } } arch_spin_unlock(&ftrace_max_lock); -- cgit v1.1 From 9c24771f844b6f0708a72cd116953e0a128e5d2a Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 17 Jul 2012 13:33:48 -0400 Subject: ntp: Fix leap-second hrtimer livelock This is a backport of 6b43ae8a619d17c4935c3320d2ef9e92bdeed05d This should have been backported when it was commited, but I mistook the problem as requiring the ntp_lock changes that landed in 3.4 in order for it to occur. Unfortunately the same issue can happen (with only one cpu) as follows: do_adjtimex() write_seqlock_irq(&xtime_lock); process_adjtimex_modes() process_adj_status() ntp_start_leap_timer() hrtimer_start() hrtimer_reprogram() tick_program_event() clockevents_program_event() ktime_get() seq = req_seqbegin(xtime_lock); [DEADLOCK] This deadlock will no always occur, as it requires the leap_timer to force a hrtimer_reprogram which only happens if its set and there's no sooner timer to expire. NOTE: This patch, being faithful to the original commit, introduces a bug (we don't update wall_to_monotonic), which will be resovled by backporting a following fix. Original commit message below: Since commit 7dffa3c673fbcf835cd7be80bb4aec8ad3f51168 the ntp subsystem has used an hrtimer for triggering the leapsecond adjustment. However, this can cause a potential livelock. Thomas diagnosed this as the following pattern: CPU 0 CPU 1 do_adjtimex() spin_lock_irq(&ntp_lock); process_adjtimex_modes(); timer_interrupt() process_adj_status(); do_timer() ntp_start_leap_timer(); write_lock(&xtime_lock); hrtimer_start(); update_wall_time(); hrtimer_reprogram(); ntp_tick_length() tick_program_event() spin_lock(&ntp_lock); clockevents_program_event() ktime_get() seq = req_seqbegin(xtime_lock); This patch tries to avoid the problem by reverting back to not using an hrtimer to inject leapseconds, and instead we handle the leapsecond processing in the second_overflow() function. The downside to this change is that on systems that support highres timers, the leap second processing will occur on a HZ tick boundary, (ie: ~1-10ms, depending on HZ) after the leap second instead of possibly sooner (~34us in my tests w/ x86_64 lapic). This patch applies on top of tip/timers/core. CC: Sasha Levin CC: Thomas Gleixner Reported-by: Sasha Levin Diagnoised-by: Thomas Gleixner Tested-by: Sasha Levin Cc: Prarit Bhargava Cc: Thomas Gleixner Signed-off-by: John Stultz Signed-off-by: Greg Kroah-Hartman --- kernel/time/ntp.c | 122 +++++++++++++++------------------------------- kernel/time/timekeeping.c | 18 +++---- 2 files changed, 47 insertions(+), 93 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 4b85a7a..4508f7f 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -31,8 +31,6 @@ unsigned long tick_nsec; u64 tick_length; static u64 tick_length_base; -static struct hrtimer leap_timer; - #define MAX_TICKADJ 500LL /* usecs */ #define MAX_TICKADJ_SCALED \ (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) @@ -350,60 +348,60 @@ void ntp_clear(void) } /* - * Leap second processing. If in leap-insert state at the end of the - * day, the system clock is set back one second; if in leap-delete - * state, the system clock is set ahead one second. + * this routine handles the overflow of the microsecond field + * + * The tricky bits of code to handle the accurate clock support + * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. + * They were originally developed for SUN and DEC kernels. + * All the kudos should go to Dave for this stuff. + * + * Also handles leap second processing, and returns leap offset */ -static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) +int second_overflow(unsigned long secs) { - enum hrtimer_restart res = HRTIMER_NORESTART; - - write_seqlock(&xtime_lock); + int leap = 0; + s64 delta; + /* + * Leap second processing. If in leap-insert state at the end of the + * day, the system clock is set back one second; if in leap-delete + * state, the system clock is set ahead one second. + */ switch (time_state) { case TIME_OK: + if (time_status & STA_INS) + time_state = TIME_INS; + else if (time_status & STA_DEL) + time_state = TIME_DEL; break; case TIME_INS: - timekeeping_leap_insert(-1); - time_state = TIME_OOP; - printk(KERN_NOTICE - "Clock: inserting leap second 23:59:60 UTC\n"); - hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC); - res = HRTIMER_RESTART; + if (secs % 86400 == 0) { + leap = -1; + time_state = TIME_OOP; + printk(KERN_NOTICE + "Clock: inserting leap second 23:59:60 UTC\n"); + } break; case TIME_DEL: - timekeeping_leap_insert(1); - time_tai--; - time_state = TIME_WAIT; - printk(KERN_NOTICE - "Clock: deleting leap second 23:59:59 UTC\n"); + if ((secs + 1) % 86400 == 0) { + leap = 1; + time_tai--; + time_state = TIME_WAIT; + printk(KERN_NOTICE + "Clock: deleting leap second 23:59:59 UTC\n"); + } break; case TIME_OOP: time_tai++; time_state = TIME_WAIT; - /* fall through */ + break; + case TIME_WAIT: if (!(time_status & (STA_INS | STA_DEL))) time_state = TIME_OK; break; } - write_sequnlock(&xtime_lock); - - return res; -} - -/* - * this routine handles the overflow of the microsecond field - * - * The tricky bits of code to handle the accurate clock support - * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. - * They were originally developed for SUN and DEC kernels. - * All the kudos should go to Dave for this stuff. - */ -void second_overflow(void) -{ - s64 delta; /* Bump the maxerror field */ time_maxerror += MAXFREQ / NSEC_PER_USEC; @@ -423,23 +421,25 @@ void second_overflow(void) pps_dec_valid(); if (!time_adjust) - return; + goto out; if (time_adjust > MAX_TICKADJ) { time_adjust -= MAX_TICKADJ; tick_length += MAX_TICKADJ_SCALED; - return; + goto out; } if (time_adjust < -MAX_TICKADJ) { time_adjust += MAX_TICKADJ; tick_length -= MAX_TICKADJ_SCALED; - return; + goto out; } tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT; time_adjust = 0; +out: + return leap; } #ifdef CONFIG_GENERIC_CMOS_UPDATE @@ -501,27 +501,6 @@ static void notify_cmos_timer(void) static inline void notify_cmos_timer(void) { } #endif -/* - * Start the leap seconds timer: - */ -static inline void ntp_start_leap_timer(struct timespec *ts) -{ - long now = ts->tv_sec; - - if (time_status & STA_INS) { - time_state = TIME_INS; - now += 86400 - now % 86400; - hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); - - return; - } - - if (time_status & STA_DEL) { - time_state = TIME_DEL; - now += 86400 - (now + 1) % 86400; - hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); - } -} /* * Propagate a new txc->status value into the NTP state: @@ -546,22 +525,6 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) time_status &= STA_RONLY; time_status |= txc->status & ~STA_RONLY; - switch (time_state) { - case TIME_OK: - ntp_start_leap_timer(ts); - break; - case TIME_INS: - case TIME_DEL: - time_state = TIME_OK; - ntp_start_leap_timer(ts); - case TIME_WAIT: - if (!(time_status & (STA_INS | STA_DEL))) - time_state = TIME_OK; - break; - case TIME_OOP: - hrtimer_restart(&leap_timer); - break; - } } /* * Called with the xtime lock held, so we can access and modify @@ -643,9 +606,6 @@ int do_adjtimex(struct timex *txc) (txc->tick < 900000/USER_HZ || txc->tick > 1100000/USER_HZ)) return -EINVAL; - - if (txc->modes & ADJ_STATUS && time_state != TIME_OK) - hrtimer_cancel(&leap_timer); } if (txc->modes & ADJ_SETOFFSET) { @@ -967,6 +927,4 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup); void __init ntp_init(void) { ntp_clear(); - hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); - leap_timer.function = ntp_leap_second; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5f45831..c444da0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -169,15 +169,6 @@ static struct timespec raw_time; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; -/* must hold xtime_lock */ -void timekeeping_leap_insert(int leapsecond) -{ - xtime.tv_sec += leapsecond; - wall_to_monotonic.tv_sec -= leapsecond; - update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, - timekeeper.mult); -} - /** * timekeeping_forward_now - update clock to the current time * @@ -828,9 +819,11 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; while (timekeeper.xtime_nsec >= nsecps) { + int leap; timekeeper.xtime_nsec -= nsecps; xtime.tv_sec++; - second_overflow(); + leap = second_overflow(xtime.tv_sec); + xtime.tv_sec += leap; } /* Accumulate raw time */ @@ -936,9 +929,12 @@ static void update_wall_time(void) * xtime.tv_nsec isn't larger then NSEC_PER_SEC */ if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) { + int leap; xtime.tv_nsec -= NSEC_PER_SEC; xtime.tv_sec++; - second_overflow(); + leap = second_overflow(xtime.tv_sec); + xtime.tv_sec += leap; + } /* check to see if there is a new clocksource to use */ -- cgit v1.1 From 96bab736bad82423c2b312d602689a9078481fa9 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Tue, 17 Jul 2012 13:33:49 -0400 Subject: ntp: Correct TAI offset during leap second This is a backport of dd48d708ff3e917f6d6b6c2b696c3f18c019feed When repeating a UTC time value during a leap second (when the UTC time should be 23:59:60), the TAI timescale should not stop. The kernel NTP code increments the TAI offset one second too late. This patch fixes the issue by incrementing the offset during the leap second itself. Signed-off-by: Richard Cochran Cc: Prarit Bhargava Cc: Thomas Gleixner Signed-off-by: John Stultz Signed-off-by: Greg Kroah-Hartman --- kernel/time/ntp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 4508f7f..f1eb182 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -378,6 +378,7 @@ int second_overflow(unsigned long secs) if (secs % 86400 == 0) { leap = -1; time_state = TIME_OOP; + time_tai++; printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); } @@ -392,7 +393,6 @@ int second_overflow(unsigned long secs) } break; case TIME_OOP: - time_tai++; time_state = TIME_WAIT; break; -- cgit v1.1 From c33f2424c3941986d402c81d380d4e805870a20f Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 17 Jul 2012 13:33:50 -0400 Subject: timekeeping: Fix CLOCK_MONOTONIC inconsistency during leapsecond This is a backport of fad0c66c4bb836d57a5f125ecd38bed653ca863a which resolves a bug the previous commit. Commit 6b43ae8a61 (ntp: Fix leap-second hrtimer livelock) broke the leapsecond update of CLOCK_MONOTONIC. The missing leapsecond update to wall_to_monotonic causes discontinuities in CLOCK_MONOTONIC. Adjust wall_to_monotonic when NTP inserted a leapsecond. Reported-by: Richard Cochran Signed-off-by: John Stultz Tested-by: Richard Cochran Link: http://lkml.kernel.org/r/1338400497-12420-1-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner Cc: Prarit Bhargava Cc: Thomas Gleixner Signed-off-by: John Stultz Signed-off-by: Greg Kroah-Hartman --- kernel/time/timekeeping.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index c444da0..ac5b225 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -824,6 +824,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) xtime.tv_sec++; leap = second_overflow(xtime.tv_sec); xtime.tv_sec += leap; + wall_to_monotonic.tv_sec -= leap; } /* Accumulate raw time */ @@ -934,7 +935,7 @@ static void update_wall_time(void) xtime.tv_sec++; leap = second_overflow(xtime.tv_sec); xtime.tv_sec += leap; - + wall_to_monotonic.tv_sec -= leap; } /* check to see if there is a new clocksource to use */ -- cgit v1.1 From c7e2580578671c4d19a1a83e6fdb2482cc136283 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Jul 2012 13:33:51 -0400 Subject: time: Move common updates to a function This is a backport of cc06268c6a87db156af2daed6e96a936b955cc82 While not a bugfix itself, it allows following fixes to backport in a more straightforward manner. CC: Thomas Gleixner CC: Eric Dumazet CC: Richard Cochran Signed-off-by: Thomas Gleixner Cc: Prarit Bhargava Cc: Thomas Gleixner Signed-off-by: John Stultz Signed-off-by: Greg Kroah-Hartman --- kernel/time/timekeeping.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index ac5b225..0b582eb 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -166,6 +166,19 @@ static struct timespec total_sleep_time; */ static struct timespec raw_time; +/* must hold write on xtime_lock */ +static void timekeeping_update(bool clearntp) +{ + if (clearntp) { + timekeeper.ntp_error = 0; + ntp_clear(); + } + update_vsyscall(&xtime, &wall_to_monotonic, + timekeeper.clock, timekeeper.mult); +} + + + /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -366,11 +379,7 @@ int do_settimeofday(const struct timespec *tv) xtime = *tv; - timekeeper.ntp_error = 0; - ntp_clear(); - - update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, - timekeeper.mult); + timekeeping_update(true); write_sequnlock_irqrestore(&xtime_lock, flags); @@ -403,11 +412,7 @@ int timekeeping_inject_offset(struct timespec *ts) xtime = timespec_add(xtime, *ts); wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts); - timekeeper.ntp_error = 0; - ntp_clear(); - - update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, - timekeeper.mult); + timekeeping_update(true); write_sequnlock_irqrestore(&xtime_lock, flags); @@ -630,10 +635,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) __timekeeping_inject_sleeptime(delta); - timekeeper.ntp_error = 0; - ntp_clear(); - update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, - timekeeper.mult); + timekeeping_update(true); write_sequnlock_irqrestore(&xtime_lock, flags); @@ -938,9 +940,7 @@ static void update_wall_time(void) wall_to_monotonic.tv_sec -= leap; } - /* check to see if there is a new clocksource to use */ - update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, - timekeeper.mult); + timekeeping_update(false); } /** -- cgit v1.1 From 62b787f886e2d96cc7c5428aeee05dbe32a9531b Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 17 Jul 2012 13:33:52 -0400 Subject: hrtimer: Provide clock_was_set_delayed() This is a backport of f55a6faa384304c89cfef162768e88374d3312cb clock_was_set() cannot be called from hard interrupt context because it calls on_each_cpu(). For fixing the widely reported leap seconds issue it is necessary to call it from hard interrupt context, i.e. the timer tick code, which does the timekeeping updates. Provide a new function which denotes it in the hrtimer cpu base structure of the cpu on which it is called and raise the hrtimer softirq. We then execute the clock_was_set() notificiation from softirq context in run_hrtimer_softirq(). The hrtimer softirq is rarely used, so polling the flag there is not a performance issue. [ tglx: Made it depend on CONFIG_HIGH_RES_TIMERS. We really should get rid of all this ifdeffery ASAP ] Signed-off-by: John Stultz Reported-by: Jan Engelhardt Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Prarit Bhargava Link: http://lkml.kernel.org/r/1341960205-56738-2-git-send-email-johnstul@us.ibm.com Signed-off-by: Thomas Gleixner Cc: Prarit Bhargava Cc: Thomas Gleixner Signed-off-by: John Stultz Signed-off-by: Greg Kroah-Hartman --- kernel/hrtimer.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 2043c08..a256bab 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -717,6 +717,19 @@ static int hrtimer_switch_to_hres(void) return 1; } +/* + * Called from timekeeping code to reprogramm the hrtimer interrupt + * device. If called from the timer interrupt context we defer it to + * softirq context. + */ +void clock_was_set_delayed(void) +{ + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + + cpu_base->clock_was_set = 1; + __raise_softirq_irqoff(HRTIMER_SOFTIRQ); +} + #else static inline int hrtimer_hres_active(void) { return 0; } @@ -1395,6 +1408,13 @@ void hrtimer_peek_ahead_timers(void) static void run_hrtimer_softirq(struct softirq_action *h) { + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + + if (cpu_base->clock_was_set) { + cpu_base->clock_was_set = 0; + clock_was_set(); + } + hrtimer_peek_ahead_timers(); } -- cgit v1.1 From d21e4baf4523fec26e3c70cb78b013ad3b245c83 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 17 Jul 2012 13:33:53 -0400 Subject: timekeeping: Fix leapsecond triggered load spike issue This is a backport of 4873fa070ae84a4115f0b3c9dfabc224f1bc7c51 The timekeeping code misses an update of the hrtimer subsystem after a leap second happened. Due to that timers based on CLOCK_REALTIME are either expiring a second early or late depending on whether a leap second has been inserted or deleted until an operation is initiated which causes that update. Unless the update happens by some other means this discrepancy between the timekeeping and the hrtimer data stays forever and timers are expired either early or late. The reported immediate workaround - $ data -s "`date`" - is causing a call to clock_was_set() which updates the hrtimer data structures. See: http://www.sheeri.com/content/mysql-and-leap-second-high-cpu-and-fix Add the missing clock_was_set() call to update_wall_time() in case of a leap second event. The actual update is deferred to softirq context as the necessary smp function call cannot be invoked from hard interrupt context. Signed-off-by: John Stultz Reported-by: Jan Engelhardt Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Prarit Bhargava Link: http://lkml.kernel.org/r/1341960205-56738-3-git-send-email-johnstul@us.ibm.com Signed-off-by: Thomas Gleixner Cc: Prarit Bhargava Cc: Thomas Gleixner Signed-off-by: John Stultz Signed-off-by: Greg Kroah-Hartman --- kernel/time/timekeeping.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 0b582eb..9201474 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -827,6 +827,8 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) leap = second_overflow(xtime.tv_sec); xtime.tv_sec += leap; wall_to_monotonic.tv_sec -= leap; + if (leap) + clock_was_set_delayed(); } /* Accumulate raw time */ @@ -938,6 +940,8 @@ static void update_wall_time(void) leap = second_overflow(xtime.tv_sec); xtime.tv_sec += leap; wall_to_monotonic.tv_sec -= leap; + if (leap) + clock_was_set_delayed(); } timekeeping_update(false); -- cgit v1.1 From 03a90b9a6f7eec70edde4eb1f88fa8a5c058d85e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Jul 2012 13:33:54 -0400 Subject: timekeeping: Maintain ktime_t based offsets for hrtimers This is a backport of 5b9fe759a678e05be4937ddf03d50e950207c1c0 We need to update the hrtimer clock offsets from the hrtimer interrupt context. To avoid conversions from timespec to ktime_t maintain a ktime_t based representation of those offsets in the timekeeper. This puts the conversion overhead into the code which updates the underlying offsets and provides fast accessible values in the hrtimer interrupt. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Prarit Bhargava Link: http://lkml.kernel.org/r/1341960205-56738-4-git-send-email-johnstul@us.ibm.com Signed-off-by: Thomas Gleixner Cc: Prarit Bhargava Cc: Thomas Gleixner Signed-off-by: John Stultz Signed-off-by: Greg Kroah-Hartman --- kernel/time/timekeeping.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 9201474..d647773 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -161,18 +161,34 @@ static struct timespec xtime __attribute__ ((aligned (16))); static struct timespec wall_to_monotonic __attribute__ ((aligned (16))); static struct timespec total_sleep_time; +/* Offset clock monotonic -> clock realtime */ +static ktime_t offs_real; + +/* Offset clock monotonic -> clock boottime */ +static ktime_t offs_boot; + /* * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ static struct timespec raw_time; /* must hold write on xtime_lock */ +static void update_rt_offset(void) +{ + struct timespec tmp, *wtm = &wall_to_monotonic; + + set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec); + offs_real = timespec_to_ktime(tmp); +} + +/* must hold write on xtime_lock */ static void timekeeping_update(bool clearntp) { if (clearntp) { timekeeper.ntp_error = 0; ntp_clear(); } + update_rt_offset(); update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, timekeeper.mult); } @@ -587,6 +603,7 @@ void __init timekeeping_init(void) } set_normalized_timespec(&wall_to_monotonic, -boot.tv_sec, -boot.tv_nsec); + update_rt_offset(); total_sleep_time.tv_sec = 0; total_sleep_time.tv_nsec = 0; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -595,6 +612,12 @@ void __init timekeeping_init(void) /* time in seconds when suspend began */ static struct timespec timekeeping_suspend_time; +static void update_sleep_time(struct timespec t) +{ + total_sleep_time = t; + offs_boot = timespec_to_ktime(t); +} + /** * __timekeeping_inject_sleeptime - Internal function to add sleep interval * @delta: pointer to a timespec delta value @@ -606,7 +629,7 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta) { xtime = timespec_add(xtime, *delta); wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta); - total_sleep_time = timespec_add(total_sleep_time, *delta); + update_sleep_time(timespec_add(total_sleep_time, *delta)); } -- cgit v1.1 From 6c89f2ce05ea7e26a7580ad9eb950f2c4f10891b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Jul 2012 13:33:55 -0400 Subject: hrtimers: Move lock held region in hrtimer_interrupt() This is a backport of 196951e91262fccda81147d2bcf7fdab08668b40 We need to update the base offsets from this code and we need to do that under base->lock. Move the lock held region around the ktime_get() calls. The ktime_get() calls are going to be replaced with a function which gets the time and the offsets atomically. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Prarit Bhargava Signed-off-by: John Stultz Link: http://lkml.kernel.org/r/1341960205-56738-6-git-send-email-johnstul@us.ibm.com Signed-off-by: Thomas Gleixner Cc: Prarit Bhargava Cc: Thomas Gleixner Signed-off-by: John Stultz Signed-off-by: Greg Kroah-Hartman --- kernel/hrtimer.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index a256bab..8fc6533 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1263,11 +1263,10 @@ void hrtimer_interrupt(struct clock_event_device *dev) cpu_base->nr_events++; dev->next_event.tv64 = KTIME_MAX; + raw_spin_lock(&cpu_base->lock); entry_time = now = ktime_get(); retry: expires_next.tv64 = KTIME_MAX; - - raw_spin_lock(&cpu_base->lock); /* * We set expires_next to KTIME_MAX here with cpu_base->lock * held to prevent that a timer is enqueued in our queue via @@ -1344,6 +1343,7 @@ retry: * interrupt routine. We give it 3 attempts to avoid * overreacting on some spurious event. */ + raw_spin_lock(&cpu_base->lock); now = ktime_get(); cpu_base->nr_retries++; if (++retries < 3) @@ -1356,6 +1356,7 @@ retry: */ cpu_base->nr_hangs++; cpu_base->hang_detected = 1; + raw_spin_unlock(&cpu_base->lock); delta = ktime_sub(now, entry_time); if (delta.tv64 > cpu_base->max_hang_time.tv64) cpu_base->max_hang_time = delta; -- cgit v1.1 From 22f4bbcfb131e2392c78ad67af35fdd436d4dd54 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Jul 2012 13:33:56 -0400 Subject: timekeeping: Provide hrtimer update function This is a backport of f6c06abfb3972ad4914cef57d8348fcb2932bc3b To finally fix the infamous leap second issue and other race windows caused by functions which change the offsets between the various time bases (CLOCK_MONOTONIC, CLOCK_REALTIME and CLOCK_BOOTTIME) we need a function which atomically gets the current monotonic time and updates the offsets of CLOCK_REALTIME and CLOCK_BOOTTIME with minimalistic overhead. The previous patch which provides ktime_t offsets allows us to make this function almost as cheap as ktime_get() which is going to be replaced in hrtimer_interrupt(). Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Prarit Bhargava Signed-off-by: John Stultz Link: http://lkml.kernel.org/r/1341960205-56738-7-git-send-email-johnstul@us.ibm.com Signed-off-by: Thomas Gleixner Cc: Prarit Bhargava Cc: Thomas Gleixner Signed-off-by: John Stultz Signed-off-by: Greg Kroah-Hartman --- kernel/time/timekeeping.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d647773..f0c7565 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1126,6 +1126,40 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, } while (read_seqretry(&xtime_lock, seq)); } +#ifdef CONFIG_HIGH_RES_TIMERS +/** + * ktime_get_update_offsets - hrtimer helper + * @real: pointer to storage for monotonic -> realtime offset + * @_boot: pointer to storage for monotonic -> boottime offset + * + * Returns current monotonic time and updates the offsets + * Called from hrtimer_interupt() or retrigger_next_event() + */ +ktime_t ktime_get_update_offsets(ktime_t *real, ktime_t *boot) +{ + ktime_t now; + unsigned int seq; + u64 secs, nsecs; + + do { + seq = read_seqbegin(&xtime_lock); + + secs = xtime.tv_sec; + nsecs = xtime.tv_nsec; + nsecs += timekeeping_get_ns(); + /* If arch requires, add in gettimeoffset() */ + nsecs += arch_gettimeoffset(); + + *real = offs_real; + *boot = offs_boot; + } while (read_seqretry(&xtime_lock, seq)); + + now = ktime_add_ns(ktime_set(secs, 0), nsecs); + now = ktime_sub(now, *real); + return now; +} +#endif + /** * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format */ -- cgit v1.1 From bb6ed34f2a6eeb40608b8ca91f3ec90ec9dca26f Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 17 Jul 2012 13:33:57 -0400 Subject: hrtimer: Update hrtimer base offsets each hrtimer_interrupt This is a backport of 5baefd6d84163443215f4a99f6a20f054ef11236 The update of the hrtimer base offsets on all cpus cannot be made atomically from the timekeeper.lock held and interrupt disabled region as smp function calls are not allowed there. clock_was_set(), which enforces the update on all cpus, is called either from preemptible process context in case of do_settimeofday() or from the softirq context when the offset modification happened in the timer interrupt itself due to a leap second. In both cases there is a race window for an hrtimer interrupt between dropping timekeeper lock, enabling interrupts and clock_was_set() issuing the updates. Any interrupt which arrives in that window will see the new time but operate on stale offsets. So we need to make sure that an hrtimer interrupt always sees a consistent state of time and offsets. ktime_get_update_offsets() allows us to get the current monotonic time and update the per cpu hrtimer base offsets from hrtimer_interrupt() to capture a consistent state of monotonic time and the offsets. The function replaces the existing ktime_get() calls in hrtimer_interrupt(). The overhead of the new function vs. ktime_get() is minimal as it just adds two store operations. This ensures that any changes to realtime or boottime offsets are noticed and stored into the per-cpu hrtimer base structures, prior to any hrtimer expiration and guarantees that timers are not expired early. Signed-off-by: John Stultz Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Prarit Bhargava Link: http://lkml.kernel.org/r/1341960205-56738-8-git-send-email-johnstul@us.ibm.com Signed-off-by: Thomas Gleixner Cc: Prarit Bhargava Cc: Thomas Gleixner Signed-off-by: John Stultz Signed-off-by: Greg Kroah-Hartman --- kernel/hrtimer.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 8fc6533..957869f 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -657,6 +657,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, return 0; } +static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) +{ + ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; + ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; + + return ktime_get_update_offsets(offs_real, offs_boot); +} + /* * Retrigger next event is called after clock was set * @@ -665,22 +673,12 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); - struct timespec realtime_offset, xtim, wtm, sleep; if (!hrtimer_hres_active()) return; - /* Optimized out for !HIGH_RES */ - get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep); - set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); - - /* Adjust CLOCK_REALTIME offset */ raw_spin_lock(&base->lock); - base->clock_base[HRTIMER_BASE_REALTIME].offset = - timespec_to_ktime(realtime_offset); - base->clock_base[HRTIMER_BASE_BOOTTIME].offset = - timespec_to_ktime(sleep); - + hrtimer_update_base(base); hrtimer_force_reprogram(base, 0); raw_spin_unlock(&base->lock); } @@ -710,7 +708,6 @@ static int hrtimer_switch_to_hres(void) base->clock_base[i].resolution = KTIME_HIGH_RES; tick_setup_sched_timer(); - /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); local_irq_restore(flags); @@ -1264,7 +1261,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) dev->next_event.tv64 = KTIME_MAX; raw_spin_lock(&cpu_base->lock); - entry_time = now = ktime_get(); + entry_time = now = hrtimer_update_base(cpu_base); retry: expires_next.tv64 = KTIME_MAX; /* @@ -1342,9 +1339,12 @@ retry: * We need to prevent that we loop forever in the hrtimer * interrupt routine. We give it 3 attempts to avoid * overreacting on some spurious event. + * + * Acquire base lock for updating the offsets and retrieving + * the current time. */ raw_spin_lock(&cpu_base->lock); - now = ktime_get(); + now = hrtimer_update_base(cpu_base); cpu_base->nr_retries++; if (++retries < 3) goto retry; -- cgit v1.1 From 0851978b661f25192ff763289698f3175b1bab42 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Jul 2012 13:33:58 -0400 Subject: timekeeping: Add missing update call in timekeeping_resume() This is a backport of 3e997130bd2e8c6f5aaa49d6e3161d4d29b43ab0 The leap second rework unearthed another issue of inconsistent data. On timekeeping_resume() the timekeeper data is updated, but nothing calls timekeeping_update(), so now the update code in the timer interrupt sees stale values. This has been the case before those changes, but then the timer interrupt was using stale data as well so this went unnoticed for quite some time. Add the missing update call, so all the data is consistent everywhere. Reported-by: Andreas Schwab Reported-and-tested-by: "Rafael J. Wysocki" Reported-and-tested-by: Martin Steigerwald Cc: John Stultz Cc: Ingo Molnar Cc: Peter Zijlstra , Cc: Prarit Bhargava Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz Signed-off-by: Linus Torvalds Cc: Prarit Bhargava Cc: Thomas Gleixner Signed-off-by: John Stultz Signed-off-by: Greg Kroah-Hartman --- kernel/time/timekeeping.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f0c7565..678ae31 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -693,6 +693,7 @@ static void timekeeping_resume(void) timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); timekeeper.ntp_error = 0; timekeeping_suspended = 0; + timekeeping_update(false); write_sequnlock_irqrestore(&xtime_lock, flags); touch_softlockup_watchdog(); -- cgit v1.1 From dccecc646f06f06db8c32fc6615fee847852cec6 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 13 Jul 2012 01:21:50 -0400 Subject: ntp: Fix STA_INS/DEL clearing bug commit 6b1859dba01c7d512b72d77e3fd7da8354235189 upstream. In commit 6b43ae8a619d17c4935c3320d2ef9e92bdeed05d, I introduced a bug that kept the STA_INS or STA_DEL bit from being cleared from time_status via adjtimex() without forcing STA_PLL first. Usually once the STA_INS is set, it isn't cleared until the leap second is applied, so its unlikely this affected anyone. However during testing I noticed it took some effort to cancel a leap second once STA_INS was set. Signed-off-by: John Stultz Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Richard Cochran Cc: Prarit Bhargava Link: http://lkml.kernel.org/r/1342156917-25092-2-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/time/ntp.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index f1eb182..61fc450 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -375,7 +375,9 @@ int second_overflow(unsigned long secs) time_state = TIME_DEL; break; case TIME_INS: - if (secs % 86400 == 0) { + if (!(time_status & STA_INS)) + time_state = TIME_OK; + else if (secs % 86400 == 0) { leap = -1; time_state = TIME_OOP; time_tai++; @@ -384,7 +386,9 @@ int second_overflow(unsigned long secs) } break; case TIME_DEL: - if ((secs + 1) % 86400 == 0) { + if (!(time_status & STA_DEL)) + time_state = TIME_OK; + else if ((secs + 1) % 86400 == 0) { leap = 1; time_tai--; time_state = TIME_WAIT; -- cgit v1.1 From 6b63ea81d831b2b6f2ce6d60cfcfa05b1858ad97 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 2 Nov 2011 13:38:39 -0700 Subject: cpusets: avoid looping when storing to mems_allowed if one node remains set commit 89e8a244b97e48f1f30e898b6f32acca477f2a13 upstream. Stable note: Not tracked in Bugzilla. [get|put]_mems_allowed() is extremely expensive and severely impacted page allocator performance. This is part of a series of patches that reduce page allocator overhead. {get,put}_mems_allowed() exist so that general kernel code may locklessly access a task's set of allowable nodes without having the chance that a concurrent write will cause the nodemask to be empty on configurations where MAX_NUMNODES > BITS_PER_LONG. This could incur a significant delay, however, especially in low memory conditions because the page allocator is blocking and reclaim requires get_mems_allowed() itself. It is not atypical to see writes to cpuset.mems take over 2 seconds to complete, for example. In low memory conditions, this is problematic because it's one of the most imporant times to change cpuset.mems in the first place! The only way a task's set of allowable nodes may change is through cpusets by writing to cpuset.mems and when attaching a task to a generic code is not reading the nodemask with get_mems_allowed() at the same time, and then clearing all the old nodes. This prevents the possibility that a reader will see an empty nodemask at the same time the writer is storing a new nodemask. If at least one node remains unchanged, though, it's possible to simply set all new nodes and then clear all the old nodes. Changing a task's nodemask is protected by cgroup_mutex so it's guaranteed that two threads are not changing the same task's nodemask at the same time, so the nodemask is guaranteed to be stored before another thread changes it and determines whether a node remains set or not. Signed-off-by: David Rientjes Cc: Miao Xie Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- kernel/cpuset.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 9c9b754..a995893 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -949,6 +949,8 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, static void cpuset_change_task_nodemask(struct task_struct *tsk, nodemask_t *newmems) { + bool masks_disjoint = !nodes_intersects(*newmems, tsk->mems_allowed); + repeat: /* * Allow tasks that have access to memory reserves because they have @@ -963,7 +965,6 @@ repeat: nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); - /* * ensure checking ->mems_allowed_change_disable after setting all new * allowed nodes. @@ -980,9 +981,11 @@ repeat: /* * Allocation of memory is very fast, we needn't sleep when waiting - * for the read-side. + * for the read-side. No wait is necessary, however, if at least one + * node remains unchanged. */ - while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) { + while (masks_disjoint && + ACCESS_ONCE(tsk->mems_allowed_change_disable)) { task_unlock(tsk); if (!task_curr(tsk)) yield(); -- cgit v1.1 From ba204b545c29176659273c5316afb147eec332a3 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 19 Dec 2011 17:11:52 -0800 Subject: cpusets: stall when updating mems_allowed for mempolicy or disjoint nodemask commit b246272ecc5ac68c743b15c9e41a2275f7ce70e2 upstream. Stable note: Not tracked in Bugzilla. [get|put]_mems_allowed() is extremely expensive and severely impacted page allocator performance. This is part of a series of patches that reduce page allocator overhead. Kernels where MAX_NUMNODES > BITS_PER_LONG may temporarily see an empty nodemask in a tsk's mempolicy if its previous nodemask is remapped onto a new set of allowed cpuset nodes where the two nodemasks, as a result of the remap, are now disjoint. c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") adds get_mems_allowed() to prevent the set of allowed nodes from changing for a thread. This causes any update to a set of allowed nodes to stall until put_mems_allowed() is called. This stall is unncessary, however, if at least one node remains unchanged in the update to the set of allowed nodes. This was addressed by 89e8a244b97e ("cpusets: avoid looping when storing to mems_allowed if one node remains set"), but it's still possible that an empty nodemask may be read from a mempolicy because the old nodemask may be remapped to the new nodemask during rebind. To prevent this, only avoid the stall if there is no mempolicy for the thread being changed. This is a temporary solution until all reads from mempolicy nodemasks can be guaranteed to not be empty without the get_mems_allowed() synchronization. Also moves the check for nodemask intersection inside task_lock() so that tsk->mems_allowed cannot change. This ensures that nothing can set this tsk's mems_allowed out from under us and also protects tsk->mempolicy. Reported-by: Miao Xie Signed-off-by: David Rientjes Cc: KOSAKI Motohiro Cc: Paul Menage Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- kernel/cpuset.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a995893..28d0bbd 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -123,6 +123,19 @@ static inline struct cpuset *task_cs(struct task_struct *task) struct cpuset, css); } +#ifdef CONFIG_NUMA +static inline bool task_has_mempolicy(struct task_struct *task) +{ + return task->mempolicy; +} +#else +static inline bool task_has_mempolicy(struct task_struct *task) +{ + return false; +} +#endif + + /* bits in struct cpuset flags field */ typedef enum { CS_CPU_EXCLUSIVE, @@ -949,7 +962,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, static void cpuset_change_task_nodemask(struct task_struct *tsk, nodemask_t *newmems) { - bool masks_disjoint = !nodes_intersects(*newmems, tsk->mems_allowed); + bool need_loop; repeat: /* @@ -962,6 +975,14 @@ repeat: return; task_lock(tsk); + /* + * Determine if a loop is necessary if another thread is doing + * get_mems_allowed(). If at least one node remains unchanged and + * tsk does not have a mempolicy, then an empty nodemask will not be + * possible when mems_allowed is larger than a word. + */ + need_loop = task_has_mempolicy(tsk) || + !nodes_intersects(*newmems, tsk->mems_allowed); nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); @@ -981,11 +1002,9 @@ repeat: /* * Allocation of memory is very fast, we needn't sleep when waiting - * for the read-side. No wait is necessary, however, if at least one - * node remains unchanged. + * for the read-side. */ - while (masks_disjoint && - ACCESS_ONCE(tsk->mems_allowed_change_disable)) { + while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) { task_unlock(tsk); if (!task_curr(tsk)) yield(); -- cgit v1.1 From 627c5c60b4ac673e9f4be758858073071684dce9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 21 Mar 2012 16:34:11 -0700 Subject: cpuset: mm: reduce large amounts of memory barrier related damage v3 commit cc9a6c8776615f9c194ccf0b63a0aa5628235545 upstream. Stable note: Not tracked in Bugzilla. [get|put]_mems_allowed() is extremely expensive and severely impacted page allocator performance. This is part of a series of patches that reduce page allocator overhead. Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") wins a super prize for the largest number of memory barriers entered into fast paths for one commit. [get|put]_mems_allowed is incredibly heavy with pairs of full memory barriers inserted into a number of hot paths. This was detected while investigating at large page allocator slowdown introduced some time after 2.6.32. The largest portion of this overhead was shown by oprofile to be at an mfence introduced by this commit into the page allocator hot path. For extra style points, the commit introduced the use of yield() in an implementation of what looks like a spinning mutex. This patch replaces the full memory barriers on both read and write sides with a sequence counter with just read barriers on the fast path side. This is much cheaper on some architectures, including x86. The main bulk of the patch is the retry logic if the nodemask changes in a manner that can cause a false failure. While updating the nodemask, a check is made to see if a false failure is a risk. If it is, the sequence number gets bumped and parallel allocators will briefly stall while the nodemask update takes place. In a page fault test microbenchmark, oprofile samples from __alloc_pages_nodemask went from 4.53% of all samples to 1.15%. The actual results were 3.3.0-rc3 3.3.0-rc3 rc3-vanilla nobarrier-v2r1 Clients 1 UserTime 0.07 ( 0.00%) 0.08 (-14.19%) Clients 2 UserTime 0.07 ( 0.00%) 0.07 ( 2.72%) Clients 4 UserTime 0.08 ( 0.00%) 0.07 ( 3.29%) Clients 1 SysTime 0.70 ( 0.00%) 0.65 ( 6.65%) Clients 2 SysTime 0.85 ( 0.00%) 0.82 ( 3.65%) Clients 4 SysTime 1.41 ( 0.00%) 1.41 ( 0.32%) Clients 1 WallTime 0.77 ( 0.00%) 0.74 ( 4.19%) Clients 2 WallTime 0.47 ( 0.00%) 0.45 ( 3.73%) Clients 4 WallTime 0.38 ( 0.00%) 0.37 ( 1.58%) Clients 1 Flt/sec/cpu 497620.28 ( 0.00%) 520294.53 ( 4.56%) Clients 2 Flt/sec/cpu 414639.05 ( 0.00%) 429882.01 ( 3.68%) Clients 4 Flt/sec/cpu 257959.16 ( 0.00%) 258761.48 ( 0.31%) Clients 1 Flt/sec 495161.39 ( 0.00%) 517292.87 ( 4.47%) Clients 2 Flt/sec 820325.95 ( 0.00%) 850289.77 ( 3.65%) Clients 4 Flt/sec 1020068.93 ( 0.00%) 1022674.06 ( 0.26%) MMTests Statistics: duration Sys Time Running Test (seconds) 135.68 132.17 User+Sys Time Running Test (seconds) 164.2 160.13 Total Elapsed Time (seconds) 123.46 120.87 The overall improvement is small but the System CPU time is much improved and roughly in correlation to what oprofile reported (these performance figures are without profiling so skew is expected). The actual number of page faults is noticeably improved. For benchmarks like kernel builds, the overall benefit is marginal but the system CPU time is slightly reduced. To test the actual bug the commit fixed I opened two terminals. The first ran within a cpuset and continually ran a small program that faulted 100M of anonymous data. In a second window, the nodemask of the cpuset was continually randomised in a loop. Without the commit, the program would fail every so often (usually within 10 seconds) and obviously with the commit everything worked fine. With this patch applied, it also worked fine so the fix should be functionally equivalent. Signed-off-by: Mel Gorman Cc: Miao Xie Cc: David Rientjes Cc: Peter Zijlstra Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Mel Gorman Signed-off-by: Greg Kroah-Hartman --- kernel/cpuset.c | 43 ++++++++----------------------------------- kernel/fork.c | 3 +++ 2 files changed, 11 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 28d0bbd..b2e84bd 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -964,7 +964,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, { bool need_loop; -repeat: /* * Allow tasks that have access to memory reserves because they have * been OOM killed to get memory anywhere. @@ -983,45 +982,19 @@ repeat: */ need_loop = task_has_mempolicy(tsk) || !nodes_intersects(*newmems, tsk->mems_allowed); - nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); - mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); - /* - * ensure checking ->mems_allowed_change_disable after setting all new - * allowed nodes. - * - * the read-side task can see an nodemask with new allowed nodes and - * old allowed nodes. and if it allocates page when cpuset clears newly - * disallowed ones continuous, it can see the new allowed bits. - * - * And if setting all new allowed nodes is after the checking, setting - * all new allowed nodes and clearing newly disallowed ones will be done - * continuous, and the read-side task may find no node to alloc page. - */ - smp_mb(); + if (need_loop) + write_seqcount_begin(&tsk->mems_allowed_seq); - /* - * Allocation of memory is very fast, we needn't sleep when waiting - * for the read-side. - */ - while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) { - task_unlock(tsk); - if (!task_curr(tsk)) - yield(); - goto repeat; - } - - /* - * ensure checking ->mems_allowed_change_disable before clearing all new - * disallowed nodes. - * - * if clearing newly disallowed bits before the checking, the read-side - * task may find no node to alloc page. - */ - smp_mb(); + nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); + mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); tsk->mems_allowed = *newmems; + + if (need_loop) + write_seqcount_end(&tsk->mems_allowed_seq); + task_unlock(tsk); } diff --git a/kernel/fork.c b/kernel/fork.c index 4712e3e..3d42aa3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -985,6 +985,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) #ifdef CONFIG_CGROUPS init_rwsem(&sig->threadgroup_fork_lock); #endif +#ifdef CONFIG_CPUSETS + seqcount_init(&tsk->mems_allowed_seq); +#endif sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; -- cgit v1.1 From 31b1c0850709c77ece1690a8fbc043829717d14c Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Sat, 16 Jun 2012 15:30:45 +0200 Subject: ftrace: Disable function tracing during suspend/resume and hibernation, again commit 443772d408a25af62498793f6f805ce3c559309a upstream. If function tracing is enabled for some of the low-level suspend/resume functions, it leads to triple fault during resume from suspend, ultimately ending up in a reboot instead of a resume (or a total refusal to come out of suspended state, on some machines). This issue was explained in more detail in commit f42ac38c59e0a03d (ftrace: disable tracing for suspend to ram). However, the changes made by that commit got reverted by commit cbe2f5a6e84eebb (tracing: allow tracing of suspend/resume & hibernation code again). So, unfortunately since things are not yet robust enough to allow tracing of low-level suspend/resume functions, suspend/resume is still broken when ftrace is enabled. So fix this by disabling function tracing during suspend/resume & hibernation. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- kernel/power/hibernate.c | 6 ++++++ kernel/power/suspend.c | 3 +++ 2 files changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 8884c27..32f1590 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -344,6 +344,7 @@ int hibernation_snapshot(int platform_mode) goto Complete_devices; suspend_console(); + ftrace_stop(); pm_restrict_gfp_mask(); error = dpm_suspend(PMSG_FREEZE); if (error) @@ -369,6 +370,7 @@ int hibernation_snapshot(int platform_mode) if (error || !in_suspend) pm_restore_gfp_mask(); + ftrace_start(); resume_console(); Complete_devices: @@ -471,6 +473,7 @@ int hibernation_restore(int platform_mode) pm_prepare_console(); suspend_console(); + ftrace_stop(); pm_restrict_gfp_mask(); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { @@ -478,6 +481,7 @@ int hibernation_restore(int platform_mode) dpm_resume_end(PMSG_RECOVER); } pm_restore_gfp_mask(); + ftrace_start(); resume_console(); pm_restore_console(); return error; @@ -504,6 +508,7 @@ int hibernation_platform_enter(void) entering_platform_hibernation = true; suspend_console(); + ftrace_stop(); error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -547,6 +552,7 @@ int hibernation_platform_enter(void) Resume_devices: entering_platform_hibernation = false; dpm_resume_end(PMSG_RESTORE); + ftrace_start(); resume_console(); Close: diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 449ccc9..e40d205 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "power.h" @@ -210,6 +211,7 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); + ftrace_stop(); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { @@ -226,6 +228,7 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); + ftrace_start(); resume_console(); Close: if (suspend_ops->end) -- cgit v1.1 From b1c7ba1bab7363fee6dc5d4ee5be4e916adcf691 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 17 Jul 2012 12:39:26 -0700 Subject: workqueue: perform cpu down operations from low priority cpu_notifier() commit 6575820221f7a4dd6eadecf7bf83cdd154335eda upstream. Currently, all workqueue cpu hotplug operations run off CPU_PRI_WORKQUEUE which is higher than normal notifiers. This is to ensure that workqueue is up and running while bringing up a CPU before other notifiers try to use workqueue on the CPU. Per-cpu workqueues are supposed to remain working and bound to the CPU for normal CPU_DOWN_PREPARE notifiers. This holds mostly true even with workqueue offlining running with higher priority because workqueue CPU_DOWN_PREPARE only creates a bound trustee thread which runs the per-cpu workqueue without concurrency management without explicitly detaching the existing workers. However, if the trustee needs to create new workers, it creates unbound workers which may wander off to other CPUs while CPU_DOWN_PREPARE notifiers are in progress. Furthermore, if the CPU down is cancelled, the per-CPU workqueue may end up with workers which aren't bound to the CPU. While reliably reproducible with a convoluted artificial test-case involving scheduling and flushing CPU burning work items from CPU down notifiers, this isn't very likely to happen in the wild, and, even when it happens, the effects are likely to be hidden by the following successful CPU down. Fix it by using different priorities for up and down notifiers - high priority for up operations and low priority for down operations. Workqueue cpu hotplug operations will soon go through further cleanup. Signed-off-by: Tejun Heo Acked-by: "Rafael J. Wysocki" Signed-off-by: Greg Kroah-Hartman --- kernel/workqueue.c | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ee1845b..e88c924 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3561,6 +3561,41 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, return notifier_from_errno(0); } +/* + * Workqueues should be brought up before normal priority CPU notifiers. + * This will be registered high priority CPU notifier. + */ +static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + case CPU_UP_CANCELED: + case CPU_DOWN_FAILED: + case CPU_ONLINE: + return workqueue_cpu_callback(nfb, action, hcpu); + } + return NOTIFY_OK; +} + +/* + * Workqueues should be brought down after normal priority CPU notifiers. + * This will be registered as low priority CPU notifier. + */ +static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + case CPU_DYING: + case CPU_POST_DEAD: + return workqueue_cpu_callback(nfb, action, hcpu); + } + return NOTIFY_OK; +} + #ifdef CONFIG_SMP struct work_for_cpu { @@ -3754,7 +3789,8 @@ static int __init init_workqueues(void) unsigned int cpu; int i; - cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE); + cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); + cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); /* initialize gcwqs */ for_each_gcwq_cpu(cpu) { -- cgit v1.1 From bc16cc3950aed91b84c9a7071af2c72abee91660 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 20 Jul 2012 11:53:29 -0700 Subject: futex: Test for pi_mutex on fault in futex_wait_requeue_pi() commit b6070a8d9853eda010a549fa9a09eb8d7269b929 upstream. If fixup_pi_state_owner() faults, pi_mutex may be NULL. Test for pi_mutex != NULL before testing the owner against current and possibly unlocking it. Signed-off-by: Darren Hart Cc: Dave Jones Cc: Dan Carpenter Link: http://lkml.kernel.org/r/dc59890338fc413606f04e5c5b131530734dae3d.1342809673.git.dvhart@linux.intel.com Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 11e8924..260e1d6 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2370,7 +2370,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * fault, unlock the rt_mutex and return the fault to userspace. */ if (ret == -EFAULT) { - if (rt_mutex_owner(pi_mutex) == current) + if (pi_mutex && rt_mutex_owner(pi_mutex) == current) rt_mutex_unlock(pi_mutex); } else if (ret == -EINTR) { /* -- cgit v1.1 From 7367fdb4987147438ca8cd69d6ae021ffc646c57 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 20 Jul 2012 11:53:30 -0700 Subject: futex: Fix bug in WARN_ON for NULL q.pi_state commit f27071cb7fe3e1d37a9dbe6c0dfc5395cd40fa43 upstream. The WARN_ON in futex_wait_requeue_pi() for a NULL q.pi_state was testing the address (&q.pi_state) of the pointer instead of the value (q.pi_state) of the pointer. Correct it accordingly. Signed-off-by: Darren Hart Cc: Dave Jones Link: http://lkml.kernel.org/r/1c85d97f6e5f79ec389a4ead3e367363c74bd09a.1342809673.git.dvhart@linux.intel.com Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 260e1d6..ca30821 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2343,7 +2343,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * signal. futex_unlock_pi() will not destroy the lock_ptr nor * the pi_state. */ - WARN_ON(!&q.pi_state); + WARN_ON(!q.pi_state); pi_mutex = &q.pi_state->pi_mutex; ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); debug_rt_mutex_free_waiter(&rt_waiter); -- cgit v1.1 From b7a06be61b78c66860897ee2a0f9e23845f1f438 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 20 Jul 2012 11:53:31 -0700 Subject: futex: Forbid uaddr == uaddr2 in futex_wait_requeue_pi() commit 6f7b0a2a5c0fb03be7c25bd1745baa50582348ef upstream. If uaddr == uaddr2, then we have broken the rule of only requeueing from a non-pi futex to a pi futex with this call. If we attempt this, as the trinity test suite manages to do, we miss early wakeups as q.key is equal to key2 (because they are the same uaddr). We will then attempt to dereference the pi_mutex (which would exist had the futex_q been properly requeued to a pi futex) and trigger a NULL pointer dereference. Signed-off-by: Darren Hart Cc: Dave Jones Link: http://lkml.kernel.org/r/ad82bfe7f7d130247fbe2b5b4275654807774227.1342809673.git.dvhart@linux.intel.com Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index ca30821..24bc59c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2231,11 +2231,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * @uaddr2: the pi futex we will take prior to returning to user-space * * The caller will wait on uaddr and will be requeued by futex_requeue() to - * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and - * complete the acquisition of the rt_mutex prior to returning to userspace. - * This ensures the rt_mutex maintains an owner when it has waiters; without - * one, the pi logic wouldn't know which task to boost/deboost, if there was a - * need to. + * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake + * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to + * userspace. This ensures the rt_mutex maintains an owner when it has waiters; + * without one, the pi logic would not know which task to boost/deboost, if + * there was a need to. * * We call schedule in futex_wait_queue_me() when we enqueue and return there * via the following: @@ -2272,6 +2272,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, struct futex_q q = futex_q_init; int res, ret; + if (uaddr == uaddr2) + return -EINVAL; + if (!bitset) return -EINVAL; -- cgit v1.1 From aa88dea2270f685349ab7b92169600452fe73b62 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 2 Jul 2012 07:52:16 -0400 Subject: random: make 'add_interrupt_randomness()' do something sane commit 775f4b297b780601e61787b766f306ed3e1d23eb upstream. We've been moving away from add_interrupt_randomness() for various reasons: it's too expensive to do on every interrupt, and flooding the CPU with interrupts could theoretically cause bogus floods of entropy from a somewhat externally controllable source. This solves both problems by limiting the actual randomness addition to just once a second or after 64 interrupts, whicever comes first. During that time, the interrupt cycle data is buffered up in a per-cpu pool. Also, we make sure the the nonblocking pool used by urandom is initialized before we start feeding the normal input pool. This assures that /dev/urandom is returning unpredictable data as soon as possible. (Based on an original patch by Linus, but significantly modified by tytso.) Tested-by: Eric Wustrow Reported-by: Eric Wustrow Reported-by: Nadia Heninger Reported-by: Zakir Durumeric Reported-by: J. Alex Halderman Signed-off-by: Linus Torvalds Signed-off-by: "Theodore Ts'o" Signed-off-by: Greg Kroah-Hartman --- kernel/irq/handle.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 470d08c..10e0772 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -117,7 +117,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) { irqreturn_t retval = IRQ_NONE; - unsigned int random = 0, irq = desc->irq_data.irq; + unsigned int flags = 0, irq = desc->irq_data.irq; do { irqreturn_t res; @@ -145,7 +145,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) /* Fall through to add to randomness */ case IRQ_HANDLED: - random |= action->flags; + flags |= action->flags; break; default: @@ -156,8 +156,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) action = action->next; } while (action); - if (random & IRQF_SAMPLE_RANDOM) - add_interrupt_randomness(irq); + add_interrupt_randomness(irq, flags); if (!noirqdebug) note_interrupt(irq, desc, retval); -- cgit v1.1 From b6b847a93be87fc9974d8232984668a5a59754df Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 14 Jul 2012 20:27:52 -0400 Subject: random: remove rand_initialize_irq() commit c5857ccf293968348e5eb4ebedc68074de3dcda6 upstream. With the new interrupt sampling system, we are no longer using the timer_rand_state structure in the irq descriptor, so we can stop initializing it now. [ Merged in fixes from Sedat to find some last missing references to rand_initialize_irq() ] Signed-off-by: "Theodore Ts'o" Signed-off-by: Sedat Dilek Signed-off-by: Greg Kroah-Hartman --- kernel/irq/manage.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index df8136f..fa4a70e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -886,22 +886,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (desc->irq_data.chip == &no_irq_chip) return -ENOSYS; - /* - * Some drivers like serial.c use request_irq() heavily, - * so we have to be careful not to interfere with a - * running system. - */ - if (new->flags & IRQF_SAMPLE_RANDOM) { - /* - * This function might sleep, we want to call it first, - * outside of the atomic block. - * Yes, this might clear the entropy pool if the wrong - * driver is attempted to be loaded, without actually - * installing a new handler, but is this really a problem, - * only the sysadmin is able to do this. - */ - rand_initialize_irq(irq); - } /* * Check whether the interrupt nests into another interrupt @@ -1325,7 +1309,6 @@ EXPORT_SYMBOL(free_irq); * Flags: * * IRQF_SHARED Interrupt is shared - * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy * IRQF_TRIGGER_* Specify active edge(s) or level * */ -- cgit v1.1 From 56e4562bb32d8a47485a9b8b868cb28fda595170 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 15 Aug 2012 12:55:22 +0200 Subject: audit: don't free_chunk() after fsnotify_add_mark() commit 0fe33aae0e94b4097dd433c9399e16e17d638cd8 upstream. Don't do free_chunk() after fsnotify_add_mark(). That one does a delayed unref via the destroy list and this results in use-after-free. Signed-off-by: Miklos Szeredi Acked-by: Eric Paris Signed-off-by: Greg Kroah-Hartman --- kernel/audit_tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index e99dda0..0cf6a0a 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -265,7 +265,7 @@ static void untag_chunk(struct node *p) fsnotify_duplicate_mark(&new->mark, entry); if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { - free_chunk(new); + fsnotify_put_mark(&new->mark); goto Fallback; } @@ -328,7 +328,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) entry = &chunk->mark; if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) { - free_chunk(chunk); + fsnotify_put_mark(entry); return -ENOSPC; } @@ -402,7 +402,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) fsnotify_duplicate_mark(chunk_entry, old_entry); if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { spin_unlock(&old_entry->lock); - free_chunk(chunk); + fsnotify_put_mark(chunk_entry); fsnotify_put_mark(old_entry); return -ENOSPC; } -- cgit v1.1 From 830cd761e4a2d5b31326e5669f59170d8b25e5ed Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 15 Aug 2012 12:55:22 +0200 Subject: audit: fix refcounting in audit-tree commit a2140fc0cb0325bb6384e788edd27b9a568714e2 upstream. Refcounting of fsnotify_mark in audit tree is broken. E.g: refcount create_chunk alloc_chunk 1 fsnotify_add_mark 2 untag_chunk fsnotify_get_mark 3 fsnotify_destroy_mark audit_tree_freeing_mark 2 fsnotify_put_mark 1 fsnotify_put_mark 0 via destroy_list fsnotify_mark_destroy -1 This was reported by various people as triggering Oops when stopping auditd. We could just remove the put_mark from audit_tree_freeing_mark() but that would break freeing via inode destruction. So this patch simply omits a put_mark after calling destroy_mark or adds a get_mark before. The additional get_mark is necessary where there's no other put_mark after fsnotify_destroy_mark() since it assumes that the caller is holding a reference (or the inode is keeping the mark pinned, not the case here AFAICS). Signed-off-by: Miklos Szeredi Reported-by: Valentin Avram Reported-by: Peter Moody Acked-by: Eric Paris Signed-off-by: Greg Kroah-Hartman --- kernel/audit_tree.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 0cf6a0a..f6b4ac7 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -256,7 +256,6 @@ static void untag_chunk(struct node *p) spin_unlock(&hash_lock); spin_unlock(&entry->lock); fsnotify_destroy_mark(entry); - fsnotify_put_mark(entry); goto out; } @@ -299,7 +298,6 @@ static void untag_chunk(struct node *p) spin_unlock(&hash_lock); spin_unlock(&entry->lock); fsnotify_destroy_mark(entry); - fsnotify_put_mark(entry); goto out; Fallback: @@ -338,6 +336,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&hash_lock); chunk->dead = 1; spin_unlock(&entry->lock); + fsnotify_get_mark(entry); fsnotify_destroy_mark(entry); fsnotify_put_mark(entry); return 0; @@ -418,6 +417,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&chunk_entry->lock); spin_unlock(&old_entry->lock); + fsnotify_get_mark(chunk_entry); fsnotify_destroy_mark(chunk_entry); fsnotify_put_mark(chunk_entry); @@ -451,7 +451,6 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) spin_unlock(&old_entry->lock); fsnotify_destroy_mark(old_entry); fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ - fsnotify_put_mark(old_entry); /* and kill it */ return 0; } -- cgit v1.1 From 3d45db6b5158a7f09f8be651577416ef6a4dcbd4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Sep 2012 12:48:43 -0700 Subject: workqueue: reimplement work_on_cpu() using system_wq commit ed48ece27cd3d5ee0354c32bbaec0f3e1d4715c3 upstream. The existing work_on_cpu() implementation is hugely inefficient. It creates a new kthread, execute that single function and then let the kthread die on each invocation. Now that system_wq can handle concurrent executions, there's no advantage of doing this. Reimplement work_on_cpu() using system_wq which makes it simpler and way more efficient. stable: While this isn't a fix in itself, it's needed to fix a workqueue related bug in cpufreq/powernow-k8. AFAICS, this shouldn't break other existing users. Signed-off-by: Tejun Heo Acked-by: Jiri Kosina Cc: Linus Torvalds Cc: Bjorn Helgaas Cc: Len Brown Cc: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- kernel/workqueue.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e88c924..ebd9639 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3599,18 +3599,17 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, #ifdef CONFIG_SMP struct work_for_cpu { - struct completion completion; + struct work_struct work; long (*fn)(void *); void *arg; long ret; }; -static int do_work_for_cpu(void *_wfc) +static void work_for_cpu_fn(struct work_struct *work) { - struct work_for_cpu *wfc = _wfc; + struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work); + wfc->ret = wfc->fn(wfc->arg); - complete(&wfc->completion); - return 0; } /** @@ -3625,19 +3624,11 @@ static int do_work_for_cpu(void *_wfc) */ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) { - struct task_struct *sub_thread; - struct work_for_cpu wfc = { - .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion), - .fn = fn, - .arg = arg, - }; + struct work_for_cpu wfc = { .fn = fn, .arg = arg }; - sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu"); - if (IS_ERR(sub_thread)) - return PTR_ERR(sub_thread); - kthread_bind(sub_thread, cpu); - wake_up_process(sub_thread); - wait_for_completion(&wfc.completion); + INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); + schedule_work_on(cpu, &wfc.work); + flush_work(&wfc.work); return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu); -- cgit v1.1 From 71f08eb07187bb598e7da95d46458478846efc72 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 20 Aug 2012 14:59:25 +0100 Subject: perf_event: Switch to internal refcount, fix race with close() commit a6fa941d94b411bbd2b6421ffbde6db3c93e65ab upstream. Don't mess with file refcounts (or keep a reference to file, for that matter) in perf_event. Use explicit refcount of its own instead. Deal with the race between the final reference to event going away and new children getting created for it by use of atomic_long_inc_not_zero() in inherit_event(); just have the latter free what it had allocated and return NULL, that works out just fine (children of siblings of something doomed are created as singletons, same as if the child of leader had been created and immediately killed). Signed-off-by: Al Viro Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120820135925.GG23464@ZenIV.linux.org.uk Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/events/core.c | 62 ++++++++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 32a6151..7b344be 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2969,12 +2969,12 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); /* * Called when the last reference to the file is gone. */ -static int perf_release(struct inode *inode, struct file *file) +static void put_event(struct perf_event *event) { - struct perf_event *event = file->private_data; struct task_struct *owner; - file->private_data = NULL; + if (!atomic_long_dec_and_test(&event->refcount)) + return; rcu_read_lock(); owner = ACCESS_ONCE(event->owner); @@ -3009,7 +3009,13 @@ static int perf_release(struct inode *inode, struct file *file) put_task_struct(owner); } - return perf_event_release_kernel(event); + perf_event_release_kernel(event); +} + +static int perf_release(struct inode *inode, struct file *file) +{ + put_event(file->private_data); + return 0; } u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) @@ -3241,7 +3247,7 @@ unlock: static const struct file_operations perf_fops; -static struct perf_event *perf_fget_light(int fd, int *fput_needed) +static struct file *perf_fget_light(int fd, int *fput_needed) { struct file *file; @@ -3255,7 +3261,7 @@ static struct perf_event *perf_fget_light(int fd, int *fput_needed) return ERR_PTR(-EBADF); } - return file->private_data; + return file; } static int perf_event_set_output(struct perf_event *event, @@ -3287,19 +3293,21 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_EVENT_IOC_SET_OUTPUT: { + struct file *output_file = NULL; struct perf_event *output_event = NULL; int fput_needed = 0; int ret; if (arg != -1) { - output_event = perf_fget_light(arg, &fput_needed); - if (IS_ERR(output_event)) - return PTR_ERR(output_event); + output_file = perf_fget_light(arg, &fput_needed); + if (IS_ERR(output_file)) + return PTR_ERR(output_file); + output_event = output_file->private_data; } ret = perf_event_set_output(event, output_event); if (output_event) - fput_light(output_event->filp, fput_needed); + fput_light(output_file, fput_needed); return ret; } @@ -6181,6 +6189,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, mutex_init(&event->mmap_mutex); + atomic_long_set(&event->refcount, 1); event->cpu = cpu; event->attr = *attr; event->group_leader = group_leader; @@ -6455,12 +6464,12 @@ SYSCALL_DEFINE5(perf_event_open, return event_fd; if (group_fd != -1) { - group_leader = perf_fget_light(group_fd, &fput_needed); - if (IS_ERR(group_leader)) { - err = PTR_ERR(group_leader); + group_file = perf_fget_light(group_fd, &fput_needed); + if (IS_ERR(group_file)) { + err = PTR_ERR(group_file); goto err_fd; } - group_file = group_leader->filp; + group_leader = group_file->private_data; if (flags & PERF_FLAG_FD_OUTPUT) output_event = group_leader; if (flags & PERF_FLAG_FD_NO_GROUP) @@ -6594,7 +6603,6 @@ SYSCALL_DEFINE5(perf_event_open, put_ctx(gctx); } - event->filp = event_file; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); @@ -6682,7 +6690,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err_free; } - event->filp = NULL; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); perf_install_in_context(ctx, event, cpu); @@ -6731,7 +6738,7 @@ static void sync_child_event(struct perf_event *child_event, * Release the parent event, if this was the last * reference to it. */ - fput(parent_event->filp); + put_event(parent_event); } static void @@ -6807,9 +6814,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * * __perf_event_exit_task() * sync_child_event() - * fput(parent_event->filp) - * perf_release() - * mutex_lock(&ctx->mutex) + * put_event() + * mutex_lock(&ctx->mutex) * * But since its the parent context it won't be the same instance. */ @@ -6877,7 +6883,7 @@ static void perf_free_event(struct perf_event *event, list_del_init(&event->child_list); mutex_unlock(&parent->child_mutex); - fput(parent->filp); + put_event(parent); perf_group_detach(event); list_del_event(event, ctx); @@ -6957,6 +6963,12 @@ inherit_event(struct perf_event *parent_event, NULL); if (IS_ERR(child_event)) return child_event; + + if (!atomic_long_inc_not_zero(&parent_event->refcount)) { + free_event(child_event); + return NULL; + } + get_ctx(child_ctx); /* @@ -6996,14 +7008,6 @@ inherit_event(struct perf_event *parent_event, raw_spin_unlock_irqrestore(&child_ctx->lock, flags); /* - * Get a reference to the parent filp - we will fput it - * when the child event exits. This is safe to do because - * we are in the parent and we know that the filp still - * exists and has a nonzero count: - */ - atomic_long_inc(&parent_event->filp->f_count); - - /* * Link this into the parent event's child list */ WARN_ON_ONCE(parent_event->ctx->parent_ctx); -- cgit v1.1 From bc713e264ea9badd399aa34f208154a64e4e1b3b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sun, 2 Sep 2012 00:28:19 +0800 Subject: workqueue: UNBOUND -> REBIND morphing in rebind_workers() should be atomic commit 96e65306b81351b656835c15931d1d237b252f27 upstream. The compiler may compile the following code into TWO write/modify instructions. worker->flags &= ~WORKER_UNBOUND; worker->flags |= WORKER_REBIND; so the other CPU may temporarily see worker->flags which doesn't have either WORKER_UNBOUND or WORKER_REBIND set and perform local wakeup prematurely. Fix it by using single explicit assignment via ACCESS_ONCE(). Because idle workers have another WORKER_NOT_RUNNING flag, this bug doesn't exist for them; however, update it to use the same pattern for consistency. tj: Applied the change to idle workers too and updated comments and patch description a bit. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- kernel/workqueue.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ebd9639..00c0bad 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3412,14 +3412,17 @@ static int __cpuinit trustee_thread(void *__gcwq) for_each_busy_worker(worker, i, pos, gcwq) { struct work_struct *rebind_work = &worker->rebind_work; + unsigned long worker_flags = worker->flags; /* * Rebind_work may race with future cpu hotplug * operations. Use a separate flag to mark that - * rebinding is scheduled. + * rebinding is scheduled. The morphing should + * be atomic. */ - worker->flags |= WORKER_REBIND; - worker->flags &= ~WORKER_ROGUE; + worker_flags |= WORKER_REBIND; + worker_flags &= ~WORKER_ROGUE; + ACCESS_ONCE(worker->flags) = worker_flags; /* queue rebind_work, wq doesn't matter, use the default one */ if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, -- cgit v1.1 From 45516ddc16abc923104d78bb3eb772ac0a09e33e Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Tue, 24 Jul 2012 15:02:49 -0700 Subject: Fix a dead loop in async_synchronize_full() [Fixed upstream by commits 2955b47d2c1983998a8c5915cb96884e67f7cb53 and a4683487f90bfe3049686fc5c566bdc1ad03ace6 from Dan Williams, but they are much more intrusive than this tiny fix, according to Andrew - gregkh] This patch tries to fix a dead loop in async_synchronize_full(), which could be seen when preemption is disabled on a single cpu machine. void async_synchronize_full(void) { do { async_synchronize_cookie(next_cookie); } while (!list_empty(&async_running) || ! list_empty(&async_pending)); } async_synchronize_cookie() calls async_synchronize_cookie_domain() with &async_running as the default domain to synchronize. However, there might be some works in the async_pending list from other domains. On a single cpu system, without preemption, there is no chance for the other works to finish, so async_synchronize_full() enters a dead loop. It seems async_synchronize_full() wants to synchronize all entries in all running lists(domains), so maybe we could just check the entry_count to know whether all works are finished. Currently, async_synchronize_cookie_domain() expects a non-NULL running list ( if NULL, there would be NULL pointer dereference ), so maybe a NULL pointer could be used as an indication for the functions to synchronize all works in all domains. Reported-by: Paul E. McKenney Signed-off-by: Li Zhong Tested-by: Paul E. McKenney Tested-by: Christian Kujau Cc: Andrew Morton Cc: Dan Williams Cc: Christian Kujau Cc: Andrew Morton Cc: Cong Wang Signed-off-by: Greg Kroah-Hartman --- kernel/async.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index cd9dbb9..04f66e3 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -87,6 +87,13 @@ static async_cookie_t __lowest_in_progress(struct list_head *running) { struct async_entry *entry; + if (!running) { /* just check the entry count */ + if (atomic_read(&entry_count)) + return 0; /* smaller than any cookie */ + else + return next_cookie; + } + if (!list_empty(running)) { entry = list_first_entry(running, struct async_entry, list); @@ -236,9 +243,7 @@ EXPORT_SYMBOL_GPL(async_schedule_domain); */ void async_synchronize_full(void) { - do { - async_synchronize_cookie(next_cookie); - } while (!list_empty(&async_running) || !list_empty(&async_pending)); + async_synchronize_cookie_domain(next_cookie, NULL); } EXPORT_SYMBOL_GPL(async_synchronize_full); @@ -258,7 +263,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain); /** * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing * @cookie: async_cookie_t to use as checkpoint - * @running: running list to synchronize on + * @running: running list to synchronize on, NULL indicates all lists * * This function waits until all asynchronous function calls for the * synchronization domain specified by the running list @list submitted -- cgit v1.1 From 64ac72f81b1b41819dab596d1524bd5cae4813fd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Jun 2012 13:36:05 +0200 Subject: sched: Fix race in task_group() commit 8323f26ce3425460769605a6aece7a174edaa7d1 upstream. Stefan reported a crash on a kernel before a3e5d1091c1 ("sched: Don't call task_group() too many times in set_task_rq()"), he found the reason to be that the multiple task_group() invocations in set_task_rq() returned different values. Looking at all that I found a lack of serialization and plain wrong comments. The below tries to fix it using an extra pointer which is updated under the appropriate scheduler locks. Its not pretty, but I can't really see another way given how all the cgroup stuff works. Reported-and-tested-by: Stefan Bader Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1340364965.18025.71.camel@twins Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/sched.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8ef48f0..7484c92 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -605,22 +605,19 @@ static inline int cpu_of(struct rq *rq) /* * Return the group to which this tasks belongs. * - * We use task_subsys_state_check() and extend the RCU verification with - * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each - * task it moves into the cgroup. Therefore by holding either of those locks, - * we pin the task to the current cgroup. + * We cannot use task_subsys_state() and friends because the cgroup + * subsystem changes that value before the cgroup_subsys::attach() method + * is called, therefore we cannot pin it and might observe the wrong value. + * + * The same is true for autogroup's p->signal->autogroup->tg, the autogroup + * core changes this before calling sched_move_task(). + * + * Instead we use a 'copy' which is updated from sched_move_task() while + * holding both task_struct::pi_lock and rq::lock. */ static inline struct task_group *task_group(struct task_struct *p) { - struct task_group *tg; - struct cgroup_subsys_state *css; - - css = task_subsys_state_check(p, cpu_cgroup_subsys_id, - lockdep_is_held(&p->pi_lock) || - lockdep_is_held(&task_rq(p)->lock)); - tg = container_of(css, struct task_group, css); - - return autogroup_task_group(p, tg); + return p->sched_task_group; } /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ @@ -2206,7 +2203,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. * * sched_move_task() holds both and thus holding either pins the cgroup, - * see set_task_rq(). + * see task_group(). * * Furthermore, all task_rq users should acquire both locks, see * task_rq_lock(). @@ -8545,6 +8542,7 @@ void sched_destroy_group(struct task_group *tg) */ void sched_move_task(struct task_struct *tsk) { + struct task_group *tg; int on_rq, running; unsigned long flags; struct rq *rq; @@ -8559,6 +8557,12 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); + tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, + lockdep_is_held(&tsk->sighand->siglock)), + struct task_group, css); + tg = autogroup_task_group(tsk, tg); + tsk->sched_task_group = tg; + #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_move_group) tsk->sched_class->task_move_group(tsk, on_rq); -- cgit v1.1 From 44fa9a0111168832510f8add6d589e73eac6793d Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 11 Sep 2012 20:49:51 -0400 Subject: time: Improve sanity checking of timekeeping inputs commit 4e8b14526ca7fb046a81c94002c1c43b6fdf0e9b upstream Unexpected behavior could occur if the time is set to a value large enough to overflow a 64bit ktime_t (which is something larger then the year 2262). Also unexpected behavior could occur if large negative offsets are injected via adjtimex. So this patch improves the sanity check timekeeping inputs by improving the timespec_valid() check, and then makes better use of timespec_valid() to make sure we don't set the time to an invalid negative value or one that overflows ktime_t. Note: This does not protect from setting the time close to overflowing ktime_t and then letting natural accumulation cause the overflow. Reported-by: CAI Qian Reported-by: Sasha Levin Signed-off-by: John Stultz Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Zhouping Liu Cc: Ingo Molnar Link: http://lkml.kernel.org/r/1344454580-17031-1-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz Signed-off-by: Greg Kroah-Hartman --- kernel/time/timekeeping.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 678ae31..bfa6be73 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -382,7 +382,7 @@ int do_settimeofday(const struct timespec *tv) struct timespec ts_delta; unsigned long flags; - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) + if (!timespec_valid(tv)) return -EINVAL; write_seqlock_irqsave(&xtime_lock, flags); @@ -417,6 +417,8 @@ EXPORT_SYMBOL(do_settimeofday); int timekeeping_inject_offset(struct timespec *ts) { unsigned long flags; + struct timespec tmp; + int ret = 0; if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; @@ -425,9 +427,16 @@ int timekeeping_inject_offset(struct timespec *ts) timekeeping_forward_now(); + tmp = timespec_add(xtime, *ts); + if (!timespec_valid(&tmp)) { + ret = -EINVAL; + goto error; + } + xtime = timespec_add(xtime, *ts); wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts); +error: /* even if we error out, we forwarded the time, so call update */ timekeeping_update(true); write_sequnlock_irqrestore(&xtime_lock, flags); @@ -435,7 +444,7 @@ int timekeeping_inject_offset(struct timespec *ts) /* signal hrtimers about time change */ clock_was_set(); - return 0; + return ret; } EXPORT_SYMBOL(timekeeping_inject_offset); @@ -582,7 +591,20 @@ void __init timekeeping_init(void) struct timespec now, boot; read_persistent_clock(&now); + if (!timespec_valid(&now)) { + pr_warn("WARNING: Persistent clock returned invalid value!\n" + " Check your CMOS/BIOS settings.\n"); + now.tv_sec = 0; + now.tv_nsec = 0; + } + read_boot_clock(&boot); + if (!timespec_valid(&boot)) { + pr_warn("WARNING: Boot clock returned invalid value!\n" + " Check your CMOS/BIOS settings.\n"); + boot.tv_sec = 0; + boot.tv_nsec = 0; + } write_seqlock_irqsave(&xtime_lock, flags); -- cgit v1.1 From 4ffa9a8069801e36e2aceed5a77482b8b0841757 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 11 Sep 2012 20:49:52 -0400 Subject: time: Avoid making adjustments if we haven't accumulated anything commit bf2ac312195155511a0f79325515cbb61929898a upstream If update_wall_time() is called and the current offset isn't large enough to accumulate, avoid re-calling timekeeping_adjust which may change the clock freq and can cause 1ns inconsistencies with CLOCK_REALTIME_COARSE/CLOCK_MONOTONIC_COARSE. Signed-off-by: John Stultz Cc: Prarit Bhargava Cc: Ingo Molnar Link: http://lkml.kernel.org/r/1345595449-34965-5-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz Signed-off-by: Greg Kroah-Hartman --- kernel/time/timekeeping.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index bfa6be73..3bbaf2d0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -919,6 +919,10 @@ static void update_wall_time(void) #else offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #endif + /* Check if there's really nothing to do */ + if (offset < timekeeper.cycle_interval) + return; + timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; /* -- cgit v1.1 From fe979e2c0aa6e5b9157c3b381b43de2ca6965d7e Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 11 Sep 2012 20:49:53 -0400 Subject: time: Move ktime_t overflow checking into timespec_valid_strict commit cee58483cf56e0ba355fdd97ff5e8925329aa936 upstream Andreas Bombe reported that the added ktime_t overflow checking added to timespec_valid in commit 4e8b14526ca7 ("time: Improve sanity checking of timekeeping inputs") was causing problems with X.org because it caused timeouts larger then KTIME_T to be invalid. Previously, these large timeouts would be clamped to KTIME_MAX and would never expire, which is valid. This patch splits the ktime_t overflow checking into a new timespec_valid_strict function, and converts the timekeeping codes internal checking to use this more strict function. Reported-and-tested-by: Andreas Bombe Cc: Zhouping Liu Cc: Ingo Molnar Cc: Prarit Bhargava Cc: Thomas Gleixner Signed-off-by: John Stultz Signed-off-by: Linus Torvalds Signed-off-by: John Stultz Signed-off-by: Greg Kroah-Hartman --- kernel/time/timekeeping.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3bbaf2d0..c3cbd8c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -382,7 +382,7 @@ int do_settimeofday(const struct timespec *tv) struct timespec ts_delta; unsigned long flags; - if (!timespec_valid(tv)) + if (!timespec_valid_strict(tv)) return -EINVAL; write_seqlock_irqsave(&xtime_lock, flags); @@ -428,7 +428,7 @@ int timekeeping_inject_offset(struct timespec *ts) timekeeping_forward_now(); tmp = timespec_add(xtime, *ts); - if (!timespec_valid(&tmp)) { + if (!timespec_valid_strict(&tmp)) { ret = -EINVAL; goto error; } @@ -591,7 +591,7 @@ void __init timekeeping_init(void) struct timespec now, boot; read_persistent_clock(&now); - if (!timespec_valid(&now)) { + if (!timespec_valid_strict(&now)) { pr_warn("WARNING: Persistent clock returned invalid value!\n" " Check your CMOS/BIOS settings.\n"); now.tv_sec = 0; @@ -599,7 +599,7 @@ void __init timekeeping_init(void) } read_boot_clock(&boot); - if (!timespec_valid(&boot)) { + if (!timespec_valid_strict(&boot)) { pr_warn("WARNING: Boot clock returned invalid value!\n" " Check your CMOS/BIOS settings.\n"); boot.tv_sec = 0; @@ -649,6 +649,12 @@ static void update_sleep_time(struct timespec t) */ static void __timekeeping_inject_sleeptime(struct timespec *delta) { + if (!timespec_valid_strict(delta)) { + printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " + "sleep delta value!\n"); + return; + } + xtime = timespec_add(xtime, *delta); wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta); update_sleep_time(timespec_add(total_sleep_time, *delta)); -- cgit v1.1 From ca465bac8c69a79377547f7563c671a530eb977c Mon Sep 17 00:00:00 2001 From: Yasunori Goto Date: Tue, 17 Jan 2012 17:40:31 +0900 Subject: sched: Fix ancient race in do_exit() commit b5740f4b2cb3503b436925eb2242bc3d75cd3dfe upstream. try_to_wake_up() has a problem which may change status from TASK_DEAD to TASK_RUNNING in race condition with SMI or guest environment of virtual machine. As a result, exited task is scheduled() again and panic occurs. Here is the sequence how it occurs: ----------------------------------+----------------------------- | CPU A | CPU B ----------------------------------+----------------------------- TASK A calls exit().... do_exit() exit_mm() down_read(mm->mmap_sem); rwsem_down_failed_common() set TASK_UNINTERRUPTIBLE set waiter.task <= task A list_add to sem->wait_list : raw_spin_unlock_irq() (I/O interruption occured) __rwsem_do_wake(mmap_sem) list_del(&waiter->list); waiter->task = NULL wake_up_process(task A) try_to_wake_up() (task is still TASK_UNINTERRUPTIBLE) p->on_rq is still 1.) ttwu_do_wakeup() (*A) : (I/O interruption handler finished) if (!waiter.task) schedule() is not called due to waiter.task is NULL. tsk->state = TASK_RUNNING : check_preempt_curr(); : task->state = TASK_DEAD (*B) <--- set TASK_RUNNING (*C) schedule() (exit task is running again) BUG_ON() is called! -------------------------------------------------------- The execution time between (*A) and (*B) is usually very short, because the interruption is disabled, and setting TASK_RUNNING at (*C) must be executed before setting TASK_DEAD. HOWEVER, if SMI is interrupted between (*A) and (*B), (*C) is able to execute AFTER setting TASK_DEAD! Then, exited task is scheduled again, and BUG_ON() is called.... If the system works on guest system of virtual machine, the time between (*A) and (*B) may be also long due to scheduling of hypervisor, and same phenomenon can occur. By this patch, do_exit() waits for releasing task->pi_lock which is used in try_to_wake_up(). It guarantees the task becomes TASK_DEAD after waking up. Signed-off-by: Yasunori Goto Acked-by: Oleg Nesterov Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20120117174031.3118.E1E9C6FF@jp.fujitsu.com Signed-off-by: Ingo Molnar Cc: Michal Hocko Signed-off-by: Greg Kroah-Hartman --- kernel/exit.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 303bed2..97dd317 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1049,6 +1049,22 @@ NORET_TYPE void do_exit(long code) preempt_disable(); exit_rcu(); + + /* + * The setting of TASK_RUNNING by try_to_wake_up() may be delayed + * when the following two conditions become true. + * - There is race condition of mmap_sem (It is acquired by + * exit_mm()), and + * - SMI occurs before setting TASK_RUNINNG. + * (or hypervisor of virtual machine switches to other guest) + * As a result, we may become TASK_RUNNING after becoming TASK_DEAD + * + * To avoid it, we have to wait for releasing tsk->pi_lock which + * is held by try_to_wake_up() + */ + smp_mb(); + raw_spin_unlock_wait(&tsk->pi_lock); + /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; schedule(); -- cgit v1.1 From faaeea39363ad54b3dfe23cc982e484f6e54aa5a Mon Sep 17 00:00:00 2001 From: Shawn Guo Date: Thu, 4 Oct 2012 17:12:23 -0700 Subject: kernel/sys.c: call disable_nonboot_cpus() in kernel_restart() commit f96972f2dc6365421cf2366ebd61ee4cf060c8d5 upstream. As kernel_power_off() calls disable_nonboot_cpus(), we may also want to have kernel_restart() call disable_nonboot_cpus(). Doing so can help machines that require boot cpu be the last alive cpu during reboot to survive with kernel restart. This fixes one reboot issue seen on imx6q (Cortex-A9 Quad). The machine requires that the restart routine be run on the primary cpu rather than secondary ones. Otherwise, the secondary core running the restart routine will fail to come to online after reboot. Signed-off-by: Shawn Guo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/sys.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index f88dadc..dd29555 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -334,6 +334,7 @@ void kernel_restart_prepare(char *cmd) void kernel_restart(char *cmd) { kernel_restart_prepare(cmd); + disable_nonboot_cpus(); if (!cmd) printk(KERN_EMERG "Restarting system.\n"); else -- cgit v1.1 From 21de4eb26ec0b1b9c484da823fbcd1d3a48afec9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 3 Aug 2012 10:30:45 -0700 Subject: workqueue: add missing smp_wmb() in process_one_work() commit 959d1af8cffc8fd38ed53e8be1cf4ab8782f9c00 upstream. WORK_STRUCT_PENDING is used to claim ownership of a work item and process_one_work() releases it before starting execution. When someone else grabs PENDING, all pre-release updates to the work item should be visible and all updates made by the new owner should happen afterwards. Grabbing PENDING uses test_and_set_bit() and thus has a full barrier; however, clearing doesn't have a matching wmb. Given the preceding spin_unlock and use of clear_bit, I don't believe this can be a problem on an actual machine and there hasn't been any related report but it still is theretically possible for clear_pending to permeate upwards and happen before work->entry update. Add an explicit smp_wmb() before work_clear_pending(). Signed-off-by: Tejun Heo Cc: Oleg Nesterov Signed-off-by: Greg Kroah-Hartman --- kernel/workqueue.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 00c0bad..aef9452 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1868,7 +1868,9 @@ __acquires(&gcwq->lock) spin_unlock_irq(&gcwq->lock); + smp_wmb(); /* paired with test_and_set_bit(PENDING) */ work_clear_pending(work); + lock_map_acquire_read(&cwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); trace_workqueue_execute_start(work); -- cgit v1.1 From 3f6ea7b4b5adbb6ee9271d48dd63dd98645e505b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 22 Sep 2012 13:55:30 -0700 Subject: rcu: Fix day-one dyntick-idle stall-warning bug commit a10d206ef1a83121ab7430cb196e0376a7145b22 upstream. Each grace period is supposed to have at least one callback waiting for that grace period to complete. However, if CONFIG_NO_HZ=n, an extra callback-free grace period is no big problem -- it will chew up a tiny bit of CPU time, but it will complete normally. In contrast, CONFIG_NO_HZ=y kernels have the potential for all the CPUs to go to sleep indefinitely, in turn indefinitely delaying completion of the callback-free grace period. Given that nothing is waiting on this grace period, this is also not a problem. That is, unless RCU CPU stall warnings are also enabled, as they are in recent kernels. In this case, if a CPU wakes up after at least one minute of inactivity, an RCU CPU stall warning will result. The reason that no one noticed until quite recently is that most systems have enough OS noise that they will never remain absolutely idle for a full minute. But there are some embedded systems with cut-down userspace configurations that consistently get into this situation. All this begs the question of exactly how a callback-free grace period gets started in the first place. This can happen due to the fact that CPUs do not necessarily agree on which grace period is in progress. If a CPU still believes that the grace period that just completed is still ongoing, it will believe that it has callbacks that need to wait for another grace period, never mind the fact that the grace period that they were waiting for just completed. This CPU can therefore erroneously decide to start a new grace period. Note that this can happen in TREE_RCU and TREE_PREEMPT_RCU even on a single-CPU system: Deadlock considerations mean that the CPU that detected the end of the grace period is not necessarily officially informed of this fact for some time. Once this CPU notices that the earlier grace period completed, it will invoke its callbacks. It then won't have any callbacks left. If no other CPU has any callbacks, we now have a callback-free grace period. This commit therefore makes CPUs check more carefully before starting a new grace period. This new check relies on an array of tail pointers into each CPU's list of callbacks. If the CPU is up to date on which grace periods have completed, it checks to see if any callbacks follow the RCU_DONE_TAIL segment, otherwise it checks to see if any callbacks follow the RCU_WAIT_TAIL segment. The reason that this works is that the RCU_WAIT_TAIL segment will be promoted to the RCU_DONE_TAIL segment as soon as the CPU is officially notified that the old grace period has ended. This change is to cpu_needs_another_gp(), which is called in a number of places. The only one that really matters is in rcu_start_gp(), where the root rcu_node structure's ->lock is held, which prevents any other CPU from starting or completing a grace period, so that the comparison that determines whether the CPU is missing the completion of a grace period is stable. Reported-by: Becky Bruce Reported-by: Subodh Nijsure Reported-by: Paul Walmsley Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Paul Walmsley Signed-off-by: Greg Kroah-Hartman --- kernel/rcutree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ba06207..fe7a9b0 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -283,7 +283,9 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) static int cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { - return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp); + return *rdp->nxttail[RCU_DONE_TAIL + + ACCESS_ONCE(rsp->completed) != rdp->completed] && + !rcu_gp_in_progress(rsp); } /* -- cgit v1.1 From 8f48f1a28ee27909afbba8a3c2c653a15f810c3e Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 24 May 2012 19:46:26 +0530 Subject: CPU hotplug, cpusets, suspend: Don't modify cpusets during suspend/resume commit d35be8bab9b0ce44bed4b9453f86ebf64062721e upstream. In the event of CPU hotplug, the kernel modifies the cpusets' cpus_allowed masks as and when necessary to ensure that the tasks belonging to the cpusets have some place (online CPUs) to run on. And regular CPU hotplug is destructive in the sense that the kernel doesn't remember the original cpuset configurations set by the user, across hotplug operations. However, suspend/resume (which uses CPU hotplug) is a special case in which the kernel has the responsibility to restore the system (during resume), to exactly the same state it was in before suspend. In order to achieve that, do the following: 1. Don't modify cpusets during suspend/resume. At all. In particular, don't move the tasks from one cpuset to another, and don't modify any cpuset's cpus_allowed mask. So, simply ignore cpusets during the CPU hotplug operations that are carried out in the suspend/resume path. 2. However, cpusets and sched domains are related. We just want to avoid altering cpusets alone. So, to keep the sched domains updated, build a single sched domain (containing all active cpus) during each of the CPU hotplug operations carried out in s/r path, effectively ignoring the cpusets' cpus_allowed masks. (Since userspace is frozen while doing all this, it will go unnoticed.) 3. During the last CPU online operation during resume, build the sched domains by looking up the (unaltered) cpusets' cpus_allowed masks. That will bring back the system to the same original state as it was in before suspend. Ultimately, this will not only solve the cpuset problem related to suspend resume (ie., restores the cpusets to exactly what it was before suspend, by not touching it at all) but also speeds up suspend/resume because we avoid running cpuset update code for every CPU being offlined/onlined. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20120524141611.3692.20155.stgit@srivatsabhat.in.ibm.com Signed-off-by: Ingo Molnar Signed-off-by: Preeti U Murthy Signed-off-by: Greg Kroah-Hartman --- kernel/cpuset.c | 3 +++ kernel/sched.c | 40 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 39 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b2e84bd..6cbe033 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2080,6 +2080,9 @@ static void scan_for_empty_cpusets(struct cpuset *root) * (of no affect) on systems that are actively using CPU hotplug * but making no active use of cpusets. * + * The only exception to this is suspend/resume, where we don't + * modify cpusets at all. + * * This routine ensures that top_cpuset.cpus_allowed tracks * cpu_active_mask on each CPU hotplug (cpuhp) event. * diff --git a/kernel/sched.c b/kernel/sched.c index 7484c92..aacd55f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7777,34 +7777,66 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) } #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ +static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ + /* * Update cpusets according to cpu_active mask. If cpusets are * disabled, cpuset_update_active_cpus() becomes a simple wrapper * around partition_sched_domains(). + * + * If we come here as part of a suspend/resume, don't touch cpusets because we + * want to restore it back to its original state upon resume anyway. */ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action & ~CPU_TASKS_FROZEN) { + switch (action) { + case CPU_ONLINE_FROZEN: + case CPU_DOWN_FAILED_FROZEN: + + /* + * num_cpus_frozen tracks how many CPUs are involved in suspend + * resume sequence. As long as this is not the last online + * operation in the resume sequence, just build a single sched + * domain, ignoring cpusets. + */ + num_cpus_frozen--; + if (likely(num_cpus_frozen)) { + partition_sched_domains(1, NULL, NULL); + break; + } + + /* + * This is the last CPU online operation. So fall through and + * restore the original sched domains by considering the + * cpuset configurations. + */ + case CPU_ONLINE: case CPU_DOWN_FAILED: cpuset_update_active_cpus(); - return NOTIFY_OK; + break; default: return NOTIFY_DONE; } + return NOTIFY_OK; } static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action & ~CPU_TASKS_FROZEN) { + switch (action) { case CPU_DOWN_PREPARE: cpuset_update_active_cpus(); - return NOTIFY_OK; + break; + case CPU_DOWN_PREPARE_FROZEN: + num_cpus_frozen++; + partition_sched_domains(1, NULL, NULL); + break; default: return NOTIFY_DONE; } + return NOTIFY_OK; } static int update_runtime(struct notifier_block *nfb, -- cgit v1.1 From 17313c04d71395a59d2797f9fa846c94aebcb73c Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Fri, 22 Jun 2012 13:49:31 -0400 Subject: module: taint kernel when lve module is loaded commit c99af3752bb52ba3aece5315279a57a477edfaf1 upstream. Cloudlinux have a product called lve that includes a kernel module. This was previously GPLed but is now under a proprietary license, but the module continues to declare MODULE_LICENSE("GPL") and makes use of some EXPORT_SYMBOL_GPL symbols. Forcibly taint it in order to avoid this. Signed-off-by: Matthew Garrett Cc: Alex Lyashkov Signed-off-by: Rusty Russell Signed-off-by: Greg Kroah-Hartman --- kernel/module.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index b9d0667..a8bd215 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2605,6 +2605,10 @@ static int check_module_license_and_versions(struct module *mod) if (strcmp(mod->name, "driverloader") == 0) add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + /* lve claims to be GPL but upstream won't provide source */ + if (strcmp(mod->name, "lve") == 0) + add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) || (mod->num_gpl_syms && !mod->gpl_crcs) -- cgit v1.1 From a6a1e89eda1562696a35465a3663b6fcf6ec48da Mon Sep 17 00:00:00 2001 From: "Hildner, Christian" Date: Mon, 8 Oct 2012 15:49:03 +0200 Subject: timers: Fix endless looping between cascade() and internal_add_timer() commit 26cff4e2aa4d666dc6a120ea34336b5057e3e187 upstream. Adding two (or more) timers with large values for "expires" (they have to reside within tv5 in the same list) leads to endless looping between cascade() and internal_add_timer() in case CONFIG_BASE_SMALL is one and jiffies are crossing the value 1 << 18. The bug was introduced between 2.6.11 and 2.6.12 (and survived for quite some time). This patch ensures that when cascade() is called timers within tv5 are not added endlessly to their own list again, instead they are added to the next lower tv level tv4 (as expected). Signed-off-by: Christian Hildner Reviewed-by: Jan Kiszka Link: http://lkml.kernel.org/r/98673C87CB31274881CFFE0B65ECC87B0F5FC1963E@DEFTHW99EA4MSX.ww902.siemens.net Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/timer.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 8cff361..27982d9 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -63,6 +63,7 @@ EXPORT_SYMBOL(jiffies_64); #define TVR_SIZE (1 << TVR_BITS) #define TVN_MASK (TVN_SIZE - 1) #define TVR_MASK (TVR_SIZE - 1) +#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1)) struct tvec { struct list_head vec[TVN_SIZE]; @@ -356,11 +357,12 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); } else { int i; - /* If the timeout is larger than 0xffffffff on 64-bit - * architectures then we use the maximum timeout: + /* If the timeout is larger than MAX_TVAL (on 64-bit + * architectures or with CONFIG_BASE_SMALL=1) then we + * use the maximum timeout. */ - if (idx > 0xffffffffUL) { - idx = 0xffffffffUL; + if (idx > MAX_TVAL) { + idx = MAX_TVAL; expires = idx + base->timer_jiffies; } i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; -- cgit v1.1 From 58793e9b6ba06d22ce9cf444d190568157c858c9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 19 Oct 2012 13:56:51 -0700 Subject: kernel/sys.c: fix stack memory content leak via UNAME26 commit 2702b1526c7278c4d65d78de209a465d4de2885e upstream. Calling uname() with the UNAME26 personality set allows a leak of kernel stack contents. This fixes it by defensively calculating the length of copy_to_user() call, making the len argument unsigned, and initializing the stack buffer to zero (now technically unneeded, but hey, overkill). CVE-2012-0957 Reported-by: PaX Team Signed-off-by: Kees Cook Cc: Andi Kleen Cc: PaX Team Cc: Brad Spengler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/sys.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index dd29555..46ca435 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1133,15 +1133,16 @@ DECLARE_RWSEM(uts_sem); * Work around broken programs that cannot handle "Linux 3.0". * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 */ -static int override_release(char __user *release, int len) +static int override_release(char __user *release, size_t len) { int ret = 0; - char buf[65]; if (current->personality & UNAME26) { - char *rest = UTS_RELEASE; + const char *rest = UTS_RELEASE; + char buf[65] = { 0 }; int ndots = 0; unsigned v; + size_t copy; while (*rest) { if (*rest == '.' && ++ndots >= 3) @@ -1151,8 +1152,9 @@ static int override_release(char __user *release, int len) rest++; } v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40; - snprintf(buf, len, "2.6.%u%s", v, rest); - ret = copy_to_user(release, buf, len); + copy = min(sizeof(buf), max_t(size_t, 1, len)); + copy = scnprintf(buf, copy, "2.6.%u%s", v, rest); + ret = copy_to_user(release, buf, copy + 1); } return ret; } -- cgit v1.1 From 2f3dc85d233fce3345eb1880ec44349d7b290211 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 19 Oct 2012 18:45:53 -0700 Subject: use clamp_t in UNAME26 fix commit 31fd84b95eb211d5db460a1dda85e004800a7b52 upstream. The min/max call needed to have explicit types on some architectures (e.g. mn10300). Use clamp_t instead to avoid the warning: kernel/sys.c: In function 'override_release': kernel/sys.c:1287:10: warning: comparison of distinct pointer types lacks a cast [enabled by default] Reported-by: Fengguang Wu Signed-off-by: Kees Cook Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 46ca435..84e353b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1152,7 +1152,7 @@ static int override_release(char __user *release, size_t len) rest++; } v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40; - copy = min(sizeof(buf), max_t(size_t, 1, len)); + copy = clamp_t(size_t, len, 1, sizeof(buf)); copy = scnprintf(buf, copy, "2.6.%u%s", v, rest); ret = copy_to_user(release, buf, copy + 1); } -- cgit v1.1 From f0c76f5fa97e436cf24cb57a4aeceb7c83835704 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 4 Oct 2012 16:37:16 +0900 Subject: cgroup: notify_on_release may not be triggered in some cases commit 1f5320d5972aa50d3e8d2b227b636b370e608359 upstream. notify_on_release must be triggered when the last process in a cgroup is move to another. But if the first(and only) process in a cgroup is moved to another, notify_on_release is not triggered. # mkdir /cgroup/cpu/SRC # mkdir /cgroup/cpu/DST # # echo 1 >/cgroup/cpu/SRC/notify_on_release # echo 1 >/cgroup/cpu/DST/notify_on_release # # sleep 300 & [1] 8629 # # echo 8629 >/cgroup/cpu/SRC/tasks # echo 8629 >/cgroup/cpu/DST/tasks -> notify_on_release for /SRC must be triggered at this point, but it isn't. This is because put_css_set() is called before setting CGRP_RELEASABLE in cgroup_task_migrate(), and is a regression introduce by the commit:74a1166d(cgroups: make procs file writable), which was merged into v3.0. Acked-by: Li Zefan Cc: Ben Blum Signed-off-by: Daisuke Nishimura Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- kernel/cgroup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2efce77..69158d5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1800,9 +1800,8 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, * trading it for newcg is protected by cgroup_mutex, we're safe to drop * it here; it will be freed under RCU. */ - put_css_set(oldcg); - set_bit(CGRP_RELEASABLE, &oldcgrp->flags); + put_css_set(oldcg); return 0; } -- cgit v1.1 From 2f56580d924c9283292d10567fb7744d9c0fd2b2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 23 Oct 2012 22:29:38 +0200 Subject: futex: Handle futex_pi OWNER_DIED take over correctly commit 59fa6245192159ab5e1e17b8e31f15afa9cff4bf upstream. Siddhesh analyzed a failure in the take over of pi futexes in case the owner died and provided a workaround. See: http://sourceware.org/bugzilla/show_bug.cgi?id=14076 The detailed problem analysis shows: Futex F is initialized with PTHREAD_PRIO_INHERIT and PTHREAD_MUTEX_ROBUST_NP attributes. T1 lock_futex_pi(F); T2 lock_futex_pi(F); --> T2 blocks on the futex and creates pi_state which is associated to T1. T1 exits --> exit_robust_list() runs --> Futex F userspace value TID field is set to 0 and FUTEX_OWNER_DIED bit is set. T3 lock_futex_pi(F); --> Succeeds due to the check for F's userspace TID field == 0 --> Claims ownership of the futex and sets its own TID into the userspace TID field of futex F --> returns to user space T1 --> exit_pi_state_list() --> Transfers pi_state to waiter T2 and wakes T2 via rt_mutex_unlock(&pi_state->mutex) T2 --> acquires pi_state->mutex and gains real ownership of the pi_state --> Claims ownership of the futex and sets its own TID into the userspace TID field of futex F --> returns to user space T3 --> observes inconsistent state This problem is independent of UP/SMP, preemptible/non preemptible kernels, or process shared vs. private. The only difference is that certain configurations are more likely to expose it. So as Siddhesh correctly analyzed the following check in futex_lock_pi_atomic() is the culprit: if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { We check the userspace value for a TID value of 0 and take over the futex unconditionally if that's true. AFAICT this check is there as it is correct for a different corner case of futexes: the WAITERS bit became stale. Now the proposed change - if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { + if (unlikely(ownerdied || + !(curval & (FUTEX_TID_MASK | FUTEX_WAITERS)))) { solves the problem, but it's not obvious why and it wreckages the "stale WAITERS bit" case. What happens is, that due to the WAITERS bit being set (T2 is blocked on that futex) it enforces T3 to go through lookup_pi_state(), which in the above case returns an existing pi_state and therefor forces T3 to legitimately fight with T2 over the ownership of the pi_state (via pi_state->mutex). Probelm solved! Though that does not work for the "WAITERS bit is stale" problem because if lookup_pi_state() does not find existing pi_state it returns -ERSCH (due to TID == 0) which causes futex_lock_pi() to return -ESRCH to user space because the OWNER_DIED bit is not set. Now there is a different solution to that problem. Do not look at the user space value at all and enforce a lookup of possibly available pi_state. If pi_state can be found, then the new incoming locker T3 blocks on that pi_state and legitimately races with T2 to acquire the rt_mutex and the pi_state and therefor the proper ownership of the user space futex. lookup_pi_state() has the correct order of checks. It first tries to find a pi_state associated with the user space futex and only if that fails it checks for futex TID value = 0. If no pi_state is available nothing can create new state at that point because this happens with the hash bucket lock held. So the above scenario changes to: T1 lock_futex_pi(F); T2 lock_futex_pi(F); --> T2 blocks on the futex and creates pi_state which is associated to T1. T1 exits --> exit_robust_list() runs --> Futex F userspace value TID field is set to 0 and FUTEX_OWNER_DIED bit is set. T3 lock_futex_pi(F); --> Finds pi_state and blocks on pi_state->rt_mutex T1 --> exit_pi_state_list() --> Transfers pi_state to waiter T2 and wakes it via rt_mutex_unlock(&pi_state->mutex) T2 --> acquires pi_state->mutex and gains ownership of the pi_state --> Claims ownership of the futex and sets its own TID into the userspace TID field of futex F --> returns to user space This covers all gazillion points on which T3 might come in between T1's exit_robust_list() clearing the TID field and T2 fixing it up. It also solves the "WAITERS bit stale" problem by forcing the take over. Another benefit of changing the code this way is that it makes it less dependent on untrusted user space values and therefor minimizes the possible wreckage which might be inflicted. As usual after staring for too long at the futex code my brain hurts so much that I really want to ditch that whole optimization of avoiding the syscall for the non contended case for PI futexes and rip out the maze of corner case handling code. Unfortunately we can't as user space relies on that existing behaviour, but at least thinking about it helps me to preserve my mental sanity. Maybe we should nevertheless :) Reported-and-tested-by: Siddhesh Poyarekar Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1210232138540.2756@ionos Acked-by: Darren Hart Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 24bc59c..01b3c55 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -716,7 +716,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, struct futex_pi_state **ps, struct task_struct *task, int set_waiters) { - int lock_taken, ret, ownerdied = 0; + int lock_taken, ret, force_take = 0; u32 uval, newval, curval, vpid = task_pid_vnr(task); retry: @@ -755,17 +755,15 @@ retry: newval = curval | FUTEX_WAITERS; /* - * There are two cases, where a futex might have no owner (the - * owner TID is 0): OWNER_DIED. We take over the futex in this - * case. We also do an unconditional take over, when the owner - * of the futex died. - * - * This is safe as we are protected by the hash bucket lock ! + * Should we force take the futex? See below. */ - if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { - /* Keep the OWNER_DIED bit */ + if (unlikely(force_take)) { + /* + * Keep the OWNER_DIED and the WAITERS bit and set the + * new TID value. + */ newval = (curval & ~FUTEX_TID_MASK) | vpid; - ownerdied = 0; + force_take = 0; lock_taken = 1; } @@ -775,7 +773,7 @@ retry: goto retry; /* - * We took the lock due to owner died take over. + * We took the lock due to forced take over. */ if (unlikely(lock_taken)) return 1; @@ -790,20 +788,25 @@ retry: switch (ret) { case -ESRCH: /* - * No owner found for this futex. Check if the - * OWNER_DIED bit is set to figure out whether - * this is a robust futex or not. + * We failed to find an owner for this + * futex. So we have no pi_state to block + * on. This can happen in two cases: + * + * 1) The owner died + * 2) A stale FUTEX_WAITERS bit + * + * Re-read the futex value. */ if (get_futex_value_locked(&curval, uaddr)) return -EFAULT; /* - * We simply start over in case of a robust - * futex. The code above will take the futex - * and return happy. + * If the owner died or we have a stale + * WAITERS bit the owner TID in the user space + * futex is 0. */ - if (curval & FUTEX_OWNER_DIED) { - ownerdied = 1; + if (!(curval & FUTEX_TID_MASK)) { + force_take = 1; goto retry; } default: -- cgit v1.1 From b87c48d774bd12f6cb08ad8e5d2671d72bcfd4b4 Mon Sep 17 00:00:00 2001 From: Ram Pai Date: Thu, 22 Sep 2011 15:48:58 +0800 Subject: Resource: fix wrong resource window calculation commit 47ea91b4052d9e94b9dca5d7a3d947fbebd07ba9 upstream. __find_resource() incorrectly returns a resource window which overlaps an existing allocated window. This happens when the parent's resource-window spans 0x00000000 to 0xffffffff and is entirely allocated to all its children resource-windows. __find_resource() looks for gaps in resource allocation among the children resource windows. When it encounters the last child window it blindly tries the range next to one allocated to the last child. Since the last child's window ends at 0xffffffff the calculation overflows, leading the algorithm to believe that any window in the range 0x0000000 to 0xfffffff is available for allocation. This leads to a conflicting window allocation. Michal Ludvig reported this issue seen on his platform. The following patch fixes the problem and has been verified by Michal. I believe this bug has been there for ages. It got exposed by git commit 2bbc6942273b ("PCI : ability to relocate assigned pci-resources") Signed-off-by: Ram Pai Tested-by: Michal Ludvig Signed-off-by: Linus Torvalds Cc: Herton Ronaldo Krzesinski Signed-off-by: Greg Kroah-Hartman --- kernel/resource.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 3ff4017..b29b83d 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -419,6 +419,9 @@ static int __find_resource(struct resource *root, struct resource *old, else tmp.end = root->end; + if (tmp.end < tmp.start) + goto next; + resource_clip(&tmp, constraint->min, constraint->max); arch_remove_reservations(&tmp); @@ -436,8 +439,10 @@ static int __find_resource(struct resource *root, struct resource *old, return 0; } } - if (!this) + +next: if (!this || this->end == root->end) break; + if (this != old) tmp.start = this->end + 1; this = this->sibling; -- cgit v1.1 From beb93b17f27dee8a85db7bf0d882b59cdff4b2a0 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Mon, 26 Nov 2012 16:29:56 -0800 Subject: futex: avoid wake_futex() for a PI futex_q commit aa10990e028cac3d5e255711fb9fb47e00700e35 upstream. Dave Jones reported a bug with futex_lock_pi() that his trinity test exposed. Sometime between queue_me() and taking the q.lock_ptr, the lock_ptr became NULL, resulting in a crash. While futex_wake() is careful to not call wake_futex() on futex_q's with a pi_state or an rt_waiter (which are either waiting for a futex_unlock_pi() or a PI futex_requeue()), futex_wake_op() and futex_requeue() do not perform the same test. Update futex_wake_op() and futex_requeue() to test for q.pi_state and q.rt_waiter and abort with -EINVAL if detected. To ensure any future breakage is caught, add a WARN() to wake_futex() if the same condition is true. This fix has seen 3 hours of testing with "trinity -c futex" on an x86_64 VM with 4 CPUS. [akpm@linux-foundation.org: tidy up the WARN()] Signed-off-by: Darren Hart Reported-by: Dave Jones Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: John Kacur Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 01b3c55..91691e9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -843,6 +843,9 @@ static void wake_futex(struct futex_q *q) { struct task_struct *p = q->task; + if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) + return; + /* * We set q->lock_ptr = NULL _before_ we wake up the task. If * a non-futex wake up happens on another CPU then the task @@ -1078,6 +1081,10 @@ retry_private: plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key1)) { + if (this->pi_state || this->rt_waiter) { + ret = -EINVAL; + goto out_unlock; + } wake_futex(this); if (++ret >= nr_wake) break; @@ -1090,6 +1097,10 @@ retry_private: op_ret = 0; plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key2)) { + if (this->pi_state || this->rt_waiter) { + ret = -EINVAL; + goto out_unlock; + } wake_futex(this); if (++op_ret >= nr_wake2) break; @@ -1098,6 +1109,7 @@ retry_private: ret += op_ret; } +out_unlock: double_unlock_hb(hb1, hb2); out_put_keys: put_futex_key(&key2); @@ -1387,9 +1399,13 @@ retry_private: /* * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always * be paired with each other and no other futex ops. + * + * We should never be requeueing a futex_q with a pi_state, + * which is awaiting a futex_unlock_pi(). */ if ((requeue_pi && !this->rt_waiter) || - (!requeue_pi && this->rt_waiter)) { + (!requeue_pi && this->rt_waiter) || + this->pi_state) { ret = -EINVAL; break; } -- cgit v1.1 From 872edb5fe78156f8a589e544bbe54eae07226356 Mon Sep 17 00:00:00 2001 From: Chuansheng Liu Date: Mon, 26 Nov 2012 16:29:54 -0800 Subject: watchdog: using u64 in get_sample_period() commit 8ffeb9b0e6369135bf03a073514f571ef10606b9 upstream. In get_sample_period(), unsigned long is not enough: watchdog_thresh * 2 * (NSEC_PER_SEC / 5) case1: watchdog_thresh is 10 by default, the sample value will be: 0xEE6B2800 case2: set watchdog_thresh is 20, the sample value will be: 0x1 DCD6 5000 In case2, we need use u64 to express the sample period. Otherwise, changing the threshold thru proc often can not be successful. Signed-off-by: liu chuansheng Acked-by: Don Zickus Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- kernel/watchdog.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 3d0c56a..53e4432 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -113,7 +113,7 @@ static unsigned long get_timestamp(int this_cpu) return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ } -static unsigned long get_sample_period(void) +static u64 get_sample_period(void) { /* * convert watchdog_thresh from seconds to ns @@ -121,7 +121,7 @@ static unsigned long get_sample_period(void) * increment before the hardlockup detector generates * a warning */ - return get_softlockup_thresh() * (NSEC_PER_SEC / 5); + return get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); } /* Commands for resetting the watchdog */ -- cgit v1.1 From cc3c85dfa1a7e8d32c6e625176269f9dff88447d Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 28 Nov 2012 07:17:18 +0100 Subject: workqueue: exit rescuer_thread() as TASK_RUNNING commit 412d32e6c98527078779e5b515823b2810e40324 upstream. A rescue thread exiting TASK_INTERRUPTIBLE can lead to a task scheduling off, never to be seen again. In the case where this occurred, an exiting thread hit reiserfs homebrew conditional resched while holding a mutex, bringing the box to its knees. PID: 18105 TASK: ffff8807fd412180 CPU: 5 COMMAND: "kdmflush" #0 [ffff8808157e7670] schedule at ffffffff8143f489 #1 [ffff8808157e77b8] reiserfs_get_block at ffffffffa038ab2d [reiserfs] #2 [ffff8808157e79a8] __block_write_begin at ffffffff8117fb14 #3 [ffff8808157e7a98] reiserfs_write_begin at ffffffffa0388695 [reiserfs] #4 [ffff8808157e7ad8] generic_perform_write at ffffffff810ee9e2 #5 [ffff8808157e7b58] generic_file_buffered_write at ffffffff810eeb41 #6 [ffff8808157e7ba8] __generic_file_aio_write at ffffffff810f1a3a #7 [ffff8808157e7c58] generic_file_aio_write at ffffffff810f1c88 #8 [ffff8808157e7cc8] do_sync_write at ffffffff8114f850 #9 [ffff8808157e7dd8] do_acct_process at ffffffff810a268f [exception RIP: kernel_thread_helper] RIP: ffffffff8144a5c0 RSP: ffff8808157e7f58 RFLAGS: 00000202 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff8107af60 RDI: ffff8803ee491d18 RBP: 0000000000000000 R8: 0000000000000000 R9: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 Signed-off-by: Mike Galbraith Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- kernel/workqueue.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index aef9452..472cfe3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2044,8 +2044,10 @@ static int rescuer_thread(void *__wq) repeat: set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); return 0; + } /* * See whether any cpu is asking for help. Unbounded -- cgit v1.1 From e86b690c9659a03d8a6bb7ded18894718ab64360 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Mon, 3 Dec 2012 06:25:25 +0100 Subject: Revert "sched, autogroup: Stop going ahead if autogroup is disabled" commit fd8ef11730f1d03d5d6555aa53126e9e34f52f12 upstream. This reverts commit 800d4d30c8f20bd728e5741a3b77c4859a613f7c. Between commits 8323f26ce342 ("sched: Fix race in task_group()") and 800d4d30c8f2 ("sched, autogroup: Stop going ahead if autogroup is disabled"), autogroup is a wreck. With both applied, all you have to do to crash a box is disable autogroup during boot up, then reboot.. boom, NULL pointer dereference due to commit 800d4d30c8f2 not allowing autogroup to move things, and commit 8323f26ce342 making that the only way to switch runqueues: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] effective_load.isra.43+0x50/0x90 Pid: 7047, comm: systemd-user-se Not tainted 3.6.8-smp #7 MEDIONPC MS-7502/MS-7502 RIP: effective_load.isra.43+0x50/0x90 Process systemd-user-se (pid: 7047, threadinfo ffff880221dde000, task ffff88022618b3a0) Call Trace: select_task_rq_fair+0x255/0x780 try_to_wake_up+0x156/0x2c0 wake_up_state+0xb/0x10 signal_wake_up+0x28/0x40 complete_signal+0x1d6/0x250 __send_signal+0x170/0x310 send_signal+0x40/0x80 do_send_sig_info+0x47/0x90 group_send_sig_info+0x4a/0x70 kill_pid_info+0x3a/0x60 sys_kill+0x97/0x1a0 ? vfs_read+0x120/0x160 ? sys_read+0x45/0x90 system_call_fastpath+0x16/0x1b Code: 49 0f af 41 50 31 d2 49 f7 f0 48 83 f8 01 48 0f 46 c6 48 2b 07 48 8b bf 40 01 00 00 48 85 ff 74 3a 45 31 c0 48 8b 8f 50 01 00 00 <48> 8b 11 4c 8b 89 80 00 00 00 49 89 d2 48 01 d0 45 8b 59 58 4c RIP [] effective_load.isra.43+0x50/0x90 RSP CR2: 0000000000000000 Signed-off-by: Mike Galbraith Acked-by: Ingo Molnar Cc: Yong Zhang Cc: Peter Zijlstra Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/sched_autogroup.c | 4 ---- kernel/sched_autogroup.h | 5 ----- 2 files changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c index 429242f..f280df1 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched_autogroup.c @@ -160,15 +160,11 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) p->signal->autogroup = autogroup_kref_get(ag); - if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) - goto out; - t = p; do { sched_move_task(t); } while_each_thread(p, t); -out: unlock_task_sighand(p, &flags); autogroup_kref_put(prev); } diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h index 0557705..7b859ff 100644 --- a/kernel/sched_autogroup.h +++ b/kernel/sched_autogroup.h @@ -1,11 +1,6 @@ #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup { - /* - * reference doesn't mean how many thread attach to this - * autogroup now. It just stands for the number of task - * could use this autogroup. - */ struct kref kref; struct task_group *tg; struct rw_semaphore lock; -- cgit v1.1 From 3ff0aeceb49aca3f99c37189e949bf43bfd393a0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 4 Dec 2012 07:40:39 -0800 Subject: workqueue: convert BUG_ON()s in __queue_delayed_work() to WARN_ON_ONCE()s commit fc4b514f2727f74a4587c31db87e0e93465518c3 upstream. 8852aac25e ("workqueue: mod_delayed_work_on() shouldn't queue timer on 0 delay") unexpectedly uncovered a very nasty abuse of delayed_work in megaraid - it allocated work_struct, casted it to delayed_work and then pass that into queue_delayed_work(). Previously, this was okay because 0 @delay short-circuited to queue_work() before doing anything with delayed_work. 8852aac25e moved 0 @delay test into __queue_delayed_work() after sanity check on delayed_work making megaraid trigger BUG_ON(). Although megaraid is already fixed by c1d390d8e6 ("megaraid: fix BUG_ON() from incorrect use of delayed work"), this patch converts BUG_ON()s in __queue_delayed_work() to WARN_ON_ONCE()s so that such abusers, if there are more, trigger warning but don't crash the machine. Signed-off-by: Tejun Heo Cc: Xiaotian Feng Signed-off-by: Shuah Khan Signed-off-by: Greg Kroah-Hartman --- kernel/workqueue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 472cfe3..dc8438d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1145,8 +1145,8 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { unsigned int lcpu; - BUG_ON(timer_pending(timer)); - BUG_ON(!list_empty(&work->entry)); + WARN_ON_ONCE(timer_pending(timer)); + WARN_ON_ONCE(!list_empty(&work->entry)); timer_stats_timer_set_start_info(&dwork->timer); -- cgit v1.1 From 7a09a2fff84eda4ac1bba98a61a61bd58f369bed Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 9 Jun 2012 19:10:27 +0300 Subject: ftrace: Clear bits properly in reset_iter_read() commit 70f77b3f7ec010ff9624c1f2e39a81babc9e2429 upstream. There is a typo here where '&' is used instead of '|' and it turns the statement into a noop. The original code is equivalent to: iter->flags &= ~((1 << 2) & (1 << 4)); Link: http://lkml.kernel.org/r/20120609161027.GD6488@elgon.mountain Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt Signed-off-by: Greg Kroah-Hartman --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9f8e2e1..f88ea18 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2058,7 +2058,7 @@ static void reset_iter_read(struct ftrace_iterator *iter) { iter->pos = 0; iter->func_pos = 0; - iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH); + iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH); } static void *t_start(struct seq_file *m, loff_t *pos) -- cgit v1.1 From d0389110863a993f19ba11bdf9ac639df713a1f2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 3 Nov 2012 11:52:09 +0100 Subject: genirq: Always force thread affinity commit 04aa530ec04f61875b99c12721162e2964e3318c upstream. Sankara reported that the genirq core code fails to adjust the affinity of an interrupt thread in several cases: 1) On request/setup_irq() the call to setup_affinity() happens before the new action is registered, so the new thread is not notified. 2) For secondary shared interrupts nothing notifies the new thread to change its affinity. 3) Interrupts which have the IRQ_NO_BALANCE flag set are not moving the thread either. Fix this by setting the thread affinity flag right on thread creation time. This ensures that under all circumstances the thread moves to the right place. Requires a check in irq_thread_check_affinity for an existing affinity mask (CONFIG_CPU_MASK_OFFSTACK=y) Reported-and-tested-by: Sankara Muthukrishnan Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1209041738200.2754@ionos Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/irq/manage.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index fa4a70e..3e1bdf9 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -698,6 +698,7 @@ static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { cpumask_var_t mask; + bool valid = true; if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) return; @@ -712,10 +713,18 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) } raw_spin_lock_irq(&desc->lock); - cpumask_copy(mask, desc->irq_data.affinity); + /* + * This code is triggered unconditionally. Check the affinity + * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. + */ + if (desc->irq_data.affinity) + cpumask_copy(mask, desc->irq_data.affinity); + else + valid = false; raw_spin_unlock_irq(&desc->lock); - set_cpus_allowed_ptr(current, mask); + if (valid) + set_cpus_allowed_ptr(current, mask); free_cpumask_var(mask); } #else @@ -925,6 +934,16 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) */ get_task_struct(t); new->thread = t; + /* + * Tell the thread to set its affinity. This is + * important for shared interrupt handlers as we do + * not invoke setup_affinity() for the secondary + * handlers as everything is already set up. Even for + * interrupts marked with IRQF_NO_BALANCE this is + * correct as we want the thread to move to the cpu(s) + * on which the requesting code placed the interrupt. + */ + set_bit(IRQTF_AFFINITY, &new->thread_flags); } if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { -- cgit v1.1 From 01fdcf4e373725f4a1238d9876063fd0b51bed15 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:35 -0800 Subject: cgroup: remove incorrect dget/dput() pair in cgroup_create_dir() commit 175431635ec09b1d1bba04979b006b99e8305a83 upstream. cgroup_create_dir() does weird dancing with dentry refcnt. On success, it gets and then puts it achieving nothing. On failure, it puts but there isn't no matching get anywhere leading to the following oops if cgroup_create_file() fails for whatever reason. ------------[ cut here ]------------ kernel BUG at /work/os/work/fs/dcache.c:552! invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: CPU 2 Pid: 697, comm: mkdir Not tainted 3.7.0-rc4-work+ #3 Bochs Bochs RIP: 0010:[] [] dput+0x1dc/0x1e0 RSP: 0018:ffff88001a3ebef8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff88000e5b1ef8 RCX: 0000000000000403 RDX: 0000000000000303 RSI: 2000000000000000 RDI: ffff88000e5b1f58 RBP: ffff88001a3ebf18 R08: ffffffff82c76960 R09: 0000000000000001 R10: ffff880015022080 R11: ffd9bed70f48a041 R12: 00000000ffffffea R13: 0000000000000001 R14: ffff88000e5b1f58 R15: 00007fff57656d60 FS: 00007ff05fcb3800(0000) GS:ffff88001fd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000004046f0 CR3: 000000001315f000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process mkdir (pid: 697, threadinfo ffff88001a3ea000, task ffff880015022080) Stack: ffff88001a3ebf48 00000000ffffffea 0000000000000001 0000000000000000 ffff88001a3ebf38 ffffffff811cc889 0000000000000001 ffff88000e5b1ef8 ffff88001a3ebf68 ffffffff811d1fc9 ffff8800198d7f18 ffff880019106ef8 Call Trace: [] done_path_create+0x19/0x50 [] sys_mkdirat+0x59/0x80 [] sys_mkdir+0x19/0x20 [] system_call_fastpath+0x16/0x1b Code: 00 48 8d 90 18 01 00 00 48 89 93 c0 00 00 00 4c 89 a0 18 01 00 00 48 8b 83 a0 00 00 00 83 80 28 01 00 00 01 e8 e6 6f a0 00 eb 92 <0f> 0b 66 90 0f 1f 44 00 00 55 48 89 e5 41 57 41 56 49 89 fe 41 RIP [] dput+0x1dc/0x1e0 RSP ---[ end trace 1277bcfd9561ddb0 ]--- Fix it by dropping the unnecessary dget/dput() pair. Signed-off-by: Tejun Heo Acked-by: Li Zefan Signed-off-by: Greg Kroah-Hartman --- kernel/cgroup.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 69158d5..1749dcd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2630,9 +2630,7 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, dentry->d_fsdata = cgrp; inc_nlink(parent->d_inode); rcu_assign_pointer(cgrp->dentry, dentry); - dget(dentry); } - dput(dentry); return error; } -- cgit v1.1