aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorZiyan <jaraidaniel@gmail.com>2015-10-24 18:19:09 +0200
committerZiyan <jaraidaniel@gmail.com>2015-10-25 16:24:52 +0100
commit101878938737bd16a5f7fb932b51041f7dbbb733 (patch)
tree235732d29c6741f22f2020218b2fad41c40b57b8 /kernel
parent540bea4ab32149f8bc71fe34b73340c8d9abc053 (diff)
parent5dba9ddd98cbc7ad319d687887981a0ea0062c75 (diff)
downloadkernel_samsung_espresso10-101878938737bd16a5f7fb932b51041f7dbbb733.zip
kernel_samsung_espresso10-101878938737bd16a5f7fb932b51041f7dbbb733.tar.gz
kernel_samsung_espresso10-101878938737bd16a5f7fb932b51041f7dbbb733.tar.bz2
Merge remote-tracking branch 'linux-stable/linux-3.0.y' into p-android-omap-3.0-dev-espresso
Conflicts: Makefile arch/arm/include/asm/hardware/cache-l2x0.h arch/arm/kernel/smp.c arch/arm/mach-omap2/board-4430sdp.c arch/arm/mach-omap2/board-omap4panda.c arch/arm/mach-omap2/opp.c arch/ia64/include/asm/futex.h drivers/bluetooth/ath3k.c drivers/bluetooth/btusb.c drivers/firmware/efivars.c drivers/gpu/drm/i915/intel_lvds.c drivers/gpu/drm/radeon/radeon_atombios.c drivers/gpu/drm/radeon/radeon_irq_kms.c drivers/hwmon/fam15h_power.c drivers/mfd/twl6030-irq.c drivers/mmc/core/sdio.c drivers/net/tun.c drivers/net/usb/ipheth.c drivers/net/usb/usbnet.c drivers/usb/core/hub.c drivers/usb/host/xhci-mem.c drivers/usb/host/xhci.h drivers/usb/musb/omap2430.c drivers/usb/serial/ftdi_sio.c drivers/usb/serial/ftdi_sio_ids.h drivers/usb/serial/option.c drivers/usb/serial/qcserial.c drivers/usb/serial/ti_usb_3410_5052.c drivers/usb/serial/ti_usb_3410_5052.h drivers/video/omap2/dss/hdmi.c fs/splice.c include/asm-generic/pgtable.h include/net/sch_generic.h kernel/cgroup.c kernel/futex.c kernel/time/timekeeping.c net/ipv4/route.c net/ipv4/syncookies.c net/ipv4/tcp_ipv4.c net/wireless/util.c security/commoncap.c sound/soc/soc-dapm.c
Diffstat (limited to 'kernel')
-rw-r--r--kernel/async.c13
-rw-r--r--kernel/audit_tree.c13
-rw-r--r--kernel/cgroup.c18
-rw-r--r--kernel/compat.c63
-rw-r--r--kernel/cpuset.c78
-rw-r--r--kernel/events/core.c143
-rw-r--r--kernel/events/hw_breakpoint.c4
-rw-r--r--kernel/exit.c16
-rw-r--r--kernel/fork.c6
-rw-r--r--kernel/futex.c79
-rw-r--r--kernel/hrtimer.c94
-rw-r--r--kernel/irq/handle.c7
-rw-r--r--kernel/irq/manage.c46
-rw-r--r--kernel/irq/spurious.c7
-rw-r--r--kernel/kmod.c5
-rw-r--r--kernel/module.c4
-rw-r--r--kernel/posix-cpu-timers.c23
-rw-r--r--kernel/posix-timers.c7
-rw-r--r--kernel/power/hibernate.c6
-rw-r--r--kernel/power/suspend.c3
-rw-r--r--kernel/printk.c13
-rw-r--r--kernel/ptrace.c63
-rw-r--r--kernel/rcutree.c4
-rw-r--r--kernel/relay.c5
-rw-r--r--kernel/resource.c57
-rw-r--r--kernel/sched.c90
-rw-r--r--kernel/sched_autogroup.c4
-rw-r--r--kernel/sched_autogroup.h5
-rw-r--r--kernel/sched_clock.c26
-rw-r--r--kernel/sched_fair.c2
-rw-r--r--kernel/signal.c22
-rw-r--r--kernel/smp.c13
-rw-r--r--kernel/sys.c16
-rw-r--r--kernel/sysctl_binary.c3
-rw-r--r--kernel/time/ntp.c128
-rw-r--r--kernel/time/tick-broadcast.c17
-rw-r--r--kernel/time/tick-common.c1
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/timekeeping.c149
-rw-r--r--kernel/timeconst.pl6
-rw-r--r--kernel/timer.c20
-rw-r--r--kernel/trace/Kconfig24
-rw-r--r--kernel/trace/ftrace.c102
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/trace/trace.c125
-rw-r--r--kernel/trace/trace.h7
-rw-r--r--kernel/trace/trace_irqsoff.c19
-rw-r--r--kernel/trace/trace_sched_wakeup.c18
-rw-r--r--kernel/trace/trace_selftest.c9
-rw-r--r--kernel/trace/trace_stack.c76
-rw-r--r--kernel/trace/trace_stat.c2
-rw-r--r--kernel/trace/trace_syscalls.c21
-rw-r--r--kernel/watchdog.c4
-rw-r--r--kernel/workqueue.c91
54 files changed, 1203 insertions, 578 deletions
diff --git a/kernel/async.c b/kernel/async.c
index cd9dbb9..04f66e3 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -87,6 +87,13 @@ static async_cookie_t __lowest_in_progress(struct list_head *running)
{
struct async_entry *entry;
+ if (!running) { /* just check the entry count */
+ if (atomic_read(&entry_count))
+ return 0; /* smaller than any cookie */
+ else
+ return next_cookie;
+ }
+
if (!list_empty(running)) {
entry = list_first_entry(running,
struct async_entry, list);
@@ -236,9 +243,7 @@ EXPORT_SYMBOL_GPL(async_schedule_domain);
*/
void async_synchronize_full(void)
{
- do {
- async_synchronize_cookie(next_cookie);
- } while (!list_empty(&async_running) || !list_empty(&async_pending));
+ async_synchronize_cookie_domain(next_cookie, NULL);
}
EXPORT_SYMBOL_GPL(async_synchronize_full);
@@ -258,7 +263,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
/**
* async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
* @cookie: async_cookie_t to use as checkpoint
- * @running: running list to synchronize on
+ * @running: running list to synchronize on, NULL indicates all lists
*
* This function waits until all asynchronous function calls for the
* synchronization domain specified by the running list @list submitted
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index e99dda0..7d9731d 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -256,7 +256,6 @@ static void untag_chunk(struct node *p)
spin_unlock(&hash_lock);
spin_unlock(&entry->lock);
fsnotify_destroy_mark(entry);
- fsnotify_put_mark(entry);
goto out;
}
@@ -265,7 +264,7 @@ static void untag_chunk(struct node *p)
fsnotify_duplicate_mark(&new->mark, entry);
if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
- free_chunk(new);
+ fsnotify_put_mark(&new->mark);
goto Fallback;
}
@@ -299,7 +298,6 @@ static void untag_chunk(struct node *p)
spin_unlock(&hash_lock);
spin_unlock(&entry->lock);
fsnotify_destroy_mark(entry);
- fsnotify_put_mark(entry);
goto out;
Fallback:
@@ -328,7 +326,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
entry = &chunk->mark;
if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
- free_chunk(chunk);
+ fsnotify_put_mark(entry);
return -ENOSPC;
}
@@ -338,6 +336,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
spin_unlock(&hash_lock);
chunk->dead = 1;
spin_unlock(&entry->lock);
+ fsnotify_get_mark(entry);
fsnotify_destroy_mark(entry);
fsnotify_put_mark(entry);
return 0;
@@ -402,7 +401,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
fsnotify_duplicate_mark(chunk_entry, old_entry);
if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
spin_unlock(&old_entry->lock);
- free_chunk(chunk);
+ fsnotify_put_mark(chunk_entry);
fsnotify_put_mark(old_entry);
return -ENOSPC;
}
@@ -418,6 +417,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
spin_unlock(&chunk_entry->lock);
spin_unlock(&old_entry->lock);
+ fsnotify_get_mark(chunk_entry);
fsnotify_destroy_mark(chunk_entry);
fsnotify_put_mark(chunk_entry);
@@ -451,7 +451,6 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
spin_unlock(&old_entry->lock);
fsnotify_destroy_mark(old_entry);
fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
- fsnotify_put_mark(old_entry); /* and kill it */
return 0;
}
@@ -615,9 +614,9 @@ void audit_trim_trees(void)
}
spin_unlock(&hash_lock);
trim_marked(tree);
- put_tree(tree);
drop_collected_mounts(root_mnt);
skip_it:
+ put_tree(tree);
mutex_lock(&audit_filter_mutex);
}
list_del(&cursor);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 5083a09..2ba3bac 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1803,9 +1803,8 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
* trading it for newcg is protected by cgroup_mutex, we're safe to drop
* it here; it will be freed under RCU.
*/
- put_css_set(oldcg);
-
set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+ put_css_set(oldcg);
return 0;
}
@@ -2029,7 +2028,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
if (!group)
return -ENOMEM;
/* pre-allocate to guarantee space while iterating in rcu read-side. */
- retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
+ retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
if (retval)
goto out_free_group_list;
@@ -2666,9 +2665,7 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
dentry->d_fsdata = cgrp;
inc_nlink(parent->d_inode);
rcu_assign_pointer(cgrp->dentry, dentry);
- dget(dentry);
}
- dput(dentry);
return error;
}
@@ -3528,6 +3525,7 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
const char *buffer)
{
struct cgroup_event *event = NULL;
+ struct cgroup *cgrp_cfile;
unsigned int efd, cfd;
struct file *efile = NULL;
struct file *cfile = NULL;
@@ -3582,6 +3580,16 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
goto fail;
}
+ /*
+ * The file to be monitored must be in the same cgroup as
+ * cgroup.event_control is.
+ */
+ cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
+ if (cgrp_cfile != cgrp) {
+ ret = -EINVAL;
+ goto fail;
+ }
+
if (!event->cft->register_event || !event->cft->unregister_event) {
ret = -EINVAL;
goto fail;
diff --git a/kernel/compat.c b/kernel/compat.c
index fc9eb093..3507c93 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -318,25 +318,54 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
#ifdef __ARCH_WANT_SYS_SIGPROCMASK
-asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
- compat_old_sigset_t __user *oset)
+/*
+ * sys_sigprocmask SIG_SETMASK sets the first (compat) word of the
+ * blocked set of signals to the supplied signal set
+ */
+static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set)
{
- old_sigset_t s;
- long ret;
- mm_segment_t old_fs;
+ memcpy(blocked->sig, &set, sizeof(set));
+}
- if (set && get_user(s, set))
- return -EFAULT;
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- ret = sys_sigprocmask(how,
- set ? (old_sigset_t __user *) &s : NULL,
- oset ? (old_sigset_t __user *) &s : NULL);
- set_fs(old_fs);
- if (ret == 0)
- if (oset)
- ret = put_user(s, oset);
- return ret;
+asmlinkage long compat_sys_sigprocmask(int how,
+ compat_old_sigset_t __user *nset,
+ compat_old_sigset_t __user *oset)
+{
+ old_sigset_t old_set, new_set;
+ sigset_t new_blocked;
+
+ old_set = current->blocked.sig[0];
+
+ if (nset) {
+ if (get_user(new_set, nset))
+ return -EFAULT;
+ new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
+
+ new_blocked = current->blocked;
+
+ switch (how) {
+ case SIG_BLOCK:
+ sigaddsetmask(&new_blocked, new_set);
+ break;
+ case SIG_UNBLOCK:
+ sigdelsetmask(&new_blocked, new_set);
+ break;
+ case SIG_SETMASK:
+ compat_sig_setmask(&new_blocked, new_set);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ set_current_blocked(&new_blocked);
+ }
+
+ if (oset) {
+ if (put_user(old_set, oset))
+ return -EFAULT;
+ }
+
+ return 0;
}
#endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ff7585d..112790d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -123,6 +123,19 @@ static inline struct cpuset *task_cs(struct task_struct *task)
struct cpuset, css);
}
+#ifdef CONFIG_NUMA
+static inline bool task_has_mempolicy(struct task_struct *task)
+{
+ return task->mempolicy;
+}
+#else
+static inline bool task_has_mempolicy(struct task_struct *task)
+{
+ return false;
+}
+#endif
+
+
/* bits in struct cpuset flags field */
typedef enum {
CS_CPU_EXCLUSIVE,
@@ -949,7 +962,8 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
static void cpuset_change_task_nodemask(struct task_struct *tsk,
nodemask_t *newmems)
{
-repeat:
+ bool need_loop;
+
/*
* Allow tasks that have access to memory reserves because they have
* been OOM killed to get memory anywhere.
@@ -960,46 +974,27 @@ repeat:
return;
task_lock(tsk);
- nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
- mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
-
-
/*
- * ensure checking ->mems_allowed_change_disable after setting all new
- * allowed nodes.
- *
- * the read-side task can see an nodemask with new allowed nodes and
- * old allowed nodes. and if it allocates page when cpuset clears newly
- * disallowed ones continuous, it can see the new allowed bits.
- *
- * And if setting all new allowed nodes is after the checking, setting
- * all new allowed nodes and clearing newly disallowed ones will be done
- * continuous, and the read-side task may find no node to alloc page.
+ * Determine if a loop is necessary if another thread is doing
+ * get_mems_allowed(). If at least one node remains unchanged and
+ * tsk does not have a mempolicy, then an empty nodemask will not be
+ * possible when mems_allowed is larger than a word.
*/
- smp_mb();
+ need_loop = task_has_mempolicy(tsk) ||
+ !nodes_intersects(*newmems, tsk->mems_allowed);
- /*
- * Allocation of memory is very fast, we needn't sleep when waiting
- * for the read-side.
- */
- while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
- task_unlock(tsk);
- if (!task_curr(tsk))
- yield();
- goto repeat;
- }
+ if (need_loop)
+ write_seqcount_begin(&tsk->mems_allowed_seq);
- /*
- * ensure checking ->mems_allowed_change_disable before clearing all new
- * disallowed nodes.
- *
- * if clearing newly disallowed bits before the checking, the read-side
- * task may find no node to alloc page.
- */
- smp_mb();
+ nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
+ mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
tsk->mems_allowed = *newmems;
+
+ if (need_loop)
+ write_seqcount_end(&tsk->mems_allowed_seq);
+
task_unlock(tsk);
}
@@ -2085,6 +2080,9 @@ static void scan_for_empty_cpusets(struct cpuset *root)
* (of no affect) on systems that are actively using CPU hotplug
* but making no active use of cpusets.
*
+ * The only exception to this is suspend/resume, where we don't
+ * modify cpusets at all.
+ *
* This routine ensures that top_cpuset.cpus_allowed tracks
* cpu_active_mask on each CPU hotplug (cpuhp) event.
*
@@ -2489,8 +2487,16 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk)
dentry = task_cs(tsk)->css.cgroup->dentry;
spin_lock(&cpuset_buffer_lock);
- snprintf(cpuset_name, CPUSET_NAME_LEN,
- dentry ? (const char *)dentry->d_name.name : "/");
+
+ if (!dentry) {
+ strcpy(cpuset_name, "/");
+ } else {
+ spin_lock(&dentry->d_lock);
+ strlcpy(cpuset_name, (const char *)dentry->d_name.name,
+ CPUSET_NAME_LEN);
+ spin_unlock(&dentry->d_lock);
+ }
+
nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
tsk->mems_allowed);
printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 32a6151..acdc087 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -223,9 +223,9 @@ perf_cgroup_match(struct perf_event *event)
return !event->cgrp || event->cgrp == cpuctx->cgrp;
}
-static inline void perf_get_cgroup(struct perf_event *event)
+static inline bool perf_tryget_cgroup(struct perf_event *event)
{
- css_get(&event->cgrp->css);
+ return css_tryget(&event->cgrp->css);
}
static inline void perf_put_cgroup(struct perf_event *event)
@@ -342,6 +342,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ if (cpuctx->unique_pmu != pmu)
+ continue; /* ensure we process each cpuctx once */
perf_pmu_disable(cpuctx->ctx.pmu);
@@ -365,9 +367,10 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
if (mode & PERF_CGROUP_SWIN) {
WARN_ON_ONCE(cpuctx->cgrp);
- /* set cgrp before ctxsw in to
- * allow event_filter_match() to not
- * have to pass task around
+ /*
+ * set cgrp before ctxsw in to allow
+ * event_filter_match() to not have to pass
+ * task around
*/
cpuctx->cgrp = perf_cgroup_from_task(task);
cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
@@ -415,7 +418,11 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
event->cgrp = cgrp;
/* must be done before we fput() the file */
- perf_get_cgroup(event);
+ if (!perf_tryget_cgroup(event)) {
+ event->cgrp = NULL;
+ ret = -ENOENT;
+ goto out;
+ }
/*
* all events in a group must monitor
@@ -651,8 +658,18 @@ perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
{
struct perf_event_context *ctx;
- rcu_read_lock();
retry:
+ /*
+ * One of the few rules of preemptible RCU is that one cannot do
+ * rcu_read_unlock() while holding a scheduler (or nested) lock when
+ * part of the read side critical section was preemptible -- see
+ * rcu_read_unlock_special().
+ *
+ * Since ctx->lock nests under rq->lock we must ensure the entire read
+ * side critical section is non-preemptible.
+ */
+ preempt_disable();
+ rcu_read_lock();
ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
if (ctx) {
/*
@@ -668,6 +685,8 @@ retry:
raw_spin_lock_irqsave(&ctx->lock, *flags);
if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
raw_spin_unlock_irqrestore(&ctx->lock, *flags);
+ rcu_read_unlock();
+ preempt_enable();
goto retry;
}
@@ -677,6 +696,7 @@ retry:
}
}
rcu_read_unlock();
+ preempt_enable();
return ctx;
}
@@ -826,6 +846,15 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
}
/*
+ * Initialize event state based on the perf_event_attr::disabled.
+ */
+static inline void perf_event__state_init(struct perf_event *event)
+{
+ event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
+ PERF_EVENT_STATE_INACTIVE;
+}
+
+/*
* Called at perf_event creation and when events are attached/detached from a
* group.
*/
@@ -1616,7 +1645,16 @@ static int __perf_event_enable(void *info)
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
int err;
- if (WARN_ON_ONCE(!ctx->is_active))
+ /*
+ * There's a time window between 'ctx->is_active' check
+ * in perf_event_enable function and this place having:
+ * - IRQs on
+ * - ctx->lock unlocked
+ *
+ * where the task could be killed and 'ctx' deactivated
+ * by perf_event_exit_task.
+ */
+ if (!ctx->is_active)
return -EINVAL;
raw_spin_lock(&ctx->lock);
@@ -2969,12 +3007,12 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
/*
* Called when the last reference to the file is gone.
*/
-static int perf_release(struct inode *inode, struct file *file)
+static void put_event(struct perf_event *event)
{
- struct perf_event *event = file->private_data;
struct task_struct *owner;
- file->private_data = NULL;
+ if (!atomic_long_dec_and_test(&event->refcount))
+ return;
rcu_read_lock();
owner = ACCESS_ONCE(event->owner);
@@ -3009,7 +3047,13 @@ static int perf_release(struct inode *inode, struct file *file)
put_task_struct(owner);
}
- return perf_event_release_kernel(event);
+ perf_event_release_kernel(event);
+}
+
+static int perf_release(struct inode *inode, struct file *file)
+{
+ put_event(file->private_data);
+ return 0;
}
u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
@@ -3241,7 +3285,7 @@ unlock:
static const struct file_operations perf_fops;
-static struct perf_event *perf_fget_light(int fd, int *fput_needed)
+static struct file *perf_fget_light(int fd, int *fput_needed)
{
struct file *file;
@@ -3255,7 +3299,7 @@ static struct perf_event *perf_fget_light(int fd, int *fput_needed)
return ERR_PTR(-EBADF);
}
- return file->private_data;
+ return file;
}
static int perf_event_set_output(struct perf_event *event,
@@ -3287,19 +3331,21 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case PERF_EVENT_IOC_SET_OUTPUT:
{
+ struct file *output_file = NULL;
struct perf_event *output_event = NULL;
int fput_needed = 0;
int ret;
if (arg != -1) {
- output_event = perf_fget_light(arg, &fput_needed);
- if (IS_ERR(output_event))
- return PTR_ERR(output_event);
+ output_file = perf_fget_light(arg, &fput_needed);
+ if (IS_ERR(output_file))
+ return PTR_ERR(output_file);
+ output_event = output_file->private_data;
}
ret = perf_event_set_output(event, output_event);
if (output_event)
- fput_light(output_event->filp, fput_needed);
+ fput_light(output_file, fput_needed);
return ret;
}
@@ -4536,7 +4582,7 @@ static void perf_event_task_event(struct perf_task_event *task_event)
rcu_read_lock();
list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
- if (cpuctx->active_pmu != pmu)
+ if (cpuctx->unique_pmu != pmu)
goto next;
perf_event_task_ctx(&cpuctx->ctx, task_event);
@@ -4682,7 +4728,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
rcu_read_lock();
list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
- if (cpuctx->active_pmu != pmu)
+ if (cpuctx->unique_pmu != pmu)
goto next;
perf_event_comm_ctx(&cpuctx->ctx, comm_event);
@@ -4878,7 +4924,7 @@ got_name:
rcu_read_lock();
list_for_each_entry_rcu(pmu, &pmus, entry) {
cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
- if (cpuctx->active_pmu != pmu)
+ if (cpuctx->unique_pmu != pmu)
goto next;
perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
vma->vm_flags & VM_EXEC);
@@ -5432,7 +5478,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
static int perf_swevent_init(struct perf_event *event)
{
- int event_id = event->attr.config;
+ u64 event_id = event->attr.config;
if (event->attr.type != PERF_TYPE_SOFTWARE)
return -ENOENT;
@@ -5904,8 +5950,8 @@ static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- if (cpuctx->active_pmu == old_pmu)
- cpuctx->active_pmu = pmu;
+ if (cpuctx->unique_pmu == old_pmu)
+ cpuctx->unique_pmu = pmu;
}
}
@@ -6037,7 +6083,7 @@ skip_type:
cpuctx->ctx.pmu = pmu;
cpuctx->jiffies_interval = 1;
INIT_LIST_HEAD(&cpuctx->rotation_list);
- cpuctx->active_pmu = pmu;
+ cpuctx->unique_pmu = pmu;
}
got_cpu_context:
@@ -6181,6 +6227,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
mutex_init(&event->mmap_mutex);
+ atomic_long_set(&event->refcount, 1);
event->cpu = cpu;
event->attr = *attr;
event->group_leader = group_leader;
@@ -6210,8 +6257,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
event->overflow_handler = overflow_handler;
- if (attr->disabled)
- event->state = PERF_EVENT_STATE_OFF;
+ perf_event__state_init(event);
pmu = NULL;
@@ -6455,12 +6501,12 @@ SYSCALL_DEFINE5(perf_event_open,
return event_fd;
if (group_fd != -1) {
- group_leader = perf_fget_light(group_fd, &fput_needed);
- if (IS_ERR(group_leader)) {
- err = PTR_ERR(group_leader);
+ group_file = perf_fget_light(group_fd, &fput_needed);
+ if (IS_ERR(group_file)) {
+ err = PTR_ERR(group_file);
goto err_fd;
}
- group_file = group_leader->filp;
+ group_leader = group_file->private_data;
if (flags & PERF_FLAG_FD_OUTPUT)
output_event = group_leader;
if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -6585,16 +6631,23 @@ SYSCALL_DEFINE5(perf_event_open,
mutex_lock(&gctx->mutex);
perf_remove_from_context(group_leader);
+
+ /*
+ * Removing from the context ends up with disabled
+ * event. What we want here is event in the initial
+ * startup state, ready to be add into new context.
+ */
+ perf_event__state_init(group_leader);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
perf_remove_from_context(sibling);
+ perf_event__state_init(sibling);
put_ctx(gctx);
}
mutex_unlock(&gctx->mutex);
put_ctx(gctx);
}
- event->filp = event_file;
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
@@ -6682,7 +6735,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
goto err_free;
}
- event->filp = NULL;
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
perf_install_in_context(ctx, event, cpu);
@@ -6731,7 +6783,7 @@ static void sync_child_event(struct perf_event *child_event,
* Release the parent event, if this was the last
* reference to it.
*/
- fput(parent_event->filp);
+ put_event(parent_event);
}
static void
@@ -6807,9 +6859,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
*
* __perf_event_exit_task()
* sync_child_event()
- * fput(parent_event->filp)
- * perf_release()
- * mutex_lock(&ctx->mutex)
+ * put_event()
+ * mutex_lock(&ctx->mutex)
*
* But since its the parent context it won't be the same instance.
*/
@@ -6877,7 +6928,7 @@ static void perf_free_event(struct perf_event *event,
list_del_init(&event->child_list);
mutex_unlock(&parent->child_mutex);
- fput(parent->filp);
+ put_event(parent);
perf_group_detach(event);
list_del_event(event, ctx);
@@ -6957,6 +7008,12 @@ inherit_event(struct perf_event *parent_event,
NULL);
if (IS_ERR(child_event))
return child_event;
+
+ if (!atomic_long_inc_not_zero(&parent_event->refcount)) {
+ free_event(child_event);
+ return NULL;
+ }
+
get_ctx(child_ctx);
/*
@@ -6996,14 +7053,6 @@ inherit_event(struct perf_event *parent_event,
raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
/*
- * Get a reference to the parent filp - we will fput it
- * when the child event exits. This is safe to do because
- * we are in the parent and we know that the filp still
- * exists and has a nonzero count:
- */
- atomic_long_inc(&parent_event->filp->f_count);
-
- /*
* Link this into the parent event's child list
*/
WARN_ON_ONCE(parent_event->ctx->parent_ctx);
@@ -7060,7 +7109,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
* child.
*/
- child_ctx = alloc_perf_context(event->pmu, child);
+ child_ctx = alloc_perf_context(parent_ctx->pmu, child);
if (!child_ctx)
return -ENOMEM;
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 086adf2..d99cb4b 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -147,7 +147,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
return;
}
- for_each_online_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
unsigned int nr;
nr = per_cpu(nr_cpu_bp_pinned[type], cpu);
@@ -233,7 +233,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
if (cpu >= 0) {
toggle_bp_task_slot(bp, cpu, enable, type, weight);
} else {
- for_each_online_cpu(cpu)
+ for_each_possible_cpu(cpu)
toggle_bp_task_slot(bp, cpu, enable, type, weight);
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 303bed2..97dd317 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1049,6 +1049,22 @@ NORET_TYPE void do_exit(long code)
preempt_disable();
exit_rcu();
+
+ /*
+ * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
+ * when the following two conditions become true.
+ * - There is race condition of mmap_sem (It is acquired by
+ * exit_mm()), and
+ * - SMI occurs before setting TASK_RUNINNG.
+ * (or hypervisor of virtual machine switches to other guest)
+ * As a result, we may become TASK_RUNNING after becoming TASK_DEAD
+ *
+ * To avoid it, we have to wait for releasing tsk->pi_lock which
+ * is held by try_to_wake_up()
+ */
+ smp_mb();
+ raw_spin_unlock_wait(&tsk->pi_lock);
+
/* causes final put_task_struct in finish_task_switch(). */
tsk->state = TASK_DEAD;
schedule();
diff --git a/kernel/fork.c b/kernel/fork.c
index 06909a9..158ca4f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -48,6 +48,7 @@
#include <linux/audit.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
+#include <linux/proc_fs.h>
#include <linux/profile.h>
#include <linux/rmap.h>
#include <linux/ksm.h>
@@ -1000,6 +1001,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
#ifdef CONFIG_CGROUPS
init_rwsem(&sig->threadgroup_fork_lock);
#endif
+#ifdef CONFIG_CPUSETS
+ seqcount_init(&tsk->mems_allowed_seq);
+#endif
sig->oom_adj = current->signal->oom_adj;
sig->oom_score_adj = current->signal->oom_score_adj;
@@ -1394,6 +1398,8 @@ bad_fork_cleanup_io:
if (p->io_context)
exit_io_context(p);
bad_fork_cleanup_namespaces:
+ if (unlikely(clone_flags & CLONE_NEWPID))
+ pid_ns_release_proc(p->nsproxy->pid_ns);
exit_task_namespaces(p);
bad_fork_cleanup_mm:
if (p->mm) {
diff --git a/kernel/futex.c b/kernel/futex.c
index b2d51a7..d38153d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -60,6 +60,7 @@
#include <linux/pid.h>
#include <linux/nsproxy.h>
#include <linux/ptrace.h>
+#include <linux/hugetlb.h>
#include <asm/futex.h>
@@ -363,7 +364,7 @@ again:
} else {
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
key->shared.inode = page_head->mapping->host;
- key->shared.pgoff = page_head->index;
+ key->shared.pgoff = basepage_index(page);
}
get_futex_key_refs(key);
@@ -716,7 +717,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
struct futex_pi_state **ps,
struct task_struct *task, int set_waiters)
{
- int lock_taken, ret, ownerdied = 0;
+ int lock_taken, ret, force_take = 0;
u32 uval, newval, curval, vpid = task_pid_vnr(task);
retry:
@@ -755,17 +756,15 @@ retry:
newval = curval | FUTEX_WAITERS;
/*
- * There are two cases, where a futex might have no owner (the
- * owner TID is 0): OWNER_DIED. We take over the futex in this
- * case. We also do an unconditional take over, when the owner
- * of the futex died.
- *
- * This is safe as we are protected by the hash bucket lock !
+ * Should we force take the futex? See below.
*/
- if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
- /* Keep the OWNER_DIED bit */
+ if (unlikely(force_take)) {
+ /*
+ * Keep the OWNER_DIED and the WAITERS bit and set the
+ * new TID value.
+ */
newval = (curval & ~FUTEX_TID_MASK) | vpid;
- ownerdied = 0;
+ force_take = 0;
lock_taken = 1;
}
@@ -775,7 +774,7 @@ retry:
goto retry;
/*
- * We took the lock due to owner died take over.
+ * We took the lock due to forced take over.
*/
if (unlikely(lock_taken))
return 1;
@@ -790,20 +789,25 @@ retry:
switch (ret) {
case -ESRCH:
/*
- * No owner found for this futex. Check if the
- * OWNER_DIED bit is set to figure out whether
- * this is a robust futex or not.
+ * We failed to find an owner for this
+ * futex. So we have no pi_state to block
+ * on. This can happen in two cases:
+ *
+ * 1) The owner died
+ * 2) A stale FUTEX_WAITERS bit
+ *
+ * Re-read the futex value.
*/
if (get_futex_value_locked(&curval, uaddr))
return -EFAULT;
/*
- * We simply start over in case of a robust
- * futex. The code above will take the futex
- * and return happy.
+ * If the owner died or we have a stale
+ * WAITERS bit the owner TID in the user space
+ * futex is 0.
*/
- if (curval & FUTEX_OWNER_DIED) {
- ownerdied = 1;
+ if (!(curval & FUTEX_TID_MASK)) {
+ force_take = 1;
goto retry;
}
default:
@@ -840,6 +844,9 @@ static void wake_futex(struct futex_q *q)
{
struct task_struct *p = q->task;
+ if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
+ return;
+
/*
* We set q->lock_ptr = NULL _before_ we wake up the task. If
* a non-futex wake up happens on another CPU then the task
@@ -1075,6 +1082,10 @@ retry_private:
plist_for_each_entry_safe(this, next, head, list) {
if (match_futex (&this->key, &key1)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
wake_futex(this);
if (++ret >= nr_wake)
break;
@@ -1087,6 +1098,10 @@ retry_private:
op_ret = 0;
plist_for_each_entry_safe(this, next, head, list) {
if (match_futex (&this->key, &key2)) {
+ if (this->pi_state || this->rt_waiter) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
wake_futex(this);
if (++op_ret >= nr_wake2)
break;
@@ -1095,6 +1110,7 @@ retry_private:
ret += op_ret;
}
+out_unlock:
double_unlock_hb(hb1, hb2);
out_put_keys:
put_futex_key(&key2);
@@ -1384,9 +1400,13 @@ retry_private:
/*
* FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
* be paired with each other and no other futex ops.
+ *
+ * We should never be requeueing a futex_q with a pi_state,
+ * which is awaiting a futex_unlock_pi().
*/
if ((requeue_pi && !this->rt_waiter) ||
- (!requeue_pi && this->rt_waiter)) {
+ (!requeue_pi && this->rt_waiter) ||
+ this->pi_state) {
ret = -EINVAL;
break;
}
@@ -2231,11 +2251,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* @uaddr2: the pi futex we will take prior to returning to user-space
*
* The caller will wait on uaddr and will be requeued by futex_requeue() to
- * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and
- * complete the acquisition of the rt_mutex prior to returning to userspace.
- * This ensures the rt_mutex maintains an owner when it has waiters; without
- * one, the pi logic wouldn't know which task to boost/deboost, if there was a
- * need to.
+ * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake
+ * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
+ * userspace. This ensures the rt_mutex maintains an owner when it has waiters;
+ * without one, the pi logic would not know which task to boost/deboost, if
+ * there was a need to.
*
* We call schedule in futex_wait_queue_me() when we enqueue and return there
* via the following:
@@ -2272,6 +2292,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
struct futex_q q = futex_q_init;
int res, ret;
+ if (uaddr == uaddr2)
+ return -EINVAL;
+
if (!bitset)
return -EINVAL;
@@ -2343,7 +2366,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* signal. futex_unlock_pi() will not destroy the lock_ptr nor
* the pi_state.
*/
- WARN_ON(!&q.pi_state);
+ WARN_ON(!q.pi_state);
pi_mutex = &q.pi_state->pi_mutex;
ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
debug_rt_mutex_free_waiter(&rt_waiter);
@@ -2370,7 +2393,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* fault, unlock the rt_mutex and return the fault to userspace.
*/
if (ret == -EFAULT) {
- if (rt_mutex_owner(pi_mutex) == current)
+ if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
rt_mutex_unlock(pi_mutex);
} else if (ret == -EINTR) {
/*
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 2043c08..80ec91d 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -61,6 +61,7 @@
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
.clock_base =
{
{
@@ -297,6 +298,10 @@ ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec)
} else {
unsigned long rem = do_div(nsec, NSEC_PER_SEC);
+ /* Make sure nsec fits into long */
+ if (unlikely(nsec > KTIME_SEC_MAX))
+ return (ktime_t){ .tv64 = KTIME_MAX };
+
tmp = ktime_set((long)nsec, rem);
}
@@ -640,21 +645,17 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
* and expiry check is done in the hrtimer_interrupt or in the softirq.
*/
static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base,
- int wakeup)
+ struct hrtimer_clock_base *base)
{
- if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
- if (wakeup) {
- raw_spin_unlock(&base->cpu_base->lock);
- raise_softirq_irqoff(HRTIMER_SOFTIRQ);
- raw_spin_lock(&base->cpu_base->lock);
- } else
- __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ return base->cpu_base->hres_active && hrtimer_reprogram(timer, base);
+}
- return 1;
- }
+static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
+{
+ ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
+ ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
- return 0;
+ return ktime_get_update_offsets(offs_real, offs_boot);
}
/*
@@ -665,22 +666,12 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
static void retrigger_next_event(void *arg)
{
struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
- struct timespec realtime_offset, xtim, wtm, sleep;
if (!hrtimer_hres_active())
return;
- /* Optimized out for !HIGH_RES */
- get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
- set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
-
- /* Adjust CLOCK_REALTIME offset */
raw_spin_lock(&base->lock);
- base->clock_base[HRTIMER_BASE_REALTIME].offset =
- timespec_to_ktime(realtime_offset);
- base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
- timespec_to_ktime(sleep);
-
+ hrtimer_update_base(base);
hrtimer_force_reprogram(base, 0);
raw_spin_unlock(&base->lock);
}
@@ -710,13 +701,28 @@ static int hrtimer_switch_to_hres(void)
base->clock_base[i].resolution = KTIME_HIGH_RES;
tick_setup_sched_timer();
-
/* "Retrigger" the interrupt to get things going */
retrigger_next_event(NULL);
local_irq_restore(flags);
return 1;
}
+static void clock_was_set_work(struct work_struct *work)
+{
+ clock_was_set();
+}
+
+static DECLARE_WORK(hrtimer_work, clock_was_set_work);
+
+/*
+ * Called from timekeeping and resume code to reprogramm the hrtimer
+ * interrupt device on all cpus.
+ */
+void clock_was_set_delayed(void)
+{
+ schedule_work(&hrtimer_work);
+}
+
#else
static inline int hrtimer_hres_active(void) { return 0; }
@@ -725,8 +731,7 @@ static inline int hrtimer_switch_to_hres(void) { return 0; }
static inline void
hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
- struct hrtimer_clock_base *base,
- int wakeup)
+ struct hrtimer_clock_base *base)
{
return 0;
}
@@ -764,8 +769,10 @@ void hrtimers_resume(void)
WARN_ONCE(!irqs_disabled(),
KERN_INFO "hrtimers_resume() called with IRQs enabled!");
+ /* Retrigger on the local CPU */
retrigger_next_event(NULL);
- timerfd_clock_was_set();
+ /* And schedule a retrigger for all others */
+ clock_was_set_delayed();
}
static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
@@ -985,8 +992,21 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
*
* XXX send_remote_softirq() ?
*/
- if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases))
- hrtimer_enqueue_reprogram(timer, new_base, wakeup);
+ if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)
+ && hrtimer_enqueue_reprogram(timer, new_base)) {
+ if (wakeup) {
+ /*
+ * We need to drop cpu_base->lock to avoid a
+ * lock ordering issue vs. rq->lock.
+ */
+ raw_spin_unlock(&new_base->cpu_base->lock);
+ raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ local_irq_restore(flags);
+ return ret;
+ } else {
+ __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ }
+ }
unlock_hrtimer_base(timer, &flags);
@@ -1250,11 +1270,10 @@ void hrtimer_interrupt(struct clock_event_device *dev)
cpu_base->nr_events++;
dev->next_event.tv64 = KTIME_MAX;
- entry_time = now = ktime_get();
+ raw_spin_lock(&cpu_base->lock);
+ entry_time = now = hrtimer_update_base(cpu_base);
retry:
expires_next.tv64 = KTIME_MAX;
-
- raw_spin_lock(&cpu_base->lock);
/*
* We set expires_next to KTIME_MAX here with cpu_base->lock
* held to prevent that a timer is enqueued in our queue via
@@ -1298,6 +1317,8 @@ retry:
expires = ktime_sub(hrtimer_get_expires(timer),
base->offset);
+ if (expires.tv64 < 0)
+ expires.tv64 = KTIME_MAX;
if (expires.tv64 < expires_next.tv64)
expires_next = expires;
break;
@@ -1330,8 +1351,12 @@ retry:
* We need to prevent that we loop forever in the hrtimer
* interrupt routine. We give it 3 attempts to avoid
* overreacting on some spurious event.
+ *
+ * Acquire base lock for updating the offsets and retrieving
+ * the current time.
*/
- now = ktime_get();
+ raw_spin_lock(&cpu_base->lock);
+ now = hrtimer_update_base(cpu_base);
cpu_base->nr_retries++;
if (++retries < 3)
goto retry;
@@ -1343,6 +1368,7 @@ retry:
*/
cpu_base->nr_hangs++;
cpu_base->hang_detected = 1;
+ raw_spin_unlock(&cpu_base->lock);
delta = ktime_sub(now, entry_time);
if (delta.tv64 > cpu_base->max_hang_time.tv64)
cpu_base->max_hang_time = delta;
@@ -1619,8 +1645,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
int i;
- raw_spin_lock_init(&cpu_base->lock);
-
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
cpu_base->clock_base[i].cpu_base = cpu_base;
timerqueue_init_head(&cpu_base->clock_base[i].active);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 470d08c..10e0772 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -117,7 +117,7 @@ irqreturn_t
handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
{
irqreturn_t retval = IRQ_NONE;
- unsigned int random = 0, irq = desc->irq_data.irq;
+ unsigned int flags = 0, irq = desc->irq_data.irq;
do {
irqreturn_t res;
@@ -145,7 +145,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
/* Fall through to add to randomness */
case IRQ_HANDLED:
- random |= action->flags;
+ flags |= action->flags;
break;
default:
@@ -156,8 +156,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
action = action->next;
} while (action);
- if (random & IRQF_SAMPLE_RANDOM)
- add_interrupt_randomness(irq);
+ add_interrupt_randomness(irq, flags);
if (!noirqdebug)
note_interrupt(irq, desc, retval);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index df8136f..2f61278 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -536,9 +536,9 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
return 0;
if (irq_settings_can_request(desc)) {
- if (desc->action)
- if (irqflags & desc->action->flags & IRQF_SHARED)
- canrequest =1;
+ if (!desc->action ||
+ irqflags & desc->action->flags & IRQF_SHARED)
+ canrequest = 1;
}
irq_put_desc_unlock(desc, flags);
return canrequest;
@@ -698,6 +698,7 @@ static void
irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
{
cpumask_var_t mask;
+ bool valid = true;
if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
return;
@@ -712,10 +713,18 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
}
raw_spin_lock_irq(&desc->lock);
- cpumask_copy(mask, desc->irq_data.affinity);
+ /*
+ * This code is triggered unconditionally. Check the affinity
+ * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
+ */
+ if (desc->irq_data.affinity)
+ cpumask_copy(mask, desc->irq_data.affinity);
+ else
+ valid = false;
raw_spin_unlock_irq(&desc->lock);
- set_cpus_allowed_ptr(current, mask);
+ if (valid)
+ set_cpus_allowed_ptr(current, mask);
free_cpumask_var(mask);
}
#else
@@ -886,22 +895,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
if (desc->irq_data.chip == &no_irq_chip)
return -ENOSYS;
- /*
- * Some drivers like serial.c use request_irq() heavily,
- * so we have to be careful not to interfere with a
- * running system.
- */
- if (new->flags & IRQF_SAMPLE_RANDOM) {
- /*
- * This function might sleep, we want to call it first,
- * outside of the atomic block.
- * Yes, this might clear the entropy pool if the wrong
- * driver is attempted to be loaded, without actually
- * installing a new handler, but is this really a problem,
- * only the sysadmin is able to do this.
- */
- rand_initialize_irq(irq);
- }
/*
* Check whether the interrupt nests into another interrupt
@@ -941,6 +934,16 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
*/
get_task_struct(t);
new->thread = t;
+ /*
+ * Tell the thread to set its affinity. This is
+ * important for shared interrupt handlers as we do
+ * not invoke setup_affinity() for the secondary
+ * handlers as everything is already set up. Even for
+ * interrupts marked with IRQF_NO_BALANCE this is
+ * correct as we want the thread to move to the cpu(s)
+ * on which the requesting code placed the interrupt.
+ */
+ set_bit(IRQTF_AFFINITY, &new->thread_flags);
}
if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
@@ -1325,7 +1328,6 @@ EXPORT_SYMBOL(free_irq);
* Flags:
*
* IRQF_SHARED Interrupt is shared
- * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy
* IRQF_TRIGGER_* Specify active edge(s) or level
*
*/
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dc813a9..63633a3 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -80,13 +80,11 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
/*
* All handlers must agree on IRQF_SHARED, so we test just the
- * first. Check for action->next as well.
+ * first.
*/
action = desc->action;
if (!action || !(action->flags & IRQF_SHARED) ||
- (action->flags & __IRQF_TIMER) ||
- (action->handler(irq, action->dev_id) == IRQ_HANDLED) ||
- !action->next)
+ (action->flags & __IRQF_TIMER))
goto out;
/* Already running on another processor */
@@ -104,6 +102,7 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
do {
if (handle_irq_event(desc) == IRQ_HANDLED)
ret = IRQ_HANDLED;
+ /* Make sure that there is still a valid action */
action = desc->action;
} while ((desc->istate & IRQS_PENDING) && action);
desc->istate &= ~IRQS_POLL_INPROGRESS;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index fabfe54..f625b4f 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -421,6 +421,11 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
int retval = 0;
helper_lock();
+ if (!sub_info->path) {
+ retval = -EINVAL;
+ goto out;
+ }
+
if (sub_info->path[0] == '\0')
goto out;
diff --git a/kernel/module.c b/kernel/module.c
index b9d0667..a8bd215 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2605,6 +2605,10 @@ static int check_module_license_and_versions(struct module *mod)
if (strcmp(mod->name, "driverloader") == 0)
add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+ /* lve claims to be GPL but upstream won't provide source */
+ if (strcmp(mod->name, "lve") == 0)
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+
#ifdef CONFIG_MODVERSIONS
if ((mod->num_syms && !mod->crcs)
|| (mod->num_gpl_syms && !mod->gpl_crcs)
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 640ded8..93d5e4a 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1450,8 +1450,10 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
while (!signal_pending(current)) {
if (timer.it.cpu.expires.sched == 0) {
/*
- * Our timer fired and was reset.
+ * Our timer fired and was reset, below
+ * deletion can not fail.
*/
+ posix_cpu_timer_del(&timer);
spin_unlock_irq(&timer.it_lock);
return 0;
}
@@ -1469,9 +1471,26 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
* We were interrupted by a signal.
*/
sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
- posix_cpu_timer_set(&timer, 0, &zero_it, it);
+ error = posix_cpu_timer_set(&timer, 0, &zero_it, it);
+ if (!error) {
+ /*
+ * Timer is now unarmed, deletion can not fail.
+ */
+ posix_cpu_timer_del(&timer);
+ }
spin_unlock_irq(&timer.it_lock);
+ while (error == TIMER_RETRY) {
+ /*
+ * We need to handle case when timer was or is in the
+ * middle of firing. In other cases we already freed
+ * resources.
+ */
+ spin_lock_irq(&timer.it_lock);
+ error = posix_cpu_timer_del(&timer);
+ spin_unlock_irq(&timer.it_lock);
+ }
+
if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
/*
* It actually did fire already.
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 4556182..d2da8ad 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -639,6 +639,13 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
{
struct k_itimer *timr;
+ /*
+ * timer_t could be any type >= int and we want to make sure any
+ * @timer_id outside positive int range fails lookup.
+ */
+ if ((unsigned long long)timer_id > INT_MAX)
+ return NULL;
+
rcu_read_lock();
timr = idr_find(&posix_timers_id, (int)timer_id);
if (timr) {
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 8884c27..32f1590 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -344,6 +344,7 @@ int hibernation_snapshot(int platform_mode)
goto Complete_devices;
suspend_console();
+ ftrace_stop();
pm_restrict_gfp_mask();
error = dpm_suspend(PMSG_FREEZE);
if (error)
@@ -369,6 +370,7 @@ int hibernation_snapshot(int platform_mode)
if (error || !in_suspend)
pm_restore_gfp_mask();
+ ftrace_start();
resume_console();
Complete_devices:
@@ -471,6 +473,7 @@ int hibernation_restore(int platform_mode)
pm_prepare_console();
suspend_console();
+ ftrace_stop();
pm_restrict_gfp_mask();
error = dpm_suspend_start(PMSG_QUIESCE);
if (!error) {
@@ -478,6 +481,7 @@ int hibernation_restore(int platform_mode)
dpm_resume_end(PMSG_RECOVER);
}
pm_restore_gfp_mask();
+ ftrace_start();
resume_console();
pm_restore_console();
return error;
@@ -504,6 +508,7 @@ int hibernation_platform_enter(void)
entering_platform_hibernation = true;
suspend_console();
+ ftrace_stop();
error = dpm_suspend_start(PMSG_HIBERNATE);
if (error) {
if (hibernation_ops->recover)
@@ -547,6 +552,7 @@ int hibernation_platform_enter(void)
Resume_devices:
entering_platform_hibernation = false;
dpm_resume_end(PMSG_RESTORE);
+ ftrace_start();
resume_console();
Close:
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 61e6347..f5adb6e 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -23,6 +23,7 @@
#include <linux/slab.h>
#include <linux/suspend.h>
#include <linux/syscore_ops.h>
+#include <linux/ftrace.h>
#include <trace/events/power.h>
#include "power.h"
@@ -213,6 +214,7 @@ int suspend_devices_and_enter(suspend_state_t state)
goto Close;
}
suspend_console();
+ ftrace_stop();
suspend_test_start();
error = dpm_suspend_start(PMSG_SUSPEND);
if (error) {
@@ -229,6 +231,7 @@ int suspend_devices_and_enter(suspend_state_t state)
suspend_test_start();
dpm_resume_end(PMSG_RESUME);
suspend_test_finish("resume devices");
+ ftrace_start();
resume_console();
Close:
if (suspend_ops->end)
diff --git a/kernel/printk.c b/kernel/printk.c
index 2414614..a1d702c 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -684,8 +684,19 @@ static void call_console_drivers(unsigned start, unsigned end)
start_print = start;
while (cur_index != end) {
if (msg_level < 0 && ((end - cur_index) > 2)) {
+ /*
+ * prepare buf_prefix, as a contiguous array,
+ * to be processed by log_prefix function
+ */
+ char buf_prefix[SYSLOG_PRI_MAX_LENGTH+1];
+ unsigned i;
+ for (i = 0; i < ((end - cur_index)) && (i < SYSLOG_PRI_MAX_LENGTH); i++) {
+ buf_prefix[i] = LOG_BUF(cur_index + i);
+ }
+ buf_prefix[i] = '\0'; /* force '\0' as last string character */
+
/* strip log prefix */
- cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL);
+ cur_index += log_prefix((const char *)&buf_prefix, &msg_level, NULL);
start_print = cur_index;
}
while (cur_index != end) {
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 2df1157..40581ee 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -38,6 +38,36 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
child->parent = new_parent;
}
+/* Ensure that nothing can wake it up, even SIGKILL */
+static bool ptrace_freeze_traced(struct task_struct *task)
+{
+ bool ret = false;
+
+ spin_lock_irq(&task->sighand->siglock);
+ if (task_is_traced(task) && !__fatal_signal_pending(task)) {
+ task->state = __TASK_TRACED;
+ ret = true;
+ }
+ spin_unlock_irq(&task->sighand->siglock);
+
+ return ret;
+}
+
+static void ptrace_unfreeze_traced(struct task_struct *task)
+{
+ if (task->state != __TASK_TRACED)
+ return;
+
+ WARN_ON(!task->ptrace || task->parent != current);
+
+ spin_lock_irq(&task->sighand->siglock);
+ if (__fatal_signal_pending(task))
+ wake_up_state(task, __TASK_TRACED);
+ else
+ task->state = TASK_TRACED;
+ spin_unlock_irq(&task->sighand->siglock);
+}
+
/**
* __ptrace_unlink - unlink ptracee and restore its execution state
* @child: ptracee to be unlinked
@@ -92,7 +122,7 @@ void __ptrace_unlink(struct task_struct *child)
* TASK_KILLABLE sleeps.
*/
if (child->group_stop & GROUP_STOP_PENDING || task_is_traced(child))
- signal_wake_up(child, task_is_traced(child));
+ ptrace_signal_wake_up(child, true);
spin_unlock(&child->sighand->siglock);
}
@@ -112,23 +142,29 @@ int ptrace_check_attach(struct task_struct *child, int kill)
* be changed by us so it's not changing right after this.
*/
read_lock(&tasklist_lock);
- if ((child->ptrace & PT_PTRACED) && child->parent == current) {
+ if (child->ptrace && child->parent == current) {
+ WARN_ON(child->state == __TASK_TRACED);
/*
* child->sighand can't be NULL, release_task()
* does ptrace_unlink() before __exit_signal().
*/
- spin_lock_irq(&child->sighand->siglock);
- WARN_ON_ONCE(task_is_stopped(child));
- if (task_is_traced(child) || kill)
+ if (kill || ptrace_freeze_traced(child))
ret = 0;
- spin_unlock_irq(&child->sighand->siglock);
}
read_unlock(&tasklist_lock);
- if (!ret && !kill)
- ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
+ if (!ret && !kill) {
+ if (!wait_task_inactive(child, __TASK_TRACED)) {
+ /*
+ * This can only happen if may_ptrace_stop() fails and
+ * ptrace_stop() changes ->state back to TASK_RUNNING,
+ * so we should not worry about leaking __TASK_TRACED.
+ */
+ WARN_ON(child->state == __TASK_TRACED);
+ ret = -ESRCH;
+ }
+ }
- /* All systems go.. */
return ret;
}
@@ -245,7 +281,7 @@ static int ptrace_attach(struct task_struct *task)
*/
if (task_is_stopped(task)) {
task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING;
- signal_wake_up(task, 1);
+ signal_wake_up_state(task, __TASK_STOPPED);
wait_trap = true;
}
@@ -777,6 +813,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
goto out_put_task_struct;
ret = arch_ptrace(child, request, addr, data);
+ if (ret || request != PTRACE_DETACH)
+ ptrace_unfreeze_traced(child);
out_put_task_struct:
put_task_struct(child);
@@ -915,8 +953,11 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
}
ret = ptrace_check_attach(child, request == PTRACE_KILL);
- if (!ret)
+ if (!ret) {
ret = compat_arch_ptrace(child, request, addr, data);
+ if (ret || request != PTRACE_DETACH)
+ ptrace_unfreeze_traced(child);
+ }
out_put_task_struct:
put_task_struct(child);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ba06207..fe7a9b0 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -283,7 +283,9 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
static int
cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
{
- return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp);
+ return *rdp->nxttail[RCU_DONE_TAIL +
+ ACCESS_ONCE(rsp->completed) != rdp->completed] &&
+ !rcu_gp_in_progress(rsp);
}
/*
diff --git a/kernel/relay.c b/kernel/relay.c
index 2c242fb..a5be9af 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1235,6 +1235,7 @@ static ssize_t subbuf_splice_actor(struct file *in,
struct splice_pipe_desc spd = {
.pages = pages,
.nr_pages = 0,
+ .nr_pages_max = PIPE_DEF_BUFFERS,
.partial = partial,
.flags = flags,
.ops = &relay_pipe_buf_ops,
@@ -1302,8 +1303,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
ret += padding;
out:
- splice_shrink_spd(pipe, &spd);
- return ret;
+ splice_shrink_spd(&spd);
+ return ret;
}
static ssize_t relay_file_splice_read(struct file *in,
diff --git a/kernel/resource.c b/kernel/resource.c
index 3ff4017..d005cd3 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -419,6 +419,9 @@ static int __find_resource(struct resource *root, struct resource *old,
else
tmp.end = root->end;
+ if (tmp.end < tmp.start)
+ goto next;
+
resource_clip(&tmp, constraint->min, constraint->max);
arch_remove_reservations(&tmp);
@@ -436,8 +439,10 @@ static int __find_resource(struct resource *root, struct resource *old,
return 0;
}
}
- if (!this)
+
+next: if (!this || this->end == root->end)
break;
+
if (this != old)
tmp.start = this->end + 1;
this = this->sibling;
@@ -731,6 +736,7 @@ static void __init __reserve_region_with_split(struct resource *root,
struct resource *parent = root;
struct resource *conflict;
struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC);
+ struct resource *next_res = NULL;
if (!res)
return;
@@ -740,21 +746,46 @@ static void __init __reserve_region_with_split(struct resource *root,
res->end = end;
res->flags = IORESOURCE_BUSY;
- conflict = __request_resource(parent, res);
- if (!conflict)
- return;
+ while (1) {
- /* failed, split and try again */
- kfree(res);
+ conflict = __request_resource(parent, res);
+ if (!conflict) {
+ if (!next_res)
+ break;
+ res = next_res;
+ next_res = NULL;
+ continue;
+ }
- /* conflict covered whole area */
- if (conflict->start <= start && conflict->end >= end)
- return;
+ /* conflict covered whole area */
+ if (conflict->start <= res->start &&
+ conflict->end >= res->end) {
+ kfree(res);
+ WARN_ON(next_res);
+ break;
+ }
+
+ /* failed, split and try again */
+ if (conflict->start > res->start) {
+ end = res->end;
+ res->end = conflict->start - 1;
+ if (conflict->end < end) {
+ next_res = kzalloc(sizeof(*next_res),
+ GFP_ATOMIC);
+ if (!next_res) {
+ kfree(res);
+ break;
+ }
+ next_res->name = name;
+ next_res->start = conflict->end + 1;
+ next_res->end = end;
+ next_res->flags = IORESOURCE_BUSY;
+ }
+ } else {
+ res->start = conflict->end + 1;
+ }
+ }
- if (conflict->start > start)
- __reserve_region_with_split(root, start, conflict->start-1, name);
- if (conflict->end < end)
- __reserve_region_with_split(root, conflict->end+1, end, name);
}
void __init reserve_region_with_split(struct resource *root,
diff --git a/kernel/sched.c b/kernel/sched.c
index 62ca33e..efb62f0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -606,22 +606,19 @@ static inline int cpu_of(struct rq *rq)
/*
* Return the group to which this tasks belongs.
*
- * We use task_subsys_state_check() and extend the RCU verification with
- * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
- * task it moves into the cgroup. Therefore by holding either of those locks,
- * we pin the task to the current cgroup.
+ * We cannot use task_subsys_state() and friends because the cgroup
+ * subsystem changes that value before the cgroup_subsys::attach() method
+ * is called, therefore we cannot pin it and might observe the wrong value.
+ *
+ * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
+ * core changes this before calling sched_move_task().
+ *
+ * Instead we use a 'copy' which is updated from sched_move_task() while
+ * holding both task_struct::pi_lock and rq::lock.
*/
static inline struct task_group *task_group(struct task_struct *p)
{
- struct task_group *tg;
- struct cgroup_subsys_state *css;
-
- css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
- lockdep_is_held(&p->pi_lock) ||
- lockdep_is_held(&task_rq(p)->lock));
- tg = container_of(css, struct task_group, css);
-
- return autogroup_task_group(p, tg);
+ return p->sched_task_group;
}
/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -2207,7 +2204,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
* a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
*
* sched_move_task() holds both and thus holding either pins the cgroup,
- * see set_task_rq().
+ * see task_group().
*
* Furthermore, all task_rq users should acquire both locks, see
* task_rq_lock().
@@ -2777,8 +2774,10 @@ static void try_to_wake_up_local(struct task_struct *p)
{
struct rq *rq = task_rq(p);
- BUG_ON(rq != this_rq());
- BUG_ON(p == current);
+ if (WARN_ON_ONCE(rq != this_rq()) ||
+ WARN_ON_ONCE(p == current))
+ return;
+
lockdep_assert_held(&rq->lock);
if (!raw_spin_trylock(&p->pi_lock)) {
@@ -2812,7 +2811,8 @@ out:
*/
int wake_up_process(struct task_struct *p)
{
- return try_to_wake_up(p, TASK_ALL, 0);
+ WARN_ON(task_is_stopped_or_traced(p));
+ return try_to_wake_up(p, TASK_NORMAL, 0);
}
EXPORT_SYMBOL(wake_up_process);
@@ -7251,11 +7251,8 @@ int sched_domain_level_max;
static int __init setup_relax_domain_level(char *str)
{
- unsigned long val;
-
- val = simple_strtoul(str, NULL, 0);
- if (val < sched_domain_level_max)
- default_relax_domain_level = val;
+ if (kstrtoint(str, 0, &default_relax_domain_level))
+ pr_warn("Unable to set relax_domain_level\n");
return 1;
}
@@ -7448,7 +7445,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
if (!sd)
return child;
- set_domain_attribute(sd, attr);
cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
if (child) {
sd->level = child->level + 1;
@@ -7456,6 +7452,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
child->parent = sd;
}
sd->child = child;
+ set_domain_attribute(sd, attr);
return sd;
}
@@ -7814,34 +7811,66 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
}
#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
+
/*
* Update cpusets according to cpu_active mask. If cpusets are
* disabled, cpuset_update_active_cpus() becomes a simple wrapper
* around partition_sched_domains().
+ *
+ * If we come here as part of a suspend/resume, don't touch cpusets because we
+ * want to restore it back to its original state upon resume anyway.
*/
static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
- switch (action & ~CPU_TASKS_FROZEN) {
+ switch (action) {
+ case CPU_ONLINE_FROZEN:
+ case CPU_DOWN_FAILED_FROZEN:
+
+ /*
+ * num_cpus_frozen tracks how many CPUs are involved in suspend
+ * resume sequence. As long as this is not the last online
+ * operation in the resume sequence, just build a single sched
+ * domain, ignoring cpusets.
+ */
+ num_cpus_frozen--;
+ if (likely(num_cpus_frozen)) {
+ partition_sched_domains(1, NULL, NULL);
+ break;
+ }
+
+ /*
+ * This is the last CPU online operation. So fall through and
+ * restore the original sched domains by considering the
+ * cpuset configurations.
+ */
+
case CPU_ONLINE:
case CPU_DOWN_FAILED:
cpuset_update_active_cpus();
- return NOTIFY_OK;
+ break;
default:
return NOTIFY_DONE;
}
+ return NOTIFY_OK;
}
static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
- switch (action & ~CPU_TASKS_FROZEN) {
+ switch (action) {
case CPU_DOWN_PREPARE:
cpuset_update_active_cpus();
- return NOTIFY_OK;
+ break;
+ case CPU_DOWN_PREPARE_FROZEN:
+ num_cpus_frozen++;
+ partition_sched_domains(1, NULL, NULL);
+ break;
default:
return NOTIFY_DONE;
}
+ return NOTIFY_OK;
}
static int update_runtime(struct notifier_block *nfb,
@@ -8590,6 +8619,7 @@ void sched_destroy_group(struct task_group *tg)
*/
void sched_move_task(struct task_struct *tsk)
{
+ struct task_group *tg;
int on_rq, running;
unsigned long flags;
struct rq *rq;
@@ -8604,6 +8634,12 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
tsk->sched_class->put_prev_task(rq, tsk);
+ tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
+ lockdep_is_held(&tsk->sighand->siglock)),
+ struct task_group, css);
+ tg = autogroup_task_group(tsk, tg);
+ tsk->sched_task_group = tg;
+
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_move_group)
tsk->sched_class->task_move_group(tsk, on_rq);
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index 429242f..f280df1 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -160,15 +160,11 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
p->signal->autogroup = autogroup_kref_get(ag);
- if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
- goto out;
-
t = p;
do {
sched_move_task(t);
} while_each_thread(p, t);
-out:
unlock_task_sighand(p, &flags);
autogroup_kref_put(prev);
}
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
index 0557705..7b859ff 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched_autogroup.h
@@ -1,11 +1,6 @@
#ifdef CONFIG_SCHED_AUTOGROUP
struct autogroup {
- /*
- * reference doesn't mean how many thread attach to this
- * autogroup now. It just stands for the number of task
- * could use this autogroup.
- */
struct kref kref;
struct task_group *tg;
struct rw_semaphore lock;
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 9d8af0b..1eeaf74 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
u64 this_clock, remote_clock;
u64 *ptr, old_val, val;
+#if BITS_PER_LONG != 64
+again:
+ /*
+ * Careful here: The local and the remote clock values need to
+ * be read out atomic as we need to compare the values and
+ * then update either the local or the remote side. So the
+ * cmpxchg64 below only protects one readout.
+ *
+ * We must reread via sched_clock_local() in the retry case on
+ * 32bit as an NMI could use sched_clock_local() via the
+ * tracer and hit between the readout of
+ * the low32bit and the high 32bit portion.
+ */
+ this_clock = sched_clock_local(my_scd);
+ /*
+ * We must enforce atomic readout on 32bit, otherwise the
+ * update on the remote cpu can hit inbetween the readout of
+ * the low32bit and the high 32bit portion.
+ */
+ remote_clock = cmpxchg64(&scd->clock, 0, 0);
+#else
+ /*
+ * On 64bit the read of [my]scd->clock is atomic versus the
+ * update, so we can avoid the above 32bit dance.
+ */
sched_clock_local(my_scd);
again:
this_clock = my_scd->clock;
remote_clock = scd->clock;
+#endif
/*
* Use the opportunity that we have both locks
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c768588..fae7d67 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -4277,7 +4277,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
* idle runqueue:
*/
if (rq->cfs.load.weight)
- rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
+ rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
return rr_interval;
}
diff --git a/kernel/signal.c b/kernel/signal.c
index 43fee1c..f15021b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -437,6 +437,9 @@ flush_signal_handlers(struct task_struct *t, int force_default)
if (force_default || ka->sa.sa_handler != SIG_IGN)
ka->sa.sa_handler = SIG_DFL;
ka->sa.sa_flags = 0;
+#ifdef __ARCH_HAS_SA_RESTORER
+ ka->sa.sa_restorer = NULL;
+#endif
sigemptyset(&ka->sa.sa_mask);
ka++;
}
@@ -631,23 +634,17 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
* No need to set need_resched since signal event passing
* goes through ->blocked
*/
-void signal_wake_up(struct task_struct *t, int resume)
+void signal_wake_up_state(struct task_struct *t, unsigned int state)
{
- unsigned int mask;
-
set_tsk_thread_flag(t, TIF_SIGPENDING);
-
/*
- * For SIGKILL, we want to wake it up in the stopped/traced/killable
+ * TASK_WAKEKILL also means wake it up in the stopped/traced/killable
* case. We don't check t->state here because there is a race with it
* executing another processor and just now entering stopped state.
* By using wake_up_state, we ensure the process will wake up and
* handle its death signal.
*/
- mask = TASK_INTERRUPTIBLE;
- if (resume)
- mask |= TASK_WAKEKILL;
- if (!wake_up_state(t, mask))
+ if (!wake_up_state(t, state | TASK_INTERRUPTIBLE))
kick_process(t);
}
@@ -1675,6 +1672,10 @@ static inline int may_ptrace_stop(void)
* If SIGKILL was already sent before the caller unlocked
* ->siglock we must see ->core_state != NULL. Otherwise it
* is safe to enter schedule().
+ *
+ * This is almost outdated, a task with the pending SIGKILL can't
+ * block in TASK_TRACED. But PTRACE_EVENT_EXIT can be reported
+ * after SIGKILL was already dequeued.
*/
if (unlikely(current->mm->core_state) &&
unlikely(current->mm == current->parent->mm))
@@ -1806,6 +1807,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
if (gstop_done)
do_notify_parent_cldstop(current, false, why);
+ /* tasklist protects us from ptrace_freeze_traced() */
__set_current_state(TASK_RUNNING);
if (clear_code)
current->exit_code = 0;
@@ -2662,7 +2664,7 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
static int do_tkill(pid_t tgid, pid_t pid, int sig)
{
- struct siginfo info;
+ struct siginfo info = {};
info.si_signo = sig;
info.si_errno = 0;
diff --git a/kernel/smp.c b/kernel/smp.c
index fb67dfa..38d9e03 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -31,6 +31,7 @@ struct call_function_data {
struct call_single_data csd;
atomic_t refs;
cpumask_var_t cpumask;
+ cpumask_var_t cpumask_ipi;
};
static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
@@ -54,6 +55,9 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
cpu_to_node(cpu)))
return notifier_from_errno(-ENOMEM);
+ if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
+ cpu_to_node(cpu)))
+ return notifier_from_errno(-ENOMEM);
break;
#ifdef CONFIG_HOTPLUG_CPU
@@ -63,6 +67,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
case CPU_DEAD:
case CPU_DEAD_FROZEN:
free_cpumask_var(cfd->cpumask);
+ free_cpumask_var(cfd->cpumask_ipi);
break;
#endif
};
@@ -524,6 +529,12 @@ void smp_call_function_many(const struct cpumask *mask,
return;
}
+ /*
+ * After we put an entry into the list, data->cpumask
+ * may be cleared again when another CPU sends another IPI for
+ * a SMP function call, so data->cpumask will be zero.
+ */
+ cpumask_copy(data->cpumask_ipi, data->cpumask);
raw_spin_lock_irqsave(&call_function.lock, flags);
/*
* Place entry at the _HEAD_ of the list, so that any cpu still
@@ -547,7 +558,7 @@ void smp_call_function_many(const struct cpumask *mask,
smp_mb();
/* Send a message to all CPUs in the map */
- arch_send_call_function_ipi_mask(data->cpumask);
+ arch_send_call_function_ipi_mask(data->cpumask_ipi);
/* Optionally wait for the CPUs to complete */
if (wait)
diff --git a/kernel/sys.c b/kernel/sys.c
index f88dadc..1c69aa7 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -320,7 +320,6 @@ void kernel_restart_prepare(char *cmd)
system_state = SYSTEM_RESTART;
usermodehelper_disable();
device_shutdown();
- syscore_shutdown();
}
/**
@@ -334,6 +333,8 @@ void kernel_restart_prepare(char *cmd)
void kernel_restart(char *cmd)
{
kernel_restart_prepare(cmd);
+ disable_nonboot_cpus();
+ syscore_shutdown();
if (!cmd)
printk(KERN_EMERG "Restarting system.\n");
else
@@ -359,6 +360,7 @@ static void kernel_shutdown_prepare(enum system_states state)
void kernel_halt(void)
{
kernel_shutdown_prepare(SYSTEM_HALT);
+ disable_nonboot_cpus();
syscore_shutdown();
printk(KERN_EMERG "System halted.\n");
kmsg_dump(KMSG_DUMP_HALT);
@@ -1132,15 +1134,16 @@ DECLARE_RWSEM(uts_sem);
* Work around broken programs that cannot handle "Linux 3.0".
* Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40
*/
-static int override_release(char __user *release, int len)
+static int override_release(char __user *release, size_t len)
{
int ret = 0;
- char buf[65];
if (current->personality & UNAME26) {
- char *rest = UTS_RELEASE;
+ const char *rest = UTS_RELEASE;
+ char buf[65] = { 0 };
int ndots = 0;
unsigned v;
+ size_t copy;
while (*rest) {
if (*rest == '.' && ++ndots >= 3)
@@ -1150,8 +1153,9 @@ static int override_release(char __user *release, int len)
rest++;
}
v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40;
- snprintf(buf, len, "2.6.%u%s", v, rest);
- ret = copy_to_user(release, buf, len);
+ copy = clamp_t(size_t, len, 1, sizeof(buf));
+ copy = scnprintf(buf, copy, "2.6.%u%s", v, rest);
+ ret = copy_to_user(release, buf, copy + 1);
}
return ret;
}
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index e055e8b..17c20c7 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1194,9 +1194,10 @@ static ssize_t bin_dn_node_address(struct file *file,
/* Convert the decnet address to binary */
result = -EIO;
- nodep = strchr(buf, '.') + 1;
+ nodep = strchr(buf, '.');
if (!nodep)
goto out;
+ ++nodep;
area = simple_strtoul(buf, NULL, 10);
node = simple_strtoul(nodep, NULL, 10);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 4b85a7a..61fc450 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -31,8 +31,6 @@ unsigned long tick_nsec;
u64 tick_length;
static u64 tick_length_base;
-static struct hrtimer leap_timer;
-
#define MAX_TICKADJ 500LL /* usecs */
#define MAX_TICKADJ_SCALED \
(((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
@@ -350,60 +348,64 @@ void ntp_clear(void)
}
/*
- * Leap second processing. If in leap-insert state at the end of the
- * day, the system clock is set back one second; if in leap-delete
- * state, the system clock is set ahead one second.
+ * this routine handles the overflow of the microsecond field
+ *
+ * The tricky bits of code to handle the accurate clock support
+ * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
+ * They were originally developed for SUN and DEC kernels.
+ * All the kudos should go to Dave for this stuff.
+ *
+ * Also handles leap second processing, and returns leap offset
*/
-static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
+int second_overflow(unsigned long secs)
{
- enum hrtimer_restart res = HRTIMER_NORESTART;
-
- write_seqlock(&xtime_lock);
+ int leap = 0;
+ s64 delta;
+ /*
+ * Leap second processing. If in leap-insert state at the end of the
+ * day, the system clock is set back one second; if in leap-delete
+ * state, the system clock is set ahead one second.
+ */
switch (time_state) {
case TIME_OK:
+ if (time_status & STA_INS)
+ time_state = TIME_INS;
+ else if (time_status & STA_DEL)
+ time_state = TIME_DEL;
break;
case TIME_INS:
- timekeeping_leap_insert(-1);
- time_state = TIME_OOP;
- printk(KERN_NOTICE
- "Clock: inserting leap second 23:59:60 UTC\n");
- hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC);
- res = HRTIMER_RESTART;
+ if (!(time_status & STA_INS))
+ time_state = TIME_OK;
+ else if (secs % 86400 == 0) {
+ leap = -1;
+ time_state = TIME_OOP;
+ time_tai++;
+ printk(KERN_NOTICE
+ "Clock: inserting leap second 23:59:60 UTC\n");
+ }
break;
case TIME_DEL:
- timekeeping_leap_insert(1);
- time_tai--;
- time_state = TIME_WAIT;
- printk(KERN_NOTICE
- "Clock: deleting leap second 23:59:59 UTC\n");
+ if (!(time_status & STA_DEL))
+ time_state = TIME_OK;
+ else if ((secs + 1) % 86400 == 0) {
+ leap = 1;
+ time_tai--;
+ time_state = TIME_WAIT;
+ printk(KERN_NOTICE
+ "Clock: deleting leap second 23:59:59 UTC\n");
+ }
break;
case TIME_OOP:
- time_tai++;
time_state = TIME_WAIT;
- /* fall through */
+ break;
+
case TIME_WAIT:
if (!(time_status & (STA_INS | STA_DEL)))
time_state = TIME_OK;
break;
}
- write_sequnlock(&xtime_lock);
-
- return res;
-}
-
-/*
- * this routine handles the overflow of the microsecond field
- *
- * The tricky bits of code to handle the accurate clock support
- * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
- * They were originally developed for SUN and DEC kernels.
- * All the kudos should go to Dave for this stuff.
- */
-void second_overflow(void)
-{
- s64 delta;
/* Bump the maxerror field */
time_maxerror += MAXFREQ / NSEC_PER_USEC;
@@ -423,23 +425,25 @@ void second_overflow(void)
pps_dec_valid();
if (!time_adjust)
- return;
+ goto out;
if (time_adjust > MAX_TICKADJ) {
time_adjust -= MAX_TICKADJ;
tick_length += MAX_TICKADJ_SCALED;
- return;
+ goto out;
}
if (time_adjust < -MAX_TICKADJ) {
time_adjust += MAX_TICKADJ;
tick_length -= MAX_TICKADJ_SCALED;
- return;
+ goto out;
}
tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ)
<< NTP_SCALE_SHIFT;
time_adjust = 0;
+out:
+ return leap;
}
#ifdef CONFIG_GENERIC_CMOS_UPDATE
@@ -501,27 +505,6 @@ static void notify_cmos_timer(void)
static inline void notify_cmos_timer(void) { }
#endif
-/*
- * Start the leap seconds timer:
- */
-static inline void ntp_start_leap_timer(struct timespec *ts)
-{
- long now = ts->tv_sec;
-
- if (time_status & STA_INS) {
- time_state = TIME_INS;
- now += 86400 - now % 86400;
- hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
-
- return;
- }
-
- if (time_status & STA_DEL) {
- time_state = TIME_DEL;
- now += 86400 - (now + 1) % 86400;
- hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
- }
-}
/*
* Propagate a new txc->status value into the NTP state:
@@ -546,22 +529,6 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
time_status &= STA_RONLY;
time_status |= txc->status & ~STA_RONLY;
- switch (time_state) {
- case TIME_OK:
- ntp_start_leap_timer(ts);
- break;
- case TIME_INS:
- case TIME_DEL:
- time_state = TIME_OK;
- ntp_start_leap_timer(ts);
- case TIME_WAIT:
- if (!(time_status & (STA_INS | STA_DEL)))
- time_state = TIME_OK;
- break;
- case TIME_OOP:
- hrtimer_restart(&leap_timer);
- break;
- }
}
/*
* Called with the xtime lock held, so we can access and modify
@@ -643,9 +610,6 @@ int do_adjtimex(struct timex *txc)
(txc->tick < 900000/USER_HZ ||
txc->tick > 1100000/USER_HZ))
return -EINVAL;
-
- if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
- hrtimer_cancel(&leap_timer);
}
if (txc->modes & ADJ_SETOFFSET) {
@@ -967,6 +931,4 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup);
void __init ntp_init(void)
{
ntp_clear();
- hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
- leap_timer.function = ntp_leap_second;
}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 7a90d02..20ba7b4 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -66,12 +66,17 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)
*/
int tick_check_broadcast_device(struct clock_event_device *dev)
{
- if ((tick_broadcast_device.evtdev &&
+ struct clock_event_device *cur = tick_broadcast_device.evtdev;
+
+ if ((dev->features & CLOCK_EVT_FEAT_DUMMY) ||
+ (tick_broadcast_device.evtdev &&
tick_broadcast_device.evtdev->rating >= dev->rating) ||
(dev->features & CLOCK_EVT_FEAT_C3STOP))
return 0;
clockevents_exchange_device(tick_broadcast_device.evtdev, dev);
+ if (cur)
+ cur->event_handler = clockevents_handle_noop;
tick_broadcast_device.evtdev = dev;
if (!cpumask_empty(tick_get_broadcast_mask()))
tick_broadcast_start_periodic(dev);
@@ -391,7 +396,15 @@ void tick_check_oneshot_broadcast(int cpu)
if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) {
struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
- clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
+ /*
+ * We might be in the middle of switching over from
+ * periodic to oneshot. If the CPU has not yet
+ * switched over, leave the device alone.
+ */
+ if (td->mode == TICKDEV_MODE_ONESHOT) {
+ clockevents_set_mode(td->evtdev,
+ CLOCK_EVT_MODE_ONESHOT);
+ }
}
}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 119528d..c43b479 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -323,6 +323,7 @@ static void tick_shutdown(unsigned int *cpup)
*/
dev->mode = CLOCK_EVT_MODE_UNUSED;
clockevents_exchange_device(dev, NULL);
+ dev->event_handler = clockevents_handle_noop;
td->evtdev = NULL;
}
raw_spin_unlock_irqrestore(&tick_device_lock, flags);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c473ce2..c0be5f2 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -811,7 +811,7 @@ void tick_cancel_sched_timer(int cpu)
hrtimer_cancel(&ts->sched_timer);
# endif
- ts->nohz_mode = NOHZ_MODE_INACTIVE;
+ memset(ts, 0, sizeof(*ts));
}
#endif
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 9b28d04..5928f95 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -161,23 +161,43 @@ static struct timespec xtime __attribute__ ((aligned (16)));
static struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
static struct timespec total_sleep_time;
+/* Offset clock monotonic -> clock realtime */
+static ktime_t offs_real;
+
+/* Offset clock monotonic -> clock boottime */
+static ktime_t offs_boot;
+
/*
* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
*/
static struct timespec raw_time;
-/* flag for if timekeeping is suspended */
-int __read_mostly timekeeping_suspended;
+/* must hold write on xtime_lock */
+static void update_rt_offset(void)
+{
+ struct timespec tmp, *wtm = &wall_to_monotonic;
+
+ set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec);
+ offs_real = timespec_to_ktime(tmp);
+}
-/* must hold xtime_lock */
-void timekeeping_leap_insert(int leapsecond)
+/* must hold write on xtime_lock */
+static void timekeeping_update(bool clearntp)
{
- xtime.tv_sec += leapsecond;
- wall_to_monotonic.tv_sec -= leapsecond;
- update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
- timekeeper.mult);
+ if (clearntp) {
+ timekeeper.ntp_error = 0;
+ ntp_clear();
+ }
+ update_rt_offset();
+ update_vsyscall(&xtime, &wall_to_monotonic,
+ timekeeper.clock, timekeeper.mult);
}
+
+
+/* flag for if timekeeping is suspended */
+int __read_mostly timekeeping_suspended;
+
/**
* timekeeping_forward_now - update clock to the current time
*
@@ -362,7 +382,7 @@ int do_settimeofday(const struct timespec *tv)
struct timespec ts_delta;
unsigned long flags;
- if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
+ if (!timespec_valid_strict(tv))
return -EINVAL;
write_seqlock_irqsave(&xtime_lock, flags);
@@ -375,11 +395,7 @@ int do_settimeofday(const struct timespec *tv)
xtime = *tv;
- timekeeper.ntp_error = 0;
- ntp_clear();
-
- update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
- timekeeper.mult);
+ timekeeping_update(true);
write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -401,6 +417,8 @@ EXPORT_SYMBOL(do_settimeofday);
int timekeeping_inject_offset(struct timespec *ts)
{
unsigned long flags;
+ struct timespec tmp;
+ int ret = 0;
if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
return -EINVAL;
@@ -409,21 +427,24 @@ int timekeeping_inject_offset(struct timespec *ts)
timekeeping_forward_now();
+ tmp = timespec_add(xtime, *ts);
+ if (!timespec_valid_strict(&tmp)) {
+ ret = -EINVAL;
+ goto error;
+ }
+
xtime = timespec_add(xtime, *ts);
wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts);
- timekeeper.ntp_error = 0;
- ntp_clear();
-
- update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
- timekeeper.mult);
+error: /* even if we error out, we forwarded the time, so call update */
+ timekeeping_update(true);
write_sequnlock_irqrestore(&xtime_lock, flags);
/* signal hrtimers about time change */
clock_was_set();
- return 0;
+ return ret;
}
EXPORT_SYMBOL(timekeeping_inject_offset);
@@ -570,7 +591,20 @@ void __init timekeeping_init(void)
struct timespec now, boot;
read_persistent_clock(&now);
+ if (!timespec_valid_strict(&now)) {
+ pr_warn("WARNING: Persistent clock returned invalid value!\n"
+ " Check your CMOS/BIOS settings.\n");
+ now.tv_sec = 0;
+ now.tv_nsec = 0;
+ }
+
read_boot_clock(&boot);
+ if (!timespec_valid_strict(&boot)) {
+ pr_warn("WARNING: Boot clock returned invalid value!\n"
+ " Check your CMOS/BIOS settings.\n");
+ boot.tv_sec = 0;
+ boot.tv_nsec = 0;
+ }
write_seqlock_irqsave(&xtime_lock, flags);
@@ -591,6 +625,7 @@ void __init timekeeping_init(void)
}
set_normalized_timespec(&wall_to_monotonic,
-boot.tv_sec, -boot.tv_nsec);
+ update_rt_offset();
total_sleep_time.tv_sec = 0;
total_sleep_time.tv_nsec = 0;
write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -599,6 +634,12 @@ void __init timekeeping_init(void)
/* time in seconds when suspend began */
static struct timespec timekeeping_suspend_time;
+static void update_sleep_time(struct timespec t)
+{
+ total_sleep_time = t;
+ offs_boot = timespec_to_ktime(t);
+}
+
/**
* __timekeeping_inject_sleeptime - Internal function to add sleep interval
* @delta: pointer to a timespec delta value
@@ -608,7 +649,7 @@ static struct timespec timekeeping_suspend_time;
*/
static void __timekeeping_inject_sleeptime(struct timespec *delta)
{
- if (!timespec_valid(delta)) {
+ if (!timespec_valid_strict(delta)) {
printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid "
"sleep delta value!\n");
return;
@@ -616,7 +657,7 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta)
xtime = timespec_add(xtime, *delta);
wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta);
- total_sleep_time = timespec_add(total_sleep_time, *delta);
+ update_sleep_time(timespec_add(total_sleep_time, *delta));
}
@@ -645,10 +686,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
__timekeeping_inject_sleeptime(delta);
- timekeeper.ntp_error = 0;
- ntp_clear();
- update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
- timekeeper.mult);
+ timekeeping_update(true);
write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -683,6 +721,7 @@ static void timekeeping_resume(void)
timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
timekeeper.ntp_error = 0;
timekeeping_suspended = 0;
+ timekeeping_update(false);
write_sequnlock_irqrestore(&xtime_lock, flags);
touch_softlockup_watchdog();
@@ -834,9 +873,14 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
while (timekeeper.xtime_nsec >= nsecps) {
+ int leap;
timekeeper.xtime_nsec -= nsecps;
xtime.tv_sec++;
- second_overflow();
+ leap = second_overflow(xtime.tv_sec);
+ xtime.tv_sec += leap;
+ wall_to_monotonic.tv_sec -= leap;
+ if (leap)
+ clock_was_set_delayed();
}
/* Accumulate raw time */
@@ -881,6 +925,10 @@ static void update_wall_time(void)
#else
offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
#endif
+ /* Check if there's really nothing to do */
+ if (offset < timekeeper.cycle_interval)
+ return;
+
timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
/*
@@ -942,14 +990,17 @@ static void update_wall_time(void)
* xtime.tv_nsec isn't larger then NSEC_PER_SEC
*/
if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) {
+ int leap;
xtime.tv_nsec -= NSEC_PER_SEC;
xtime.tv_sec++;
- second_overflow();
+ leap = second_overflow(xtime.tv_sec);
+ xtime.tv_sec += leap;
+ wall_to_monotonic.tv_sec -= leap;
+ if (leap)
+ clock_was_set_delayed();
}
- /* check to see if there is a new clocksource to use */
- update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
- timekeeper.mult);
+ timekeeping_update(false);
}
/**
@@ -1002,7 +1053,7 @@ void get_monotonic_boottime(struct timespec *ts)
} while (read_seqretry(&xtime_lock, seq));
set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
- ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs);
+ (s64)ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs);
}
EXPORT_SYMBOL_GPL(get_monotonic_boottime);
@@ -1108,6 +1159,40 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
} while (read_seqretry(&xtime_lock, seq));
}
+#ifdef CONFIG_HIGH_RES_TIMERS
+/**
+ * ktime_get_update_offsets - hrtimer helper
+ * @real: pointer to storage for monotonic -> realtime offset
+ * @_boot: pointer to storage for monotonic -> boottime offset
+ *
+ * Returns current monotonic time and updates the offsets
+ * Called from hrtimer_interupt() or retrigger_next_event()
+ */
+ktime_t ktime_get_update_offsets(ktime_t *real, ktime_t *boot)
+{
+ ktime_t now;
+ unsigned int seq;
+ u64 secs, nsecs;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+
+ secs = xtime.tv_sec;
+ nsecs = xtime.tv_nsec;
+ nsecs += timekeeping_get_ns();
+ /* If arch requires, add in gettimeoffset() */
+ nsecs += arch_gettimeoffset();
+
+ *real = offs_real;
+ *boot = offs_boot;
+ } while (read_seqretry(&xtime_lock, seq));
+
+ now = ktime_add_ns(ktime_set(secs, 0), nsecs);
+ now = ktime_sub(now, *real);
+ return now;
+}
+#endif
+
/**
* ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
*/
diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl
index eb51d76..3f42652 100644
--- a/kernel/timeconst.pl
+++ b/kernel/timeconst.pl
@@ -369,10 +369,8 @@ if ($hz eq '--can') {
die "Usage: $0 HZ\n";
}
- @val = @{$canned_values{$hz}};
- if (!defined(@val)) {
- @val = compute_values($hz);
- }
+ $cv = $canned_values{$hz};
+ @val = defined($cv) ? @$cv : compute_values($hz);
output($hz, @val);
}
exit 0;
diff --git a/kernel/timer.c b/kernel/timer.c
index 8cff361..5eac0d8 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -63,6 +63,7 @@ EXPORT_SYMBOL(jiffies_64);
#define TVR_SIZE (1 << TVR_BITS)
#define TVN_MASK (TVN_SIZE - 1)
#define TVR_MASK (TVR_SIZE - 1)
+#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
struct tvec {
struct list_head vec[TVN_SIZE];
@@ -144,9 +145,11 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu,
/* now that we have rounded, subtract the extra skew again */
j -= cpu * 3;
- if (j <= jiffies) /* rounding ate our timeout entirely; */
- return original;
- return j;
+ /*
+ * Make sure j is still in the future. Otherwise return the
+ * unmodified value.
+ */
+ return time_is_after_jiffies(j) ? j : original;
}
/**
@@ -356,11 +359,12 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
} else {
int i;
- /* If the timeout is larger than 0xffffffff on 64-bit
- * architectures then we use the maximum timeout:
+ /* If the timeout is larger than MAX_TVAL (on 64-bit
+ * architectures or with CONFIG_BASE_SMALL=1) then we
+ * use the maximum timeout.
*/
- if (idx > 0xffffffffUL) {
- idx = 0xffffffffUL;
+ if (idx > MAX_TVAL) {
+ idx = MAX_TVAL;
expires = idx + base->timer_jiffies;
}
i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
@@ -1628,12 +1632,12 @@ static int __cpuinit init_timers_cpu(int cpu)
boot_done = 1;
base = &boot_tvec_bases;
}
+ spin_lock_init(&base->lock);
tvec_base_done[cpu] = 1;
} else {
base = per_cpu(tvec_bases, cpu);
}
- spin_lock_init(&base->lock);
for (j = 0; j < TVN_SIZE; j++) {
INIT_LIST_HEAD(base->tv5.vec + j);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 2ad39e5..57c92f2 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -386,24 +386,28 @@ config KPROBE_EVENT
If you want to use perf tools, this option is strongly recommended.
config DYNAMIC_FTRACE
- bool "enable/disable ftrace tracepoints dynamically"
+ bool "enable/disable function tracing dynamically"
depends on FUNCTION_TRACER
depends on HAVE_DYNAMIC_FTRACE
default y
help
- This option will modify all the calls to ftrace dynamically
- (will patch them out of the binary image and replace them
- with a No-Op instruction) as they are called. A table is
- created to dynamically enable them again.
+ This option will modify all the calls to function tracing
+ dynamically (will patch them out of the binary image and
+ replace them with a No-Op instruction) on boot up. During
+ compile time, a table is made of all the locations that ftrace
+ can function trace, and this table is linked into the kernel
+ image. When this is enabled, functions can be individually
+ enabled, and the functions not enabled will not affect
+ performance of the system.
+
+ See the files in /sys/kernel/debug/tracing:
+ available_filter_functions
+ set_ftrace_filter
+ set_ftrace_notrace
This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but
otherwise has native performance as long as no tracing is active.
- The changes to the code are done by a kernel thread that
- wakes up once a second and checks to see if any ftrace calls
- were made. If so, it runs stop_machine (stops all CPUS)
- and modifies the code to jump over the call to ftrace.
-
config FUNCTION_PROFILER
bool "Kernel function profiler"
depends on FUNCTION_TRACER
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9f8e2e1..0d704b0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -548,7 +548,7 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE);
- for (i = 0; i < pages; i++) {
+ for (i = 1; i < pages; i++) {
pg->next = (void *)get_zeroed_page(GFP_KERNEL);
if (!pg->next)
goto out_free;
@@ -566,7 +566,6 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
free_page(tmp);
}
- free_page((unsigned long)stat->pages);
stat->pages = NULL;
stat->start = NULL;
@@ -934,6 +933,19 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
static struct pid * const ftrace_swapper_pid = &init_struct_pid;
+static loff_t
+ftrace_filter_lseek(struct file *file, loff_t offset, int whence)
+{
+ loff_t ret;
+
+ if (file->f_mode & FMODE_READ)
+ ret = seq_lseek(file, offset, whence);
+ else
+ file->f_pos = ret = 1;
+
+ return ret;
+}
+
#ifdef CONFIG_DYNAMIC_FTRACE
#ifndef CONFIG_FTRACE_MCOUNT_RECORD
@@ -2058,7 +2070,7 @@ static void reset_iter_read(struct ftrace_iterator *iter)
{
iter->pos = 0;
iter->func_pos = 0;
- iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH);
+ iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH);
}
static void *t_start(struct seq_file *m, loff_t *pos)
@@ -2300,19 +2312,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
inode, file);
}
-static loff_t
-ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
-{
- loff_t ret;
-
- if (file->f_mode & FMODE_READ)
- ret = seq_lseek(file, offset, origin);
- else
- file->f_pos = ret = 1;
-
- return ret;
-}
-
static int ftrace_match(char *str, char *regex, int len, int type)
{
int matched = 0;
@@ -2709,8 +2708,8 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
continue;
}
- hlist_del(&entry->node);
- call_rcu(&entry->rcu, ftrace_free_entry_rcu);
+ hlist_del_rcu(&entry->node);
+ call_rcu_sched(&entry->rcu, ftrace_free_entry_rcu);
}
}
__disable_ftrace_function_probe();
@@ -3119,7 +3118,7 @@ static const struct file_operations ftrace_filter_fops = {
.open = ftrace_filter_open,
.read = seq_read,
.write = ftrace_filter_write,
- .llseek = ftrace_regex_lseek,
+ .llseek = ftrace_filter_lseek,
.release = ftrace_regex_release,
};
@@ -3127,7 +3126,7 @@ static const struct file_operations ftrace_notrace_fops = {
.open = ftrace_notrace_open,
.read = seq_read,
.write = ftrace_notrace_write,
- .llseek = ftrace_regex_lseek,
+ .llseek = ftrace_filter_lseek,
.release = ftrace_regex_release,
};
@@ -3288,7 +3287,8 @@ out:
if (fail)
return -EINVAL;
- ftrace_graph_filter_enabled = 1;
+ ftrace_graph_filter_enabled = !!(*idx);
+
return 0;
}
@@ -3335,8 +3335,8 @@ static const struct file_operations ftrace_graph_fops = {
.open = ftrace_graph_open,
.read = seq_read,
.write = ftrace_graph_write,
+ .llseek = ftrace_filter_lseek,
.release = ftrace_graph_release,
- .llseek = seq_lseek,
};
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
@@ -3432,35 +3432,49 @@ static void ftrace_init_module(struct module *mod,
ftrace_process_locs(mod, start, end);
}
-static int ftrace_module_notify(struct notifier_block *self,
- unsigned long val, void *data)
+static int ftrace_module_notify_enter(struct notifier_block *self,
+ unsigned long val, void *data)
{
struct module *mod = data;
- switch (val) {
- case MODULE_STATE_COMING:
+ if (val == MODULE_STATE_COMING)
ftrace_init_module(mod, mod->ftrace_callsites,
mod->ftrace_callsites +
mod->num_ftrace_callsites);
- break;
- case MODULE_STATE_GOING:
+ return 0;
+}
+
+static int ftrace_module_notify_exit(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ struct module *mod = data;
+
+ if (val == MODULE_STATE_GOING)
ftrace_release_mod(mod);
- break;
- }
return 0;
}
#else
-static int ftrace_module_notify(struct notifier_block *self,
- unsigned long val, void *data)
+static int ftrace_module_notify_enter(struct notifier_block *self,
+ unsigned long val, void *data)
+{
+ return 0;
+}
+static int ftrace_module_notify_exit(struct notifier_block *self,
+ unsigned long val, void *data)
{
return 0;
}
#endif /* CONFIG_MODULES */
-struct notifier_block ftrace_module_nb = {
- .notifier_call = ftrace_module_notify,
- .priority = 0,
+struct notifier_block ftrace_module_enter_nb = {
+ .notifier_call = ftrace_module_notify_enter,
+ .priority = INT_MAX, /* Run before anything that can use kprobes */
+};
+
+struct notifier_block ftrace_module_exit_nb = {
+ .notifier_call = ftrace_module_notify_exit,
+ .priority = INT_MIN, /* Run after anything that can remove kprobes */
};
extern unsigned long __start_mcount_loc[];
@@ -3494,9 +3508,13 @@ void __init ftrace_init(void)
__start_mcount_loc,
__stop_mcount_loc);
- ret = register_module_notifier(&ftrace_module_nb);
+ ret = register_module_notifier(&ftrace_module_enter_nb);
+ if (ret)
+ pr_warning("Failed to register trace ftrace module enter notifier\n");
+
+ ret = register_module_notifier(&ftrace_module_exit_nb);
if (ret)
- pr_warning("Failed to register trace ftrace module notifier\n");
+ pr_warning("Failed to register trace ftrace module exit notifier\n");
set_ftrace_early_filters();
@@ -3804,7 +3822,7 @@ static const struct file_operations ftrace_pid_fops = {
.open = ftrace_pid_open,
.write = ftrace_pid_write,
.read = seq_read,
- .llseek = seq_lseek,
+ .llseek = ftrace_filter_lseek,
.release = ftrace_pid_release,
};
@@ -3916,12 +3934,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
ftrace_startup_sysctl();
/* we are starting ftrace again */
- if (ftrace_ops_list != &ftrace_list_end) {
- if (ftrace_ops_list->next == &ftrace_list_end)
- ftrace_trace_function = ftrace_ops_list->func;
- else
- ftrace_trace_function = ftrace_ops_list_func;
- }
+ if (ftrace_ops_list != &ftrace_list_end)
+ update_ftrace_function();
} else {
/* stopping ftrace calls (just send to ftrace_stub) */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b0c7aa4..20dff64 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2926,6 +2926,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
* Splice the empty reader page into the list around the head.
*/
reader = rb_set_head_page(cpu_buffer);
+ if (!reader)
+ goto out;
cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next);
cpu_buffer->reader_page->list.prev = reader->list.prev;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0731e81a..34d15ba 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -628,7 +628,15 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
max_data->pid = tsk->pid;
- max_data->uid = task_uid(tsk);
+ /*
+ * If tsk == current, then use current_uid(), as that does not use
+ * RCU. The irq tracer can be called out of RCU scope.
+ */
+ if (tsk == current)
+ max_data->uid = current_uid();
+ else
+ max_data->uid = task_uid(tsk);
+
max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
max_data->policy = tsk->policy;
max_data->rt_priority = tsk->rt_priority;
@@ -649,7 +657,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
void
update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
- struct ring_buffer *buf = tr->buffer;
+ struct ring_buffer *buf;
if (trace_stop_count)
return;
@@ -661,6 +669,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
}
arch_spin_lock(&ftrace_max_lock);
+ buf = tr->buffer;
tr->buffer = max_tr.buffer;
max_tr.buffer = buf;
@@ -2432,10 +2441,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
if (cpumask_test_cpu(cpu, tracing_cpumask) &&
!cpumask_test_cpu(cpu, tracing_cpumask_new)) {
atomic_inc(&global_trace.data[cpu]->disabled);
+ ring_buffer_record_disable_cpu(global_trace.buffer, cpu);
}
if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
cpumask_test_cpu(cpu, tracing_cpumask_new)) {
atomic_dec(&global_trace.data[cpu]->disabled);
+ ring_buffer_record_enable_cpu(global_trace.buffer, cpu);
}
}
arch_spin_unlock(&ftrace_max_lock);
@@ -2524,11 +2535,25 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
return -EINVAL;
}
-static void set_tracer_flags(unsigned int mask, int enabled)
+/* Some tracers require overwrite to stay enabled */
+int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
+{
+ if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set)
+ return -1;
+
+ return 0;
+}
+
+int set_tracer_flag(unsigned int mask, int enabled)
{
/* do nothing if flag is already set */
if (!!(trace_flags & mask) == !!enabled)
- return;
+ return 0;
+
+ /* Give the tracer a chance to approve the change */
+ if (current_trace->flag_changed)
+ if (current_trace->flag_changed(current_trace, mask, !!enabled))
+ return -EINVAL;
if (enabled)
trace_flags |= mask;
@@ -2540,6 +2565,8 @@ static void set_tracer_flags(unsigned int mask, int enabled)
if (mask == TRACE_ITER_OVERWRITE)
ring_buffer_change_overwrite(global_trace.buffer, enabled);
+
+ return 0;
}
static ssize_t
@@ -2549,7 +2576,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
char buf[64];
char *cmp;
int neg = 0;
- int ret;
+ int ret = -ENODEV;
int i;
if (cnt >= sizeof(buf))
@@ -2566,21 +2593,23 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
cmp += 2;
}
+ mutex_lock(&trace_types_lock);
+
for (i = 0; trace_options[i]; i++) {
if (strcmp(cmp, trace_options[i]) == 0) {
- set_tracer_flags(1 << i, !neg);
+ ret = set_tracer_flag(1 << i, !neg);
break;
}
}
/* If no option could be set, test the specific tracer options */
- if (!trace_options[i]) {
- mutex_lock(&trace_types_lock);
+ if (!trace_options[i])
ret = set_tracer_option(current_trace, cmp, neg);
- mutex_unlock(&trace_types_lock);
- if (ret)
- return ret;
- }
+
+ mutex_unlock(&trace_types_lock);
+
+ if (ret < 0)
+ return ret;
*ppos += cnt;
@@ -2878,6 +2907,9 @@ static int tracing_set_tracer(const char *buf)
goto out;
trace_branch_disable();
+
+ current_trace->enabled = false;
+
if (current_trace && current_trace->reset)
current_trace->reset(tr);
if (current_trace && current_trace->use_max_tr) {
@@ -2907,6 +2939,7 @@ static int tracing_set_tracer(const char *buf)
goto out;
}
+ current_trace->enabled = true;
trace_branch_enable(tr);
out:
mutex_unlock(&trace_types_lock);
@@ -3212,6 +3245,7 @@ waitagain:
memset(&iter->seq, 0,
sizeof(struct trace_iterator) -
offsetof(struct trace_iterator, seq));
+ cpumask_clear(iter->started);
iter->pos = -1;
trace_event_read_lock();
@@ -3330,6 +3364,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
.pages = pages_def,
.partial = partial_def,
.nr_pages = 0, /* This gets updated below. */
+ .nr_pages_max = PIPE_DEF_BUFFERS,
.flags = flags,
.ops = &tracing_pipe_buf_ops,
.spd_release = tracing_spd_release_pipe,
@@ -3401,7 +3436,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
ret = splice_to_pipe(pipe, &spd);
out:
- splice_shrink_spd(pipe, &spd);
+ splice_shrink_spd(&spd);
return ret;
out_err:
@@ -3814,6 +3849,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
struct splice_pipe_desc spd = {
.pages = pages_def,
.partial = partial_def,
+ .nr_pages_max = PIPE_DEF_BUFFERS,
.flags = flags,
.ops = &buffer_pipe_buf_ops,
.spd_release = buffer_spd_release,
@@ -3902,7 +3938,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
}
ret = splice_to_pipe(pipe, &spd);
- splice_shrink_spd(pipe, &spd);
+ splice_shrink_spd(&spd);
out:
return ret;
}
@@ -4177,7 +4213,13 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (val != 0 && val != 1)
return -EINVAL;
- set_tracer_flags(1 << index, val);
+
+ mutex_lock(&trace_types_lock);
+ ret = set_tracer_flag(1 << index, val);
+ mutex_unlock(&trace_types_lock);
+
+ if (ret < 0)
+ return ret;
*ppos += cnt;
@@ -4329,6 +4371,8 @@ static __init int tracer_init_debugfs(void)
trace_access_lock_init();
d_tracer = tracing_init_dentry();
+ if (!d_tracer)
+ return 0;
trace_create_file("tracing_enabled", 0644, d_tracer,
&global_trace, &tracing_ctrl_fops);
@@ -4456,30 +4500,32 @@ void trace_init_global_iter(struct trace_iterator *iter)
iter->cpu_file = TRACE_PIPE_ALL_CPU;
}
-static void
-__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
+void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
{
- static arch_spinlock_t ftrace_dump_lock =
- (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
/* use static because iter can be a bit big for the stack */
static struct trace_iterator iter;
+ static atomic_t dump_running;
unsigned int old_userobj;
- static int dump_ran;
unsigned long flags;
int cnt = 0, cpu;
- /* only one dump */
- local_irq_save(flags);
- arch_spin_lock(&ftrace_dump_lock);
- if (dump_ran)
- goto out;
-
- dump_ran = 1;
+ /* Only allow one dump user at a time. */
+ if (atomic_inc_return(&dump_running) != 1) {
+ atomic_dec(&dump_running);
+ return;
+ }
+ /*
+ * Always turn off tracing when we dump.
+ * We don't need to show trace output of what happens
+ * between multiple crashes.
+ *
+ * If the user does a sysrq-z, then they can re-enable
+ * tracing with echo 1 > tracing_on.
+ */
tracing_off();
- if (disable_tracing)
- ftrace_kill();
+ local_irq_save(flags);
trace_init_global_iter(&iter);
@@ -4550,26 +4596,15 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
printk(KERN_TRACE "---------------------------------\n");
out_enable:
- /* Re-enable tracing if requested */
- if (!disable_tracing) {
- trace_flags |= old_userobj;
+ trace_flags |= old_userobj;
- for_each_tracing_cpu(cpu) {
- atomic_dec(&iter.tr->data[cpu]->disabled);
- }
- tracing_on();
+ for_each_tracing_cpu(cpu) {
+ atomic_dec(&iter.tr->data[cpu]->disabled);
}
-
- out:
- arch_spin_unlock(&ftrace_dump_lock);
+ atomic_dec(&dump_running);
local_irq_restore(flags);
}
-
-/* By default: disable tracing after the dump */
-void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
-{
- __ftrace_dump(true, oops_dump_mode);
-}
+EXPORT_SYMBOL_GPL(ftrace_dump);
__init static int tracer_alloc_buffers(void)
{
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f807407..123ee28 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -271,10 +271,14 @@ struct tracer {
enum print_line_t (*print_line)(struct trace_iterator *iter);
/* If you handled the flag setting, return 0 */
int (*set_flag)(u32 old_flags, u32 bit, int set);
+ /* Return 0 if OK with change, else return non-zero */
+ int (*flag_changed)(struct tracer *tracer,
+ u32 mask, int set);
struct tracer *next;
struct tracer_flags *flags;
int print_max;
int use_max_tr;
+ bool enabled;
};
@@ -776,6 +780,9 @@ extern struct list_head ftrace_events;
extern const char *__start___trace_bprintk_fmt[];
extern const char *__stop___trace_bprintk_fmt[];
+int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
+int set_tracer_flag(unsigned int mask, int enabled);
+
#undef FTRACE_ENTRY
#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
extern struct ftrace_event_call \
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index c77424b..984aad8 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -32,7 +32,7 @@ enum {
static int trace_type __read_mostly;
-static int save_lat_flag;
+static int save_flags;
static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
static int start_irqsoff_tracer(struct trace_array *tr, int graph);
@@ -544,8 +544,11 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
static void __irqsoff_tracer_init(struct trace_array *tr)
{
- save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
- trace_flags |= TRACE_ITER_LATENCY_FMT;
+ save_flags = trace_flags;
+
+ /* non overwrite screws up the latency tracers */
+ set_tracer_flag(TRACE_ITER_OVERWRITE, 1);
+ set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1);
tracing_max_latency = 0;
irqsoff_trace = tr;
@@ -559,10 +562,13 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
static void irqsoff_tracer_reset(struct trace_array *tr)
{
+ int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
+ int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
+
stop_irqsoff_tracer(tr, is_graph());
- if (!save_lat_flag)
- trace_flags &= ~TRACE_ITER_LATENCY_FMT;
+ set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag);
+ set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag);
}
static void irqsoff_tracer_start(struct trace_array *tr)
@@ -595,6 +601,7 @@ static struct tracer irqsoff_tracer __read_mostly =
.print_line = irqsoff_print_line,
.flags = &tracer_flags,
.set_flag = irqsoff_set_flag,
+ .flag_changed = trace_keep_overwrite,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_irqsoff,
#endif
@@ -628,6 +635,7 @@ static struct tracer preemptoff_tracer __read_mostly =
.print_line = irqsoff_print_line,
.flags = &tracer_flags,
.set_flag = irqsoff_set_flag,
+ .flag_changed = trace_keep_overwrite,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_preemptoff,
#endif
@@ -663,6 +671,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
.print_line = irqsoff_print_line,
.flags = &tracer_flags,
.set_flag = irqsoff_set_flag,
+ .flag_changed = trace_keep_overwrite,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_preemptirqsoff,
#endif
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index f029dd4..1beb25e 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -36,7 +36,7 @@ static void __wakeup_reset(struct trace_array *tr);
static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
static void wakeup_graph_return(struct ftrace_graph_ret *trace);
-static int save_lat_flag;
+static int save_flags;
#define TRACE_DISPLAY_GRAPH 1
@@ -526,8 +526,11 @@ static void stop_wakeup_tracer(struct trace_array *tr)
static int __wakeup_tracer_init(struct trace_array *tr)
{
- save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
- trace_flags |= TRACE_ITER_LATENCY_FMT;
+ save_flags = trace_flags;
+
+ /* non overwrite screws up the latency tracers */
+ set_tracer_flag(TRACE_ITER_OVERWRITE, 1);
+ set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1);
tracing_max_latency = 0;
wakeup_trace = tr;
@@ -549,12 +552,15 @@ static int wakeup_rt_tracer_init(struct trace_array *tr)
static void wakeup_tracer_reset(struct trace_array *tr)
{
+ int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
+ int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
+
stop_wakeup_tracer(tr);
/* make sure we put back any tasks we are tracing */
wakeup_reset(tr);
- if (!save_lat_flag)
- trace_flags &= ~TRACE_ITER_LATENCY_FMT;
+ set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag);
+ set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag);
}
static void wakeup_tracer_start(struct trace_array *tr)
@@ -580,6 +586,7 @@ static struct tracer wakeup_tracer __read_mostly =
.print_line = wakeup_print_line,
.flags = &tracer_flags,
.set_flag = wakeup_set_flag,
+ .flag_changed = trace_keep_overwrite,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
#endif
@@ -601,6 +608,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
.print_line = wakeup_print_line,
.flags = &tracer_flags,
.set_flag = wakeup_set_flag,
+ .flag_changed = trace_keep_overwrite,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
#endif
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 288541f..09fd98a 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -461,8 +461,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
/* Maximum number of functions to trace before diagnosing a hang */
#define GRAPH_MAX_FUNC_TEST 100000000
-static void
-__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);
static unsigned int graph_hang_thresh;
/* Wrap the real function entry probe to avoid possible hanging */
@@ -472,8 +470,11 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
ftrace_graph_stop();
printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
- if (ftrace_dump_on_oops)
- __ftrace_dump(false, DUMP_ALL);
+ if (ftrace_dump_on_oops) {
+ ftrace_dump(DUMP_ALL);
+ /* ftrace_dump() disables tracing */
+ tracing_on();
+ }
return 0;
}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index b0b53b8..2d43977 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -17,13 +17,24 @@
#define STACK_TRACE_ENTRIES 500
+#ifdef CC_USING_FENTRY
+# define fentry 1
+#else
+# define fentry 0
+#endif
+
static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
{ [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
+/*
+ * Reserve one entry for the passed in ip. This will allow
+ * us to remove most or all of the stack size overhead
+ * added by the stack tracer itself.
+ */
static struct stack_trace max_stack_trace = {
- .max_entries = STACK_TRACE_ENTRIES,
- .entries = stack_dump_trace,
+ .max_entries = STACK_TRACE_ENTRIES - 1,
+ .entries = &stack_dump_trace[1],
};
static unsigned long max_stack_size;
@@ -37,25 +48,34 @@ static DEFINE_MUTEX(stack_sysctl_mutex);
int stack_tracer_enabled;
static int last_stack_tracer_enabled;
-static inline void check_stack(void)
+static inline void
+check_stack(unsigned long ip, unsigned long *stack)
{
unsigned long this_size, flags;
unsigned long *p, *top, *start;
+ static int tracer_frame;
+ int frame_size = ACCESS_ONCE(tracer_frame);
int i;
- this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1);
+ this_size = ((unsigned long)stack) & (THREAD_SIZE-1);
this_size = THREAD_SIZE - this_size;
+ /* Remove the frame of the tracer */
+ this_size -= frame_size;
if (this_size <= max_stack_size)
return;
/* we do not handle interrupt stacks yet */
- if (!object_is_on_stack(&this_size))
+ if (!object_is_on_stack(stack))
return;
local_irq_save(flags);
arch_spin_lock(&max_stack_lock);
+ /* In case another CPU set the tracer_frame on us */
+ if (unlikely(!frame_size))
+ this_size -= tracer_frame;
+
/* a race could have already updated it */
if (this_size <= max_stack_size)
goto out;
@@ -68,10 +88,18 @@ static inline void check_stack(void)
save_stack_trace(&max_stack_trace);
/*
+ * Add the passed in ip from the function tracer.
+ * Searching for this on the stack will skip over
+ * most of the overhead from the stack tracer itself.
+ */
+ stack_dump_trace[0] = ip;
+ max_stack_trace.nr_entries++;
+
+ /*
* Now find where in the stack these are.
*/
i = 0;
- start = &this_size;
+ start = stack;
top = (unsigned long *)
(((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE);
@@ -95,6 +123,18 @@ static inline void check_stack(void)
found = 1;
/* Start the search from here */
start = p + 1;
+ /*
+ * We do not want to show the overhead
+ * of the stack tracer stack in the
+ * max stack. If we haven't figured
+ * out what that is, then figure it out
+ * now.
+ */
+ if (unlikely(!tracer_frame) && i == 1) {
+ tracer_frame = (p - stack) *
+ sizeof(unsigned long);
+ max_stack_size -= tracer_frame;
+ }
}
}
@@ -110,6 +150,7 @@ static inline void check_stack(void)
static void
stack_trace_call(unsigned long ip, unsigned long parent_ip)
{
+ unsigned long stack;
int cpu;
if (unlikely(!ftrace_enabled || stack_trace_disabled))
@@ -122,7 +163,26 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
if (per_cpu(trace_active, cpu)++ != 0)
goto out;
- check_stack();
+ /*
+ * When fentry is used, the traced function does not get
+ * its stack frame set up, and we lose the parent.
+ * The ip is pretty useless because the function tracer
+ * was called before that function set up its stack frame.
+ * In this case, we use the parent ip.
+ *
+ * By adding the return address of either the parent ip
+ * or the current ip we can disregard most of the stack usage
+ * caused by the stack tracer itself.
+ *
+ * The function tracer always reports the address of where the
+ * mcount call was, but the stack will hold the return address.
+ */
+ if (fentry)
+ ip = parent_ip;
+ else
+ ip += MCOUNT_INSN_SIZE;
+
+ check_stack(ip, &stack);
out:
per_cpu(trace_active, cpu)--;
@@ -360,6 +420,8 @@ static __init int stack_trace_init(void)
struct dentry *d_tracer;
d_tracer = tracing_init_dentry();
+ if (!d_tracer)
+ return 0;
trace_create_file("stack_max_size", 0644, d_tracer,
&max_stack_size, &stack_max_size_fops);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 96cffb2..847f88a 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -307,6 +307,8 @@ static int tracing_stat_init(void)
struct dentry *d_tracing;
d_tracing = tracing_init_dentry();
+ if (!d_tracing)
+ return 0;
stat_dir = debugfs_create_dir("trace_stat", d_tracing);
if (!stat_dir)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index ee7b5a0..5819cd5 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -304,6 +304,8 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
struct ring_buffer *buffer;
int size;
int syscall_nr;
+ unsigned long irq_flags;
+ int pc;
syscall_nr = syscall_get_nr(current, regs);
if (syscall_nr < 0)
@@ -317,8 +319,11 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
+ local_save_flags(irq_flags);
+ pc = preempt_count();
+
event = trace_current_buffer_lock_reserve(&buffer,
- sys_data->enter_event->event.type, size, 0, 0);
+ sys_data->enter_event->event.type, size, irq_flags, pc);
if (!event)
return;
@@ -328,7 +333,8 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
if (!filter_current_check_discard(buffer, sys_data->enter_event,
entry, event))
- trace_current_buffer_unlock_commit(buffer, event, 0, 0);
+ trace_current_buffer_unlock_commit(buffer, event,
+ irq_flags, pc);
}
void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
@@ -338,6 +344,8 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
struct ring_buffer_event *event;
struct ring_buffer *buffer;
int syscall_nr;
+ unsigned long irq_flags;
+ int pc;
syscall_nr = syscall_get_nr(current, regs);
if (syscall_nr < 0)
@@ -349,8 +357,12 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
if (!sys_data)
return;
+ local_save_flags(irq_flags);
+ pc = preempt_count();
+
event = trace_current_buffer_lock_reserve(&buffer,
- sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
+ sys_data->exit_event->event.type, sizeof(*entry),
+ irq_flags, pc);
if (!event)
return;
@@ -360,7 +372,8 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
if (!filter_current_check_discard(buffer, sys_data->exit_event,
entry, event))
- trace_current_buffer_unlock_commit(buffer, event, 0, 0);
+ trace_current_buffer_unlock_commit(buffer, event,
+ irq_flags, pc);
}
int reg_event_syscall_enter(struct ftrace_event_call *call)
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 3d0c56a..53e4432 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -113,7 +113,7 @@ static unsigned long get_timestamp(int this_cpu)
return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
}
-static unsigned long get_sample_period(void)
+static u64 get_sample_period(void)
{
/*
* convert watchdog_thresh from seconds to ns
@@ -121,7 +121,7 @@ static unsigned long get_sample_period(void)
* increment before the hardlockup detector generates
* a warning
*/
- return get_softlockup_thresh() * (NSEC_PER_SEC / 5);
+ return get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
}
/* Commands for resetting the watchdog */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1456dab..dc8438d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1145,8 +1145,8 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
unsigned int lcpu;
- BUG_ON(timer_pending(timer));
- BUG_ON(!list_empty(&work->entry));
+ WARN_ON_ONCE(timer_pending(timer));
+ WARN_ON_ONCE(!list_empty(&work->entry));
timer_stats_timer_set_start_info(&dwork->timer);
@@ -1214,8 +1214,13 @@ static void worker_enter_idle(struct worker *worker)
} else
wake_up_all(&gcwq->trustee_wait);
- /* sanity check nr_running */
- WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle &&
+ /*
+ * Sanity check nr_running. Because trustee releases gcwq->lock
+ * between setting %WORKER_ROGUE and zapping nr_running, the
+ * warning may trigger spuriously. Check iff trustee is idle.
+ */
+ WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE &&
+ gcwq->nr_workers == gcwq->nr_idle &&
atomic_read(get_gcwq_nr_running(gcwq->cpu)));
}
@@ -1863,7 +1868,9 @@ __acquires(&gcwq->lock)
spin_unlock_irq(&gcwq->lock);
+ smp_wmb(); /* paired with test_and_set_bit(PENDING) */
work_clear_pending(work);
+
lock_map_acquire_read(&cwq->wq->lockdep_map);
lock_map_acquire(&lockdep_map);
trace_workqueue_execute_start(work);
@@ -2037,8 +2044,10 @@ static int rescuer_thread(void *__wq)
repeat:
set_current_state(TASK_INTERRUPTIBLE);
- if (kthread_should_stop())
+ if (kthread_should_stop()) {
+ __set_current_state(TASK_RUNNING);
return 0;
+ }
/*
* See whether any cpu is asking for help. Unbounded
@@ -3407,14 +3416,17 @@ static int __cpuinit trustee_thread(void *__gcwq)
for_each_busy_worker(worker, i, pos, gcwq) {
struct work_struct *rebind_work = &worker->rebind_work;
+ unsigned long worker_flags = worker->flags;
/*
* Rebind_work may race with future cpu hotplug
* operations. Use a separate flag to mark that
- * rebinding is scheduled.
+ * rebinding is scheduled. The morphing should
+ * be atomic.
*/
- worker->flags |= WORKER_REBIND;
- worker->flags &= ~WORKER_ROGUE;
+ worker_flags |= WORKER_REBIND;
+ worker_flags &= ~WORKER_ROGUE;
+ ACCESS_ONCE(worker->flags) = worker_flags;
/* queue rebind_work, wq doesn't matter, use the default one */
if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
@@ -3556,21 +3568,55 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
return notifier_from_errno(0);
}
+/*
+ * Workqueues should be brought up before normal priority CPU notifiers.
+ * This will be registered high priority CPU notifier.
+ */
+static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_CANCELED:
+ case CPU_DOWN_FAILED:
+ case CPU_ONLINE:
+ return workqueue_cpu_callback(nfb, action, hcpu);
+ }
+ return NOTIFY_OK;
+}
+
+/*
+ * Workqueues should be brought down after normal priority CPU notifiers.
+ * This will be registered as low priority CPU notifier.
+ */
+static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DOWN_PREPARE:
+ case CPU_DYING:
+ case CPU_POST_DEAD:
+ return workqueue_cpu_callback(nfb, action, hcpu);
+ }
+ return NOTIFY_OK;
+}
+
#ifdef CONFIG_SMP
struct work_for_cpu {
- struct completion completion;
+ struct work_struct work;
long (*fn)(void *);
void *arg;
long ret;
};
-static int do_work_for_cpu(void *_wfc)
+static void work_for_cpu_fn(struct work_struct *work)
{
- struct work_for_cpu *wfc = _wfc;
+ struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);
+
wfc->ret = wfc->fn(wfc->arg);
- complete(&wfc->completion);
- return 0;
}
/**
@@ -3585,19 +3631,11 @@ static int do_work_for_cpu(void *_wfc)
*/
long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
{
- struct task_struct *sub_thread;
- struct work_for_cpu wfc = {
- .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion),
- .fn = fn,
- .arg = arg,
- };
+ struct work_for_cpu wfc = { .fn = fn, .arg = arg };
- sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu");
- if (IS_ERR(sub_thread))
- return PTR_ERR(sub_thread);
- kthread_bind(sub_thread, cpu);
- wake_up_process(sub_thread);
- wait_for_completion(&wfc.completion);
+ INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
+ schedule_work_on(cpu, &wfc.work);
+ flush_work(&wfc.work);
return wfc.ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu);
@@ -3749,7 +3787,8 @@ static int __init init_workqueues(void)
unsigned int cpu;
int i;
- cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
+ cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
+ cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
/* initialize gcwqs */
for_each_gcwq_cpu(cpu) {