diff options
Diffstat (limited to 'kernel')
69 files changed, 6733 insertions, 2533 deletions
diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer new file mode 100644 index 0000000..a3bb4cb --- /dev/null +++ b/kernel/Kconfig.freezer @@ -0,0 +1,2 @@ +config FREEZER + def_bool PM_SLEEP || CGROUP_FREEZER diff --git a/kernel/Makefile b/kernel/Makefile index 4e1d7df..305f11d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -24,6 +24,7 @@ CFLAGS_REMOVE_sched_clock.o = -pg CFLAGS_REMOVE_sched.o = -mno-spe -pg endif +obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o @@ -55,6 +56,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup.o obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o +obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o obj-$(CONFIG_UTS_NS) += utsname.o @@ -83,6 +85,7 @@ obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_MARKERS) += marker.o +obj-$(CONFIG_TRACEPOINTS) += tracepoint.o obj-$(CONFIG_LATENCYTOP) += latencytop.o obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o obj-$(CONFIG_FTRACE) += trace/ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8c6e1c1..046c160 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -241,7 +241,6 @@ static void unlink_css_set(struct css_set *cg) struct cg_cgroup_link *link; struct cg_cgroup_link *saved_link; - write_lock(&css_set_lock); hlist_del(&cg->hlist); css_set_count--; @@ -251,16 +250,25 @@ static void unlink_css_set(struct css_set *cg) list_del(&link->cgrp_link_list); kfree(link); } - - write_unlock(&css_set_lock); } -static void __release_css_set(struct kref *k, int taskexit) +static void __put_css_set(struct css_set *cg, int taskexit) { int i; - struct css_set *cg = container_of(k, struct css_set, ref); - + /* + * Ensure that the refcount doesn't hit zero while any readers + * can see it. Similar to atomic_dec_and_lock(), but for an + * rwlock + */ + if (atomic_add_unless(&cg->refcount, -1, 1)) + return; + write_lock(&css_set_lock); + if (!atomic_dec_and_test(&cg->refcount)) { + write_unlock(&css_set_lock); + return; + } unlink_css_set(cg); + write_unlock(&css_set_lock); rcu_read_lock(); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { @@ -276,32 +284,22 @@ static void __release_css_set(struct kref *k, int taskexit) kfree(cg); } -static void release_css_set(struct kref *k) -{ - __release_css_set(k, 0); -} - -static void release_css_set_taskexit(struct kref *k) -{ - __release_css_set(k, 1); -} - /* * refcounted get/put for css_set objects */ static inline void get_css_set(struct css_set *cg) { - kref_get(&cg->ref); + atomic_inc(&cg->refcount); } static inline void put_css_set(struct css_set *cg) { - kref_put(&cg->ref, release_css_set); + __put_css_set(cg, 0); } static inline void put_css_set_taskexit(struct css_set *cg) { - kref_put(&cg->ref, release_css_set_taskexit); + __put_css_set(cg, 1); } /* @@ -427,7 +425,7 @@ static struct css_set *find_css_set( return NULL; } - kref_init(&res->ref); + atomic_set(&res->refcount, 1); INIT_LIST_HEAD(&res->cg_links); INIT_LIST_HEAD(&res->tasks); INIT_HLIST_NODE(&res->hlist); @@ -870,6 +868,14 @@ static struct super_operations cgroup_ops = { .remount_fs = cgroup_remount, }; +static void init_cgroup_housekeeping(struct cgroup *cgrp) +{ + INIT_LIST_HEAD(&cgrp->sibling); + INIT_LIST_HEAD(&cgrp->children); + INIT_LIST_HEAD(&cgrp->css_sets); + INIT_LIST_HEAD(&cgrp->release_list); + init_rwsem(&cgrp->pids_mutex); +} static void init_cgroup_root(struct cgroupfs_root *root) { struct cgroup *cgrp = &root->top_cgroup; @@ -878,10 +884,7 @@ static void init_cgroup_root(struct cgroupfs_root *root) root->number_of_cgroups = 1; cgrp->root = root; cgrp->top_cgroup = cgrp; - INIT_LIST_HEAD(&cgrp->sibling); - INIT_LIST_HEAD(&cgrp->children); - INIT_LIST_HEAD(&cgrp->css_sets); - INIT_LIST_HEAD(&cgrp->release_list); + init_cgroup_housekeeping(cgrp); } static int cgroup_test_super(struct super_block *sb, void *data) @@ -1728,7 +1731,7 @@ int cgroup_task_count(const struct cgroup *cgrp) read_lock(&css_set_lock); list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { - count += atomic_read(&link->cg->ref.refcount); + count += atomic_read(&link->cg->refcount); } read_unlock(&css_set_lock); return count; @@ -1997,16 +2000,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * but we cannot guarantee that the information we produce is correct * unless we produce it entirely atomically. * - * Upon tasks file open(), a struct ctr_struct is allocated, that - * will have a pointer to an array (also allocated here). The struct - * ctr_struct * is stored in file->private_data. Its resources will - * be freed by release() when the file is closed. The array is used - * to sprintf the PIDs and then used by read(). */ -struct ctr_struct { - char *buf; - int bufsz; -}; /* * Load into 'pidarray' up to 'npids' of the tasks using cgroup @@ -2088,42 +2082,132 @@ static int cmppid(const void *a, const void *b) return *(pid_t *)a - *(pid_t *)b; } + /* - * Convert array 'a' of 'npids' pid_t's to a string of newline separated - * decimal pids in 'buf'. Don't write more than 'sz' chars, but return - * count 'cnt' of how many chars would be written if buf were large enough. + * seq_file methods for the "tasks" file. The seq_file position is the + * next pid to display; the seq_file iterator is a pointer to the pid + * in the cgroup->tasks_pids array. */ -static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) + +static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) { - int cnt = 0; - int i; + /* + * Initially we receive a position value that corresponds to + * one more than the last pid shown (or 0 on the first call or + * after a seek to the start). Use a binary-search to find the + * next pid to display, if any + */ + struct cgroup *cgrp = s->private; + int index = 0, pid = *pos; + int *iter; - for (i = 0; i < npids; i++) - cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]); - return cnt; + down_read(&cgrp->pids_mutex); + if (pid) { + int end = cgrp->pids_length; + int i; + while (index < end) { + int mid = (index + end) / 2; + if (cgrp->tasks_pids[mid] == pid) { + index = mid; + break; + } else if (cgrp->tasks_pids[mid] <= pid) + index = mid + 1; + else + end = mid; + } + } + /* If we're off the end of the array, we're done */ + if (index >= cgrp->pids_length) + return NULL; + /* Update the abstract position to be the actual pid that we found */ + iter = cgrp->tasks_pids + index; + *pos = *iter; + return iter; +} + +static void cgroup_tasks_stop(struct seq_file *s, void *v) +{ + struct cgroup *cgrp = s->private; + up_read(&cgrp->pids_mutex); } +static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct cgroup *cgrp = s->private; + int *p = v; + int *end = cgrp->tasks_pids + cgrp->pids_length; + + /* + * Advance to the next pid in the array. If this goes off the + * end, we're done + */ + p++; + if (p >= end) { + return NULL; + } else { + *pos = *p; + return p; + } +} + +static int cgroup_tasks_show(struct seq_file *s, void *v) +{ + return seq_printf(s, "%d\n", *(int *)v); +} + +static struct seq_operations cgroup_tasks_seq_operations = { + .start = cgroup_tasks_start, + .stop = cgroup_tasks_stop, + .next = cgroup_tasks_next, + .show = cgroup_tasks_show, +}; + +static void release_cgroup_pid_array(struct cgroup *cgrp) +{ + down_write(&cgrp->pids_mutex); + BUG_ON(!cgrp->pids_use_count); + if (!--cgrp->pids_use_count) { + kfree(cgrp->tasks_pids); + cgrp->tasks_pids = NULL; + cgrp->pids_length = 0; + } + up_write(&cgrp->pids_mutex); +} + +static int cgroup_tasks_release(struct inode *inode, struct file *file) +{ + struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + + if (!(file->f_mode & FMODE_READ)) + return 0; + + release_cgroup_pid_array(cgrp); + return seq_release(inode, file); +} + +static struct file_operations cgroup_tasks_operations = { + .read = seq_read, + .llseek = seq_lseek, + .write = cgroup_file_write, + .release = cgroup_tasks_release, +}; + /* - * Handle an open on 'tasks' file. Prepare a buffer listing the + * Handle an open on 'tasks' file. Prepare an array containing the * process id's of tasks currently attached to the cgroup being opened. - * - * Does not require any specific cgroup mutexes, and does not take any. */ + static int cgroup_tasks_open(struct inode *unused, struct file *file) { struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - struct ctr_struct *ctr; pid_t *pidarray; int npids; - char c; + int retval; + /* Nothing to do for write-only files */ if (!(file->f_mode & FMODE_READ)) return 0; - ctr = kmalloc(sizeof(*ctr), GFP_KERNEL); - if (!ctr) - goto err0; - /* * If cgroup gets more users after we read count, we won't have * enough space - tough. This race is indistinguishable to the @@ -2131,57 +2215,31 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file) * show up until sometime later on. */ npids = cgroup_task_count(cgrp); - if (npids) { - pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); - if (!pidarray) - goto err1; - - npids = pid_array_load(pidarray, npids, cgrp); - sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); - - /* Call pid_array_to_buf() twice, first just to get bufsz */ - ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1; - ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL); - if (!ctr->buf) - goto err2; - ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids); - - kfree(pidarray); - } else { - ctr->buf = NULL; - ctr->bufsz = 0; - } - file->private_data = ctr; - return 0; - -err2: - kfree(pidarray); -err1: - kfree(ctr); -err0: - return -ENOMEM; -} + pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); + if (!pidarray) + return -ENOMEM; + npids = pid_array_load(pidarray, npids, cgrp); + sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); -static ssize_t cgroup_tasks_read(struct cgroup *cgrp, - struct cftype *cft, - struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) -{ - struct ctr_struct *ctr = file->private_data; + /* + * Store the array in the cgroup, freeing the old + * array if necessary + */ + down_write(&cgrp->pids_mutex); + kfree(cgrp->tasks_pids); + cgrp->tasks_pids = pidarray; + cgrp->pids_length = npids; + cgrp->pids_use_count++; + up_write(&cgrp->pids_mutex); - return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz); -} + file->f_op = &cgroup_tasks_operations; -static int cgroup_tasks_release(struct inode *unused_inode, - struct file *file) -{ - struct ctr_struct *ctr; - - if (file->f_mode & FMODE_READ) { - ctr = file->private_data; - kfree(ctr->buf); - kfree(ctr); + retval = seq_open(file, &cgroup_tasks_seq_operations); + if (retval) { + release_cgroup_pid_array(cgrp); + return retval; } + ((struct seq_file *)file->private_data)->private = cgrp; return 0; } @@ -2210,7 +2268,6 @@ static struct cftype files[] = { { .name = "tasks", .open = cgroup_tasks_open, - .read = cgroup_tasks_read, .write_u64 = cgroup_tasks_write, .release = cgroup_tasks_release, .private = FILE_TASKLIST, @@ -2300,10 +2357,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, mutex_lock(&cgroup_mutex); - INIT_LIST_HEAD(&cgrp->sibling); - INIT_LIST_HEAD(&cgrp->children); - INIT_LIST_HEAD(&cgrp->css_sets); - INIT_LIST_HEAD(&cgrp->release_list); + init_cgroup_housekeeping(cgrp); cgrp->parent = parent; cgrp->root = parent->root; @@ -2495,8 +2549,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) int __init cgroup_init_early(void) { int i; - kref_init(&init_css_set.ref); - kref_get(&init_css_set.ref); + atomic_set(&init_css_set.refcount, 1); INIT_LIST_HEAD(&init_css_set.cg_links); INIT_LIST_HEAD(&init_css_set.tasks); INIT_HLIST_NODE(&init_css_set.hlist); diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c index c3dc3ab..daca620 100644 --- a/kernel/cgroup_debug.c +++ b/kernel/cgroup_debug.c @@ -57,7 +57,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cont, u64 count; rcu_read_lock(); - count = atomic_read(¤t->cgroups->ref.refcount); + count = atomic_read(¤t->cgroups->refcount); rcu_read_unlock(); return count; } @@ -90,7 +90,7 @@ static struct cftype files[] = { { .name = "releasable", .read_u64 = releasable_read, - } + }, }; static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c new file mode 100644 index 0000000..e950569 --- /dev/null +++ b/kernel/cgroup_freezer.c @@ -0,0 +1,379 @@ +/* + * cgroup_freezer.c - control group freezer subsystem + * + * Copyright IBM Corporation, 2007 + * + * Author : Cedric Le Goater <clg@fr.ibm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include <linux/module.h> +#include <linux/cgroup.h> +#include <linux/fs.h> +#include <linux/uaccess.h> +#include <linux/freezer.h> +#include <linux/seq_file.h> + +enum freezer_state { + CGROUP_THAWED = 0, + CGROUP_FREEZING, + CGROUP_FROZEN, +}; + +struct freezer { + struct cgroup_subsys_state css; + enum freezer_state state; + spinlock_t lock; /* protects _writes_ to state */ +}; + +static inline struct freezer *cgroup_freezer( + struct cgroup *cgroup) +{ + return container_of( + cgroup_subsys_state(cgroup, freezer_subsys_id), + struct freezer, css); +} + +static inline struct freezer *task_freezer(struct task_struct *task) +{ + return container_of(task_subsys_state(task, freezer_subsys_id), + struct freezer, css); +} + +int cgroup_frozen(struct task_struct *task) +{ + struct freezer *freezer; + enum freezer_state state; + + task_lock(task); + freezer = task_freezer(task); + state = freezer->state; + task_unlock(task); + + return state == CGROUP_FROZEN; +} + +/* + * cgroups_write_string() limits the size of freezer state strings to + * CGROUP_LOCAL_BUFFER_SIZE + */ +static const char *freezer_state_strs[] = { + "THAWED", + "FREEZING", + "FROZEN", +}; + +/* + * State diagram + * Transitions are caused by userspace writes to the freezer.state file. + * The values in parenthesis are state labels. The rest are edge labels. + * + * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN) + * ^ ^ | | + * | \_______THAWED_______/ | + * \__________________________THAWED____________/ + */ + +struct cgroup_subsys freezer_subsys; + +/* Locks taken and their ordering + * ------------------------------ + * css_set_lock + * cgroup_mutex (AKA cgroup_lock) + * task->alloc_lock (AKA task_lock) + * freezer->lock + * task->sighand->siglock + * + * cgroup code forces css_set_lock to be taken before task->alloc_lock + * + * freezer_create(), freezer_destroy(): + * cgroup_mutex [ by cgroup core ] + * + * can_attach(): + * cgroup_mutex + * + * cgroup_frozen(): + * task->alloc_lock (to get task's cgroup) + * + * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): + * task->alloc_lock (to get task's cgroup) + * freezer->lock + * sighand->siglock (if the cgroup is freezing) + * + * freezer_read(): + * cgroup_mutex + * freezer->lock + * read_lock css_set_lock (cgroup iterator start) + * + * freezer_write() (freeze): + * cgroup_mutex + * freezer->lock + * read_lock css_set_lock (cgroup iterator start) + * sighand->siglock + * + * freezer_write() (unfreeze): + * cgroup_mutex + * freezer->lock + * read_lock css_set_lock (cgroup iterator start) + * task->alloc_lock (to prevent races with freeze_task()) + * sighand->siglock + */ +static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, + struct cgroup *cgroup) +{ + struct freezer *freezer; + + freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL); + if (!freezer) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&freezer->lock); + freezer->state = CGROUP_THAWED; + return &freezer->css; +} + +static void freezer_destroy(struct cgroup_subsys *ss, + struct cgroup *cgroup) +{ + kfree(cgroup_freezer(cgroup)); +} + +/* Task is frozen or will freeze immediately when next it gets woken */ +static bool is_task_frozen_enough(struct task_struct *task) +{ + return frozen(task) || + (task_is_stopped_or_traced(task) && freezing(task)); +} + +/* + * The call to cgroup_lock() in the freezer.state write method prevents + * a write to that file racing against an attach, and hence the + * can_attach() result will remain valid until the attach completes. + */ +static int freezer_can_attach(struct cgroup_subsys *ss, + struct cgroup *new_cgroup, + struct task_struct *task) +{ + struct freezer *freezer; + int retval; + + /* Anything frozen can't move or be moved to/from */ + + if (is_task_frozen_enough(task)) + return -EBUSY; + + freezer = cgroup_freezer(new_cgroup); + if (freezer->state == CGROUP_FROZEN) + return -EBUSY; + + retval = 0; + task_lock(task); + freezer = task_freezer(task); + if (freezer->state == CGROUP_FROZEN) + retval = -EBUSY; + task_unlock(task); + return retval; +} + +static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) +{ + struct freezer *freezer; + + task_lock(task); + freezer = task_freezer(task); + task_unlock(task); + + BUG_ON(freezer->state == CGROUP_FROZEN); + spin_lock_irq(&freezer->lock); + /* Locking avoids race with FREEZING -> THAWED transitions. */ + if (freezer->state == CGROUP_FREEZING) + freeze_task(task, true); + spin_unlock_irq(&freezer->lock); +} + +/* + * caller must hold freezer->lock + */ +static void update_freezer_state(struct cgroup *cgroup, + struct freezer *freezer) +{ + struct cgroup_iter it; + struct task_struct *task; + unsigned int nfrozen = 0, ntotal = 0; + + cgroup_iter_start(cgroup, &it); + while ((task = cgroup_iter_next(cgroup, &it))) { + ntotal++; + if (is_task_frozen_enough(task)) + nfrozen++; + } + + /* + * Transition to FROZEN when no new tasks can be added ensures + * that we never exist in the FROZEN state while there are unfrozen + * tasks. + */ + if (nfrozen == ntotal) + freezer->state = CGROUP_FROZEN; + else if (nfrozen > 0) + freezer->state = CGROUP_FREEZING; + else + freezer->state = CGROUP_THAWED; + cgroup_iter_end(cgroup, &it); +} + +static int freezer_read(struct cgroup *cgroup, struct cftype *cft, + struct seq_file *m) +{ + struct freezer *freezer; + enum freezer_state state; + + if (!cgroup_lock_live_group(cgroup)) + return -ENODEV; + + freezer = cgroup_freezer(cgroup); + spin_lock_irq(&freezer->lock); + state = freezer->state; + if (state == CGROUP_FREEZING) { + /* We change from FREEZING to FROZEN lazily if the cgroup was + * only partially frozen when we exitted write. */ + update_freezer_state(cgroup, freezer); + state = freezer->state; + } + spin_unlock_irq(&freezer->lock); + cgroup_unlock(); + + seq_puts(m, freezer_state_strs[state]); + seq_putc(m, '\n'); + return 0; +} + +static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) +{ + struct cgroup_iter it; + struct task_struct *task; + unsigned int num_cant_freeze_now = 0; + + freezer->state = CGROUP_FREEZING; + cgroup_iter_start(cgroup, &it); + while ((task = cgroup_iter_next(cgroup, &it))) { + if (!freeze_task(task, true)) + continue; + if (is_task_frozen_enough(task)) + continue; + if (!freezing(task) && !freezer_should_skip(task)) + num_cant_freeze_now++; + } + cgroup_iter_end(cgroup, &it); + + return num_cant_freeze_now ? -EBUSY : 0; +} + +static int unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) +{ + struct cgroup_iter it; + struct task_struct *task; + + cgroup_iter_start(cgroup, &it); + while ((task = cgroup_iter_next(cgroup, &it))) { + int do_wake; + + task_lock(task); + do_wake = __thaw_process(task); + task_unlock(task); + if (do_wake) + wake_up_process(task); + } + cgroup_iter_end(cgroup, &it); + freezer->state = CGROUP_THAWED; + + return 0; +} + +static int freezer_change_state(struct cgroup *cgroup, + enum freezer_state goal_state) +{ + struct freezer *freezer; + int retval = 0; + + freezer = cgroup_freezer(cgroup); + spin_lock_irq(&freezer->lock); + update_freezer_state(cgroup, freezer); + if (goal_state == freezer->state) + goto out; + switch (freezer->state) { + case CGROUP_THAWED: + retval = try_to_freeze_cgroup(cgroup, freezer); + break; + case CGROUP_FREEZING: + if (goal_state == CGROUP_FROZEN) { + /* Userspace is retrying after + * "/bin/echo FROZEN > freezer.state" returned -EBUSY */ + retval = try_to_freeze_cgroup(cgroup, freezer); + break; + } + /* state == FREEZING and goal_state == THAWED, so unfreeze */ + case CGROUP_FROZEN: + retval = unfreeze_cgroup(cgroup, freezer); + break; + default: + break; + } +out: + spin_unlock_irq(&freezer->lock); + + return retval; +} + +static int freezer_write(struct cgroup *cgroup, + struct cftype *cft, + const char *buffer) +{ + int retval; + enum freezer_state goal_state; + + if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0) + goal_state = CGROUP_THAWED; + else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) + goal_state = CGROUP_FROZEN; + else + return -EIO; + + if (!cgroup_lock_live_group(cgroup)) + return -ENODEV; + retval = freezer_change_state(cgroup, goal_state); + cgroup_unlock(); + return retval; +} + +static struct cftype files[] = { + { + .name = "state", + .read_seq_string = freezer_read, + .write_string = freezer_write, + }, +}; + +static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup) +{ + return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files)); +} + +struct cgroup_subsys freezer_subsys = { + .name = "freezer", + .create = freezer_create, + .destroy = freezer_destroy, + .populate = freezer_populate, + .subsys_id = freezer_subsys_id, + .can_attach = freezer_can_attach, + .attach = NULL, + .fork = freezer_fork, + .exit = NULL, +}; diff --git a/kernel/compat.c b/kernel/compat.c index 143990e..8eafe3e 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -23,6 +23,7 @@ #include <linux/timex.h> #include <linux/migrate.h> #include <linux/posix-timers.h> +#include <linux/times.h> #include <asm/uaccess.h> @@ -208,49 +209,23 @@ asmlinkage long compat_sys_setitimer(int which, return 0; } +static compat_clock_t clock_t_to_compat_clock_t(clock_t x) +{ + return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); +} + asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) { - /* - * In the SMP world we might just be unlucky and have one of - * the times increment as we use it. Since the value is an - * atomically safe type this is just fine. Conceptually its - * as if the syscall took an instant longer to occur. - */ if (tbuf) { + struct tms tms; struct compat_tms tmp; - struct task_struct *tsk = current; - struct task_struct *t; - cputime_t utime, stime, cutime, cstime; - - read_lock(&tasklist_lock); - utime = tsk->signal->utime; - stime = tsk->signal->stime; - t = tsk; - do { - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); - t = next_thread(t); - } while (t != tsk); - - /* - * While we have tasklist_lock read-locked, no dying thread - * can be updating current->signal->[us]time. Instead, - * we got their counts included in the live thread loop. - * However, another thread can come in right now and - * do a wait call that updates current->signal->c[us]time. - * To make sure we always see that pair updated atomically, - * we take the siglock around fetching them. - */ - spin_lock_irq(&tsk->sighand->siglock); - cutime = tsk->signal->cutime; - cstime = tsk->signal->cstime; - spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); - - tmp.tms_utime = compat_jiffies_to_clock_t(cputime_to_jiffies(utime)); - tmp.tms_stime = compat_jiffies_to_clock_t(cputime_to_jiffies(stime)); - tmp.tms_cutime = compat_jiffies_to_clock_t(cputime_to_jiffies(cutime)); - tmp.tms_cstime = compat_jiffies_to_clock_t(cputime_to_jiffies(cstime)); + + do_sys_times(&tms); + /* Convert our struct tms to the compat version. */ + tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime); + tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime); + tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime); + tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime); if (copy_to_user(tbuf, &tmp, sizeof(tmp))) return -EFAULT; } diff --git a/kernel/configs.c b/kernel/configs.c index 4c34521..abaee68 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -54,9 +54,6 @@ #ifdef CONFIG_IKCONFIG_PROC -/**************************************************/ -/* globals and useful constants */ - static ssize_t ikconfig_read_current(struct file *file, char __user *buf, size_t len, loff_t * offset) @@ -71,9 +68,6 @@ static const struct file_operations ikconfig_file_ops = { .read = ikconfig_read_current, }; -/***************************************************/ -/* ikconfig_init: start up everything we need to */ - static int __init ikconfig_init(void) { struct proc_dir_entry *entry; @@ -89,9 +83,6 @@ static int __init ikconfig_init(void) return 0; } -/***************************************************/ -/* ikconfig_cleanup: clean up our mess */ - static void __exit ikconfig_cleanup(void) { remove_proc_entry("config.gz", NULL); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index eab7bd6..3e00526 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1172,7 +1172,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, { struct cpuset trialcs; int err; - int cpus_nonempty, balance_flag_changed; + int balance_flag_changed; trialcs = *cs; if (turning_on) @@ -1184,7 +1184,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, if (err < 0) return err; - cpus_nonempty = !cpus_empty(trialcs.cpus_allowed); balance_flag_changed = (is_sched_load_balance(cs) != is_sched_load_balance(&trialcs)); @@ -1192,7 +1191,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, cs->flags = trialcs.flags; mutex_unlock(&callback_mutex); - if (cpus_nonempty && balance_flag_changed) + if (!cpus_empty(trialcs.cpus_allowed) && balance_flag_changed) async_rebuild_sched_domains(); return 0; @@ -2437,19 +2436,15 @@ const struct file_operations proc_cpuset_operations = { void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { seq_printf(m, "Cpus_allowed:\t"); - m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count, - task->cpus_allowed); + seq_cpumask(m, &task->cpus_allowed); seq_printf(m, "\n"); seq_printf(m, "Cpus_allowed_list:\t"); - m->count += cpulist_scnprintf(m->buf + m->count, m->size - m->count, - task->cpus_allowed); + seq_cpumask_list(m, &task->cpus_allowed); seq_printf(m, "\n"); seq_printf(m, "Mems_allowed:\t"); - m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count, - task->mems_allowed); + seq_nodemask(m, &task->mems_allowed); seq_printf(m, "\n"); seq_printf(m, "Mems_allowed_list:\t"); - m->count += nodelist_scnprintf(m->buf + m->count, m->size - m->count, - task->mems_allowed); + seq_nodemask_list(m, &task->mems_allowed); seq_printf(m, "\n"); } diff --git a/kernel/exit.c b/kernel/exit.c index 0ef4673..80137a5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -47,6 +47,7 @@ #include <linux/blkdev.h> #include <linux/task_io_accounting_ops.h> #include <linux/tracehook.h> +#include <trace/sched.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -112,8 +113,6 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ - sig->utime = cputime_add(sig->utime, task_utime(tsk)); - sig->stime = cputime_add(sig->stime, task_stime(tsk)); sig->gtime = cputime_add(sig->gtime, task_gtime(tsk)); sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; @@ -122,7 +121,6 @@ static void __exit_signal(struct task_struct *tsk) sig->inblock += task_io_get_inblock(tsk); sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); - sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig = NULL; /* Marker for below. */ } @@ -149,7 +147,10 @@ static void __exit_signal(struct task_struct *tsk) static void delayed_put_task_struct(struct rcu_head *rhp) { - put_task_struct(container_of(rhp, struct task_struct, rcu)); + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + + trace_sched_process_free(tsk); + put_task_struct(tsk); } @@ -1073,6 +1074,8 @@ NORET_TYPE void do_exit(long code) if (group_dead) acct_process(); + trace_sched_process_exit(tsk); + exit_sem(tsk); exit_files(tsk); exit_fs(tsk); @@ -1301,6 +1304,7 @@ static int wait_task_zombie(struct task_struct *p, int options, if (likely(!traced)) { struct signal_struct *psig; struct signal_struct *sig; + struct task_cputime cputime; /* * The resource counters for the group leader are in its @@ -1316,20 +1320,23 @@ static int wait_task_zombie(struct task_struct *p, int options, * need to protect the access to p->parent->signal fields, * as other threads in the parent group can be right * here reaping other children at the same time. + * + * We use thread_group_cputime() to get times for the thread + * group, which consolidates times for all threads in the + * group including the group leader. */ spin_lock_irq(&p->parent->sighand->siglock); psig = p->parent->signal; sig = p->signal; + thread_group_cputime(p, &cputime); psig->cutime = cputime_add(psig->cutime, - cputime_add(p->utime, - cputime_add(sig->utime, - sig->cutime))); + cputime_add(cputime.utime, + sig->cutime)); psig->cstime = cputime_add(psig->cstime, - cputime_add(p->stime, - cputime_add(sig->stime, - sig->cstime))); + cputime_add(cputime.stime, + sig->cstime)); psig->cgtime = cputime_add(psig->cgtime, cputime_add(p->gtime, @@ -1674,6 +1681,8 @@ static long do_wait(enum pid_type type, struct pid *pid, int options, struct task_struct *tsk; int retval; + trace_sched_process_wait(pid); + add_wait_queue(¤t->signal->wait_chldexit,&wait); repeat: /* diff --git a/kernel/fork.c b/kernel/fork.c index 30de644..4d09355 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -58,6 +58,7 @@ #include <linux/tty.h> #include <linux/proc_fs.h> #include <linux/blkdev.h> +#include <trace/sched.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -759,15 +760,44 @@ void __cleanup_sighand(struct sighand_struct *sighand) kmem_cache_free(sighand_cachep, sighand); } + +/* + * Initialize POSIX timer handling for a thread group. + */ +static void posix_cpu_timers_init_group(struct signal_struct *sig) +{ + /* Thread group counters. */ + thread_group_cputime_init(sig); + + /* Expiration times and increments. */ + sig->it_virt_expires = cputime_zero; + sig->it_virt_incr = cputime_zero; + sig->it_prof_expires = cputime_zero; + sig->it_prof_incr = cputime_zero; + + /* Cached expiration times. */ + sig->cputime_expires.prof_exp = cputime_zero; + sig->cputime_expires.virt_exp = cputime_zero; + sig->cputime_expires.sched_exp = 0; + + /* The timer lists. */ + INIT_LIST_HEAD(&sig->cpu_timers[0]); + INIT_LIST_HEAD(&sig->cpu_timers[1]); + INIT_LIST_HEAD(&sig->cpu_timers[2]); +} + static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) { struct signal_struct *sig; int ret; if (clone_flags & CLONE_THREAD) { - atomic_inc(¤t->signal->count); - atomic_inc(¤t->signal->live); - return 0; + ret = thread_group_cputime_clone_thread(current); + if (likely(!ret)) { + atomic_inc(¤t->signal->count); + atomic_inc(¤t->signal->live); + } + return ret; } sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); tsk->signal = sig; @@ -795,40 +825,25 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->it_real_incr.tv64 = 0; sig->real_timer.function = it_real_fn; - sig->it_virt_expires = cputime_zero; - sig->it_virt_incr = cputime_zero; - sig->it_prof_expires = cputime_zero; - sig->it_prof_incr = cputime_zero; - sig->leader = 0; /* session leadership doesn't inherit */ sig->tty_old_pgrp = NULL; sig->tty = NULL; - sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; + sig->cutime = sig->cstime = cputime_zero; sig->gtime = cputime_zero; sig->cgtime = cputime_zero; sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; task_io_accounting_init(&sig->ioac); - sig->sum_sched_runtime = 0; - INIT_LIST_HEAD(&sig->cpu_timers[0]); - INIT_LIST_HEAD(&sig->cpu_timers[1]); - INIT_LIST_HEAD(&sig->cpu_timers[2]); taskstats_tgid_init(sig); task_lock(current->group_leader); memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); task_unlock(current->group_leader); - if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { - /* - * New sole thread in the process gets an expiry time - * of the whole CPU time limit. - */ - tsk->it_prof_expires = - secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); - } + posix_cpu_timers_init_group(sig); + acct_init_pacct(&sig->pacct); tty_audit_fork(sig); @@ -838,6 +853,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) void __cleanup_signal(struct signal_struct *sig) { + thread_group_cputime_free(sig); exit_thread_group_keys(sig); tty_kref_put(sig->tty); kmem_cache_free(signal_cachep, sig); @@ -888,6 +904,19 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p) #endif /* CONFIG_MM_OWNER */ /* + * Initialize POSIX timer handling for a single task. + */ +static void posix_cpu_timers_init(struct task_struct *tsk) +{ + tsk->cputime_expires.prof_exp = cputime_zero; + tsk->cputime_expires.virt_exp = cputime_zero; + tsk->cputime_expires.sched_exp = 0; + INIT_LIST_HEAD(&tsk->cpu_timers[0]); + INIT_LIST_HEAD(&tsk->cpu_timers[1]); + INIT_LIST_HEAD(&tsk->cpu_timers[2]); +} + +/* * This creates a new process as a copy of the old one, * but does not actually start it yet. * @@ -997,12 +1026,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, task_io_accounting_init(&p->ioac); acct_clear_integrals(p); - p->it_virt_expires = cputime_zero; - p->it_prof_expires = cputime_zero; - p->it_sched_expires = 0; - INIT_LIST_HEAD(&p->cpu_timers[0]); - INIT_LIST_HEAD(&p->cpu_timers[1]); - INIT_LIST_HEAD(&p->cpu_timers[2]); + posix_cpu_timers_init(p); p->lock_depth = -1; /* -1 = no lock */ do_posix_clock_monotonic_gettime(&p->start_time); @@ -1203,21 +1227,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (clone_flags & CLONE_THREAD) { p->group_leader = current->group_leader; list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); - - if (!cputime_eq(current->signal->it_virt_expires, - cputime_zero) || - !cputime_eq(current->signal->it_prof_expires, - cputime_zero) || - current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY || - !list_empty(¤t->signal->cpu_timers[0]) || - !list_empty(¤t->signal->cpu_timers[1]) || - !list_empty(¤t->signal->cpu_timers[2])) { - /* - * Have child wake up on its first tick to check - * for process CPU timers. - */ - p->it_prof_expires = jiffies_to_cputime(1); - } } if (likely(p->pid)) { @@ -1364,6 +1373,8 @@ long do_fork(unsigned long clone_flags, if (!IS_ERR(p)) { struct completion vfork; + trace_sched_process_fork(current, p); + nr = task_pid_vnr(p); if (clone_flags & CLONE_PARENT_SETTID) diff --git a/kernel/freezer.c b/kernel/freezer.c new file mode 100644 index 0000000..ba6248b --- /dev/null +++ b/kernel/freezer.c @@ -0,0 +1,154 @@ +/* + * kernel/freezer.c - Function to freeze a process + * + * Originally from kernel/power/process.c + */ + +#include <linux/interrupt.h> +#include <linux/suspend.h> +#include <linux/module.h> +#include <linux/syscalls.h> +#include <linux/freezer.h> + +/* + * freezing is complete, mark current process as frozen + */ +static inline void frozen_process(void) +{ + if (!unlikely(current->flags & PF_NOFREEZE)) { + current->flags |= PF_FROZEN; + wmb(); + } + clear_freeze_flag(current); +} + +/* Refrigerator is place where frozen processes are stored :-). */ +void refrigerator(void) +{ + /* Hmm, should we be allowed to suspend when there are realtime + processes around? */ + long save; + + task_lock(current); + if (freezing(current)) { + frozen_process(); + task_unlock(current); + } else { + task_unlock(current); + return; + } + save = current->state; + pr_debug("%s entered refrigerator\n", current->comm); + + spin_lock_irq(¤t->sighand->siglock); + recalc_sigpending(); /* We sent fake signal, clean it up */ + spin_unlock_irq(¤t->sighand->siglock); + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!frozen(current)) + break; + schedule(); + } + pr_debug("%s left refrigerator\n", current->comm); + __set_current_state(save); +} +EXPORT_SYMBOL(refrigerator); + +static void fake_signal_wake_up(struct task_struct *p) +{ + unsigned long flags; + + spin_lock_irqsave(&p->sighand->siglock, flags); + signal_wake_up(p, 0); + spin_unlock_irqrestore(&p->sighand->siglock, flags); +} + +/** + * freeze_task - send a freeze request to given task + * @p: task to send the request to + * @sig_only: if set, the request will only be sent if the task has the + * PF_FREEZER_NOSIG flag unset + * Return value: 'false', if @sig_only is set and the task has + * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise + * + * The freeze request is sent by setting the tasks's TIF_FREEZE flag and + * either sending a fake signal to it or waking it up, depending on whether + * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task + * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its + * TIF_FREEZE flag will not be set. + */ +bool freeze_task(struct task_struct *p, bool sig_only) +{ + /* + * We first check if the task is freezing and next if it has already + * been frozen to avoid the race with frozen_process() which first marks + * the task as frozen and next clears its TIF_FREEZE. + */ + if (!freezing(p)) { + rmb(); + if (frozen(p)) + return false; + + if (!sig_only || should_send_signal(p)) + set_freeze_flag(p); + else + return false; + } + + if (should_send_signal(p)) { + if (!signal_pending(p)) + fake_signal_wake_up(p); + } else if (sig_only) { + return false; + } else { + wake_up_state(p, TASK_INTERRUPTIBLE); + } + + return true; +} + +void cancel_freezing(struct task_struct *p) +{ + unsigned long flags; + + if (freezing(p)) { + pr_debug(" clean up: %s\n", p->comm); + clear_freeze_flag(p); + spin_lock_irqsave(&p->sighand->siglock, flags); + recalc_sigpending_and_wake(p); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + } +} + +/* + * Wake up a frozen process + * + * task_lock() is needed to prevent the race with refrigerator() which may + * occur if the freezing of tasks fails. Namely, without the lock, if the + * freezing of tasks failed, thaw_tasks() might have run before a task in + * refrigerator() could call frozen_process(), in which case the task would be + * frozen and no one would thaw it. + */ +int __thaw_process(struct task_struct *p) +{ + if (frozen(p)) { + p->flags &= ~PF_FROZEN; + return 1; + } + clear_freeze_flag(p); + return 0; +} + +int thaw_process(struct task_struct *p) +{ + task_lock(p); + if (__thaw_process(p) == 1) { + task_unlock(p); + wake_up_process(p); + return 1; + } + task_unlock(p); + return 0; +} +EXPORT_SYMBOL(thaw_process); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index cdec83e..95978f4 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1403,9 +1403,7 @@ void hrtimer_run_queues(void) if (!base->first) continue; - if (base->get_softirq_time) - base->softirq_time = base->get_softirq_time(); - else if (gettime) { + if (gettime) { hrtimer_get_softirq_time(cpu_base); gettime = 0; } @@ -1688,9 +1686,11 @@ static void migrate_hrtimers(int cpu) new_base = &get_cpu_var(hrtimer_bases); tick_cancel_sched_timer(cpu); - - local_irq_disable(); - spin_lock(&new_base->lock); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + spin_lock_irq(&new_base->lock); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { @@ -1703,8 +1703,7 @@ static void migrate_hrtimers(int cpu) raise = 1; spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); - local_irq_enable(); + spin_unlock_irq(&new_base->lock); put_cpu_var(hrtimer_bases); if (raise) diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 533068c..cc0f732 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -30,17 +30,16 @@ static DEFINE_MUTEX(probing_active); unsigned long probe_irq_on(void) { struct irq_desc *desc; - unsigned long mask; - unsigned int i; + unsigned long mask = 0; + unsigned int status; + int i; mutex_lock(&probing_active); /* * something may have generated an irq long ago and we want to * flush such a longstanding irq before considering it as spurious. */ - for (i = NR_IRQS-1; i > 0; i--) { - desc = irq_desc + i; - + for_each_irq_desc_reverse(i, desc) { spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { /* @@ -68,9 +67,7 @@ unsigned long probe_irq_on(void) * (we must startup again here because if a longstanding irq * happened in the previous stage, it may have masked itself) */ - for (i = NR_IRQS-1; i > 0; i--) { - desc = irq_desc + i; - + for_each_irq_desc_reverse(i, desc) { spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { desc->status |= IRQ_AUTODETECT | IRQ_WAITING; @@ -88,11 +85,7 @@ unsigned long probe_irq_on(void) /* * Now filter out any obviously spurious interrupts */ - mask = 0; - for (i = 0; i < NR_IRQS; i++) { - unsigned int status; - - desc = irq_desc + i; + for_each_irq_desc(i, desc) { spin_lock_irq(&desc->lock); status = desc->status; @@ -126,14 +119,11 @@ EXPORT_SYMBOL(probe_irq_on); */ unsigned int probe_irq_mask(unsigned long val) { - unsigned int mask; + unsigned int status, mask = 0; + struct irq_desc *desc; int i; - mask = 0; - for (i = 0; i < NR_IRQS; i++) { - struct irq_desc *desc = irq_desc + i; - unsigned int status; - + for_each_irq_desc(i, desc) { spin_lock_irq(&desc->lock); status = desc->status; @@ -171,20 +161,19 @@ EXPORT_SYMBOL(probe_irq_mask); */ int probe_irq_off(unsigned long val) { - int i, irq_found = 0, nr_irqs = 0; - - for (i = 0; i < NR_IRQS; i++) { - struct irq_desc *desc = irq_desc + i; - unsigned int status; + int i, irq_found = 0, nr_of_irqs = 0; + struct irq_desc *desc; + unsigned int status; + for_each_irq_desc(i, desc) { spin_lock_irq(&desc->lock); status = desc->status; if (status & IRQ_AUTODETECT) { if (!(status & IRQ_WAITING)) { - if (!nr_irqs) + if (!nr_of_irqs) irq_found = i; - nr_irqs++; + nr_of_irqs++; } desc->status = status & ~IRQ_AUTODETECT; desc->chip->shutdown(i); @@ -193,7 +182,7 @@ int probe_irq_off(unsigned long val) } mutex_unlock(&probing_active); - if (nr_irqs > 1) + if (nr_of_irqs > 1) irq_found = -irq_found; return irq_found; diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 3cd441e..10b5092 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -24,16 +24,15 @@ */ void dynamic_irq_init(unsigned int irq) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - if (irq >= NR_IRQS) { + if (!desc) { WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); return; } /* Ensure we don't have left over values from a previous use of this irq */ - desc = irq_desc + irq; spin_lock_irqsave(&desc->lock, flags); desc->status = IRQ_DISABLED; desc->chip = &no_irq_chip; @@ -57,15 +56,14 @@ void dynamic_irq_init(unsigned int irq) */ void dynamic_irq_cleanup(unsigned int irq) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - if (irq >= NR_IRQS) { + if (!desc) { WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); return; } - desc = irq_desc + irq; spin_lock_irqsave(&desc->lock, flags); if (desc->action) { spin_unlock_irqrestore(&desc->lock, flags); @@ -78,6 +76,7 @@ void dynamic_irq_cleanup(unsigned int irq) desc->chip_data = NULL; desc->handle_irq = handle_bad_irq; desc->chip = &no_irq_chip; + desc->name = NULL; spin_unlock_irqrestore(&desc->lock, flags); } @@ -89,10 +88,10 @@ void dynamic_irq_cleanup(unsigned int irq) */ int set_irq_chip(unsigned int irq, struct irq_chip *chip) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - if (irq >= NR_IRQS) { + if (!desc) { WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq); return -EINVAL; } @@ -100,7 +99,6 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) if (!chip) chip = &no_irq_chip; - desc = irq_desc + irq; spin_lock_irqsave(&desc->lock, flags); irq_chip_set_defaults(chip); desc->chip = chip; @@ -111,27 +109,27 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) EXPORT_SYMBOL(set_irq_chip); /** - * set_irq_type - set the irq type for an irq + * set_irq_type - set the irq trigger type for an irq * @irq: irq number - * @type: interrupt type - see include/linux/interrupt.h + * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h */ int set_irq_type(unsigned int irq, unsigned int type) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; int ret = -ENXIO; - if (irq >= NR_IRQS) { + if (!desc) { printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); return -ENODEV; } - desc = irq_desc + irq; - if (desc->chip->set_type) { - spin_lock_irqsave(&desc->lock, flags); - ret = desc->chip->set_type(irq, type); - spin_unlock_irqrestore(&desc->lock, flags); - } + if (type == IRQ_TYPE_NONE) + return 0; + + spin_lock_irqsave(&desc->lock, flags); + ret = __irq_set_trigger(desc, irq, type); + spin_unlock_irqrestore(&desc->lock, flags); return ret; } EXPORT_SYMBOL(set_irq_type); @@ -145,16 +143,15 @@ EXPORT_SYMBOL(set_irq_type); */ int set_irq_data(unsigned int irq, void *data) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - if (irq >= NR_IRQS) { + if (!desc) { printk(KERN_ERR "Trying to install controller data for IRQ%d\n", irq); return -EINVAL; } - desc = irq_desc + irq; spin_lock_irqsave(&desc->lock, flags); desc->handler_data = data; spin_unlock_irqrestore(&desc->lock, flags); @@ -171,15 +168,15 @@ EXPORT_SYMBOL(set_irq_data); */ int set_irq_msi(unsigned int irq, struct msi_desc *entry) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - if (irq >= NR_IRQS) { + if (!desc) { printk(KERN_ERR "Trying to install msi data for IRQ%d\n", irq); return -EINVAL; } - desc = irq_desc + irq; + spin_lock_irqsave(&desc->lock, flags); desc->msi_desc = entry; if (entry) @@ -197,10 +194,16 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) */ int set_irq_chip_data(unsigned int irq, void *data) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - if (irq >= NR_IRQS || !desc->chip) { + if (!desc) { + printk(KERN_ERR + "Trying to install chip data for IRQ%d\n", irq); + return -EINVAL; + } + + if (!desc->chip) { printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); return -EINVAL; } @@ -218,7 +221,7 @@ EXPORT_SYMBOL(set_irq_chip_data); */ static void default_enable(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); desc->chip->unmask(irq); desc->status &= ~IRQ_MASKED; @@ -236,8 +239,9 @@ static void default_disable(unsigned int irq) */ static unsigned int default_startup(unsigned int irq) { - irq_desc[irq].chip->enable(irq); + struct irq_desc *desc = irq_to_desc(irq); + desc->chip->enable(irq); return 0; } @@ -246,7 +250,7 @@ static unsigned int default_startup(unsigned int irq) */ static void default_shutdown(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); desc->chip->mask(irq); desc->status |= IRQ_MASKED; @@ -305,14 +309,13 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) { struct irqaction *action; irqreturn_t action_ret; - const unsigned int cpu = smp_processor_id(); spin_lock(&desc->lock); if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); - kstat_cpu(cpu).irqs[irq]++; + kstat_incr_irqs_this_cpu(irq, desc); action = desc->action; if (unlikely(!action || (desc->status & IRQ_DISABLED))) @@ -344,7 +347,6 @@ out_unlock: void handle_level_irq(unsigned int irq, struct irq_desc *desc) { - unsigned int cpu = smp_processor_id(); struct irqaction *action; irqreturn_t action_ret; @@ -354,7 +356,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); - kstat_cpu(cpu).irqs[irq]++; + kstat_incr_irqs_this_cpu(irq, desc); /* * If its disabled or no action available @@ -392,7 +394,6 @@ out_unlock: void handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) { - unsigned int cpu = smp_processor_id(); struct irqaction *action; irqreturn_t action_ret; @@ -402,7 +403,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) goto out; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); - kstat_cpu(cpu).irqs[irq]++; + kstat_incr_irqs_this_cpu(irq, desc); /* * If its disabled or no action available @@ -451,8 +452,6 @@ out: void handle_edge_irq(unsigned int irq, struct irq_desc *desc) { - const unsigned int cpu = smp_processor_id(); - spin_lock(&desc->lock); desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); @@ -468,8 +467,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) mask_ack_irq(desc, irq); goto out_unlock; } - - kstat_cpu(cpu).irqs[irq]++; + kstat_incr_irqs_this_cpu(irq, desc); /* Start handling the irq */ desc->chip->ack(irq); @@ -524,7 +522,7 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) { irqreturn_t action_ret; - kstat_this_cpu.irqs[irq]++; + kstat_incr_irqs_this_cpu(irq, desc); if (desc->chip->ack) desc->chip->ack(irq); @@ -541,17 +539,15 @@ void __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, const char *name) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - if (irq >= NR_IRQS) { + if (!desc) { printk(KERN_ERR "Trying to install type control for IRQ%d\n", irq); return; } - desc = irq_desc + irq; - if (!handle) handle = handle_bad_irq; else if (desc->chip == &no_irq_chip) { @@ -583,7 +579,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, desc->status &= ~IRQ_DISABLED; desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; desc->depth = 0; - desc->chip->unmask(irq); + desc->chip->startup(irq); } spin_unlock_irqrestore(&desc->lock, flags); } @@ -606,17 +602,14 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, void __init set_irq_noprobe(unsigned int irq) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - if (irq >= NR_IRQS) { + if (!desc) { printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq); - return; } - desc = irq_desc + irq; - spin_lock_irqsave(&desc->lock, flags); desc->status |= IRQ_NOPROBE; spin_unlock_irqrestore(&desc->lock, flags); @@ -624,17 +617,14 @@ void __init set_irq_noprobe(unsigned int irq) void __init set_irq_probe(unsigned int irq) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - if (irq >= NR_IRQS) { + if (!desc) { printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq); - return; } - desc = irq_desc + irq; - spin_lock_irqsave(&desc->lock, flags); desc->status &= ~IRQ_NOPROBE; spin_unlock_irqrestore(&desc->lock, flags); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 5fa6198..c815b42 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -25,11 +25,10 @@ * * Handles spurious and unhandled IRQ's. It also prints a debugmessage. */ -void -handle_bad_irq(unsigned int irq, struct irq_desc *desc) +void handle_bad_irq(unsigned int irq, struct irq_desc *desc) { print_irq_desc(irq, desc); - kstat_this_cpu.irqs[irq]++; + kstat_incr_irqs_this_cpu(irq, desc); ack_bad_irq(irq); } @@ -47,6 +46,9 @@ handle_bad_irq(unsigned int irq, struct irq_desc *desc) * * Controller mappings for all interrupt sources: */ +int nr_irqs = NR_IRQS; +EXPORT_SYMBOL_GPL(nr_irqs); + struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { .status = IRQ_DISABLED, @@ -66,7 +68,9 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { */ static void ack_bad(unsigned int irq) { - print_irq_desc(irq, irq_desc + irq); + struct irq_desc *desc = irq_to_desc(irq); + + print_irq_desc(irq, desc); ack_bad_irq(irq); } @@ -131,8 +135,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) irqreturn_t ret, retval = IRQ_NONE; unsigned int status = 0; - handle_dynamic_tick(action); - if (!(action->flags & IRQF_DISABLED)) local_irq_enable_in_hardirq(); @@ -165,11 +167,12 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) */ unsigned int __do_IRQ(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; unsigned int status; - kstat_this_cpu.irqs[irq]++; + kstat_incr_irqs_this_cpu(irq, desc); + if (CHECK_IRQ_PER_CPU(desc->status)) { irqreturn_t action_ret; @@ -256,8 +259,8 @@ out: } #endif -#ifdef CONFIG_TRACE_IRQFLAGS +#ifdef CONFIG_TRACE_IRQFLAGS /* * lockdep: we want to handle all irq_desc locks as a single lock-class: */ @@ -265,10 +268,10 @@ static struct lock_class_key irq_desc_lock_class; void early_init_irq_lock_class(void) { + struct irq_desc *desc; int i; - for (i = 0; i < NR_IRQS; i++) - lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class); + for_each_irq_desc(i, desc) + lockdep_set_class(&desc->lock, &irq_desc_lock_class); } - #endif diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 08a849a..c9767e6 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -10,12 +10,15 @@ extern void irq_chip_set_defaults(struct irq_chip *chip); /* Set default handler: */ extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); +extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, + unsigned long flags); + #ifdef CONFIG_PROC_FS -extern void register_irq_proc(unsigned int irq); +extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); extern void register_handler_proc(unsigned int irq, struct irqaction *action); extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); #else -static inline void register_irq_proc(unsigned int irq) { } +static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { } static inline void register_handler_proc(unsigned int irq, struct irqaction *action) { } static inline void unregister_handler_proc(unsigned int irq, diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 60c49e3..c498a1b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -31,10 +31,10 @@ cpumask_t irq_default_affinity = CPU_MASK_ALL; */ void synchronize_irq(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); unsigned int status; - if (irq >= NR_IRQS) + if (!desc) return; do { @@ -64,7 +64,7 @@ EXPORT_SYMBOL(synchronize_irq); */ int irq_can_set_affinity(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip || !desc->chip->set_affinity) @@ -81,18 +81,17 @@ int irq_can_set_affinity(unsigned int irq) */ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); if (!desc->chip->set_affinity) return -EINVAL; - set_balance_irq_affinity(irq, cpumask); - #ifdef CONFIG_GENERIC_PENDING_IRQ - if (desc->status & IRQ_MOVE_PCNTXT) { + if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { unsigned long flags; spin_lock_irqsave(&desc->lock, flags); + desc->affinity = cpumask; desc->chip->set_affinity(irq, cpumask); spin_unlock_irqrestore(&desc->lock, flags); } else @@ -111,16 +110,17 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) int irq_select_affinity(unsigned int irq) { cpumask_t mask; + struct irq_desc *desc; if (!irq_can_set_affinity(irq)) return 0; cpus_and(mask, cpu_online_map, irq_default_affinity); - irq_desc[irq].affinity = mask; - irq_desc[irq].chip->set_affinity(irq, mask); + desc = irq_to_desc(irq); + desc->affinity = mask; + desc->chip->set_affinity(irq, mask); - set_balance_irq_affinity(irq, mask); return 0; } #endif @@ -140,10 +140,10 @@ int irq_select_affinity(unsigned int irq) */ void disable_irq_nosync(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - if (irq >= NR_IRQS) + if (!desc) return; spin_lock_irqsave(&desc->lock, flags); @@ -169,9 +169,9 @@ EXPORT_SYMBOL(disable_irq_nosync); */ void disable_irq(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); - if (irq >= NR_IRQS) + if (!desc) return; disable_irq_nosync(irq); @@ -211,10 +211,10 @@ static void __enable_irq(struct irq_desc *desc, unsigned int irq) */ void enable_irq(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - if (irq >= NR_IRQS) + if (!desc) return; spin_lock_irqsave(&desc->lock, flags); @@ -223,9 +223,9 @@ void enable_irq(unsigned int irq) } EXPORT_SYMBOL(enable_irq); -int set_irq_wake_real(unsigned int irq, unsigned int on) +static int set_irq_wake_real(unsigned int irq, unsigned int on) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); int ret = -ENXIO; if (desc->chip->set_wake) @@ -248,7 +248,7 @@ int set_irq_wake_real(unsigned int irq, unsigned int on) */ int set_irq_wake(unsigned int irq, unsigned int on) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; int ret = 0; @@ -288,12 +288,16 @@ EXPORT_SYMBOL(set_irq_wake); */ int can_request_irq(unsigned int irq, unsigned long irqflags) { + struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; - if (irq >= NR_IRQS || irq_desc[irq].status & IRQ_NOREQUEST) + if (!desc) + return 0; + + if (desc->status & IRQ_NOREQUEST) return 0; - action = irq_desc[irq].action; + action = desc->action; if (action) if (irqflags & action->flags & IRQF_SHARED) action = NULL; @@ -312,10 +316,11 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc) desc->handle_irq = NULL; } -static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq, +int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags) { int ret; + struct irq_chip *chip = desc->chip; if (!chip || !chip->set_type) { /* @@ -333,6 +338,11 @@ static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq, pr_err("setting trigger mode %d for irq %u failed (%pF)\n", (int)(flags & IRQF_TRIGGER_MASK), irq, chip->set_type); + else { + /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ + desc->status &= ~IRQ_TYPE_SENSE_MASK; + desc->status |= flags & IRQ_TYPE_SENSE_MASK; + } return ret; } @@ -341,16 +351,16 @@ static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq, * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. */ -int setup_irq(unsigned int irq, struct irqaction *new) +static int +__setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new) { - struct irq_desc *desc = irq_desc + irq; struct irqaction *old, **p; const char *old_name = NULL; unsigned long flags; int shared = 0; int ret; - if (irq >= NR_IRQS) + if (!desc) return -EINVAL; if (desc->chip == &no_irq_chip) @@ -411,7 +421,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) /* Setup the type (level, edge polarity) if configured: */ if (new->flags & IRQF_TRIGGER_MASK) { - ret = __irq_set_trigger(desc->chip, irq, new->flags); + ret = __irq_set_trigger(desc, irq, new->flags); if (ret) { spin_unlock_irqrestore(&desc->lock, flags); @@ -430,16 +440,21 @@ int setup_irq(unsigned int irq, struct irqaction *new) if (!(desc->status & IRQ_NOAUTOEN)) { desc->depth = 0; desc->status &= ~IRQ_DISABLED; - if (desc->chip->startup) - desc->chip->startup(irq); - else - desc->chip->enable(irq); + desc->chip->startup(irq); } else /* Undo nested disables: */ desc->depth = 1; /* Set default affinity mask once everything is setup */ irq_select_affinity(irq); + + } else if ((new->flags & IRQF_TRIGGER_MASK) + && (new->flags & IRQF_TRIGGER_MASK) + != (desc->status & IRQ_TYPE_SENSE_MASK)) { + /* hope the handler works with the actual trigger mode... */ + pr_warning("IRQ %d uses trigger mode %d; requested %d\n", + irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK), + (int)(new->flags & IRQF_TRIGGER_MASK)); } *p = new; @@ -464,7 +479,7 @@ int setup_irq(unsigned int irq, struct irqaction *new) spin_unlock_irqrestore(&desc->lock, flags); new->irq = irq; - register_irq_proc(irq); + register_irq_proc(irq, desc); new->dir = NULL; register_handler_proc(irq, new); @@ -484,6 +499,20 @@ mismatch: } /** + * setup_irq - setup an interrupt + * @irq: Interrupt line to setup + * @act: irqaction for the interrupt + * + * Used to statically setup interrupts in the early boot process. + */ +int setup_irq(unsigned int irq, struct irqaction *act) +{ + struct irq_desc *desc = irq_to_desc(irq); + + return __setup_irq(irq, desc, act); +} + +/** * free_irq - free an interrupt * @irq: Interrupt line to free * @dev_id: Device identity to free @@ -499,15 +528,15 @@ mismatch: */ void free_irq(unsigned int irq, void *dev_id) { - struct irq_desc *desc; + struct irq_desc *desc = irq_to_desc(irq); struct irqaction **p; unsigned long flags; WARN_ON(in_interrupt()); - if (irq >= NR_IRQS) + + if (!desc) return; - desc = irq_desc + irq; spin_lock_irqsave(&desc->lock, flags); p = &desc->action; for (;;) { @@ -596,12 +625,14 @@ EXPORT_SYMBOL(free_irq); * IRQF_SHARED Interrupt is shared * IRQF_DISABLED Disable local interrupts while processing * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy + * IRQF_TRIGGER_* Specify active edge(s) or level * */ int request_irq(unsigned int irq, irq_handler_t handler, unsigned long irqflags, const char *devname, void *dev_id) { struct irqaction *action; + struct irq_desc *desc; int retval; #ifdef CONFIG_LOCKDEP @@ -618,9 +649,12 @@ int request_irq(unsigned int irq, irq_handler_t handler, */ if ((irqflags & IRQF_SHARED) && !dev_id) return -EINVAL; - if (irq >= NR_IRQS) + + desc = irq_to_desc(irq); + if (!desc) return -EINVAL; - if (irq_desc[irq].status & IRQ_NOREQUEST) + + if (desc->status & IRQ_NOREQUEST) return -EINVAL; if (!handler) return -EINVAL; @@ -636,26 +670,29 @@ int request_irq(unsigned int irq, irq_handler_t handler, action->next = NULL; action->dev_id = dev_id; + retval = __setup_irq(irq, desc, action); + if (retval) + kfree(action); + #ifdef CONFIG_DEBUG_SHIRQ if (irqflags & IRQF_SHARED) { /* * It's a shared IRQ -- the driver ought to be prepared for it * to happen immediately, so let's make sure.... - * We do this before actually registering it, to make sure that - * a 'real' IRQ doesn't run in parallel with our fake + * We disable the irq to make sure that a 'real' IRQ doesn't + * run in parallel with our fake. */ unsigned long flags; + disable_irq(irq); local_irq_save(flags); + handler(irq, dev_id); + local_irq_restore(flags); + enable_irq(irq); } #endif - - retval = setup_irq(irq, action); - if (retval) - kfree(action); - return retval; } EXPORT_SYMBOL(request_irq); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 77b7acc..90b920d 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -3,18 +3,18 @@ void set_pending_irq(unsigned int irq, cpumask_t mask) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; spin_lock_irqsave(&desc->lock, flags); desc->status |= IRQ_MOVE_PENDING; - irq_desc[irq].pending_mask = mask; + desc->pending_mask = mask; spin_unlock_irqrestore(&desc->lock, flags); } void move_masked_irq(int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); cpumask_t tmp; if (likely(!(desc->status & IRQ_MOVE_PENDING))) @@ -30,7 +30,7 @@ void move_masked_irq(int irq) desc->status &= ~IRQ_MOVE_PENDING; - if (unlikely(cpus_empty(irq_desc[irq].pending_mask))) + if (unlikely(cpus_empty(desc->pending_mask))) return; if (!desc->chip->set_affinity) @@ -38,7 +38,7 @@ void move_masked_irq(int irq) assert_spin_locked(&desc->lock); - cpus_and(tmp, irq_desc[irq].pending_mask, cpu_online_map); + cpus_and(tmp, desc->pending_mask, cpu_online_map); /* * If there was a valid mask to work with, please @@ -55,12 +55,12 @@ void move_masked_irq(int irq) if (likely(!cpus_empty(tmp))) { desc->chip->set_affinity(irq,tmp); } - cpus_clear(irq_desc[irq].pending_mask); + cpus_clear(desc->pending_mask); } void move_native_irq(int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index a09dd29..fac014a 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -19,7 +19,7 @@ static struct proc_dir_entry *root_irq_dir; static int irq_affinity_proc_show(struct seq_file *m, void *v) { - struct irq_desc *desc = irq_desc + (long)m->private; + struct irq_desc *desc = irq_to_desc((long)m->private); cpumask_t *mask = &desc->affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ @@ -43,7 +43,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, cpumask_t new_value; int err; - if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || + if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity || irq_balancing_disabled(irq)) return -EIO; @@ -132,20 +132,20 @@ static const struct file_operations default_affinity_proc_fops = { static int irq_spurious_read(char *page, char **start, off_t off, int count, int *eof, void *data) { - struct irq_desc *d = &irq_desc[(long) data]; + struct irq_desc *desc = irq_to_desc((long) data); return sprintf(page, "count %u\n" "unhandled %u\n" "last_unhandled %u ms\n", - d->irq_count, - d->irqs_unhandled, - jiffies_to_msecs(d->last_unhandled)); + desc->irq_count, + desc->irqs_unhandled, + jiffies_to_msecs(desc->last_unhandled)); } #define MAX_NAMELEN 128 static int name_unique(unsigned int irq, struct irqaction *new_action) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; unsigned long flags; int ret = 1; @@ -165,8 +165,9 @@ static int name_unique(unsigned int irq, struct irqaction *new_action) void register_handler_proc(unsigned int irq, struct irqaction *action) { char name [MAX_NAMELEN]; + struct irq_desc *desc = irq_to_desc(irq); - if (!irq_desc[irq].dir || action->dir || !action->name || + if (!desc->dir || action->dir || !action->name || !name_unique(irq, action)) return; @@ -174,36 +175,34 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) snprintf(name, MAX_NAMELEN, "%s", action->name); /* create /proc/irq/1234/handler/ */ - action->dir = proc_mkdir(name, irq_desc[irq].dir); + action->dir = proc_mkdir(name, desc->dir); } #undef MAX_NAMELEN #define MAX_NAMELEN 10 -void register_irq_proc(unsigned int irq) +void register_irq_proc(unsigned int irq, struct irq_desc *desc) { char name [MAX_NAMELEN]; struct proc_dir_entry *entry; - if (!root_irq_dir || - (irq_desc[irq].chip == &no_irq_chip) || - irq_desc[irq].dir) + if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) return; memset(name, 0, MAX_NAMELEN); sprintf(name, "%d", irq); /* create /proc/irq/1234 */ - irq_desc[irq].dir = proc_mkdir(name, root_irq_dir); + desc->dir = proc_mkdir(name, root_irq_dir); #ifdef CONFIG_SMP /* create /proc/irq/<irq>/smp_affinity */ - proc_create_data("smp_affinity", 0600, irq_desc[irq].dir, + proc_create_data("smp_affinity", 0600, desc->dir, &irq_affinity_proc_fops, (void *)(long)irq); #endif - entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir); + entry = create_proc_entry("spurious", 0444, desc->dir); if (entry) { entry->data = (void *)(long)irq; entry->read_proc = irq_spurious_read; @@ -214,8 +213,11 @@ void register_irq_proc(unsigned int irq) void unregister_handler_proc(unsigned int irq, struct irqaction *action) { - if (action->dir) - remove_proc_entry(action->dir->name, irq_desc[irq].dir); + if (action->dir) { + struct irq_desc *desc = irq_to_desc(irq); + + remove_proc_entry(action->dir->name, desc->dir); + } } void register_default_affinity_proc(void) @@ -228,7 +230,8 @@ void register_default_affinity_proc(void) void init_irq_proc(void) { - int i; + unsigned int irq; + struct irq_desc *desc; /* create /proc/irq */ root_irq_dir = proc_mkdir("irq", NULL); @@ -240,7 +243,7 @@ void init_irq_proc(void) /* * Create entries for all existing IRQs. */ - for (i = 0; i < NR_IRQS; i++) - register_irq_proc(i); + for_each_irq_desc(irq, desc) + register_irq_proc(irq, desc); } diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index a804679..89c7117 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -33,10 +33,10 @@ static void resend_irqs(unsigned long arg) struct irq_desc *desc; int irq; - while (!bitmap_empty(irqs_resend, NR_IRQS)) { - irq = find_first_bit(irqs_resend, NR_IRQS); + while (!bitmap_empty(irqs_resend, nr_irqs)) { + irq = find_first_bit(irqs_resend, nr_irqs); clear_bit(irq, irqs_resend); - desc = irq_desc + irq; + desc = irq_to_desc(irq); local_irq_disable(); desc->handle_irq(irq, desc); local_irq_enable(); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index c66d3f1..dd364c1 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -12,83 +12,122 @@ #include <linux/kallsyms.h> #include <linux/interrupt.h> #include <linux/moduleparam.h> +#include <linux/timer.h> static int irqfixup __read_mostly; +#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) +static void poll_spurious_irqs(unsigned long dummy); +static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); + /* * Recovery handler for misrouted interrupts. */ -static int misrouted_irq(int irq) +static int try_one_irq(int irq, struct irq_desc *desc) { - int i; - int ok = 0; - int work = 0; /* Did we do work for a real IRQ */ - - for (i = 1; i < NR_IRQS; i++) { - struct irq_desc *desc = irq_desc + i; - struct irqaction *action; - - if (i == irq) /* Already tried */ - continue; + struct irqaction *action; + int ok = 0, work = 0; - spin_lock(&desc->lock); - /* Already running on another processor */ - if (desc->status & IRQ_INPROGRESS) { - /* - * Already running: If it is shared get the other - * CPU to go looking for our mystery interrupt too - */ - if (desc->action && (desc->action->flags & IRQF_SHARED)) - desc->status |= IRQ_PENDING; - spin_unlock(&desc->lock); - continue; - } - /* Honour the normal IRQ locking */ - desc->status |= IRQ_INPROGRESS; - action = desc->action; + spin_lock(&desc->lock); + /* Already running on another processor */ + if (desc->status & IRQ_INPROGRESS) { + /* + * Already running: If it is shared get the other + * CPU to go looking for our mystery interrupt too + */ + if (desc->action && (desc->action->flags & IRQF_SHARED)) + desc->status |= IRQ_PENDING; spin_unlock(&desc->lock); + return ok; + } + /* Honour the normal IRQ locking */ + desc->status |= IRQ_INPROGRESS; + action = desc->action; + spin_unlock(&desc->lock); - while (action) { - /* Only shared IRQ handlers are safe to call */ - if (action->flags & IRQF_SHARED) { - if (action->handler(i, action->dev_id) == - IRQ_HANDLED) - ok = 1; - } - action = action->next; + while (action) { + /* Only shared IRQ handlers are safe to call */ + if (action->flags & IRQF_SHARED) { + if (action->handler(irq, action->dev_id) == + IRQ_HANDLED) + ok = 1; } - local_irq_disable(); - /* Now clean up the flags */ - spin_lock(&desc->lock); - action = desc->action; + action = action->next; + } + local_irq_disable(); + /* Now clean up the flags */ + spin_lock(&desc->lock); + action = desc->action; + /* + * While we were looking for a fixup someone queued a real + * IRQ clashing with our walk: + */ + while ((desc->status & IRQ_PENDING) && action) { /* - * While we were looking for a fixup someone queued a real - * IRQ clashing with our walk: - */ - while ((desc->status & IRQ_PENDING) && action) { - /* - * Perform real IRQ processing for the IRQ we deferred - */ - work = 1; - spin_unlock(&desc->lock); - handle_IRQ_event(i, action); - spin_lock(&desc->lock); - desc->status &= ~IRQ_PENDING; - } - desc->status &= ~IRQ_INPROGRESS; - /* - * If we did actual work for the real IRQ line we must let the - * IRQ controller clean up too + * Perform real IRQ processing for the IRQ we deferred */ - if (work && desc->chip && desc->chip->end) - desc->chip->end(i); + work = 1; spin_unlock(&desc->lock); + handle_IRQ_event(irq, action); + spin_lock(&desc->lock); + desc->status &= ~IRQ_PENDING; + } + desc->status &= ~IRQ_INPROGRESS; + /* + * If we did actual work for the real IRQ line we must let the + * IRQ controller clean up too + */ + if (work && desc->chip && desc->chip->end) + desc->chip->end(irq); + spin_unlock(&desc->lock); + + return ok; +} + +static int misrouted_irq(int irq) +{ + struct irq_desc *desc; + int i, ok = 0; + + for_each_irq_desc(i, desc) { + if (!i) + continue; + + if (i == irq) /* Already tried */ + continue; + + if (try_one_irq(i, desc)) + ok = 1; } /* So the caller can adjust the irq error counts */ return ok; } +static void poll_spurious_irqs(unsigned long dummy) +{ + struct irq_desc *desc; + int i; + + for_each_irq_desc(i, desc) { + unsigned int status; + + if (!i) + continue; + + /* Racy but it doesn't matter */ + status = desc->status; + barrier(); + if (!(status & IRQ_SPURIOUS_DISABLED)) + continue; + + try_one_irq(i, desc); + } + + mod_timer(&poll_spurious_irq_timer, + jiffies + POLL_SPURIOUS_IRQ_INTERVAL); +} + /* * If 99,900 of the previous 100,000 interrupts have not been handled * then assume that the IRQ is stuck in some manner. Drop a diagnostic @@ -137,7 +176,9 @@ report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) } } -static inline int try_misrouted_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) +static inline int +try_misrouted_irq(unsigned int irq, struct irq_desc *desc, + irqreturn_t action_ret) { struct irqaction *action; @@ -212,6 +253,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; desc->depth++; desc->chip->disable(irq); + + mod_timer(&poll_spurious_irq_timer, + jiffies + POLL_SPURIOUS_IRQ_INTERVAL); } desc->irqs_unhandled = 0; } @@ -241,7 +285,7 @@ static int __init irqfixup_setup(char *str) __setup("irqfixup", irqfixup_setup); module_param(irqfixup, int, 0644); -MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode 2: irqpoll mode"); +MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode, 2: irqpoll mode"); static int __init irqpoll_setup(char *str) { diff --git a/kernel/itimer.c b/kernel/itimer.c index ab98274..db7c358 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -55,17 +55,15 @@ int do_getitimer(int which, struct itimerval *value) spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: - read_lock(&tasklist_lock); spin_lock_irq(&tsk->sighand->siglock); cval = tsk->signal->it_virt_expires; cinterval = tsk->signal->it_virt_incr; if (!cputime_eq(cval, cputime_zero)) { - struct task_struct *t = tsk; - cputime_t utime = tsk->signal->utime; - do { - utime = cputime_add(utime, t->utime); - t = next_thread(t); - } while (t != tsk); + struct task_cputime cputime; + cputime_t utime; + + thread_group_cputime(tsk, &cputime); + utime = cputime.utime; if (cputime_le(cval, utime)) { /* about to fire */ cval = jiffies_to_cputime(1); } else { @@ -73,25 +71,19 @@ int do_getitimer(int which, struct itimerval *value) } } spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); cputime_to_timeval(cval, &value->it_value); cputime_to_timeval(cinterval, &value->it_interval); break; case ITIMER_PROF: - read_lock(&tasklist_lock); spin_lock_irq(&tsk->sighand->siglock); cval = tsk->signal->it_prof_expires; cinterval = tsk->signal->it_prof_incr; if (!cputime_eq(cval, cputime_zero)) { - struct task_struct *t = tsk; - cputime_t ptime = cputime_add(tsk->signal->utime, - tsk->signal->stime); - do { - ptime = cputime_add(ptime, - cputime_add(t->utime, - t->stime)); - t = next_thread(t); - } while (t != tsk); + struct task_cputime times; + cputime_t ptime; + + thread_group_cputime(tsk, ×); + ptime = cputime_add(times.utime, times.stime); if (cputime_le(cval, ptime)) { /* about to fire */ cval = jiffies_to_cputime(1); } else { @@ -99,7 +91,6 @@ int do_getitimer(int which, struct itimerval *value) } } spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); cputime_to_timeval(cval, &value->it_value); cputime_to_timeval(cinterval, &value->it_interval); break; @@ -185,7 +176,6 @@ again: case ITIMER_VIRTUAL: nval = timeval_to_cputime(&value->it_value); ninterval = timeval_to_cputime(&value->it_interval); - read_lock(&tasklist_lock); spin_lock_irq(&tsk->sighand->siglock); cval = tsk->signal->it_virt_expires; cinterval = tsk->signal->it_virt_incr; @@ -200,7 +190,6 @@ again: tsk->signal->it_virt_expires = nval; tsk->signal->it_virt_incr = ninterval; spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); if (ovalue) { cputime_to_timeval(cval, &ovalue->it_value); cputime_to_timeval(cinterval, &ovalue->it_interval); @@ -209,7 +198,6 @@ again: case ITIMER_PROF: nval = timeval_to_cputime(&value->it_value); ninterval = timeval_to_cputime(&value->it_interval); - read_lock(&tasklist_lock); spin_lock_irq(&tsk->sighand->siglock); cval = tsk->signal->it_prof_expires; cinterval = tsk->signal->it_prof_incr; @@ -224,7 +212,6 @@ again: tsk->signal->it_prof_expires = nval; tsk->signal->it_prof_incr = ninterval; spin_unlock_irq(&tsk->sighand->siglock); - read_unlock(&tasklist_lock); if (ovalue) { cputime_to_timeval(cval, &ovalue->it_value); cputime_to_timeval(cinterval, &ovalue->it_interval); diff --git a/kernel/kexec.c b/kernel/kexec.c index aef2653..ac0fde7 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -30,6 +30,7 @@ #include <linux/pm.h> #include <linux/cpu.h> #include <linux/console.h> +#include <linux/vmalloc.h> #include <asm/page.h> #include <asm/uaccess.h> @@ -1371,6 +1372,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(node_online_map); VMCOREINFO_SYMBOL(swapper_pg_dir); VMCOREINFO_SYMBOL(_stext); + VMCOREINFO_SYMBOL(vmlist); #ifndef CONFIG_NEED_MULTIPLE_NODES VMCOREINFO_SYMBOL(mem_map); @@ -1406,6 +1408,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(free_area, free_list); VMCOREINFO_OFFSET(list_head, next); VMCOREINFO_OFFSET(list_head, prev); + VMCOREINFO_OFFSET(vm_struct, addr); VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); VMCOREINFO_NUMBER(NR_FREE_PAGES); diff --git a/kernel/kthread.c b/kernel/kthread.c index 96cff2f..8e7a7ce 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -13,6 +13,7 @@ #include <linux/file.h> #include <linux/module.h> #include <linux/mutex.h> +#include <trace/sched.h> #define KTHREAD_NICE_LEVEL (-5) @@ -171,12 +172,11 @@ EXPORT_SYMBOL(kthread_create); */ void kthread_bind(struct task_struct *k, unsigned int cpu) { - if (k->state != TASK_UNINTERRUPTIBLE) { + /* Must have done schedule() in kthread() before we set_task_cpu */ + if (!wait_task_inactive(k, TASK_UNINTERRUPTIBLE)) { WARN_ON(1); return; } - /* Must have done schedule() in kthread() before we set_task_cpu */ - wait_task_inactive(k, 0); set_task_cpu(k, cpu); k->cpus_allowed = cpumask_of_cpu(cpu); k->rt.nr_cpus_allowed = 1; @@ -206,6 +206,8 @@ int kthread_stop(struct task_struct *k) /* It could exit after stop_info.k set, but before wake_up_process. */ get_task_struct(k); + trace_sched_kthread_stop(k); + /* Must init completion *before* thread sees kthread_stop_info.k */ init_completion(&kthread_stop_info.done); smp_wmb(); @@ -221,6 +223,8 @@ int kthread_stop(struct task_struct *k) ret = kthread_stop_info.err; mutex_unlock(&kthread_stop_lock); + trace_sched_kthread_stop_ret(ret); + return ret; } EXPORT_SYMBOL(kthread_stop); diff --git a/kernel/marker.c b/kernel/marker.c index 7d1faec..e9c6b2b 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -62,7 +62,7 @@ struct marker_entry { int refcount; /* Number of times armed. 0 if disarmed. */ struct rcu_head rcu; void *oldptr; - unsigned char rcu_pending:1; + int rcu_pending; unsigned char ptype:1; char name[0]; /* Contains name'\0'format'\0' */ }; @@ -103,11 +103,11 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...) char ptype; /* - * preempt_disable does two things : disabling preemption to make sure - * the teardown of the callbacks can be done correctly when they are in - * modules and they insure RCU read coherency. + * rcu_read_lock_sched does two things : disabling preemption to make + * sure the teardown of the callbacks can be done correctly when they + * are in modules and they insure RCU read coherency. */ - preempt_disable(); + rcu_read_lock_sched(); ptype = mdata->ptype; if (likely(!ptype)) { marker_probe_func *func; @@ -145,7 +145,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...) va_end(args); } } - preempt_enable(); + rcu_read_unlock_sched(); } EXPORT_SYMBOL_GPL(marker_probe_cb); @@ -162,7 +162,7 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) va_list args; /* not initialized */ char ptype; - preempt_disable(); + rcu_read_lock_sched(); ptype = mdata->ptype; if (likely(!ptype)) { marker_probe_func *func; @@ -195,7 +195,7 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) multi[i].func(multi[i].probe_private, call_private, mdata->format, &args); } - preempt_enable(); + rcu_read_unlock_sched(); } EXPORT_SYMBOL_GPL(marker_probe_cb_noarg); @@ -560,7 +560,7 @@ static int set_marker(struct marker_entry **entry, struct marker *elem, * Disable a marker and its probe callback. * Note: only waiting an RCU period after setting elem->call to the empty * function insures that the original callback is not used anymore. This insured - * by preempt_disable around the call site. + * by rcu_read_lock_sched around the call site. */ static void disable_marker(struct marker *elem) { @@ -653,11 +653,17 @@ int marker_probe_register(const char *name, const char *format, entry = get_marker(name); if (!entry) { entry = add_marker(name, format); - if (IS_ERR(entry)) { + if (IS_ERR(entry)) ret = PTR_ERR(entry); - goto end; - } + } else if (format) { + if (!entry->format) + ret = marker_set_format(&entry, format); + else if (strcmp(entry->format, format)) + ret = -EPERM; } + if (ret) + goto end; + /* * If we detect that a call_rcu is pending for this marker, * make sure it's executed now. @@ -674,6 +680,8 @@ int marker_probe_register(const char *name, const char *format, mutex_lock(&markers_mutex); entry = get_marker(name); WARN_ON(!entry); + if (entry->rcu_pending) + rcu_barrier_sched(); entry->oldptr = old; entry->rcu_pending = 1; /* write rcu_pending before calling the RCU callback */ @@ -717,6 +725,8 @@ int marker_probe_unregister(const char *name, entry = get_marker(name); if (!entry) goto end; + if (entry->rcu_pending) + rcu_barrier_sched(); entry->oldptr = old; entry->rcu_pending = 1; /* write rcu_pending before calling the RCU callback */ @@ -795,6 +805,8 @@ int marker_probe_unregister_private_data(marker_probe_func *probe, mutex_lock(&markers_mutex); entry = get_marker_from_private_data(probe, probe_private); WARN_ON(!entry); + if (entry->rcu_pending) + rcu_barrier_sched(); entry->oldptr = old; entry->rcu_pending = 1; /* write rcu_pending before calling the RCU callback */ diff --git a/kernel/module.c b/kernel/module.c index b7205f6..0d8d21e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -46,6 +46,8 @@ #include <asm/cacheflush.h> #include <linux/license.h> #include <asm/sections.h> +#include <linux/tracepoint.h> +#include <linux/ftrace.h> #if 0 #define DEBUGP printk @@ -1430,6 +1432,9 @@ static void free_module(struct module *mod) /* Module unload stuff */ module_unload_free(mod); + /* release any pointers to mcount in this module */ + ftrace_release(mod->module_core, mod->core_size); + /* This may be NULL, but that's OK */ module_free(mod, mod->module_init); kfree(mod->args); @@ -1834,6 +1839,7 @@ static noinline struct module *load_module(void __user *umod, Elf_Ehdr *hdr; Elf_Shdr *sechdrs; char *secstrings, *args, *modmagic, *strtab = NULL; + char *staging; unsigned int i; unsigned int symindex = 0; unsigned int strindex = 0; @@ -1860,9 +1866,13 @@ static noinline struct module *load_module(void __user *umod, unsigned int markersindex; unsigned int markersstringsindex; unsigned int verboseindex; + unsigned int tracepointsindex; + unsigned int tracepointsstringsindex; + unsigned int mcountindex; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ + void *mseg; struct exception_table_entry *extable; mm_segment_t old_fs; @@ -1989,6 +1999,14 @@ static noinline struct module *load_module(void __user *umod, goto free_hdr; } + staging = get_modinfo(sechdrs, infoindex, "staging"); + if (staging) { + add_taint_module(mod, TAINT_CRAP); + printk(KERN_WARNING "%s: module is from the staging directory," + " the quality is unknown, you have been warned.\n", + mod->name); + } + /* Now copy in args */ args = strndup_user(uargs, ~0UL >> 1); if (IS_ERR(args)) { @@ -2147,6 +2165,12 @@ static noinline struct module *load_module(void __user *umod, markersstringsindex = find_sec(hdr, sechdrs, secstrings, "__markers_strings"); verboseindex = find_sec(hdr, sechdrs, secstrings, "__verbose"); + tracepointsindex = find_sec(hdr, sechdrs, secstrings, "__tracepoints"); + tracepointsstringsindex = find_sec(hdr, sechdrs, secstrings, + "__tracepoints_strings"); + + mcountindex = find_sec(hdr, sechdrs, secstrings, + "__mcount_loc"); /* Now do relocations. */ for (i = 1; i < hdr->e_shnum; i++) { @@ -2174,6 +2198,12 @@ static noinline struct module *load_module(void __user *umod, mod->num_markers = sechdrs[markersindex].sh_size / sizeof(*mod->markers); #endif +#ifdef CONFIG_TRACEPOINTS + mod->tracepoints = (void *)sechdrs[tracepointsindex].sh_addr; + mod->num_tracepoints = + sechdrs[tracepointsindex].sh_size / sizeof(*mod->tracepoints); +#endif + /* Find duplicate symbols */ err = verify_export_symbols(mod); @@ -2192,12 +2222,22 @@ static noinline struct module *load_module(void __user *umod, add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); + if (!mod->taints) { #ifdef CONFIG_MARKERS - if (!mod->taints) marker_update_probe_range(mod->markers, mod->markers + mod->num_markers); #endif dynamic_printk_setup(sechdrs, verboseindex); +#ifdef CONFIG_TRACEPOINTS + tracepoint_update_probe_range(mod->tracepoints, + mod->tracepoints + mod->num_tracepoints); +#endif + } + + /* sechdrs[0].sh_size is always zero */ + mseg = (void *)sechdrs[mcountindex].sh_addr; + ftrace_init_module(mseg, mseg + sechdrs[mcountindex].sh_size); + err = module_finalize(hdr, sechdrs, mod); if (err < 0) goto cleanup; @@ -2267,6 +2307,7 @@ static noinline struct module *load_module(void __user *umod, cleanup: kobject_del(&mod->mkobj.kobj); kobject_put(&mod->mkobj.kobj); + ftrace_release(mod->module_core, mod->core_size); free_unload: module_unload_free(mod); module_free(mod, mod->module_init); @@ -2587,6 +2628,8 @@ static char *module_flags(struct module *mod, char *buf) buf[bx++] = 'P'; if (mod->taints & (1 << TAINT_FORCED_MODULE)) buf[bx++] = 'F'; + if (mod->taints & (1 << TAINT_CRAP)) + buf[bx++] = 'C'; /* * TAINT_FORCED_RMMOD: could be added. * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't @@ -2748,3 +2791,50 @@ void module_update_markers(void) mutex_unlock(&module_mutex); } #endif + +#ifdef CONFIG_TRACEPOINTS +void module_update_tracepoints(void) +{ + struct module *mod; + + mutex_lock(&module_mutex); + list_for_each_entry(mod, &modules, list) + if (!mod->taints) + tracepoint_update_probe_range(mod->tracepoints, + mod->tracepoints + mod->num_tracepoints); + mutex_unlock(&module_mutex); +} + +/* + * Returns 0 if current not found. + * Returns 1 if current found. + */ +int module_get_iter_tracepoints(struct tracepoint_iter *iter) +{ + struct module *iter_mod; + int found = 0; + + mutex_lock(&module_mutex); + list_for_each_entry(iter_mod, &modules, list) { + if (!iter_mod->taints) { + /* + * Sorted module list + */ + if (iter_mod < iter->module) + continue; + else if (iter_mod > iter->module) + iter->tracepoint = NULL; + found = tracepoint_get_iter_range(&iter->tracepoint, + iter_mod->tracepoints, + iter_mod->tracepoints + + iter_mod->num_tracepoints); + if (found) { + iter->module = iter_mod; + break; + } + } + } + mutex_unlock(&module_mutex); + return found; +} +#endif diff --git a/kernel/notifier.c b/kernel/notifier.c index 823be115..4282c0a 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -550,7 +550,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier); static ATOMIC_NOTIFIER_HEAD(die_chain); -int notify_die(enum die_val val, const char *str, +int notrace notify_die(enum die_val val, const char *str, struct pt_regs *regs, long err, int trap, int sig) { struct die_args args = { diff --git a/kernel/panic.c b/kernel/panic.c index f290e8e..bda561e 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -161,6 +161,7 @@ static const struct tnt tnts[] = { { TAINT_DIE, 'D', ' ' }, { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, { TAINT_WARN, 'W', ' ' }, + { TAINT_CRAP, 'C', ' ' }, }; /** @@ -175,6 +176,7 @@ static const struct tnt tnts[] = { * 'U' - Userspace-defined naughtiness. * 'A' - ACPI table overridden. * 'W' - Taint on warning. + * 'C' - modules from drivers/staging are loaded. * * The string is overwritten by the next call to print_taint(). */ diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index c42a03a..153dcb2 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -7,6 +7,93 @@ #include <linux/errno.h> #include <linux/math64.h> #include <asm/uaccess.h> +#include <linux/kernel_stat.h> + +/* + * Allocate the thread_group_cputime structure appropriately and fill in the + * current values of the fields. Called from copy_signal() via + * thread_group_cputime_clone_thread() when adding a second or subsequent + * thread to a thread group. Assumes interrupts are enabled when called. + */ +int thread_group_cputime_alloc(struct task_struct *tsk) +{ + struct signal_struct *sig = tsk->signal; + struct task_cputime *cputime; + + /* + * If we have multiple threads and we don't already have a + * per-CPU task_cputime struct (checked in the caller), allocate + * one and fill it in with the times accumulated so far. We may + * race with another thread so recheck after we pick up the sighand + * lock. + */ + cputime = alloc_percpu(struct task_cputime); + if (cputime == NULL) + return -ENOMEM; + spin_lock_irq(&tsk->sighand->siglock); + if (sig->cputime.totals) { + spin_unlock_irq(&tsk->sighand->siglock); + free_percpu(cputime); + return 0; + } + sig->cputime.totals = cputime; + cputime = per_cpu_ptr(sig->cputime.totals, smp_processor_id()); + cputime->utime = tsk->utime; + cputime->stime = tsk->stime; + cputime->sum_exec_runtime = tsk->se.sum_exec_runtime; + spin_unlock_irq(&tsk->sighand->siglock); + return 0; +} + +/** + * thread_group_cputime - Sum the thread group time fields across all CPUs. + * + * @tsk: The task we use to identify the thread group. + * @times: task_cputime structure in which we return the summed fields. + * + * Walk the list of CPUs to sum the per-CPU time fields in the thread group + * time structure. + */ +void thread_group_cputime( + struct task_struct *tsk, + struct task_cputime *times) +{ + struct signal_struct *sig; + int i; + struct task_cputime *tot; + + sig = tsk->signal; + if (unlikely(!sig) || !sig->cputime.totals) { + times->utime = tsk->utime; + times->stime = tsk->stime; + times->sum_exec_runtime = tsk->se.sum_exec_runtime; + return; + } + times->stime = times->utime = cputime_zero; + times->sum_exec_runtime = 0; + for_each_possible_cpu(i) { + tot = per_cpu_ptr(tsk->signal->cputime.totals, i); + times->utime = cputime_add(times->utime, tot->utime); + times->stime = cputime_add(times->stime, tot->stime); + times->sum_exec_runtime += tot->sum_exec_runtime; + } +} + +/* + * Called after updating RLIMIT_CPU to set timer expiration if necessary. + */ +void update_rlimit_cpu(unsigned long rlim_new) +{ + cputime_t cputime; + + cputime = secs_to_cputime(rlim_new); + if (cputime_eq(current->signal->it_prof_expires, cputime_zero) || + cputime_lt(current->signal->it_prof_expires, cputime)) { + spin_lock_irq(¤t->sighand->siglock); + set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); + spin_unlock_irq(¤t->sighand->siglock); + } +} static int check_clock(const clockid_t which_clock) { @@ -158,10 +245,6 @@ static inline cputime_t virt_ticks(struct task_struct *p) { return p->utime; } -static inline unsigned long long sched_ns(struct task_struct *p) -{ - return task_sched_runtime(p); -} int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) { @@ -211,7 +294,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, cpu->cpu = virt_ticks(p); break; case CPUCLOCK_SCHED: - cpu->sched = sched_ns(p); + cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p); break; } return 0; @@ -220,59 +303,30 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, /* * Sample a process (thread group) clock for the given group_leader task. * Must be called with tasklist_lock held for reading. - * Must be called with tasklist_lock held for reading, and p->sighand->siglock. */ -static int cpu_clock_sample_group_locked(unsigned int clock_idx, - struct task_struct *p, - union cpu_time_count *cpu) +static int cpu_clock_sample_group(const clockid_t which_clock, + struct task_struct *p, + union cpu_time_count *cpu) { - struct task_struct *t = p; - switch (clock_idx) { + struct task_cputime cputime; + + thread_group_cputime(p, &cputime); + switch (which_clock) { default: return -EINVAL; case CPUCLOCK_PROF: - cpu->cpu = cputime_add(p->signal->utime, p->signal->stime); - do { - cpu->cpu = cputime_add(cpu->cpu, prof_ticks(t)); - t = next_thread(t); - } while (t != p); + cpu->cpu = cputime_add(cputime.utime, cputime.stime); break; case CPUCLOCK_VIRT: - cpu->cpu = p->signal->utime; - do { - cpu->cpu = cputime_add(cpu->cpu, virt_ticks(t)); - t = next_thread(t); - } while (t != p); + cpu->cpu = cputime.utime; break; case CPUCLOCK_SCHED: - cpu->sched = p->signal->sum_sched_runtime; - /* Add in each other live thread. */ - while ((t = next_thread(t)) != p) { - cpu->sched += t->se.sum_exec_runtime; - } - cpu->sched += sched_ns(p); + cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); break; } return 0; } -/* - * Sample a process (thread group) clock for the given group_leader task. - * Must be called with tasklist_lock held for reading. - */ -static int cpu_clock_sample_group(const clockid_t which_clock, - struct task_struct *p, - union cpu_time_count *cpu) -{ - int ret; - unsigned long flags; - spin_lock_irqsave(&p->sighand->siglock, flags); - ret = cpu_clock_sample_group_locked(CPUCLOCK_WHICH(which_clock), p, - cpu); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - return ret; -} - int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) { @@ -471,80 +525,11 @@ void posix_cpu_timers_exit(struct task_struct *tsk) } void posix_cpu_timers_exit_group(struct task_struct *tsk) { - cleanup_timers(tsk->signal->cpu_timers, - cputime_add(tsk->utime, tsk->signal->utime), - cputime_add(tsk->stime, tsk->signal->stime), - tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime); -} + struct task_cputime cputime; - -/* - * Set the expiry times of all the threads in the process so one of them - * will go off before the process cumulative expiry total is reached. - */ -static void process_timer_rebalance(struct task_struct *p, - unsigned int clock_idx, - union cpu_time_count expires, - union cpu_time_count val) -{ - cputime_t ticks, left; - unsigned long long ns, nsleft; - struct task_struct *t = p; - unsigned int nthreads = atomic_read(&p->signal->live); - - if (!nthreads) - return; - - switch (clock_idx) { - default: - BUG(); - break; - case CPUCLOCK_PROF: - left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), - nthreads); - do { - if (likely(!(t->flags & PF_EXITING))) { - ticks = cputime_add(prof_ticks(t), left); - if (cputime_eq(t->it_prof_expires, - cputime_zero) || - cputime_gt(t->it_prof_expires, ticks)) { - t->it_prof_expires = ticks; - } - } - t = next_thread(t); - } while (t != p); - break; - case CPUCLOCK_VIRT: - left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu), - nthreads); - do { - if (likely(!(t->flags & PF_EXITING))) { - ticks = cputime_add(virt_ticks(t), left); - if (cputime_eq(t->it_virt_expires, - cputime_zero) || - cputime_gt(t->it_virt_expires, ticks)) { - t->it_virt_expires = ticks; - } - } - t = next_thread(t); - } while (t != p); - break; - case CPUCLOCK_SCHED: - nsleft = expires.sched - val.sched; - do_div(nsleft, nthreads); - nsleft = max_t(unsigned long long, nsleft, 1); - do { - if (likely(!(t->flags & PF_EXITING))) { - ns = t->se.sum_exec_runtime + nsleft; - if (t->it_sched_expires == 0 || - t->it_sched_expires > ns) { - t->it_sched_expires = ns; - } - } - t = next_thread(t); - } while (t != p); - break; - } + thread_group_cputime(tsk, &cputime); + cleanup_timers(tsk->signal->cpu_timers, + cputime.utime, cputime.stime, cputime.sum_exec_runtime); } static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) @@ -608,29 +593,32 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) default: BUG(); case CPUCLOCK_PROF: - if (cputime_eq(p->it_prof_expires, + if (cputime_eq(p->cputime_expires.prof_exp, cputime_zero) || - cputime_gt(p->it_prof_expires, + cputime_gt(p->cputime_expires.prof_exp, nt->expires.cpu)) - p->it_prof_expires = nt->expires.cpu; + p->cputime_expires.prof_exp = + nt->expires.cpu; break; case CPUCLOCK_VIRT: - if (cputime_eq(p->it_virt_expires, + if (cputime_eq(p->cputime_expires.virt_exp, cputime_zero) || - cputime_gt(p->it_virt_expires, + cputime_gt(p->cputime_expires.virt_exp, nt->expires.cpu)) - p->it_virt_expires = nt->expires.cpu; + p->cputime_expires.virt_exp = + nt->expires.cpu; break; case CPUCLOCK_SCHED: - if (p->it_sched_expires == 0 || - p->it_sched_expires > nt->expires.sched) - p->it_sched_expires = nt->expires.sched; + if (p->cputime_expires.sched_exp == 0 || + p->cputime_expires.sched_exp > + nt->expires.sched) + p->cputime_expires.sched_exp = + nt->expires.sched; break; } } else { /* - * For a process timer, we must balance - * all the live threads' expirations. + * For a process timer, set the cached expiration time. */ switch (CPUCLOCK_WHICH(timer->it_clock)) { default: @@ -641,7 +629,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) cputime_lt(p->signal->it_virt_expires, timer->it.cpu.expires.cpu)) break; - goto rebalance; + p->signal->cputime_expires.virt_exp = + timer->it.cpu.expires.cpu; + break; case CPUCLOCK_PROF: if (!cputime_eq(p->signal->it_prof_expires, cputime_zero) && @@ -652,13 +642,12 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) if (i != RLIM_INFINITY && i <= cputime_to_secs(timer->it.cpu.expires.cpu)) break; - goto rebalance; + p->signal->cputime_expires.prof_exp = + timer->it.cpu.expires.cpu; + break; case CPUCLOCK_SCHED: - rebalance: - process_timer_rebalance( - timer->it.cpu.task, - CPUCLOCK_WHICH(timer->it_clock), - timer->it.cpu.expires, now); + p->signal->cputime_expires.sched_exp = + timer->it.cpu.expires.sched; break; } } @@ -969,13 +958,13 @@ static void check_thread_timers(struct task_struct *tsk, struct signal_struct *const sig = tsk->signal; maxfire = 20; - tsk->it_prof_expires = cputime_zero; + tsk->cputime_expires.prof_exp = cputime_zero; while (!list_empty(timers)) { struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { - tsk->it_prof_expires = t->expires.cpu; + tsk->cputime_expires.prof_exp = t->expires.cpu; break; } t->firing = 1; @@ -984,13 +973,13 @@ static void check_thread_timers(struct task_struct *tsk, ++timers; maxfire = 20; - tsk->it_virt_expires = cputime_zero; + tsk->cputime_expires.virt_exp = cputime_zero; while (!list_empty(timers)) { struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { - tsk->it_virt_expires = t->expires.cpu; + tsk->cputime_expires.virt_exp = t->expires.cpu; break; } t->firing = 1; @@ -999,13 +988,13 @@ static void check_thread_timers(struct task_struct *tsk, ++timers; maxfire = 20; - tsk->it_sched_expires = 0; + tsk->cputime_expires.sched_exp = 0; while (!list_empty(timers)) { struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { - tsk->it_sched_expires = t->expires.sched; + tsk->cputime_expires.sched_exp = t->expires.sched; break; } t->firing = 1; @@ -1055,10 +1044,10 @@ static void check_process_timers(struct task_struct *tsk, { int maxfire; struct signal_struct *const sig = tsk->signal; - cputime_t utime, stime, ptime, virt_expires, prof_expires; + cputime_t utime, ptime, virt_expires, prof_expires; unsigned long long sum_sched_runtime, sched_expires; - struct task_struct *t; struct list_head *timers = sig->cpu_timers; + struct task_cputime cputime; /* * Don't sample the current process CPU clocks if there are no timers. @@ -1074,18 +1063,10 @@ static void check_process_timers(struct task_struct *tsk, /* * Collect the current process totals. */ - utime = sig->utime; - stime = sig->stime; - sum_sched_runtime = sig->sum_sched_runtime; - t = tsk; - do { - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); - sum_sched_runtime += t->se.sum_exec_runtime; - t = next_thread(t); - } while (t != tsk); - ptime = cputime_add(utime, stime); - + thread_group_cputime(tsk, &cputime); + utime = cputime.utime; + ptime = cputime_add(utime, cputime.stime); + sum_sched_runtime = cputime.sum_exec_runtime; maxfire = 20; prof_expires = cputime_zero; while (!list_empty(timers)) { @@ -1193,60 +1174,18 @@ static void check_process_timers(struct task_struct *tsk, } } - if (!cputime_eq(prof_expires, cputime_zero) || - !cputime_eq(virt_expires, cputime_zero) || - sched_expires != 0) { - /* - * Rebalance the threads' expiry times for the remaining - * process CPU timers. - */ - - cputime_t prof_left, virt_left, ticks; - unsigned long long sched_left, sched; - const unsigned int nthreads = atomic_read(&sig->live); - - if (!nthreads) - return; - - prof_left = cputime_sub(prof_expires, utime); - prof_left = cputime_sub(prof_left, stime); - prof_left = cputime_div_non_zero(prof_left, nthreads); - virt_left = cputime_sub(virt_expires, utime); - virt_left = cputime_div_non_zero(virt_left, nthreads); - if (sched_expires) { - sched_left = sched_expires - sum_sched_runtime; - do_div(sched_left, nthreads); - sched_left = max_t(unsigned long long, sched_left, 1); - } else { - sched_left = 0; - } - t = tsk; - do { - if (unlikely(t->flags & PF_EXITING)) - continue; - - ticks = cputime_add(cputime_add(t->utime, t->stime), - prof_left); - if (!cputime_eq(prof_expires, cputime_zero) && - (cputime_eq(t->it_prof_expires, cputime_zero) || - cputime_gt(t->it_prof_expires, ticks))) { - t->it_prof_expires = ticks; - } - - ticks = cputime_add(t->utime, virt_left); - if (!cputime_eq(virt_expires, cputime_zero) && - (cputime_eq(t->it_virt_expires, cputime_zero) || - cputime_gt(t->it_virt_expires, ticks))) { - t->it_virt_expires = ticks; - } - - sched = t->se.sum_exec_runtime + sched_left; - if (sched_expires && (t->it_sched_expires == 0 || - t->it_sched_expires > sched)) { - t->it_sched_expires = sched; - } - } while ((t = next_thread(t)) != tsk); - } + if (!cputime_eq(prof_expires, cputime_zero) && + (cputime_eq(sig->cputime_expires.prof_exp, cputime_zero) || + cputime_gt(sig->cputime_expires.prof_exp, prof_expires))) + sig->cputime_expires.prof_exp = prof_expires; + if (!cputime_eq(virt_expires, cputime_zero) && + (cputime_eq(sig->cputime_expires.virt_exp, cputime_zero) || + cputime_gt(sig->cputime_expires.virt_exp, virt_expires))) + sig->cputime_expires.virt_exp = virt_expires; + if (sched_expires != 0 && + (sig->cputime_expires.sched_exp == 0 || + sig->cputime_expires.sched_exp > sched_expires)) + sig->cputime_expires.sched_exp = sched_expires; } /* @@ -1314,6 +1253,86 @@ out: ++timer->it_requeue_pending; } +/** + * task_cputime_zero - Check a task_cputime struct for all zero fields. + * + * @cputime: The struct to compare. + * + * Checks @cputime to see if all fields are zero. Returns true if all fields + * are zero, false if any field is nonzero. + */ +static inline int task_cputime_zero(const struct task_cputime *cputime) +{ + if (cputime_eq(cputime->utime, cputime_zero) && + cputime_eq(cputime->stime, cputime_zero) && + cputime->sum_exec_runtime == 0) + return 1; + return 0; +} + +/** + * task_cputime_expired - Compare two task_cputime entities. + * + * @sample: The task_cputime structure to be checked for expiration. + * @expires: Expiration times, against which @sample will be checked. + * + * Checks @sample against @expires to see if any field of @sample has expired. + * Returns true if any field of the former is greater than the corresponding + * field of the latter if the latter field is set. Otherwise returns false. + */ +static inline int task_cputime_expired(const struct task_cputime *sample, + const struct task_cputime *expires) +{ + if (!cputime_eq(expires->utime, cputime_zero) && + cputime_ge(sample->utime, expires->utime)) + return 1; + if (!cputime_eq(expires->stime, cputime_zero) && + cputime_ge(cputime_add(sample->utime, sample->stime), + expires->stime)) + return 1; + if (expires->sum_exec_runtime != 0 && + sample->sum_exec_runtime >= expires->sum_exec_runtime) + return 1; + return 0; +} + +/** + * fastpath_timer_check - POSIX CPU timers fast path. + * + * @tsk: The task (thread) being checked. + * + * Check the task and thread group timers. If both are zero (there are no + * timers set) return false. Otherwise snapshot the task and thread group + * timers and compare them with the corresponding expiration times. Return + * true if a timer has expired, else return false. + */ +static inline int fastpath_timer_check(struct task_struct *tsk) +{ + struct signal_struct *sig = tsk->signal; + + if (unlikely(!sig)) + return 0; + + if (!task_cputime_zero(&tsk->cputime_expires)) { + struct task_cputime task_sample = { + .utime = tsk->utime, + .stime = tsk->stime, + .sum_exec_runtime = tsk->se.sum_exec_runtime + }; + + if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) + return 1; + } + if (!task_cputime_zero(&sig->cputime_expires)) { + struct task_cputime group_sample; + + thread_group_cputime(tsk, &group_sample); + if (task_cputime_expired(&group_sample, &sig->cputime_expires)) + return 1; + } + return 0; +} + /* * This is called from the timer interrupt handler. The irq handler has * already updated our counts. We need to check if any timers fire now. @@ -1326,42 +1345,31 @@ void run_posix_cpu_timers(struct task_struct *tsk) BUG_ON(!irqs_disabled()); -#define UNEXPIRED(clock) \ - (cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \ - cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires)) - - if (UNEXPIRED(prof) && UNEXPIRED(virt) && - (tsk->it_sched_expires == 0 || - tsk->se.sum_exec_runtime < tsk->it_sched_expires)) + /* + * The fast path checks that there are no expired thread or thread + * group timers. If that's so, just return. + */ + if (!fastpath_timer_check(tsk)) return; -#undef UNEXPIRED - + spin_lock(&tsk->sighand->siglock); /* - * Double-check with locks held. + * Here we take off tsk->signal->cpu_timers[N] and + * tsk->cpu_timers[N] all the timers that are firing, and + * put them on the firing list. */ - read_lock(&tasklist_lock); - if (likely(tsk->signal != NULL)) { - spin_lock(&tsk->sighand->siglock); + check_thread_timers(tsk, &firing); + check_process_timers(tsk, &firing); - /* - * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N] - * all the timers that are firing, and put them on the firing list. - */ - check_thread_timers(tsk, &firing); - check_process_timers(tsk, &firing); - - /* - * We must release these locks before taking any timer's lock. - * There is a potential race with timer deletion here, as the - * siglock now protects our private firing list. We have set - * the firing flag in each timer, so that a deletion attempt - * that gets the timer lock before we do will give it up and - * spin until we've taken care of that timer below. - */ - spin_unlock(&tsk->sighand->siglock); - } - read_unlock(&tasklist_lock); + /* + * We must release these locks before taking any timer's lock. + * There is a potential race with timer deletion here, as the + * siglock now protects our private firing list. We have set + * the firing flag in each timer, so that a deletion attempt + * that gets the timer lock before we do will give it up and + * spin until we've taken care of that timer below. + */ + spin_unlock(&tsk->sighand->siglock); /* * Now that all the timers on our list have the firing flag, @@ -1389,10 +1397,9 @@ void run_posix_cpu_timers(struct task_struct *tsk) /* * Set one of the process-wide special case CPU timers. - * The tasklist_lock and tsk->sighand->siglock must be held by the caller. - * The oldval argument is null for the RLIMIT_CPU timer, where *newval is - * absolute; non-null for ITIMER_*, where *newval is relative and we update - * it to be absolute, *oldval is absolute and we update it to be relative. + * The tsk->sighand->siglock must be held by the caller. + * The *newval argument is relative and we update it to be absolute, *oldval + * is absolute and we update it to be relative. */ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, cputime_t *newval, cputime_t *oldval) @@ -1401,7 +1408,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, struct list_head *head; BUG_ON(clock_idx == CPUCLOCK_SCHED); - cpu_clock_sample_group_locked(clock_idx, tsk, &now); + cpu_clock_sample_group(clock_idx, tsk, &now); if (oldval) { if (!cputime_eq(*oldval, cputime_zero)) { @@ -1435,13 +1442,14 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, cputime_ge(list_first_entry(head, struct cpu_timer_list, entry)->expires.cpu, *newval)) { - /* - * Rejigger each thread's expiry time so that one will - * notice before we hit the process-cumulative expiry time. - */ - union cpu_time_count expires = { .sched = 0 }; - expires.cpu = *newval; - process_timer_rebalance(tsk, clock_idx, expires, now); + switch (clock_idx) { + case CPUCLOCK_PROF: + tsk->signal->cputime_expires.prof_exp = *newval; + break; + case CPUCLOCK_VIRT: + tsk->signal->cputime_expires.virt_exp = *newval; + break; + } } } diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 5131e54..b931d7c 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -223,6 +223,15 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) } /* + * Get monotonic time for posix timers + */ +static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) +{ + getrawmonotonic(tp); + return 0; +} + +/* * Initialize everything, well, just everything in Posix clocks/timers ;) */ static __init int init_posix_timers(void) @@ -235,9 +244,15 @@ static __init int init_posix_timers(void) .clock_get = posix_ktime_get_ts, .clock_set = do_posix_clock_nosettime, }; + struct k_clock clock_monotonic_raw = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_monotonic_raw, + .clock_set = do_posix_clock_nosettime, + }; register_posix_clock(CLOCK_REALTIME, &clock_realtime); register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); + register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof (struct k_itimer), 0, SLAB_PANIC, @@ -298,6 +313,7 @@ void do_schedule_next_timer(struct siginfo *info) int posix_timer_event(struct k_itimer *timr, int si_private) { + int shared, ret; /* * FIXME: if ->sigq is queued we can race with * dequeue_signal()->do_schedule_next_timer(). @@ -311,25 +327,10 @@ int posix_timer_event(struct k_itimer *timr, int si_private) */ timr->sigq->info.si_sys_private = si_private; - timr->sigq->info.si_signo = timr->it_sigev_signo; - timr->sigq->info.si_code = SI_TIMER; - timr->sigq->info.si_tid = timr->it_id; - timr->sigq->info.si_value = timr->it_sigev_value; - - if (timr->it_sigev_notify & SIGEV_THREAD_ID) { - struct task_struct *leader; - int ret = send_sigqueue(timr->sigq, timr->it_process, 0); - - if (likely(ret >= 0)) - return ret; - - timr->it_sigev_notify = SIGEV_SIGNAL; - leader = timr->it_process->group_leader; - put_task_struct(timr->it_process); - timr->it_process = leader; - } - - return send_sigqueue(timr->sigq, timr->it_process, 1); + shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); + ret = send_sigqueue(timr->sigq, timr->it_process, shared); + /* If we failed to send the signal the timer stops. */ + return ret > 0; } EXPORT_SYMBOL_GPL(posix_timer_event); @@ -468,11 +469,9 @@ sys_timer_create(const clockid_t which_clock, struct sigevent __user *timer_event_spec, timer_t __user * created_timer_id) { - int error = 0; - struct k_itimer *new_timer = NULL; - int new_timer_id; - struct task_struct *process = NULL; - unsigned long flags; + struct k_itimer *new_timer; + int error, new_timer_id; + struct task_struct *process; sigevent_t event; int it_id_set = IT_ID_NOT_SET; @@ -490,12 +489,11 @@ sys_timer_create(const clockid_t which_clock, goto out; } spin_lock_irq(&idr_lock); - error = idr_get_new(&posix_timers_id, (void *) new_timer, - &new_timer_id); + error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id); spin_unlock_irq(&idr_lock); - if (error == -EAGAIN) - goto retry; - else if (error) { + if (error) { + if (error == -EAGAIN) + goto retry; /* * Weird looking, but we return EAGAIN if the IDR is * full (proper POSIX return value for this) @@ -526,67 +524,43 @@ sys_timer_create(const clockid_t which_clock, error = -EFAULT; goto out; } - new_timer->it_sigev_notify = event.sigev_notify; - new_timer->it_sigev_signo = event.sigev_signo; - new_timer->it_sigev_value = event.sigev_value; - - read_lock(&tasklist_lock); - if ((process = good_sigevent(&event))) { - /* - * We may be setting up this process for another - * thread. It may be exiting. To catch this - * case the we check the PF_EXITING flag. If - * the flag is not set, the siglock will catch - * him before it is too late (in exit_itimers). - * - * The exec case is a bit more invloved but easy - * to code. If the process is in our thread - * group (and it must be or we would not allow - * it here) and is doing an exec, it will cause - * us to be killed. In this case it will wait - * for us to die which means we can finish this - * linkage with our last gasp. I.e. no code :) - */ - spin_lock_irqsave(&process->sighand->siglock, flags); - if (!(process->flags & PF_EXITING)) { - new_timer->it_process = process; - list_add(&new_timer->list, - &process->signal->posix_timers); - if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) - get_task_struct(process); - spin_unlock_irqrestore(&process->sighand->siglock, flags); - } else { - spin_unlock_irqrestore(&process->sighand->siglock, flags); - process = NULL; - } - } - read_unlock(&tasklist_lock); + rcu_read_lock(); + process = good_sigevent(&event); + if (process) + get_task_struct(process); + rcu_read_unlock(); if (!process) { error = -EINVAL; goto out; } } else { - new_timer->it_sigev_notify = SIGEV_SIGNAL; - new_timer->it_sigev_signo = SIGALRM; - new_timer->it_sigev_value.sival_int = new_timer->it_id; + event.sigev_notify = SIGEV_SIGNAL; + event.sigev_signo = SIGALRM; + event.sigev_value.sival_int = new_timer->it_id; process = current->group_leader; - spin_lock_irqsave(&process->sighand->siglock, flags); - new_timer->it_process = process; - list_add(&new_timer->list, &process->signal->posix_timers); - spin_unlock_irqrestore(&process->sighand->siglock, flags); + get_task_struct(process); } + new_timer->it_sigev_notify = event.sigev_notify; + new_timer->sigq->info.si_signo = event.sigev_signo; + new_timer->sigq->info.si_value = event.sigev_value; + new_timer->sigq->info.si_tid = new_timer->it_id; + new_timer->sigq->info.si_code = SI_TIMER; + + spin_lock_irq(¤t->sighand->siglock); + new_timer->it_process = process; + list_add(&new_timer->list, ¤t->signal->posix_timers); + spin_unlock_irq(¤t->sighand->siglock); + + return 0; /* * In the case of the timer belonging to another task, after * the task is unlocked, the timer is owned by the other task * and may cease to exist at any time. Don't use or modify * new_timer after the unlock call. */ - out: - if (error) - release_posix_timer(new_timer, it_id_set); - + release_posix_timer(new_timer, it_id_set); return error; } @@ -597,7 +571,7 @@ out: * the find to the timer lock. To avoid a dead lock, the timer id MUST * be release with out holding the timer lock. */ -static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) +static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags) { struct k_itimer *timr; /* @@ -605,23 +579,20 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) * flags part over to the timer lock. Must not let interrupts in * while we are moving the lock. */ - spin_lock_irqsave(&idr_lock, *flags); - timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id); + timr = idr_find(&posix_timers_id, (int)timer_id); if (timr) { spin_lock(&timr->it_lock); - - if ((timr->it_id != timer_id) || !(timr->it_process) || - !same_thread_group(timr->it_process, current)) { - spin_unlock(&timr->it_lock); - spin_unlock_irqrestore(&idr_lock, *flags); - timr = NULL; - } else + if (timr->it_process && + same_thread_group(timr->it_process, current)) { spin_unlock(&idr_lock); - } else - spin_unlock_irqrestore(&idr_lock, *flags); + return timr; + } + spin_unlock(&timr->it_lock); + } + spin_unlock_irqrestore(&idr_lock, *flags); - return timr; + return NULL; } /* @@ -862,8 +833,7 @@ retry_delete: * This keeps any tasks waiting on the spin lock from thinking * they got something (see the lock code above). */ - if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) - put_task_struct(timer->it_process); + put_task_struct(timer->it_process); timer->it_process = NULL; unlock_timer(timer, flags); @@ -890,8 +860,7 @@ retry_delete: * This keeps any tasks waiting on the spin lock from thinking * they got something (see the lock code above). */ - if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) - put_task_struct(timer->it_process); + put_task_struct(timer->it_process); timer->it_process = NULL; unlock_timer(timer, flags); diff --git a/kernel/power/process.c b/kernel/power/process.c index 278946a..ca63401 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -28,121 +28,6 @@ static inline int freezeable(struct task_struct * p) return 1; } -/* - * freezing is complete, mark current process as frozen - */ -static inline void frozen_process(void) -{ - if (!unlikely(current->flags & PF_NOFREEZE)) { - current->flags |= PF_FROZEN; - wmb(); - } - clear_freeze_flag(current); -} - -/* Refrigerator is place where frozen processes are stored :-). */ -void refrigerator(void) -{ - /* Hmm, should we be allowed to suspend when there are realtime - processes around? */ - long save; - - task_lock(current); - if (freezing(current)) { - frozen_process(); - task_unlock(current); - } else { - task_unlock(current); - return; - } - save = current->state; - pr_debug("%s entered refrigerator\n", current->comm); - - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); /* We sent fake signal, clean it up */ - spin_unlock_irq(¤t->sighand->siglock); - - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!frozen(current)) - break; - schedule(); - } - pr_debug("%s left refrigerator\n", current->comm); - __set_current_state(save); -} - -static void fake_signal_wake_up(struct task_struct *p) -{ - unsigned long flags; - - spin_lock_irqsave(&p->sighand->siglock, flags); - signal_wake_up(p, 0); - spin_unlock_irqrestore(&p->sighand->siglock, flags); -} - -static inline bool should_send_signal(struct task_struct *p) -{ - return !(p->flags & PF_FREEZER_NOSIG); -} - -/** - * freeze_task - send a freeze request to given task - * @p: task to send the request to - * @sig_only: if set, the request will only be sent if the task has the - * PF_FREEZER_NOSIG flag unset - * Return value: 'false', if @sig_only is set and the task has - * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise - * - * The freeze request is sent by setting the tasks's TIF_FREEZE flag and - * either sending a fake signal to it or waking it up, depending on whether - * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task - * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its - * TIF_FREEZE flag will not be set. - */ -static bool freeze_task(struct task_struct *p, bool sig_only) -{ - /* - * We first check if the task is freezing and next if it has already - * been frozen to avoid the race with frozen_process() which first marks - * the task as frozen and next clears its TIF_FREEZE. - */ - if (!freezing(p)) { - rmb(); - if (frozen(p)) - return false; - - if (!sig_only || should_send_signal(p)) - set_freeze_flag(p); - else - return false; - } - - if (should_send_signal(p)) { - if (!signal_pending(p)) - fake_signal_wake_up(p); - } else if (sig_only) { - return false; - } else { - wake_up_state(p, TASK_INTERRUPTIBLE); - } - - return true; -} - -static void cancel_freezing(struct task_struct *p) -{ - unsigned long flags; - - if (freezing(p)) { - pr_debug(" clean up: %s\n", p->comm); - clear_freeze_flag(p); - spin_lock_irqsave(&p->sighand->siglock, flags); - recalc_sigpending_and_wake(p); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - } -} - static int try_to_freeze_tasks(bool sig_only) { struct task_struct *g, *p; @@ -250,6 +135,9 @@ static void thaw_tasks(bool nosig_only) if (nosig_only && should_send_signal(p)) continue; + if (cgroup_frozen(p)) + continue; + thaw_process(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); @@ -264,4 +152,3 @@ void thaw_processes(void) printk("done.\n"); } -EXPORT_SYMBOL(refrigerator); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 356699a..1e68e4c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -45,7 +45,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) * TASK_TRACED, resume it now. * Requires that irqs be disabled. */ -void ptrace_untrace(struct task_struct *child) +static void ptrace_untrace(struct task_struct *child) { spin_lock(&child->sighand->siglock); if (task_is_traced(child)) { diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index ca4bbbe..59236e8 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -54,9 +54,9 @@ #include <linux/cpu.h> #include <linux/random.h> #include <linux/delay.h> -#include <linux/byteorder/swabb.h> #include <linux/cpumask.h> #include <linux/rcupreempt_trace.h> +#include <asm/byteorder.h> /* * PREEMPT_RCU data structures. diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 90b5b12..85cb905 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -42,10 +42,10 @@ #include <linux/freezer.h> #include <linux/cpu.h> #include <linux/delay.h> -#include <linux/byteorder/swabb.h> #include <linux/stat.h> #include <linux/srcu.h> #include <linux/slab.h> +#include <asm/byteorder.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " diff --git a/kernel/sched.c b/kernel/sched.c index 6f23059..945a97b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -71,6 +71,7 @@ #include <linux/debugfs.h> #include <linux/ctype.h> #include <linux/ftrace.h> +#include <trace/sched.h> #include <asm/tlb.h> #include <asm/irq_regs.h> @@ -818,6 +819,13 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; unsigned int sysctl_sched_shares_ratelimit = 250000; /* + * Inject some fuzzyness into changing the per-cpu group shares + * this avoids remote rq-locks at the expense of fairness. + * default: 4 + */ +unsigned int sysctl_sched_shares_thresh = 4; + +/* * period over which we measure -rt task cpu usage in us. * default: 1s */ @@ -1453,8 +1461,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares); * Calculate and set the cpu's group shares. */ static void -__update_group_shares_cpu(struct task_group *tg, int cpu, - unsigned long sd_shares, unsigned long sd_rq_weight) +update_group_shares_cpu(struct task_group *tg, int cpu, + unsigned long sd_shares, unsigned long sd_rq_weight) { int boost = 0; unsigned long shares; @@ -1485,19 +1493,23 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, * */ shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); + shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); - /* - * record the actual number of shares, not the boosted amount. - */ - tg->cfs_rq[cpu]->shares = boost ? 0 : shares; - tg->cfs_rq[cpu]->rq_weight = rq_weight; + if (abs(shares - tg->se[cpu]->load.weight) > + sysctl_sched_shares_thresh) { + struct rq *rq = cpu_rq(cpu); + unsigned long flags; - if (shares < MIN_SHARES) - shares = MIN_SHARES; - else if (shares > MAX_SHARES) - shares = MAX_SHARES; + spin_lock_irqsave(&rq->lock, flags); + /* + * record the actual number of shares, not the boosted amount. + */ + tg->cfs_rq[cpu]->shares = boost ? 0 : shares; + tg->cfs_rq[cpu]->rq_weight = rq_weight; - __set_se_shares(tg->se[cpu], shares); + __set_se_shares(tg->se[cpu], shares); + spin_unlock_irqrestore(&rq->lock, flags); + } } /* @@ -1526,14 +1538,8 @@ static int tg_shares_up(struct task_group *tg, void *data) if (!rq_weight) rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; - for_each_cpu_mask(i, sd->span) { - struct rq *rq = cpu_rq(i); - unsigned long flags; - - spin_lock_irqsave(&rq->lock, flags); - __update_group_shares_cpu(tg, i, shares, rq_weight); - spin_unlock_irqrestore(&rq->lock, flags); - } + for_each_cpu_mask(i, sd->span) + update_group_shares_cpu(tg, i, shares, rq_weight); return 0; } @@ -1936,6 +1942,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * just go back and repeat. */ rq = task_rq_lock(p, &flags); + trace_sched_wait_task(rq, p); running = task_running(rq, p); on_rq = p->se.on_rq; ncsw = 0; @@ -2297,9 +2304,7 @@ out_activate: success = 1; out_running: - trace_mark(kernel_sched_wakeup, - "pid %d state %ld ## rq %p task %p rq->curr %p", - p->pid, p->state, rq, p, rq->curr); + trace_sched_wakeup(rq, p); check_preempt_curr(rq, p, sync); p->state = TASK_RUNNING; @@ -2432,9 +2437,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->sched_class->task_new(rq, p); inc_nr_running(rq); } - trace_mark(kernel_sched_wakeup_new, - "pid %d state %ld ## rq %p task %p rq->curr %p", - p->pid, p->state, rq, p, rq->curr); + trace_sched_wakeup_new(rq, p); check_preempt_curr(rq, p, 0); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@ -2607,11 +2610,7 @@ context_switch(struct rq *rq, struct task_struct *prev, struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); - trace_mark(kernel_sched_schedule, - "prev_pid %d next_pid %d prev_state %ld " - "## rq %p prev %p next %p", - prev->pid, next->pid, prev->state, - rq, prev, next); + trace_sched_switch(rq, prev, next); mm = next->mm; oldmm = prev->active_mm; /* @@ -2851,6 +2850,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) || unlikely(!cpu_active(dest_cpu))) goto out; + trace_sched_migrate_task(rq, p, dest_cpu); /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread (might exit: take ref). */ @@ -4052,23 +4052,26 @@ DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); /* - * Return p->sum_exec_runtime plus any more ns on the sched_clock - * that have not yet been banked in case the task is currently running. + * Return any ns on the sched_clock that have not yet been banked in + * @p in case that task is currently running. */ -unsigned long long task_sched_runtime(struct task_struct *p) +unsigned long long task_delta_exec(struct task_struct *p) { unsigned long flags; - u64 ns, delta_exec; struct rq *rq; + u64 ns = 0; rq = task_rq_lock(p, &flags); - ns = p->se.sum_exec_runtime; + if (task_current(rq, p)) { + u64 delta_exec; + update_rq_clock(rq); delta_exec = rq->clock - p->se.exec_start; if ((s64)delta_exec > 0) - ns += delta_exec; + ns = delta_exec; } + task_rq_unlock(rq, &flags); return ns; @@ -4085,6 +4088,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime) cputime64_t tmp; p->utime = cputime_add(p->utime, cputime); + account_group_user_time(p, cputime); /* Add user time to cpustat. */ tmp = cputime_to_cputime64(cputime); @@ -4109,6 +4113,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime) tmp = cputime_to_cputime64(cputime); p->utime = cputime_add(p->utime, cputime); + account_group_user_time(p, cputime); p->gtime = cputime_add(p->gtime, cputime); cpustat->user = cputime64_add(cpustat->user, tmp); @@ -4144,6 +4149,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, } p->stime = cputime_add(p->stime, cputime); + account_group_system_time(p, cputime); /* Add system time to cpustat. */ tmp = cputime_to_cputime64(cputime); @@ -4185,6 +4191,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal) if (p == rq->idle) { p->stime = cputime_add(p->stime, steal); + account_group_system_time(p, steal); if (atomic_read(&rq->nr_iowait) > 0) cpustat->iowait = cputime64_add(cpustat->iowait, tmp); else @@ -4441,12 +4448,8 @@ need_resched_nonpreemptible: if (sched_feat(HRTICK)) hrtick_clear(rq); - /* - * Do the rq-clock update outside the rq lock: - */ - local_irq_disable(); + spin_lock_irq(&rq->lock); update_rq_clock(rq); - spin_lock(&rq->lock); clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 18fd171..9573c33 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -73,6 +73,8 @@ unsigned int sysctl_sched_wakeup_granularity = 5000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +static const struct sched_class fair_sched_class; + /************************************************************** * CFS operations on generic schedulable entities: */ @@ -334,7 +336,7 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, #endif /* - * delta *= w / rw + * delta *= P[w / rw] */ static inline unsigned long calc_delta_weight(unsigned long delta, struct sched_entity *se) @@ -348,15 +350,13 @@ calc_delta_weight(unsigned long delta, struct sched_entity *se) } /* - * delta *= rw / w + * delta /= w */ static inline unsigned long calc_delta_fair(unsigned long delta, struct sched_entity *se) { - for_each_sched_entity(se) { - delta = calc_delta_mine(delta, - cfs_rq_of(se)->load.weight, &se->load); - } + if (unlikely(se->load.weight != NICE_0_LOAD)) + delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); return delta; } @@ -386,26 +386,26 @@ static u64 __sched_period(unsigned long nr_running) * We calculate the wall-time slice from the period by taking a part * proportional to the weight. * - * s = p*w/rw + * s = p*P[w/rw] */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - return calc_delta_weight(__sched_period(cfs_rq->nr_running), se); + unsigned long nr_running = cfs_rq->nr_running; + + if (unlikely(!se->on_rq)) + nr_running++; + + return calc_delta_weight(__sched_period(nr_running), se); } /* * We calculate the vruntime slice of a to be inserted task * - * vs = s*rw/w = p + * vs = s/w */ -static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - unsigned long nr_running = cfs_rq->nr_running; - - if (!se->on_rq) - nr_running++; - - return __sched_period(nr_running); + return calc_delta_fair(sched_slice(cfs_rq, se), se); } /* @@ -449,6 +449,7 @@ static void update_curr(struct cfs_rq *cfs_rq) struct task_struct *curtask = task_of(curr); cpuacct_charge(curtask, delta_exec); + account_group_exec_runtime(curtask, delta_exec); } } @@ -627,7 +628,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) * stays open at the end. */ if (initial && sched_feat(START_DEBIT)) - vruntime += sched_vslice_add(cfs_rq, se); + vruntime += sched_vslice(cfs_rq, se); if (!initial) { /* sleeps upto a single latency don't count. */ @@ -747,7 +748,7 @@ pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) struct rq *rq = rq_of(cfs_rq); u64 pair_slice = rq->clock - cfs_rq->pair_start; - if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) { + if (!cfs_rq->next || pair_slice > sysctl_sched_min_granularity) { cfs_rq->pair_start = rq->clock; return se; } @@ -848,11 +849,31 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) hrtick_start(rq, delta); } } + +/* + * called from enqueue/dequeue and updates the hrtick when the + * current task is from our class and nr_running is low enough + * to matter. + */ +static void hrtick_update(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + + if (curr->sched_class != &fair_sched_class) + return; + + if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) + hrtick_start_fair(rq, curr); +} #else /* !CONFIG_SCHED_HRTICK */ static inline void hrtick_start_fair(struct rq *rq, struct task_struct *p) { } + +static inline void hrtick_update(struct rq *rq) +{ +} #endif /* @@ -873,7 +894,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) wakeup = 1; } - hrtick_start_fair(rq, rq->curr); + hrtick_update(rq); } /* @@ -895,7 +916,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) sleep = 1; } - hrtick_start_fair(rq, rq->curr); + hrtick_update(rq); } /* @@ -1001,8 +1022,6 @@ static inline int wake_idle(int cpu, struct task_struct *p) #ifdef CONFIG_SMP -static const struct sched_class fair_sched_class; - #ifdef CONFIG_FAIR_GROUP_SCHED /* * effective_load() calculates the load change as seen from the root_task_group diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 7c9e8f4..fda0162 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -5,7 +5,7 @@ SCHED_FEAT(START_DEBIT, 1) SCHED_FEAT(AFFINE_WAKEUPS, 1) SCHED_FEAT(CACHE_HOT_BUDDY, 1) SCHED_FEAT(SYNC_WAKEUPS, 1) -SCHED_FEAT(HRTICK, 1) +SCHED_FEAT(HRTICK, 0) SCHED_FEAT(DOUBLE_TICK, 0) SCHED_FEAT(ASYM_GRAN, 1) SCHED_FEAT(LB_BIAS, 1) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index cdf5740..b446dc8 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -526,6 +526,8 @@ static void update_curr_rt(struct rq *rq) schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + curr->se.exec_start = rq->clock; cpuacct_charge(curr, delta_exec); @@ -1458,7 +1460,7 @@ static void watchdog(struct rq *rq, struct task_struct *p) p->rt.timeout++; next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); if (p->rt.timeout > next) - p->it_sched_expires = p->se.sum_exec_runtime; + p->cputime_expires.sched_exp = p->se.sum_exec_runtime; } } diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 8385d43..2df9d29 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -9,7 +9,7 @@ static int show_schedstat(struct seq_file *seq, void *v) { int cpu; - int mask_len = NR_CPUS/32 * 9; + int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; char *mask_str = kmalloc(mask_len, GFP_KERNEL); if (mask_str == NULL) @@ -270,3 +270,89 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) #define sched_info_switch(t, next) do { } while (0) #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ +/* + * The following are functions that support scheduler-internal time accounting. + * These functions are generally called at the timer tick. None of this depends + * on CONFIG_SCHEDSTATS. + */ + +/** + * account_group_user_time - Maintain utime for a thread group. + * + * @tsk: Pointer to task structure. + * @cputime: Time value by which to increment the utime field of the + * thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the utime field there. + */ +static inline void account_group_user_time(struct task_struct *tsk, + cputime_t cputime) +{ + struct signal_struct *sig; + + sig = tsk->signal; + if (unlikely(!sig)) + return; + if (sig->cputime.totals) { + struct task_cputime *times; + + times = per_cpu_ptr(sig->cputime.totals, get_cpu()); + times->utime = cputime_add(times->utime, cputime); + put_cpu_no_resched(); + } +} + +/** + * account_group_system_time - Maintain stime for a thread group. + * + * @tsk: Pointer to task structure. + * @cputime: Time value by which to increment the stime field of the + * thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the stime field there. + */ +static inline void account_group_system_time(struct task_struct *tsk, + cputime_t cputime) +{ + struct signal_struct *sig; + + sig = tsk->signal; + if (unlikely(!sig)) + return; + if (sig->cputime.totals) { + struct task_cputime *times; + + times = per_cpu_ptr(sig->cputime.totals, get_cpu()); + times->stime = cputime_add(times->stime, cputime); + put_cpu_no_resched(); + } +} + +/** + * account_group_exec_runtime - Maintain exec runtime for a thread group. + * + * @tsk: Pointer to task structure. + * @ns: Time value by which to increment the sum_exec_runtime field + * of the thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the sum_exec_runtime field there. + */ +static inline void account_group_exec_runtime(struct task_struct *tsk, + unsigned long long ns) +{ + struct signal_struct *sig; + + sig = tsk->signal; + if (unlikely(!sig)) + return; + if (sig->cputime.totals) { + struct task_cputime *times; + + times = per_cpu_ptr(sig->cputime.totals, get_cpu()); + times->sum_exec_runtime += ns; + put_cpu_no_resched(); + } +} diff --git a/kernel/signal.c b/kernel/signal.c index e661b01..105217d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -27,6 +27,7 @@ #include <linux/freezer.h> #include <linux/pid_namespace.h> #include <linux/nsproxy.h> +#include <trace/sched.h> #include <asm/param.h> #include <asm/uaccess.h> @@ -803,6 +804,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, struct sigpending *pending; struct sigqueue *q; + trace_sched_signal_send(sig, t); + assert_spin_locked(&t->sighand->siglock); if (!prepare_signal(sig, t)) return 0; @@ -1338,6 +1341,7 @@ int do_notify_parent(struct task_struct *tsk, int sig) struct siginfo info; unsigned long flags; struct sighand_struct *psig; + struct task_cputime cputime; int ret = sig; BUG_ON(sig == -1); @@ -1368,10 +1372,9 @@ int do_notify_parent(struct task_struct *tsk, int sig) info.si_uid = tsk->uid; - info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, - tsk->signal->utime)); - info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime, - tsk->signal->stime)); + thread_group_cputime(tsk, &cputime); + info.si_utime = cputime_to_jiffies(cputime.utime); + info.si_stime = cputime_to_jiffies(cputime.stime); info.si_status = tsk->exit_code & 0x7f; if (tsk->exit_code & 0x80) diff --git a/kernel/softirq.c b/kernel/softirq.c index 37d67aa..7110dae 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -6,6 +6,8 @@ * Distribute under GPLv2. * * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) + * + * Remote softirq infrastructure is by Jens Axboe. */ #include <linux/module.h> @@ -265,16 +267,12 @@ asmlinkage void do_softirq(void) */ void irq_enter(void) { -#ifdef CONFIG_NO_HZ int cpu = smp_processor_id(); + if (idle_cpu(cpu) && !in_interrupt()) - tick_nohz_stop_idle(cpu); -#endif + tick_check_idle(cpu); + __irq_enter(); -#ifdef CONFIG_NO_HZ - if (idle_cpu(cpu)) - tick_nohz_update_jiffies(); -#endif } #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED @@ -474,17 +472,144 @@ void tasklet_kill(struct tasklet_struct *t) EXPORT_SYMBOL(tasklet_kill); +DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); +EXPORT_PER_CPU_SYMBOL(softirq_work_list); + +static void __local_trigger(struct call_single_data *cp, int softirq) +{ + struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]); + + list_add_tail(&cp->list, head); + + /* Trigger the softirq only if the list was previously empty. */ + if (head->next == &cp->list) + raise_softirq_irqoff(softirq); +} + +#ifdef CONFIG_USE_GENERIC_SMP_HELPERS +static void remote_softirq_receive(void *data) +{ + struct call_single_data *cp = data; + unsigned long flags; + int softirq; + + softirq = cp->priv; + + local_irq_save(flags); + __local_trigger(cp, softirq); + local_irq_restore(flags); +} + +static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ + if (cpu_online(cpu)) { + cp->func = remote_softirq_receive; + cp->info = cp; + cp->flags = 0; + cp->priv = softirq; + + __smp_call_function_single(cpu, cp); + return 0; + } + return 1; +} +#else /* CONFIG_USE_GENERIC_SMP_HELPERS */ +static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ + return 1; +} +#endif + +/** + * __send_remote_softirq - try to schedule softirq work on a remote cpu + * @cp: private SMP call function data area + * @cpu: the remote cpu + * @this_cpu: the currently executing cpu + * @softirq: the softirq for the work + * + * Attempt to schedule softirq work on a remote cpu. If this cannot be + * done, the work is instead queued up on the local cpu. + * + * Interrupts must be disabled. + */ +void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq) +{ + if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq)) + __local_trigger(cp, softirq); +} +EXPORT_SYMBOL(__send_remote_softirq); + +/** + * send_remote_softirq - try to schedule softirq work on a remote cpu + * @cp: private SMP call function data area + * @cpu: the remote cpu + * @softirq: the softirq for the work + * + * Like __send_remote_softirq except that disabling interrupts and + * computing the current cpu is done for the caller. + */ +void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ + unsigned long flags; + int this_cpu; + + local_irq_save(flags); + this_cpu = smp_processor_id(); + __send_remote_softirq(cp, cpu, this_cpu, softirq); + local_irq_restore(flags); +} +EXPORT_SYMBOL(send_remote_softirq); + +static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + /* + * If a CPU goes away, splice its entries to the current CPU + * and trigger a run of the softirq + */ + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { + int cpu = (unsigned long) hcpu; + int i; + + local_irq_disable(); + for (i = 0; i < NR_SOFTIRQS; i++) { + struct list_head *head = &per_cpu(softirq_work_list[i], cpu); + struct list_head *local_head; + + if (list_empty(head)) + continue; + + local_head = &__get_cpu_var(softirq_work_list[i]); + list_splice_init(head, local_head); + raise_softirq_irqoff(i); + } + local_irq_enable(); + } + + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = { + .notifier_call = remote_softirq_cpu_notify, +}; + void __init softirq_init(void) { int cpu; for_each_possible_cpu(cpu) { + int i; + per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; + for (i = 0; i < NR_SOFTIRQS; i++) + INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu)); } + register_hotcpu_notifier(&remote_softirq_cpu_notifier); + open_softirq(TASKLET_SOFTIRQ, tasklet_action); open_softirq(HI_SOFTIRQ, tasklet_hi_action); } diff --git a/kernel/sys.c b/kernel/sys.c index 0bc8fa3..53879cd 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -853,38 +853,28 @@ asmlinkage long sys_setfsgid(gid_t gid) return old_fsgid; } +void do_sys_times(struct tms *tms) +{ + struct task_cputime cputime; + cputime_t cutime, cstime; + + spin_lock_irq(¤t->sighand->siglock); + thread_group_cputime(current, &cputime); + cutime = current->signal->cutime; + cstime = current->signal->cstime; + spin_unlock_irq(¤t->sighand->siglock); + tms->tms_utime = cputime_to_clock_t(cputime.utime); + tms->tms_stime = cputime_to_clock_t(cputime.stime); + tms->tms_cutime = cputime_to_clock_t(cutime); + tms->tms_cstime = cputime_to_clock_t(cstime); +} + asmlinkage long sys_times(struct tms __user * tbuf) { - /* - * In the SMP world we might just be unlucky and have one of - * the times increment as we use it. Since the value is an - * atomically safe type this is just fine. Conceptually its - * as if the syscall took an instant longer to occur. - */ if (tbuf) { struct tms tmp; - struct task_struct *tsk = current; - struct task_struct *t; - cputime_t utime, stime, cutime, cstime; - - spin_lock_irq(&tsk->sighand->siglock); - utime = tsk->signal->utime; - stime = tsk->signal->stime; - t = tsk; - do { - utime = cputime_add(utime, t->utime); - stime = cputime_add(stime, t->stime); - t = next_thread(t); - } while (t != tsk); - - cutime = tsk->signal->cutime; - cstime = tsk->signal->cstime; - spin_unlock_irq(&tsk->sighand->siglock); - - tmp.tms_utime = cputime_to_clock_t(utime); - tmp.tms_stime = cputime_to_clock_t(stime); - tmp.tms_cutime = cputime_to_clock_t(cutime); - tmp.tms_cstime = cputime_to_clock_t(cstime); + + do_sys_times(&tmp); if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) return -EFAULT; } @@ -1449,7 +1439,6 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) { struct rlimit new_rlim, *old_rlim; - unsigned long it_prof_secs; int retval; if (resource >= RLIM_NLIMITS) @@ -1503,18 +1492,7 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) if (new_rlim.rlim_cur == RLIM_INFINITY) goto out; - it_prof_secs = cputime_to_secs(current->signal->it_prof_expires); - if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) { - unsigned long rlim_cur = new_rlim.rlim_cur; - cputime_t cputime; - - cputime = secs_to_cputime(rlim_cur); - read_lock(&tasklist_lock); - spin_lock_irq(¤t->sighand->siglock); - set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); - spin_unlock_irq(¤t->sighand->siglock); - read_unlock(&tasklist_lock); - } + update_rlimit_cpu(new_rlim.rlim_cur); out: return 0; } @@ -1552,11 +1530,8 @@ out: * */ -static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r, - cputime_t *utimep, cputime_t *stimep) +static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r) { - *utimep = cputime_add(*utimep, t->utime); - *stimep = cputime_add(*stimep, t->stime); r->ru_nvcsw += t->nvcsw; r->ru_nivcsw += t->nivcsw; r->ru_minflt += t->min_flt; @@ -1570,12 +1545,13 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) struct task_struct *t; unsigned long flags; cputime_t utime, stime; + struct task_cputime cputime; memset((char *) r, 0, sizeof *r); utime = stime = cputime_zero; if (who == RUSAGE_THREAD) { - accumulate_thread_rusage(p, r, &utime, &stime); + accumulate_thread_rusage(p, r); goto out; } @@ -1598,8 +1574,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) break; case RUSAGE_SELF: - utime = cputime_add(utime, p->signal->utime); - stime = cputime_add(stime, p->signal->stime); + thread_group_cputime(p, &cputime); + utime = cputime_add(utime, cputime.utime); + stime = cputime_add(stime, cputime.stime); r->ru_nvcsw += p->signal->nvcsw; r->ru_nivcsw += p->signal->nivcsw; r->ru_minflt += p->signal->min_flt; @@ -1608,7 +1585,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_oublock += p->signal->oublock; t = p; do { - accumulate_thread_rusage(t, r, &utime, &stime); + accumulate_thread_rusage(t, r); t = next_thread(t); } while (t != p); break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 617d41e..a13bd4d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -276,6 +276,16 @@ static struct ctl_table kern_table[] = { }, { .ctl_name = CTL_UNNUMBERED, + .procname = "sched_shares_thresh", + .data = &sysctl_sched_shares_thresh, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, + { + .ctl_name = CTL_UNNUMBERED, .procname = "sched_child_runs_first", .data = &sysctl_sched_child_runs_first, .maxlen = sizeof(unsigned int), @@ -833,6 +843,16 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_UNEVICTABLE_LRU + { + .ctl_name = CTL_UNNUMBERED, + .procname = "scan_unevictable_pages", + .data = &scan_unevictable_pages, + .maxlen = sizeof(scan_unevictable_pages), + .mode = 0644, + .proc_handler = &scan_unevictable_handler, + }, +#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 093d4ac..9ed2eec 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -325,6 +325,9 @@ int clocksource_register(struct clocksource *c) unsigned long flags; int ret; + /* save mult_orig on registration */ + c->mult_orig = c->mult; + spin_lock_irqsave(&clocksource_lock, flags); ret = clocksource_enqueue(c); if (!ret) diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 4c256fd..1ca9955 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -61,6 +61,7 @@ struct clocksource clocksource_jiffies = { .read = jiffies_read, .mask = 0xffffffff, /*32bits*/ .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ + .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT, .shift = JIFFIES_SHIFT, }; diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 1ad46f3..1a20715 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -10,13 +10,13 @@ #include <linux/mm.h> #include <linux/time.h> -#include <linux/timer.h> #include <linux/timex.h> #include <linux/jiffies.h> #include <linux/hrtimer.h> #include <linux/capability.h> #include <linux/math64.h> #include <linux/clocksource.h> +#include <linux/workqueue.h> #include <asm/timex.h> /* @@ -218,11 +218,11 @@ void second_overflow(void) /* Disable the cmos update - used by virtualization and embedded */ int no_sync_cmos_clock __read_mostly; -static void sync_cmos_clock(unsigned long dummy); +static void sync_cmos_clock(struct work_struct *work); -static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); +static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); -static void sync_cmos_clock(unsigned long dummy) +static void sync_cmos_clock(struct work_struct *work) { struct timespec now, next; int fail = 1; @@ -258,13 +258,13 @@ static void sync_cmos_clock(unsigned long dummy) next.tv_sec++; next.tv_nsec -= NSEC_PER_SEC; } - mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next)); + schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); } static void notify_cmos_timer(void) { if (!no_sync_cmos_clock) - mod_timer(&sync_cmos_timer, jiffies + 1); + schedule_delayed_work(&sync_cmos_work, 0); } #else @@ -277,38 +277,50 @@ static inline void notify_cmos_timer(void) { } int do_adjtimex(struct timex *txc) { struct timespec ts; - long save_adjust, sec; int result; - /* In order to modify anything, you gotta be super-user! */ - if (txc->modes && !capable(CAP_SYS_TIME)) - return -EPERM; - - /* Now we validate the data before disabling interrupts */ - - if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) { + /* Validate the data before disabling interrupts */ + if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ - if (txc->modes & ~ADJ_OFFSET_SS_READ) + if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) return -EINVAL; + if (!(txc->modes & ADJ_OFFSET_READONLY) && + !capable(CAP_SYS_TIME)) + return -EPERM; + } else { + /* In order to modify anything, you gotta be super-user! */ + if (txc->modes && !capable(CAP_SYS_TIME)) + return -EPERM; + + /* if the quartz is off by more than 10% something is VERY wrong! */ + if (txc->modes & ADJ_TICK && + (txc->tick < 900000/USER_HZ || + txc->tick > 1100000/USER_HZ)) + return -EINVAL; + + if (txc->modes & ADJ_STATUS && time_state != TIME_OK) + hrtimer_cancel(&leap_timer); } - /* if the quartz is off by more than 10% something is VERY wrong ! */ - if (txc->modes & ADJ_TICK) - if (txc->tick < 900000/USER_HZ || - txc->tick > 1100000/USER_HZ) - return -EINVAL; - - if (time_state != TIME_OK && txc->modes & ADJ_STATUS) - hrtimer_cancel(&leap_timer); getnstimeofday(&ts); write_seqlock_irq(&xtime_lock); - /* Save for later - semantics of adjtime is to return old value */ - save_adjust = time_adjust; - /* If there are input parameters, then process them */ + if (txc->modes & ADJ_ADJTIME) { + long save_adjust = time_adjust; + + if (!(txc->modes & ADJ_OFFSET_READONLY)) { + /* adjtime() is independent from ntp_adjtime() */ + time_adjust = txc->offset; + ntp_update_frequency(); + } + txc->offset = save_adjust; + goto adj_done; + } if (txc->modes) { + long sec; + if (txc->modes & ADJ_STATUS) { if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { @@ -375,13 +387,8 @@ int do_adjtimex(struct timex *txc) if (txc->modes & ADJ_TAI && txc->constant > 0) time_tai = txc->constant; - if (txc->modes & ADJ_OFFSET) { - if (txc->modes == ADJ_OFFSET_SINGLESHOT) - /* adjtime() is independent from ntp_adjtime() */ - time_adjust = txc->offset; - else - ntp_update_offset(txc->offset); - } + if (txc->modes & ADJ_OFFSET) + ntp_update_offset(txc->offset); if (txc->modes & ADJ_TICK) tick_usec = txc->tick; @@ -389,22 +396,18 @@ int do_adjtimex(struct timex *txc) ntp_update_frequency(); } + txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, + NTP_SCALE_SHIFT); + if (!(time_status & STA_NANO)) + txc->offset /= NSEC_PER_USEC; + +adj_done: result = time_state; /* mostly `TIME_OK' */ if (time_status & (STA_UNSYNC|STA_CLOCKERR)) result = TIME_ERROR; - if ((txc->modes == ADJ_OFFSET_SINGLESHOT) || - (txc->modes == ADJ_OFFSET_SS_READ)) - txc->offset = save_adjust; - else { - txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, - NTP_SCALE_SHIFT); - if (!(time_status & STA_NANO)) - txc->offset /= NSEC_PER_USEC; - } - txc->freq = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) * - (s64)PPM_SCALE_INV, - NTP_SCALE_SHIFT); + txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * + (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT); txc->maxerror = time_maxerror; txc->esterror = time_esterror; txc->status = time_status; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index cb01cd8..f98a1b7 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -384,6 +384,19 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) } /* + * Called from irq_enter() when idle was interrupted to reenable the + * per cpu device. + */ +void tick_check_oneshot_broadcast(int cpu) +{ + if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) { + struct tick_device *td = &per_cpu(tick_cpu_device, cpu); + + clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); + } +} + +/* * Handle oneshot mode broadcasting */ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 4692487..b1c05bf 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -36,6 +36,7 @@ extern void tick_broadcast_switch_to_oneshot(void); extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); extern int tick_broadcast_oneshot_active(void); +extern void tick_check_oneshot_broadcast(int cpu); # else /* BROADCAST */ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { @@ -45,6 +46,7 @@ static inline void tick_broadcast_oneshot_control(unsigned long reason) { } static inline void tick_broadcast_switch_to_oneshot(void) { } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } static inline int tick_broadcast_oneshot_active(void) { return 0; } +static inline void tick_check_oneshot_broadcast(int cpu) { } # endif /* !BROADCAST */ #else /* !ONESHOT */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b711ffc..0581c11 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -155,7 +155,7 @@ void tick_nohz_update_jiffies(void) touch_softlockup_watchdog(); } -void tick_nohz_stop_idle(int cpu) +static void tick_nohz_stop_idle(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); @@ -377,6 +377,32 @@ ktime_t tick_nohz_get_sleep_length(void) return ts->sleep_length; } +static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) +{ + hrtimer_cancel(&ts->sched_timer); + ts->sched_timer.expires = ts->idle_tick; + + while (1) { + /* Forward the time to expire in the future */ + hrtimer_forward(&ts->sched_timer, now, tick_period); + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + hrtimer_start(&ts->sched_timer, + ts->sched_timer.expires, + HRTIMER_MODE_ABS); + /* Check, if the timer was already in the past */ + if (hrtimer_active(&ts->sched_timer)) + break; + } else { + if (!tick_program_event(ts->sched_timer.expires, 0)) + break; + } + /* Update jiffies and reread time */ + tick_do_update_jiffies64(now); + now = ktime_get(); + } +} + /** * tick_nohz_restart_sched_tick - restart the idle tick from the idle task * @@ -430,28 +456,7 @@ void tick_nohz_restart_sched_tick(void) */ ts->tick_stopped = 0; ts->idle_exittime = now; - hrtimer_cancel(&ts->sched_timer); - ts->sched_timer.expires = ts->idle_tick; - - while (1) { - /* Forward the time to expire in the future */ - hrtimer_forward(&ts->sched_timer, now, tick_period); - - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { - hrtimer_start(&ts->sched_timer, - ts->sched_timer.expires, - HRTIMER_MODE_ABS); - /* Check, if the timer was already in the past */ - if (hrtimer_active(&ts->sched_timer)) - break; - } else { - if (!tick_program_event(ts->sched_timer.expires, 0)) - break; - } - /* Update jiffies and reread time */ - tick_do_update_jiffies64(now); - now = ktime_get(); - } + tick_nohz_restart(ts, now); local_irq_enable(); } @@ -503,10 +508,6 @@ static void tick_nohz_handler(struct clock_event_device *dev) update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); - /* Do not restart, when we are in the idle loop */ - if (ts->tick_stopped) - return; - while (tick_nohz_reprogram(ts, now)) { now = ktime_get(); tick_do_update_jiffies64(now); @@ -552,6 +553,27 @@ static void tick_nohz_switch_to_nohz(void) smp_processor_id()); } +/* + * When NOHZ is enabled and the tick is stopped, we need to kick the + * tick timer from irq_enter() so that the jiffies update is kept + * alive during long running softirqs. That's ugly as hell, but + * correctness is key even if we need to fix the offending softirq in + * the first place. + * + * Note, this is different to tick_nohz_restart. We just kick the + * timer and do not touch the other magic bits which need to be done + * when idle is left. + */ +static void tick_nohz_kick_tick(int cpu) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + + if (!ts->tick_stopped) + return; + + tick_nohz_restart(ts, ktime_get()); +} + #else static inline void tick_nohz_switch_to_nohz(void) { } @@ -559,6 +581,19 @@ static inline void tick_nohz_switch_to_nohz(void) { } #endif /* NO_HZ */ /* + * Called from irq_enter to notify about the possible interruption of idle() + */ +void tick_check_idle(int cpu) +{ + tick_check_oneshot_broadcast(cpu); +#ifdef CONFIG_NO_HZ + tick_nohz_stop_idle(cpu); + tick_nohz_update_jiffies(); + tick_nohz_kick_tick(cpu); +#endif +} + +/* * High resolution timer specific code */ #ifdef CONFIG_HIGH_RES_TIMERS @@ -611,10 +646,6 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) profile_tick(CPU_PROFILING); } - /* Do not restart, when we are in the idle loop */ - if (ts->tick_stopped) - return HRTIMER_NORESTART; - hrtimer_forward(timer, now, tick_period); return HRTIMER_RESTART; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e91c29f..e7acfb4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -58,27 +58,26 @@ struct clocksource *clock; #ifdef CONFIG_GENERIC_TIME /** - * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook + * clocksource_forward_now - update clock to the current time * - * private function, must hold xtime_lock lock when being - * called. Returns the number of nanoseconds since the - * last call to update_wall_time() (adjusted by NTP scaling) + * Forward the current clock to update its state since the last call to + * update_wall_time(). This is useful before significant clock changes, + * as it avoids having to deal with this time offset explicitly. */ -static inline s64 __get_nsec_offset(void) +static void clocksource_forward_now(void) { cycle_t cycle_now, cycle_delta; - s64 ns_offset; + s64 nsec; - /* read clocksource: */ cycle_now = clocksource_read(clock); - - /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + clock->cycle_last = cycle_now; - /* convert to nanoseconds: */ - ns_offset = cyc2ns(clock, cycle_delta); + nsec = cyc2ns(clock, cycle_delta); + timespec_add_ns(&xtime, nsec); - return ns_offset; + nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; + clock->raw_time.tv_nsec += nsec; } /** @@ -89,6 +88,7 @@ static inline s64 __get_nsec_offset(void) */ void getnstimeofday(struct timespec *ts) { + cycle_t cycle_now, cycle_delta; unsigned long seq; s64 nsecs; @@ -96,7 +96,15 @@ void getnstimeofday(struct timespec *ts) seq = read_seqbegin(&xtime_lock); *ts = xtime; - nsecs = __get_nsec_offset(); + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* convert to nanoseconds: */ + nsecs = cyc2ns(clock, cycle_delta); } while (read_seqretry(&xtime_lock, seq)); @@ -129,22 +137,22 @@ EXPORT_SYMBOL(do_gettimeofday); */ int do_settimeofday(struct timespec *tv) { + struct timespec ts_delta; unsigned long flags; - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; write_seqlock_irqsave(&xtime_lock, flags); - nsec -= __get_nsec_offset(); + clocksource_forward_now(); + + ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; + ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; + wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta); - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); + xtime = *tv; - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); update_xtime_cache(0); clock->error = 0; @@ -170,22 +178,19 @@ EXPORT_SYMBOL(do_settimeofday); static void change_clocksource(void) { struct clocksource *new; - cycle_t now; - u64 nsec; new = clocksource_get_next(); if (clock == new) return; - new->cycle_last = 0; - now = clocksource_read(new); - nsec = __get_nsec_offset(); - timespec_add_ns(&xtime, nsec); + clocksource_forward_now(); - clock = new; - clock->cycle_last = now; + new->raw_time = clock->raw_time; + clock = new; + clock->cycle_last = 0; + clock->cycle_last = clocksource_read(new); clock->error = 0; clock->xtime_nsec = 0; clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); @@ -200,11 +205,44 @@ static void change_clocksource(void) */ } #else +static inline void clocksource_forward_now(void) { } static inline void change_clocksource(void) { } -static inline s64 __get_nsec_offset(void) { return 0; } #endif /** + * getrawmonotonic - Returns the raw monotonic time in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the raw monotonic time (completely un-modified by ntp) + */ +void getrawmonotonic(struct timespec *ts) +{ + unsigned long seq; + s64 nsecs; + cycle_t cycle_now, cycle_delta; + + do { + seq = read_seqbegin(&xtime_lock); + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* convert to nanoseconds: */ + nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; + + *ts = clock->raw_time; + + } while (read_seqretry(&xtime_lock, seq)); + + timespec_add_ns(ts, nsecs); +} +EXPORT_SYMBOL(getrawmonotonic); + + +/** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres */ int timekeeping_valid_for_hres(void) @@ -265,8 +303,6 @@ void __init timekeeping_init(void) static int timekeeping_suspended; /* time in seconds when suspend began */ static unsigned long timekeeping_suspend_time; -/* xtime offset when we went into suspend */ -static s64 timekeeping_suspend_nsecs; /** * timekeeping_resume - Resumes the generic timekeeping subsystem. @@ -292,8 +328,6 @@ static int timekeeping_resume(struct sys_device *dev) wall_to_monotonic.tv_sec -= sleep_length; total_sleep_time += sleep_length; } - /* Make sure that we have the correct xtime reference */ - timespec_add_ns(&xtime, timekeeping_suspend_nsecs); update_xtime_cache(0); /* re-base the last cycle value */ clock->cycle_last = 0; @@ -319,8 +353,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) timekeeping_suspend_time = read_persistent_clock(); write_seqlock_irqsave(&xtime_lock, flags); - /* Get the current xtime offset */ - timekeeping_suspend_nsecs = __get_nsec_offset(); + clocksource_forward_now(); timekeeping_suspended = 1; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -454,23 +487,29 @@ void update_wall_time(void) #else offset = clock->cycle_interval; #endif - clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; + clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift; /* normally this loop will run just once, however in the * case of lost or late ticks, it will accumulate correctly. */ while (offset >= clock->cycle_interval) { /* accumulate one interval */ - clock->xtime_nsec += clock->xtime_interval; - clock->cycle_last += clock->cycle_interval; offset -= clock->cycle_interval; + clock->cycle_last += clock->cycle_interval; + clock->xtime_nsec += clock->xtime_interval; if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; xtime.tv_sec++; second_overflow(); } + clock->raw_time.tv_nsec += clock->raw_interval; + if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) { + clock->raw_time.tv_nsec -= NSEC_PER_SEC; + clock->raw_time.tv_sec++; + } + /* accumulate error between NTP and clock interval */ clock->error += tick_length; clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); @@ -479,9 +518,12 @@ void update_wall_time(void) /* correct the clock when NTP error is too big */ clocksource_adjust(offset); - /* store full nanoseconds into xtime */ - xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; + /* store full nanoseconds into xtime after rounding it up and + * add the remainder to the error difference. + */ + xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1; clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; + clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift); update_xtime_cache(cyc2ns(clock, offset)); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index a40e20f..f642691 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -47,13 +47,14 @@ static void print_name_offset(struct seq_file *m, void *sym) } static void -print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) +print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, + int idx, u64 now) { #ifdef CONFIG_TIMER_STATS char tmp[TASK_COMM_LEN + 1]; #endif SEQ_printf(m, " #%d: ", idx); - print_name_offset(m, timer); + print_name_offset(m, taddr); SEQ_printf(m, ", "); print_name_offset(m, timer->function); SEQ_printf(m, ", S:%02lx", timer->state); @@ -99,7 +100,7 @@ next_one: tmp = *timer; spin_unlock_irqrestore(&base->cpu_base->lock, flags); - print_timer(m, &tmp, i, now); + print_timer(m, timer, &tmp, i, now); next++; goto next_one; } @@ -109,6 +110,7 @@ next_one: static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { + SEQ_printf(m, " .base: %p\n", base); SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %Lu nsecs\n", @@ -183,12 +185,16 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) #ifdef CONFIG_GENERIC_CLOCKEVENTS static void -print_tickdevice(struct seq_file *m, struct tick_device *td) +print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) { struct clock_event_device *dev = td->evtdev; SEQ_printf(m, "\n"); SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); + if (cpu < 0) + SEQ_printf(m, "Broadcast device\n"); + else + SEQ_printf(m, "Per CPU device: %d\n", cpu); SEQ_printf(m, "Clock Event Device: "); if (!dev) { @@ -222,7 +228,7 @@ static void timer_list_show_tickdevices(struct seq_file *m) int cpu; #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST - print_tickdevice(m, tick_get_broadcast_device()); + print_tickdevice(m, tick_get_broadcast_device(), -1); SEQ_printf(m, "tick_broadcast_mask: %08lx\n", tick_get_broadcast_mask()->bits[0]); #ifdef CONFIG_TICK_ONESHOT @@ -232,7 +238,7 @@ static void timer_list_show_tickdevices(struct seq_file *m) SEQ_printf(m, "\n"); #endif for_each_online_cpu(cpu) - print_tickdevice(m, tick_get_device(cpu)); + print_tickdevice(m, tick_get_device(cpu), cpu); SEQ_printf(m, "\n"); } #else @@ -244,7 +250,7 @@ static int timer_list_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Timer List Version: v0.3\n"); + SEQ_printf(m, "Timer List Version: v0.4\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); diff --git a/kernel/timer.c b/kernel/timer.c index 510fe69..56becf3 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1436,9 +1436,11 @@ static void __cpuinit migrate_timers(int cpu) BUG_ON(cpu_online(cpu)); old_base = per_cpu(tvec_bases, cpu); new_base = get_cpu_var(tvec_bases); - - local_irq_disable(); - spin_lock(&new_base->lock); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + spin_lock_irq(&new_base->lock); spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); BUG_ON(old_base->running_timer); @@ -1453,8 +1455,7 @@ static void __cpuinit migrate_timers(int cpu) } spin_unlock(&old_base->lock); - spin_unlock(&new_base->lock); - local_irq_enable(); + spin_unlock_irq(&new_base->lock); put_cpu_var(tvec_bases); } #endif /* CONFIG_HOTPLUG_CPU */ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 263e9e6..1cb3e1f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1,23 +1,37 @@ # # Architectures that offer an FTRACE implementation should select HAVE_FTRACE: # + +config NOP_TRACER + bool + config HAVE_FTRACE bool + select NOP_TRACER config HAVE_DYNAMIC_FTRACE bool +config HAVE_FTRACE_MCOUNT_RECORD + bool + config TRACER_MAX_TRACE bool +config RING_BUFFER + bool + config TRACING bool select DEBUG_FS + select RING_BUFFER select STACKTRACE + select TRACEPOINTS config FTRACE bool "Kernel Function Tracer" depends on HAVE_FTRACE + depends on DEBUG_KERNEL select FRAME_POINTER select TRACING select CONTEXT_SWITCH_TRACER @@ -36,6 +50,7 @@ config IRQSOFF_TRACER depends on TRACE_IRQFLAGS_SUPPORT depends on GENERIC_TIME depends on HAVE_FTRACE + depends on DEBUG_KERNEL select TRACE_IRQFLAGS select TRACING select TRACER_MAX_TRACE @@ -59,6 +74,7 @@ config PREEMPT_TRACER depends on GENERIC_TIME depends on PREEMPT depends on HAVE_FTRACE + depends on DEBUG_KERNEL select TRACING select TRACER_MAX_TRACE help @@ -86,6 +102,7 @@ config SYSPROF_TRACER config SCHED_TRACER bool "Scheduling Latency Tracer" depends on HAVE_FTRACE + depends on DEBUG_KERNEL select TRACING select CONTEXT_SWITCH_TRACER select TRACER_MAX_TRACE @@ -96,16 +113,56 @@ config SCHED_TRACER config CONTEXT_SWITCH_TRACER bool "Trace process context switches" depends on HAVE_FTRACE + depends on DEBUG_KERNEL select TRACING select MARKERS help This tracer gets called from the context switch and records all switching of tasks. +config BOOT_TRACER + bool "Trace boot initcalls" + depends on HAVE_FTRACE + depends on DEBUG_KERNEL + select TRACING + help + This tracer helps developers to optimize boot times: it records + the timings of the initcalls and traces key events and the identity + of tasks that can cause boot delays, such as context-switches. + + Its aim is to be parsed by the /scripts/bootgraph.pl tool to + produce pretty graphics about boot inefficiencies, giving a visual + representation of the delays during initcalls - but the raw + /debug/tracing/trace text output is readable too. + + ( Note that tracing self tests can't be enabled if this tracer is + selected, because the self-tests are an initcall as well and that + would invalidate the boot trace. ) + +config STACK_TRACER + bool "Trace max stack" + depends on HAVE_FTRACE + depends on DEBUG_KERNEL + select FTRACE + select STACKTRACE + help + This special tracer records the maximum stack footprint of the + kernel and displays it in debugfs/tracing/stack_trace. + + This tracer works by hooking into every function call that the + kernel executes, and keeping a maximum stack depth value and + stack-trace saved. Because this logic has to execute in every + kernel function, all the time, this option can slow down the + kernel measurably and is generally intended for kernel + developers only. + + Say N if unsure. + config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" depends on FTRACE depends on HAVE_DYNAMIC_FTRACE + depends on DEBUG_KERNEL default y help This option will modify all the calls to ftrace dynamically @@ -121,12 +178,17 @@ config DYNAMIC_FTRACE were made. If so, it runs stop_machine (stops all CPUS) and modifies the code to jump over the call to ftrace. +config FTRACE_MCOUNT_RECORD + def_bool y + depends on DYNAMIC_FTRACE + depends on HAVE_FTRACE_MCOUNT_RECORD + config FTRACE_SELFTEST bool config FTRACE_STARTUP_TEST bool "Perform a startup test on ftrace" - depends on TRACING + depends on TRACING && DEBUG_KERNEL && !BOOT_TRACER select FTRACE_SELFTEST help This option performs a series of startup tests on ftrace. On bootup diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 71d17de..a85dfba 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -11,6 +11,7 @@ obj-y += trace_selftest_dynamic.o endif obj-$(CONFIG_FTRACE) += libftrace.o +obj-$(CONFIG_RING_BUFFER) += ring_buffer.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o @@ -19,6 +20,9 @@ obj-$(CONFIG_FTRACE) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o +obj-$(CONFIG_NOP_TRACER) += trace_nop.o +obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o +obj-$(CONFIG_BOOT_TRACER) += trace_boot.o libftrace-y := ftrace.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f6e3af3..4dda4f6 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -81,7 +81,7 @@ void clear_ftrace_function(void) static int __register_ftrace_function(struct ftrace_ops *ops) { - /* Should never be called by interrupts */ + /* should not be called from interrupt context */ spin_lock(&ftrace_lock); ops->next = ftrace_list; @@ -115,6 +115,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) struct ftrace_ops **p; int ret = 0; + /* should not be called from interrupt context */ spin_lock(&ftrace_lock); /* @@ -153,6 +154,30 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) #ifdef CONFIG_DYNAMIC_FTRACE +#ifndef CONFIG_FTRACE_MCOUNT_RECORD +/* + * The hash lock is only needed when the recording of the mcount + * callers are dynamic. That is, by the caller themselves and + * not recorded via the compilation. + */ +static DEFINE_SPINLOCK(ftrace_hash_lock); +#define ftrace_hash_lock(flags) spin_lock_irqsave(&ftrace_hash_lock, flags) +#define ftrace_hash_unlock(flags) \ + spin_unlock_irqrestore(&ftrace_hash_lock, flags) +#else +/* This is protected via the ftrace_lock with MCOUNT_RECORD. */ +#define ftrace_hash_lock(flags) do { (void)(flags); } while (0) +#define ftrace_hash_unlock(flags) do { } while(0) +#endif + +/* + * Since MCOUNT_ADDR may point to mcount itself, we do not want + * to get it confused by reading a reference in the code as we + * are parsing on objcopy output of text. Use a variable for + * it instead. + */ +static unsigned long mcount_addr = MCOUNT_ADDR; + static struct task_struct *ftraced_task; enum { @@ -171,7 +196,6 @@ static struct hlist_head ftrace_hash[FTRACE_HASHSIZE]; static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); -static DEFINE_SPINLOCK(ftrace_shutdown_lock); static DEFINE_MUTEX(ftraced_lock); static DEFINE_MUTEX(ftrace_regex_lock); @@ -294,13 +318,37 @@ static inline void ftrace_del_hash(struct dyn_ftrace *node) static void ftrace_free_rec(struct dyn_ftrace *rec) { - /* no locking, only called from kstop_machine */ - rec->ip = (unsigned long)ftrace_free_records; ftrace_free_records = rec; rec->flags |= FTRACE_FL_FREE; } +void ftrace_release(void *start, unsigned long size) +{ + struct dyn_ftrace *rec; + struct ftrace_page *pg; + unsigned long s = (unsigned long)start; + unsigned long e = s + size; + int i; + + if (ftrace_disabled || !start) + return; + + /* should not be called from interrupt context */ + spin_lock(&ftrace_lock); + + for (pg = ftrace_pages_start; pg; pg = pg->next) { + for (i = 0; i < pg->index; i++) { + rec = &pg->records[i]; + + if ((rec->ip >= s) && (rec->ip < e)) + ftrace_free_rec(rec); + } + } + spin_unlock(&ftrace_lock); + +} + static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { struct dyn_ftrace *rec; @@ -338,7 +386,6 @@ ftrace_record_ip(unsigned long ip) unsigned long flags; unsigned long key; int resched; - int atomic; int cpu; if (!ftrace_enabled || ftrace_disabled) @@ -368,9 +415,7 @@ ftrace_record_ip(unsigned long ip) if (ftrace_ip_in_hash(ip, key)) goto out; - atomic = irqs_disabled(); - - spin_lock_irqsave(&ftrace_shutdown_lock, flags); + ftrace_hash_lock(flags); /* This ip may have hit the hash before the lock */ if (ftrace_ip_in_hash(ip, key)) @@ -387,7 +432,7 @@ ftrace_record_ip(unsigned long ip) ftraced_trigger = 1; out_unlock: - spin_unlock_irqrestore(&ftrace_shutdown_lock, flags); + ftrace_hash_unlock(flags); out: per_cpu(ftrace_shutdown_disable_cpu, cpu)--; @@ -531,6 +576,16 @@ static void ftrace_shutdown_replenish(void) ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); } +static void print_ip_ins(const char *fmt, unsigned char *p) +{ + int i; + + printk(KERN_CONT "%s", fmt); + + for (i = 0; i < MCOUNT_INSN_SIZE; i++) + printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); +} + static int ftrace_code_disable(struct dyn_ftrace *rec) { @@ -541,10 +596,27 @@ ftrace_code_disable(struct dyn_ftrace *rec) ip = rec->ip; nop = ftrace_nop_replace(); - call = ftrace_call_replace(ip, MCOUNT_ADDR); + call = ftrace_call_replace(ip, mcount_addr); failed = ftrace_modify_code(ip, call, nop); if (failed) { + switch (failed) { + case 1: + WARN_ON_ONCE(1); + pr_info("ftrace faulted on modifying "); + print_ip_sym(ip); + break; + case 2: + WARN_ON_ONCE(1); + pr_info("ftrace failed to modify "); + print_ip_sym(ip); + print_ip_ins(" expected: ", call); + print_ip_ins(" actual: ", (unsigned char *)ip); + print_ip_ins(" replace: ", nop); + printk(KERN_CONT "\n"); + break; + } + rec->flags |= FTRACE_FL_FAILED; return 0; } @@ -792,47 +864,7 @@ static int ftrace_update_code(void) return 1; } -static int ftraced(void *ignore) -{ - unsigned long usecs; - - while (!kthread_should_stop()) { - - set_current_state(TASK_INTERRUPTIBLE); - - /* check once a second */ - schedule_timeout(HZ); - - if (unlikely(ftrace_disabled)) - continue; - - mutex_lock(&ftrace_sysctl_lock); - mutex_lock(&ftraced_lock); - if (!ftraced_suspend && !ftraced_stop && - ftrace_update_code()) { - usecs = nsecs_to_usecs(ftrace_update_time); - if (ftrace_update_tot_cnt > 100000) { - ftrace_update_tot_cnt = 0; - pr_info("hm, dftrace overflow: %lu change%s" - " (%lu total) in %lu usec%s\n", - ftrace_update_cnt, - ftrace_update_cnt != 1 ? "s" : "", - ftrace_update_tot_cnt, - usecs, usecs != 1 ? "s" : ""); - ftrace_disabled = 1; - WARN_ON_ONCE(1); - } - } - mutex_unlock(&ftraced_lock); - mutex_unlock(&ftrace_sysctl_lock); - - ftrace_shutdown_replenish(); - } - __set_current_state(TASK_RUNNING); - return 0; -} - -static int __init ftrace_dyn_table_alloc(void) +static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) { struct ftrace_page *pg; int cnt; @@ -859,7 +891,9 @@ static int __init ftrace_dyn_table_alloc(void) pg = ftrace_pages = ftrace_pages_start; - cnt = NR_TO_INIT / ENTRIES_PER_PAGE; + cnt = num_to_init / ENTRIES_PER_PAGE; + pr_info("ftrace: allocating %ld hash entries in %d pages\n", + num_to_init, cnt); for (i = 0; i < cnt; i++) { pg->next = (void *)get_zeroed_page(GFP_KERNEL); @@ -901,6 +935,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos) (*pos)++; + /* should not be called from interrupt context */ + spin_lock(&ftrace_lock); retry: if (iter->idx >= iter->pg->index) { if (iter->pg->next) { @@ -910,15 +946,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } } else { rec = &iter->pg->records[iter->idx++]; - if ((!(iter->flags & FTRACE_ITER_FAILURES) && + if ((rec->flags & FTRACE_FL_FREE) || + + (!(iter->flags & FTRACE_ITER_FAILURES) && (rec->flags & FTRACE_FL_FAILED)) || ((iter->flags & FTRACE_ITER_FAILURES) && - (!(rec->flags & FTRACE_FL_FAILED) || - (rec->flags & FTRACE_FL_FREE))) || - - ((iter->flags & FTRACE_ITER_FILTER) && - !(rec->flags & FTRACE_FL_FILTER)) || + !(rec->flags & FTRACE_FL_FAILED)) || ((iter->flags & FTRACE_ITER_NOTRACE) && !(rec->flags & FTRACE_FL_NOTRACE))) { @@ -926,6 +960,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) goto retry; } } + spin_unlock(&ftrace_lock); iter->pos = *pos; @@ -1039,8 +1074,8 @@ static void ftrace_filter_reset(int enable) unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; unsigned i; - /* keep kstop machine from running */ - preempt_disable(); + /* should not be called from interrupt context */ + spin_lock(&ftrace_lock); if (enable) ftrace_filtered = 0; pg = ftrace_pages_start; @@ -1053,7 +1088,7 @@ static void ftrace_filter_reset(int enable) } pg = pg->next; } - preempt_enable(); + spin_unlock(&ftrace_lock); } static int @@ -1165,8 +1200,8 @@ ftrace_match(unsigned char *buff, int len, int enable) } } - /* keep kstop machine from running */ - preempt_disable(); + /* should not be called from interrupt context */ + spin_lock(&ftrace_lock); if (enable) ftrace_filtered = 1; pg = ftrace_pages_start; @@ -1203,7 +1238,7 @@ ftrace_match(unsigned char *buff, int len, int enable) } pg = pg->next; } - preempt_enable(); + spin_unlock(&ftrace_lock); } static ssize_t @@ -1556,6 +1591,114 @@ static __init int ftrace_init_debugfs(void) fs_initcall(ftrace_init_debugfs); +#ifdef CONFIG_FTRACE_MCOUNT_RECORD +static int ftrace_convert_nops(unsigned long *start, + unsigned long *end) +{ + unsigned long *p; + unsigned long addr; + unsigned long flags; + + p = start; + while (p < end) { + addr = ftrace_call_adjust(*p++); + /* should not be called from interrupt context */ + spin_lock(&ftrace_lock); + ftrace_record_ip(addr); + spin_unlock(&ftrace_lock); + ftrace_shutdown_replenish(); + } + + /* p is ignored */ + local_irq_save(flags); + __ftrace_update_code(p); + local_irq_restore(flags); + + return 0; +} + +void ftrace_init_module(unsigned long *start, unsigned long *end) +{ + if (ftrace_disabled || start == end) + return; + ftrace_convert_nops(start, end); +} + +extern unsigned long __start_mcount_loc[]; +extern unsigned long __stop_mcount_loc[]; + +void __init ftrace_init(void) +{ + unsigned long count, addr, flags; + int ret; + + /* Keep the ftrace pointer to the stub */ + addr = (unsigned long)ftrace_stub; + + local_irq_save(flags); + ftrace_dyn_arch_init(&addr); + local_irq_restore(flags); + + /* ftrace_dyn_arch_init places the return code in addr */ + if (addr) + goto failed; + + count = __stop_mcount_loc - __start_mcount_loc; + + ret = ftrace_dyn_table_alloc(count); + if (ret) + goto failed; + + last_ftrace_enabled = ftrace_enabled = 1; + + ret = ftrace_convert_nops(__start_mcount_loc, + __stop_mcount_loc); + + return; + failed: + ftrace_disabled = 1; +} +#else /* CONFIG_FTRACE_MCOUNT_RECORD */ +static int ftraced(void *ignore) +{ + unsigned long usecs; + + while (!kthread_should_stop()) { + + set_current_state(TASK_INTERRUPTIBLE); + + /* check once a second */ + schedule_timeout(HZ); + + if (unlikely(ftrace_disabled)) + continue; + + mutex_lock(&ftrace_sysctl_lock); + mutex_lock(&ftraced_lock); + if (!ftraced_suspend && !ftraced_stop && + ftrace_update_code()) { + usecs = nsecs_to_usecs(ftrace_update_time); + if (ftrace_update_tot_cnt > 100000) { + ftrace_update_tot_cnt = 0; + pr_info("hm, dftrace overflow: %lu change%s" + " (%lu total) in %lu usec%s\n", + ftrace_update_cnt, + ftrace_update_cnt != 1 ? "s" : "", + ftrace_update_tot_cnt, + usecs, usecs != 1 ? "s" : ""); + ftrace_disabled = 1; + WARN_ON_ONCE(1); + } + } + mutex_unlock(&ftraced_lock); + mutex_unlock(&ftrace_sysctl_lock); + + ftrace_shutdown_replenish(); + } + __set_current_state(TASK_RUNNING); + return 0; +} + static int __init ftrace_dynamic_init(void) { struct task_struct *p; @@ -1572,7 +1715,7 @@ static int __init ftrace_dynamic_init(void) goto failed; } - ret = ftrace_dyn_table_alloc(); + ret = ftrace_dyn_table_alloc(NR_TO_INIT); if (ret) goto failed; @@ -1593,6 +1736,8 @@ static int __init ftrace_dynamic_init(void) } core_initcall(ftrace_dynamic_init); +#endif /* CONFIG_FTRACE_MCOUNT_RECORD */ + #else # define ftrace_startup() do { } while (0) # define ftrace_shutdown() do { } while (0) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c new file mode 100644 index 0000000..94af1fe --- /dev/null +++ b/kernel/trace/ring_buffer.c @@ -0,0 +1,2014 @@ +/* + * Generic ring buffer + * + * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> + */ +#include <linux/ring_buffer.h> +#include <linux/spinlock.h> +#include <linux/debugfs.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/mutex.h> +#include <linux/sched.h> /* used for sched_clock() (for now) */ +#include <linux/init.h> +#include <linux/hash.h> +#include <linux/list.h> +#include <linux/fs.h> + +/* Up this if you want to test the TIME_EXTENTS and normalization */ +#define DEBUG_SHIFT 0 + +/* FIXME!!! */ +u64 ring_buffer_time_stamp(int cpu) +{ + /* shift to debug/test normalization and TIME_EXTENTS */ + return sched_clock() << DEBUG_SHIFT; +} + +void ring_buffer_normalize_time_stamp(int cpu, u64 *ts) +{ + /* Just stupid testing the normalize function and deltas */ + *ts >>= DEBUG_SHIFT; +} + +#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event)) +#define RB_ALIGNMENT_SHIFT 2 +#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT) +#define RB_MAX_SMALL_DATA 28 + +enum { + RB_LEN_TIME_EXTEND = 8, + RB_LEN_TIME_STAMP = 16, +}; + +/* inline for ring buffer fast paths */ +static inline unsigned +rb_event_length(struct ring_buffer_event *event) +{ + unsigned length; + + switch (event->type) { + case RINGBUF_TYPE_PADDING: + /* undefined */ + return -1; + + case RINGBUF_TYPE_TIME_EXTEND: + return RB_LEN_TIME_EXTEND; + + case RINGBUF_TYPE_TIME_STAMP: + return RB_LEN_TIME_STAMP; + + case RINGBUF_TYPE_DATA: + if (event->len) + length = event->len << RB_ALIGNMENT_SHIFT; + else + length = event->array[0]; + return length + RB_EVNT_HDR_SIZE; + default: + BUG(); + } + /* not hit */ + return 0; +} + +/** + * ring_buffer_event_length - return the length of the event + * @event: the event to get the length of + */ +unsigned ring_buffer_event_length(struct ring_buffer_event *event) +{ + return rb_event_length(event); +} + +/* inline for ring buffer fast paths */ +static inline void * +rb_event_data(struct ring_buffer_event *event) +{ + BUG_ON(event->type != RINGBUF_TYPE_DATA); + /* If length is in len field, then array[0] has the data */ + if (event->len) + return (void *)&event->array[0]; + /* Otherwise length is in array[0] and array[1] has the data */ + return (void *)&event->array[1]; +} + +/** + * ring_buffer_event_data - return the data of the event + * @event: the event to get the data from + */ +void *ring_buffer_event_data(struct ring_buffer_event *event) +{ + return rb_event_data(event); +} + +#define for_each_buffer_cpu(buffer, cpu) \ + for_each_cpu_mask(cpu, buffer->cpumask) + +#define TS_SHIFT 27 +#define TS_MASK ((1ULL << TS_SHIFT) - 1) +#define TS_DELTA_TEST (~TS_MASK) + +/* + * This hack stolen from mm/slob.c. + * We can store per page timing information in the page frame of the page. + * Thanks to Peter Zijlstra for suggesting this idea. + */ +struct buffer_page { + u64 time_stamp; /* page time stamp */ + local_t write; /* index for next write */ + local_t commit; /* write commited index */ + unsigned read; /* index for next read */ + struct list_head list; /* list of free pages */ + void *page; /* Actual data page */ +}; + +/* + * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing + * this issue out. + */ +static inline void free_buffer_page(struct buffer_page *bpage) +{ + if (bpage->page) + __free_page(bpage->page); + kfree(bpage); +} + +/* + * We need to fit the time_stamp delta into 27 bits. + */ +static inline int test_time_stamp(u64 delta) +{ + if (delta & TS_DELTA_TEST) + return 1; + return 0; +} + +#define BUF_PAGE_SIZE PAGE_SIZE + +/* + * head_page == tail_page && head == tail then buffer is empty. + */ +struct ring_buffer_per_cpu { + int cpu; + struct ring_buffer *buffer; + spinlock_t lock; + struct lock_class_key lock_key; + struct list_head pages; + struct buffer_page *head_page; /* read from head */ + struct buffer_page *tail_page; /* write to tail */ + struct buffer_page *commit_page; /* commited pages */ + struct buffer_page *reader_page; + unsigned long overrun; + unsigned long entries; + u64 write_stamp; + u64 read_stamp; + atomic_t record_disabled; +}; + +struct ring_buffer { + unsigned long size; + unsigned pages; + unsigned flags; + int cpus; + cpumask_t cpumask; + atomic_t record_disabled; + + struct mutex mutex; + + struct ring_buffer_per_cpu **buffers; +}; + +struct ring_buffer_iter { + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long head; + struct buffer_page *head_page; + u64 read_stamp; +}; + +#define RB_WARN_ON(buffer, cond) \ + do { \ + if (unlikely(cond)) { \ + atomic_inc(&buffer->record_disabled); \ + WARN_ON(1); \ + } \ + } while (0) + +#define RB_WARN_ON_RET(buffer, cond) \ + do { \ + if (unlikely(cond)) { \ + atomic_inc(&buffer->record_disabled); \ + WARN_ON(1); \ + return -1; \ + } \ + } while (0) + +#define RB_WARN_ON_ONCE(buffer, cond) \ + do { \ + static int once; \ + if (unlikely(cond) && !once) { \ + once++; \ + atomic_inc(&buffer->record_disabled); \ + WARN_ON(1); \ + } \ + } while (0) + +/** + * check_pages - integrity check of buffer pages + * @cpu_buffer: CPU buffer with pages to test + * + * As a safty measure we check to make sure the data pages have not + * been corrupted. + */ +static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct list_head *head = &cpu_buffer->pages; + struct buffer_page *page, *tmp; + + RB_WARN_ON_RET(cpu_buffer, head->next->prev != head); + RB_WARN_ON_RET(cpu_buffer, head->prev->next != head); + + list_for_each_entry_safe(page, tmp, head, list) { + RB_WARN_ON_RET(cpu_buffer, + page->list.next->prev != &page->list); + RB_WARN_ON_RET(cpu_buffer, + page->list.prev->next != &page->list); + } + + return 0; +} + +static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, + unsigned nr_pages) +{ + struct list_head *head = &cpu_buffer->pages; + struct buffer_page *page, *tmp; + unsigned long addr; + LIST_HEAD(pages); + unsigned i; + + for (i = 0; i < nr_pages; i++) { + page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), + GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); + if (!page) + goto free_pages; + list_add(&page->list, &pages); + + addr = __get_free_page(GFP_KERNEL); + if (!addr) + goto free_pages; + page->page = (void *)addr; + } + + list_splice(&pages, head); + + rb_check_pages(cpu_buffer); + + return 0; + + free_pages: + list_for_each_entry_safe(page, tmp, &pages, list) { + list_del_init(&page->list); + free_buffer_page(page); + } + return -ENOMEM; +} + +static struct ring_buffer_per_cpu * +rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *page; + unsigned long addr; + int ret; + + cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), + GFP_KERNEL, cpu_to_node(cpu)); + if (!cpu_buffer) + return NULL; + + cpu_buffer->cpu = cpu; + cpu_buffer->buffer = buffer; + spin_lock_init(&cpu_buffer->lock); + INIT_LIST_HEAD(&cpu_buffer->pages); + + page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), + GFP_KERNEL, cpu_to_node(cpu)); + if (!page) + goto fail_free_buffer; + + cpu_buffer->reader_page = page; + addr = __get_free_page(GFP_KERNEL); + if (!addr) + goto fail_free_reader; + page->page = (void *)addr; + + INIT_LIST_HEAD(&cpu_buffer->reader_page->list); + + ret = rb_allocate_pages(cpu_buffer, buffer->pages); + if (ret < 0) + goto fail_free_reader; + + cpu_buffer->head_page + = list_entry(cpu_buffer->pages.next, struct buffer_page, list); + cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; + + return cpu_buffer; + + fail_free_reader: + free_buffer_page(cpu_buffer->reader_page); + + fail_free_buffer: + kfree(cpu_buffer); + return NULL; +} + +static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct list_head *head = &cpu_buffer->pages; + struct buffer_page *page, *tmp; + + list_del_init(&cpu_buffer->reader_page->list); + free_buffer_page(cpu_buffer->reader_page); + + list_for_each_entry_safe(page, tmp, head, list) { + list_del_init(&page->list); + free_buffer_page(page); + } + kfree(cpu_buffer); +} + +/* + * Causes compile errors if the struct buffer_page gets bigger + * than the struct page. + */ +extern int ring_buffer_page_too_big(void); + +/** + * ring_buffer_alloc - allocate a new ring_buffer + * @size: the size in bytes that is needed. + * @flags: attributes to set for the ring buffer. + * + * Currently the only flag that is available is the RB_FL_OVERWRITE + * flag. This flag means that the buffer will overwrite old data + * when the buffer wraps. If this flag is not set, the buffer will + * drop data when the tail hits the head. + */ +struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) +{ + struct ring_buffer *buffer; + int bsize; + int cpu; + + /* Paranoid! Optimizes out when all is well */ + if (sizeof(struct buffer_page) > sizeof(struct page)) + ring_buffer_page_too_big(); + + + /* keep it in its own cache line */ + buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), + GFP_KERNEL); + if (!buffer) + return NULL; + + buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + buffer->flags = flags; + + /* need at least two pages */ + if (buffer->pages == 1) + buffer->pages++; + + buffer->cpumask = cpu_possible_map; + buffer->cpus = nr_cpu_ids; + + bsize = sizeof(void *) * nr_cpu_ids; + buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()), + GFP_KERNEL); + if (!buffer->buffers) + goto fail_free_buffer; + + for_each_buffer_cpu(buffer, cpu) { + buffer->buffers[cpu] = + rb_allocate_cpu_buffer(buffer, cpu); + if (!buffer->buffers[cpu]) + goto fail_free_buffers; + } + + mutex_init(&buffer->mutex); + + return buffer; + + fail_free_buffers: + for_each_buffer_cpu(buffer, cpu) { + if (buffer->buffers[cpu]) + rb_free_cpu_buffer(buffer->buffers[cpu]); + } + kfree(buffer->buffers); + + fail_free_buffer: + kfree(buffer); + return NULL; +} + +/** + * ring_buffer_free - free a ring buffer. + * @buffer: the buffer to free. + */ +void +ring_buffer_free(struct ring_buffer *buffer) +{ + int cpu; + + for_each_buffer_cpu(buffer, cpu) + rb_free_cpu_buffer(buffer->buffers[cpu]); + + kfree(buffer); +} + +static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); + +static void +rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) +{ + struct buffer_page *page; + struct list_head *p; + unsigned i; + + atomic_inc(&cpu_buffer->record_disabled); + synchronize_sched(); + + for (i = 0; i < nr_pages; i++) { + BUG_ON(list_empty(&cpu_buffer->pages)); + p = cpu_buffer->pages.next; + page = list_entry(p, struct buffer_page, list); + list_del_init(&page->list); + free_buffer_page(page); + } + BUG_ON(list_empty(&cpu_buffer->pages)); + + rb_reset_cpu(cpu_buffer); + + rb_check_pages(cpu_buffer); + + atomic_dec(&cpu_buffer->record_disabled); + +} + +static void +rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, + struct list_head *pages, unsigned nr_pages) +{ + struct buffer_page *page; + struct list_head *p; + unsigned i; + + atomic_inc(&cpu_buffer->record_disabled); + synchronize_sched(); + + for (i = 0; i < nr_pages; i++) { + BUG_ON(list_empty(pages)); + p = pages->next; + page = list_entry(p, struct buffer_page, list); + list_del_init(&page->list); + list_add_tail(&page->list, &cpu_buffer->pages); + } + rb_reset_cpu(cpu_buffer); + + rb_check_pages(cpu_buffer); + + atomic_dec(&cpu_buffer->record_disabled); +} + +/** + * ring_buffer_resize - resize the ring buffer + * @buffer: the buffer to resize. + * @size: the new size. + * + * The tracer is responsible for making sure that the buffer is + * not being used while changing the size. + * Note: We may be able to change the above requirement by using + * RCU synchronizations. + * + * Minimum size is 2 * BUF_PAGE_SIZE. + * + * Returns -1 on failure. + */ +int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned nr_pages, rm_pages, new_pages; + struct buffer_page *page, *tmp; + unsigned long buffer_size; + unsigned long addr; + LIST_HEAD(pages); + int i, cpu; + + size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + size *= BUF_PAGE_SIZE; + buffer_size = buffer->pages * BUF_PAGE_SIZE; + + /* we need a minimum of two pages */ + if (size < BUF_PAGE_SIZE * 2) + size = BUF_PAGE_SIZE * 2; + + if (size == buffer_size) + return size; + + mutex_lock(&buffer->mutex); + + nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + + if (size < buffer_size) { + + /* easy case, just free pages */ + BUG_ON(nr_pages >= buffer->pages); + + rm_pages = buffer->pages - nr_pages; + + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + rb_remove_pages(cpu_buffer, rm_pages); + } + goto out; + } + + /* + * This is a bit more difficult. We only want to add pages + * when we can allocate enough for all CPUs. We do this + * by allocating all the pages and storing them on a local + * link list. If we succeed in our allocation, then we + * add these pages to the cpu_buffers. Otherwise we just free + * them all and return -ENOMEM; + */ + BUG_ON(nr_pages <= buffer->pages); + new_pages = nr_pages - buffer->pages; + + for_each_buffer_cpu(buffer, cpu) { + for (i = 0; i < new_pages; i++) { + page = kzalloc_node(ALIGN(sizeof(*page), + cache_line_size()), + GFP_KERNEL, cpu_to_node(cpu)); + if (!page) + goto free_pages; + list_add(&page->list, &pages); + addr = __get_free_page(GFP_KERNEL); + if (!addr) + goto free_pages; + page->page = (void *)addr; + } + } + + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + rb_insert_pages(cpu_buffer, &pages, new_pages); + } + + BUG_ON(!list_empty(&pages)); + + out: + buffer->pages = nr_pages; + mutex_unlock(&buffer->mutex); + + return size; + + free_pages: + list_for_each_entry_safe(page, tmp, &pages, list) { + list_del_init(&page->list); + free_buffer_page(page); + } + return -ENOMEM; +} + +static inline int rb_null_event(struct ring_buffer_event *event) +{ + return event->type == RINGBUF_TYPE_PADDING; +} + +static inline void *__rb_page_index(struct buffer_page *page, unsigned index) +{ + return page->page + index; +} + +static inline struct ring_buffer_event * +rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) +{ + return __rb_page_index(cpu_buffer->reader_page, + cpu_buffer->reader_page->read); +} + +static inline struct ring_buffer_event * +rb_head_event(struct ring_buffer_per_cpu *cpu_buffer) +{ + return __rb_page_index(cpu_buffer->head_page, + cpu_buffer->head_page->read); +} + +static inline struct ring_buffer_event * +rb_iter_head_event(struct ring_buffer_iter *iter) +{ + return __rb_page_index(iter->head_page, iter->head); +} + +static inline unsigned rb_page_write(struct buffer_page *bpage) +{ + return local_read(&bpage->write); +} + +static inline unsigned rb_page_commit(struct buffer_page *bpage) +{ + return local_read(&bpage->commit); +} + +/* Size is determined by what has been commited */ +static inline unsigned rb_page_size(struct buffer_page *bpage) +{ + return rb_page_commit(bpage); +} + +static inline unsigned +rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) +{ + return rb_page_commit(cpu_buffer->commit_page); +} + +static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer) +{ + return rb_page_commit(cpu_buffer->head_page); +} + +/* + * When the tail hits the head and the buffer is in overwrite mode, + * the head jumps to the next page and all content on the previous + * page is discarded. But before doing so, we update the overrun + * variable of the buffer. + */ +static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct ring_buffer_event *event; + unsigned long head; + + for (head = 0; head < rb_head_size(cpu_buffer); + head += rb_event_length(event)) { + + event = __rb_page_index(cpu_buffer->head_page, head); + BUG_ON(rb_null_event(event)); + /* Only count data entries */ + if (event->type != RINGBUF_TYPE_DATA) + continue; + cpu_buffer->overrun++; + cpu_buffer->entries--; + } +} + +static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page **page) +{ + struct list_head *p = (*page)->list.next; + + if (p == &cpu_buffer->pages) + p = p->next; + + *page = list_entry(p, struct buffer_page, list); +} + +static inline unsigned +rb_event_index(struct ring_buffer_event *event) +{ + unsigned long addr = (unsigned long)event; + + return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); +} + +static inline int +rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long addr = (unsigned long)event; + unsigned long index; + + index = rb_event_index(event); + addr &= PAGE_MASK; + + return cpu_buffer->commit_page->page == (void *)addr && + rb_commit_index(cpu_buffer) == index; +} + +static inline void +rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long addr = (unsigned long)event; + unsigned long index; + + index = rb_event_index(event); + addr &= PAGE_MASK; + + while (cpu_buffer->commit_page->page != (void *)addr) { + RB_WARN_ON(cpu_buffer, + cpu_buffer->commit_page == cpu_buffer->tail_page); + cpu_buffer->commit_page->commit = + cpu_buffer->commit_page->write; + rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); + cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; + } + + /* Now set the commit to the event's index */ + local_set(&cpu_buffer->commit_page->commit, index); +} + +static inline void +rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) +{ + /* + * We only race with interrupts and NMIs on this CPU. + * If we own the commit event, then we can commit + * all others that interrupted us, since the interruptions + * are in stack format (they finish before they come + * back to us). This allows us to do a simple loop to + * assign the commit to the tail. + */ + while (cpu_buffer->commit_page != cpu_buffer->tail_page) { + cpu_buffer->commit_page->commit = + cpu_buffer->commit_page->write; + rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); + cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; + /* add barrier to keep gcc from optimizing too much */ + barrier(); + } + while (rb_commit_index(cpu_buffer) != + rb_page_write(cpu_buffer->commit_page)) { + cpu_buffer->commit_page->commit = + cpu_buffer->commit_page->write; + barrier(); + } +} + +static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + cpu_buffer->read_stamp = cpu_buffer->reader_page->time_stamp; + cpu_buffer->reader_page->read = 0; +} + +static inline void rb_inc_iter(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + + /* + * The iterator could be on the reader page (it starts there). + * But the head could have moved, since the reader was + * found. Check for this case and assign the iterator + * to the head page instead of next. + */ + if (iter->head_page == cpu_buffer->reader_page) + iter->head_page = cpu_buffer->head_page; + else + rb_inc_page(cpu_buffer, &iter->head_page); + + iter->read_stamp = iter->head_page->time_stamp; + iter->head = 0; +} + +/** + * ring_buffer_update_event - update event type and data + * @event: the even to update + * @type: the type of event + * @length: the size of the event field in the ring buffer + * + * Update the type and data fields of the event. The length + * is the actual size that is written to the ring buffer, + * and with this, we can determine what to place into the + * data field. + */ +static inline void +rb_update_event(struct ring_buffer_event *event, + unsigned type, unsigned length) +{ + event->type = type; + + switch (type) { + + case RINGBUF_TYPE_PADDING: + break; + + case RINGBUF_TYPE_TIME_EXTEND: + event->len = + (RB_LEN_TIME_EXTEND + (RB_ALIGNMENT-1)) + >> RB_ALIGNMENT_SHIFT; + break; + + case RINGBUF_TYPE_TIME_STAMP: + event->len = + (RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1)) + >> RB_ALIGNMENT_SHIFT; + break; + + case RINGBUF_TYPE_DATA: + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA) { + event->len = 0; + event->array[0] = length; + } else + event->len = + (length + (RB_ALIGNMENT-1)) + >> RB_ALIGNMENT_SHIFT; + break; + default: + BUG(); + } +} + +static inline unsigned rb_calculate_event_length(unsigned length) +{ + struct ring_buffer_event event; /* Used only for sizeof array */ + + /* zero length can cause confusions */ + if (!length) + length = 1; + + if (length > RB_MAX_SMALL_DATA) + length += sizeof(event.array[0]); + + length += RB_EVNT_HDR_SIZE; + length = ALIGN(length, RB_ALIGNMENT); + + return length; +} + +static struct ring_buffer_event * +__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, + unsigned type, unsigned long length, u64 *ts) +{ + struct buffer_page *tail_page, *head_page, *reader_page; + unsigned long tail, write; + struct ring_buffer *buffer = cpu_buffer->buffer; + struct ring_buffer_event *event; + unsigned long flags; + + tail_page = cpu_buffer->tail_page; + write = local_add_return(length, &tail_page->write); + tail = write - length; + + /* See if we shot pass the end of this buffer page */ + if (write > BUF_PAGE_SIZE) { + struct buffer_page *next_page = tail_page; + + spin_lock_irqsave(&cpu_buffer->lock, flags); + + rb_inc_page(cpu_buffer, &next_page); + + head_page = cpu_buffer->head_page; + reader_page = cpu_buffer->reader_page; + + /* we grabbed the lock before incrementing */ + RB_WARN_ON(cpu_buffer, next_page == reader_page); + + /* + * If for some reason, we had an interrupt storm that made + * it all the way around the buffer, bail, and warn + * about it. + */ + if (unlikely(next_page == cpu_buffer->commit_page)) { + WARN_ON_ONCE(1); + goto out_unlock; + } + + if (next_page == head_page) { + if (!(buffer->flags & RB_FL_OVERWRITE)) { + /* reset write */ + if (tail <= BUF_PAGE_SIZE) + local_set(&tail_page->write, tail); + goto out_unlock; + } + + /* tail_page has not moved yet? */ + if (tail_page == cpu_buffer->tail_page) { + /* count overflows */ + rb_update_overflow(cpu_buffer); + + rb_inc_page(cpu_buffer, &head_page); + cpu_buffer->head_page = head_page; + cpu_buffer->head_page->read = 0; + } + } + + /* + * If the tail page is still the same as what we think + * it is, then it is up to us to update the tail + * pointer. + */ + if (tail_page == cpu_buffer->tail_page) { + local_set(&next_page->write, 0); + local_set(&next_page->commit, 0); + cpu_buffer->tail_page = next_page; + + /* reread the time stamp */ + *ts = ring_buffer_time_stamp(cpu_buffer->cpu); + cpu_buffer->tail_page->time_stamp = *ts; + } + + /* + * The actual tail page has moved forward. + */ + if (tail < BUF_PAGE_SIZE) { + /* Mark the rest of the page with padding */ + event = __rb_page_index(tail_page, tail); + event->type = RINGBUF_TYPE_PADDING; + } + + if (tail <= BUF_PAGE_SIZE) + /* Set the write back to the previous setting */ + local_set(&tail_page->write, tail); + + /* + * If this was a commit entry that failed, + * increment that too + */ + if (tail_page == cpu_buffer->commit_page && + tail == rb_commit_index(cpu_buffer)) { + rb_set_commit_to_write(cpu_buffer); + } + + spin_unlock_irqrestore(&cpu_buffer->lock, flags); + + /* fail and let the caller try again */ + return ERR_PTR(-EAGAIN); + } + + /* We reserved something on the buffer */ + + BUG_ON(write > BUF_PAGE_SIZE); + + event = __rb_page_index(tail_page, tail); + rb_update_event(event, type, length); + + /* + * If this is a commit and the tail is zero, then update + * this page's time stamp. + */ + if (!tail && rb_is_commit(cpu_buffer, event)) + cpu_buffer->commit_page->time_stamp = *ts; + + return event; + + out_unlock: + spin_unlock_irqrestore(&cpu_buffer->lock, flags); + return NULL; +} + +static int +rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, + u64 *ts, u64 *delta) +{ + struct ring_buffer_event *event; + static int once; + int ret; + + if (unlikely(*delta > (1ULL << 59) && !once++)) { + printk(KERN_WARNING "Delta way too big! %llu" + " ts=%llu write stamp = %llu\n", + *delta, *ts, cpu_buffer->write_stamp); + WARN_ON(1); + } + + /* + * The delta is too big, we to add a + * new timestamp. + */ + event = __rb_reserve_next(cpu_buffer, + RINGBUF_TYPE_TIME_EXTEND, + RB_LEN_TIME_EXTEND, + ts); + if (!event) + return -EBUSY; + + if (PTR_ERR(event) == -EAGAIN) + return -EAGAIN; + + /* Only a commited time event can update the write stamp */ + if (rb_is_commit(cpu_buffer, event)) { + /* + * If this is the first on the page, then we need to + * update the page itself, and just put in a zero. + */ + if (rb_event_index(event)) { + event->time_delta = *delta & TS_MASK; + event->array[0] = *delta >> TS_SHIFT; + } else { + cpu_buffer->commit_page->time_stamp = *ts; + event->time_delta = 0; + event->array[0] = 0; + } + cpu_buffer->write_stamp = *ts; + /* let the caller know this was the commit */ + ret = 1; + } else { + /* Darn, this is just wasted space */ + event->time_delta = 0; + event->array[0] = 0; + ret = 0; + } + + *delta = 0; + + return ret; +} + +static struct ring_buffer_event * +rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, + unsigned type, unsigned long length) +{ + struct ring_buffer_event *event; + u64 ts, delta; + int commit = 0; + + again: + ts = ring_buffer_time_stamp(cpu_buffer->cpu); + + /* + * Only the first commit can update the timestamp. + * Yes there is a race here. If an interrupt comes in + * just after the conditional and it traces too, then it + * will also check the deltas. More than one timestamp may + * also be made. But only the entry that did the actual + * commit will be something other than zero. + */ + if (cpu_buffer->tail_page == cpu_buffer->commit_page && + rb_page_write(cpu_buffer->tail_page) == + rb_commit_index(cpu_buffer)) { + + delta = ts - cpu_buffer->write_stamp; + + /* make sure this delta is calculated here */ + barrier(); + + /* Did the write stamp get updated already? */ + if (unlikely(ts < cpu_buffer->write_stamp)) + goto again; + + if (test_time_stamp(delta)) { + + commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); + + if (commit == -EBUSY) + return NULL; + + if (commit == -EAGAIN) + goto again; + + RB_WARN_ON(cpu_buffer, commit < 0); + } + } else + /* Non commits have zero deltas */ + delta = 0; + + event = __rb_reserve_next(cpu_buffer, type, length, &ts); + if (PTR_ERR(event) == -EAGAIN) + goto again; + + if (!event) { + if (unlikely(commit)) + /* + * Ouch! We needed a timestamp and it was commited. But + * we didn't get our event reserved. + */ + rb_set_commit_to_write(cpu_buffer); + return NULL; + } + + /* + * If the timestamp was commited, make the commit our entry + * now so that we will update it when needed. + */ + if (commit) + rb_set_commit_event(cpu_buffer, event); + else if (!rb_is_commit(cpu_buffer, event)) + delta = 0; + + event->time_delta = delta; + + return event; +} + +static DEFINE_PER_CPU(int, rb_need_resched); + +/** + * ring_buffer_lock_reserve - reserve a part of the buffer + * @buffer: the ring buffer to reserve from + * @length: the length of the data to reserve (excluding event header) + * @flags: a pointer to save the interrupt flags + * + * Returns a reseverd event on the ring buffer to copy directly to. + * The user of this interface will need to get the body to write into + * and can use the ring_buffer_event_data() interface. + * + * The length is the length of the data needed, not the event length + * which also includes the event header. + * + * Must be paired with ring_buffer_unlock_commit, unless NULL is returned. + * If NULL is returned, then nothing has been allocated or locked. + */ +struct ring_buffer_event * +ring_buffer_lock_reserve(struct ring_buffer *buffer, + unsigned long length, + unsigned long *flags) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + int cpu, resched; + + if (atomic_read(&buffer->record_disabled)) + return NULL; + + /* If we are tracing schedule, we don't want to recurse */ + resched = need_resched(); + preempt_disable_notrace(); + + cpu = raw_smp_processor_id(); + + if (!cpu_isset(cpu, buffer->cpumask)) + goto out; + + cpu_buffer = buffer->buffers[cpu]; + + if (atomic_read(&cpu_buffer->record_disabled)) + goto out; + + length = rb_calculate_event_length(length); + if (length > BUF_PAGE_SIZE) + goto out; + + event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length); + if (!event) + goto out; + + /* + * Need to store resched state on this cpu. + * Only the first needs to. + */ + + if (preempt_count() == 1) + per_cpu(rb_need_resched, cpu) = resched; + + return event; + + out: + if (resched) + preempt_enable_notrace(); + else + preempt_enable_notrace(); + return NULL; +} + +static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + cpu_buffer->entries++; + + /* Only process further if we own the commit */ + if (!rb_is_commit(cpu_buffer, event)) + return; + + cpu_buffer->write_stamp += event->time_delta; + + rb_set_commit_to_write(cpu_buffer); +} + +/** + * ring_buffer_unlock_commit - commit a reserved + * @buffer: The buffer to commit to + * @event: The event pointer to commit. + * @flags: the interrupt flags received from ring_buffer_lock_reserve. + * + * This commits the data to the ring buffer, and releases any locks held. + * + * Must be paired with ring_buffer_lock_reserve. + */ +int ring_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu = raw_smp_processor_id(); + + cpu_buffer = buffer->buffers[cpu]; + + rb_commit(cpu_buffer, event); + + /* + * Only the last preempt count needs to restore preemption. + */ + if (preempt_count() == 1) { + if (per_cpu(rb_need_resched, cpu)) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); + } else + preempt_enable_no_resched_notrace(); + + return 0; +} + +/** + * ring_buffer_write - write data to the buffer without reserving + * @buffer: The ring buffer to write to. + * @length: The length of the data being written (excluding the event header) + * @data: The data to write to the buffer. + * + * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as + * one function. If you already have the data to write to the buffer, it + * may be easier to simply call this function. + * + * Note, like ring_buffer_lock_reserve, the length is the length of the data + * and not the length of the event which would hold the header. + */ +int ring_buffer_write(struct ring_buffer *buffer, + unsigned long length, + void *data) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + unsigned long event_length; + void *body; + int ret = -EBUSY; + int cpu, resched; + + if (atomic_read(&buffer->record_disabled)) + return -EBUSY; + + resched = need_resched(); + preempt_disable_notrace(); + + cpu = raw_smp_processor_id(); + + if (!cpu_isset(cpu, buffer->cpumask)) + goto out; + + cpu_buffer = buffer->buffers[cpu]; + + if (atomic_read(&cpu_buffer->record_disabled)) + goto out; + + event_length = rb_calculate_event_length(length); + event = rb_reserve_next_event(cpu_buffer, + RINGBUF_TYPE_DATA, event_length); + if (!event) + goto out; + + body = rb_event_data(event); + + memcpy(body, data, length); + + rb_commit(cpu_buffer, event); + + ret = 0; + out: + if (resched) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); + + return ret; +} + +static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *reader = cpu_buffer->reader_page; + struct buffer_page *head = cpu_buffer->head_page; + struct buffer_page *commit = cpu_buffer->commit_page; + + return reader->read == rb_page_commit(reader) && + (commit == reader || + (commit == head && + head->read == rb_page_commit(commit))); +} + +/** + * ring_buffer_record_disable - stop all writes into the buffer + * @buffer: The ring buffer to stop writes to. + * + * This prevents all writes to the buffer. Any attempt to write + * to the buffer after this will fail and return NULL. + * + * The caller should call synchronize_sched() after this. + */ +void ring_buffer_record_disable(struct ring_buffer *buffer) +{ + atomic_inc(&buffer->record_disabled); +} + +/** + * ring_buffer_record_enable - enable writes to the buffer + * @buffer: The ring buffer to enable writes + * + * Note, multiple disables will need the same number of enables + * to truely enable the writing (much like preempt_disable). + */ +void ring_buffer_record_enable(struct ring_buffer *buffer) +{ + atomic_dec(&buffer->record_disabled); +} + +/** + * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer + * @buffer: The ring buffer to stop writes to. + * @cpu: The CPU buffer to stop + * + * This prevents all writes to the buffer. Any attempt to write + * to the buffer after this will fail and return NULL. + * + * The caller should call synchronize_sched() after this. + */ +void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpu_isset(cpu, buffer->cpumask)) + return; + + cpu_buffer = buffer->buffers[cpu]; + atomic_inc(&cpu_buffer->record_disabled); +} + +/** + * ring_buffer_record_enable_cpu - enable writes to the buffer + * @buffer: The ring buffer to enable writes + * @cpu: The CPU to enable. + * + * Note, multiple disables will need the same number of enables + * to truely enable the writing (much like preempt_disable). + */ +void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpu_isset(cpu, buffer->cpumask)) + return; + + cpu_buffer = buffer->buffers[cpu]; + atomic_dec(&cpu_buffer->record_disabled); +} + +/** + * ring_buffer_entries_cpu - get the number of entries in a cpu buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the entries from. + */ +unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpu_isset(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + return cpu_buffer->entries; +} + +/** + * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpu_isset(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + return cpu_buffer->overrun; +} + +/** + * ring_buffer_entries - get the number of entries in a buffer + * @buffer: The ring buffer + * + * Returns the total number of entries in the ring buffer + * (all CPU entries) + */ +unsigned long ring_buffer_entries(struct ring_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long entries = 0; + int cpu; + + /* if you care about this being correct, lock the buffer */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + entries += cpu_buffer->entries; + } + + return entries; +} + +/** + * ring_buffer_overrun_cpu - get the number of overruns in buffer + * @buffer: The ring buffer + * + * Returns the total number of overruns in the ring buffer + * (all CPU entries) + */ +unsigned long ring_buffer_overruns(struct ring_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long overruns = 0; + int cpu; + + /* if you care about this being correct, lock the buffer */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + overruns += cpu_buffer->overrun; + } + + return overruns; +} + +/** + * ring_buffer_iter_reset - reset an iterator + * @iter: The iterator to reset + * + * Resets the iterator, so that it will start from the beginning + * again. + */ +void ring_buffer_iter_reset(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + + /* Iterator usage is expected to have record disabled */ + if (list_empty(&cpu_buffer->reader_page->list)) { + iter->head_page = cpu_buffer->head_page; + iter->head = cpu_buffer->head_page->read; + } else { + iter->head_page = cpu_buffer->reader_page; + iter->head = cpu_buffer->reader_page->read; + } + if (iter->head) + iter->read_stamp = cpu_buffer->read_stamp; + else + iter->read_stamp = iter->head_page->time_stamp; +} + +/** + * ring_buffer_iter_empty - check if an iterator has no more to read + * @iter: The iterator to check + */ +int ring_buffer_iter_empty(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + cpu_buffer = iter->cpu_buffer; + + return iter->head_page == cpu_buffer->commit_page && + iter->head == rb_commit_index(cpu_buffer); +} + +static void +rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + u64 delta; + + switch (event->type) { + case RINGBUF_TYPE_PADDING: + return; + + case RINGBUF_TYPE_TIME_EXTEND: + delta = event->array[0]; + delta <<= TS_SHIFT; + delta += event->time_delta; + cpu_buffer->read_stamp += delta; + return; + + case RINGBUF_TYPE_TIME_STAMP: + /* FIXME: not implemented */ + return; + + case RINGBUF_TYPE_DATA: + cpu_buffer->read_stamp += event->time_delta; + return; + + default: + BUG(); + } + return; +} + +static void +rb_update_iter_read_stamp(struct ring_buffer_iter *iter, + struct ring_buffer_event *event) +{ + u64 delta; + + switch (event->type) { + case RINGBUF_TYPE_PADDING: + return; + + case RINGBUF_TYPE_TIME_EXTEND: + delta = event->array[0]; + delta <<= TS_SHIFT; + delta += event->time_delta; + iter->read_stamp += delta; + return; + + case RINGBUF_TYPE_TIME_STAMP: + /* FIXME: not implemented */ + return; + + case RINGBUF_TYPE_DATA: + iter->read_stamp += event->time_delta; + return; + + default: + BUG(); + } + return; +} + +static struct buffer_page * +rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *reader = NULL; + unsigned long flags; + + spin_lock_irqsave(&cpu_buffer->lock, flags); + + again: + reader = cpu_buffer->reader_page; + + /* If there's more to read, return this page */ + if (cpu_buffer->reader_page->read < rb_page_size(reader)) + goto out; + + /* Never should we have an index greater than the size */ + RB_WARN_ON(cpu_buffer, + cpu_buffer->reader_page->read > rb_page_size(reader)); + + /* check if we caught up to the tail */ + reader = NULL; + if (cpu_buffer->commit_page == cpu_buffer->reader_page) + goto out; + + /* + * Splice the empty reader page into the list around the head. + * Reset the reader page to size zero. + */ + + reader = cpu_buffer->head_page; + cpu_buffer->reader_page->list.next = reader->list.next; + cpu_buffer->reader_page->list.prev = reader->list.prev; + + local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->commit, 0); + + /* Make the reader page now replace the head */ + reader->list.prev->next = &cpu_buffer->reader_page->list; + reader->list.next->prev = &cpu_buffer->reader_page->list; + + /* + * If the tail is on the reader, then we must set the head + * to the inserted page, otherwise we set it one before. + */ + cpu_buffer->head_page = cpu_buffer->reader_page; + + if (cpu_buffer->commit_page != reader) + rb_inc_page(cpu_buffer, &cpu_buffer->head_page); + + /* Finally update the reader page to the new head */ + cpu_buffer->reader_page = reader; + rb_reset_reader_page(cpu_buffer); + + goto again; + + out: + spin_unlock_irqrestore(&cpu_buffer->lock, flags); + + return reader; +} + +static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct ring_buffer_event *event; + struct buffer_page *reader; + unsigned length; + + reader = rb_get_reader_page(cpu_buffer); + + /* This function should not be called when buffer is empty */ + BUG_ON(!reader); + + event = rb_reader_event(cpu_buffer); + + if (event->type == RINGBUF_TYPE_DATA) + cpu_buffer->entries--; + + rb_update_read_stamp(cpu_buffer, event); + + length = rb_event_length(event); + cpu_buffer->reader_page->read += length; +} + +static void rb_advance_iter(struct ring_buffer_iter *iter) +{ + struct ring_buffer *buffer; + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + unsigned length; + + cpu_buffer = iter->cpu_buffer; + buffer = cpu_buffer->buffer; + + /* + * Check if we are at the end of the buffer. + */ + if (iter->head >= rb_page_size(iter->head_page)) { + BUG_ON(iter->head_page == cpu_buffer->commit_page); + rb_inc_iter(iter); + return; + } + + event = rb_iter_head_event(iter); + + length = rb_event_length(event); + + /* + * This should not be called to advance the header if we are + * at the tail of the buffer. + */ + BUG_ON((iter->head_page == cpu_buffer->commit_page) && + (iter->head + length > rb_commit_index(cpu_buffer))); + + rb_update_iter_read_stamp(iter, event); + + iter->head += length; + + /* check for end of page padding */ + if ((iter->head >= rb_page_size(iter->head_page)) && + (iter->head_page != cpu_buffer->commit_page)) + rb_advance_iter(iter); +} + +/** + * ring_buffer_peek - peek at the next event to be read + * @buffer: The ring buffer to read + * @cpu: The cpu to peak at + * @ts: The timestamp counter of this event. + * + * This will return the event that will be read next, but does + * not consume the data. + */ +struct ring_buffer_event * +ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + struct buffer_page *reader; + + if (!cpu_isset(cpu, buffer->cpumask)) + return NULL; + + cpu_buffer = buffer->buffers[cpu]; + + again: + reader = rb_get_reader_page(cpu_buffer); + if (!reader) + return NULL; + + event = rb_reader_event(cpu_buffer); + + switch (event->type) { + case RINGBUF_TYPE_PADDING: + RB_WARN_ON(cpu_buffer, 1); + rb_advance_reader(cpu_buffer); + return NULL; + + case RINGBUF_TYPE_TIME_EXTEND: + /* Internal data, OK to advance */ + rb_advance_reader(cpu_buffer); + goto again; + + case RINGBUF_TYPE_TIME_STAMP: + /* FIXME: not implemented */ + rb_advance_reader(cpu_buffer); + goto again; + + case RINGBUF_TYPE_DATA: + if (ts) { + *ts = cpu_buffer->read_stamp + event->time_delta; + ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts); + } + return event; + + default: + BUG(); + } + + return NULL; +} + +/** + * ring_buffer_iter_peek - peek at the next event to be read + * @iter: The ring buffer iterator + * @ts: The timestamp counter of this event. + * + * This will return the event that will be read next, but does + * not increment the iterator. + */ +struct ring_buffer_event * +ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) +{ + struct ring_buffer *buffer; + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + + if (ring_buffer_iter_empty(iter)) + return NULL; + + cpu_buffer = iter->cpu_buffer; + buffer = cpu_buffer->buffer; + + again: + if (rb_per_cpu_empty(cpu_buffer)) + return NULL; + + event = rb_iter_head_event(iter); + + switch (event->type) { + case RINGBUF_TYPE_PADDING: + rb_inc_iter(iter); + goto again; + + case RINGBUF_TYPE_TIME_EXTEND: + /* Internal data, OK to advance */ + rb_advance_iter(iter); + goto again; + + case RINGBUF_TYPE_TIME_STAMP: + /* FIXME: not implemented */ + rb_advance_iter(iter); + goto again; + + case RINGBUF_TYPE_DATA: + if (ts) { + *ts = iter->read_stamp + event->time_delta; + ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts); + } + return event; + + default: + BUG(); + } + + return NULL; +} + +/** + * ring_buffer_consume - return an event and consume it + * @buffer: The ring buffer to get the next event from + * + * Returns the next event in the ring buffer, and that event is consumed. + * Meaning, that sequential reads will keep returning a different event, + * and eventually empty the ring buffer if the producer is slower. + */ +struct ring_buffer_event * +ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + + if (!cpu_isset(cpu, buffer->cpumask)) + return NULL; + + event = ring_buffer_peek(buffer, cpu, ts); + if (!event) + return NULL; + + cpu_buffer = buffer->buffers[cpu]; + rb_advance_reader(cpu_buffer); + + return event; +} + +/** + * ring_buffer_read_start - start a non consuming read of the buffer + * @buffer: The ring buffer to read from + * @cpu: The cpu buffer to iterate over + * + * This starts up an iteration through the buffer. It also disables + * the recording to the buffer until the reading is finished. + * This prevents the reading from being corrupted. This is not + * a consuming read, so a producer is not expected. + * + * Must be paired with ring_buffer_finish. + */ +struct ring_buffer_iter * +ring_buffer_read_start(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_iter *iter; + unsigned long flags; + + if (!cpu_isset(cpu, buffer->cpumask)) + return NULL; + + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return NULL; + + cpu_buffer = buffer->buffers[cpu]; + + iter->cpu_buffer = cpu_buffer; + + atomic_inc(&cpu_buffer->record_disabled); + synchronize_sched(); + + spin_lock_irqsave(&cpu_buffer->lock, flags); + ring_buffer_iter_reset(iter); + spin_unlock_irqrestore(&cpu_buffer->lock, flags); + + return iter; +} + +/** + * ring_buffer_finish - finish reading the iterator of the buffer + * @iter: The iterator retrieved by ring_buffer_start + * + * This re-enables the recording to the buffer, and frees the + * iterator. + */ +void +ring_buffer_read_finish(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + + atomic_dec(&cpu_buffer->record_disabled); + kfree(iter); +} + +/** + * ring_buffer_read - read the next item in the ring buffer by the iterator + * @iter: The ring buffer iterator + * @ts: The time stamp of the event read. + * + * This reads the next event in the ring buffer and increments the iterator. + */ +struct ring_buffer_event * +ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) +{ + struct ring_buffer_event *event; + + event = ring_buffer_iter_peek(iter, ts); + if (!event) + return NULL; + + rb_advance_iter(iter); + + return event; +} + +/** + * ring_buffer_size - return the size of the ring buffer (in bytes) + * @buffer: The ring buffer. + */ +unsigned long ring_buffer_size(struct ring_buffer *buffer) +{ + return BUF_PAGE_SIZE * buffer->pages; +} + +static void +rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) +{ + cpu_buffer->head_page + = list_entry(cpu_buffer->pages.next, struct buffer_page, list); + local_set(&cpu_buffer->head_page->write, 0); + local_set(&cpu_buffer->head_page->commit, 0); + + cpu_buffer->head_page->read = 0; + + cpu_buffer->tail_page = cpu_buffer->head_page; + cpu_buffer->commit_page = cpu_buffer->head_page; + + INIT_LIST_HEAD(&cpu_buffer->reader_page->list); + local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->commit, 0); + cpu_buffer->reader_page->read = 0; + + cpu_buffer->overrun = 0; + cpu_buffer->entries = 0; +} + +/** + * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer + * @buffer: The ring buffer to reset a per cpu buffer of + * @cpu: The CPU buffer to be reset + */ +void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + unsigned long flags; + + if (!cpu_isset(cpu, buffer->cpumask)) + return; + + spin_lock_irqsave(&cpu_buffer->lock, flags); + + rb_reset_cpu(cpu_buffer); + + spin_unlock_irqrestore(&cpu_buffer->lock, flags); +} + +/** + * ring_buffer_reset - reset a ring buffer + * @buffer: The ring buffer to reset all cpu buffers + */ +void ring_buffer_reset(struct ring_buffer *buffer) +{ + int cpu; + + for_each_buffer_cpu(buffer, cpu) + ring_buffer_reset_cpu(buffer, cpu); +} + +/** + * rind_buffer_empty - is the ring buffer empty? + * @buffer: The ring buffer to test + */ +int ring_buffer_empty(struct ring_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu; + + /* yes this is racy, but if you don't like the race, lock the buffer */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + if (!rb_per_cpu_empty(cpu_buffer)) + return 0; + } + return 1; +} + +/** + * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty? + * @buffer: The ring buffer + * @cpu: The CPU buffer to test + */ +int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpu_isset(cpu, buffer->cpumask)) + return 1; + + cpu_buffer = buffer->buffers[cpu]; + return rb_per_cpu_empty(cpu_buffer); +} + +/** + * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers + * @buffer_a: One buffer to swap with + * @buffer_b: The other buffer to swap with + * + * This function is useful for tracers that want to take a "snapshot" + * of a CPU buffer and has another back up buffer lying around. + * it is expected that the tracer handles the cpu buffer not being + * used at the moment. + */ +int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, + struct ring_buffer *buffer_b, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer_a; + struct ring_buffer_per_cpu *cpu_buffer_b; + + if (!cpu_isset(cpu, buffer_a->cpumask) || + !cpu_isset(cpu, buffer_b->cpumask)) + return -EINVAL; + + /* At least make sure the two buffers are somewhat the same */ + if (buffer_a->size != buffer_b->size || + buffer_a->pages != buffer_b->pages) + return -EINVAL; + + cpu_buffer_a = buffer_a->buffers[cpu]; + cpu_buffer_b = buffer_b->buffers[cpu]; + + /* + * We can't do a synchronize_sched here because this + * function can be called in atomic context. + * Normally this will be called from the same CPU as cpu. + * If not it's up to the caller to protect this. + */ + atomic_inc(&cpu_buffer_a->record_disabled); + atomic_inc(&cpu_buffer_b->record_disabled); + + buffer_a->buffers[cpu] = cpu_buffer_b; + buffer_b->buffers[cpu] = cpu_buffer_a; + + cpu_buffer_b->buffer = buffer_a; + cpu_buffer_a->buffer = buffer_b; + + atomic_dec(&cpu_buffer_a->record_disabled); + atomic_dec(&cpu_buffer_b->record_disabled); + + return 0; +} + diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8f3fb3d..d345d64 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -14,6 +14,7 @@ #include <linux/utsrelease.h> #include <linux/kallsyms.h> #include <linux/seq_file.h> +#include <linux/notifier.h> #include <linux/debugfs.h> #include <linux/pagemap.h> #include <linux/hardirq.h> @@ -22,6 +23,7 @@ #include <linux/ftrace.h> #include <linux/module.h> #include <linux/percpu.h> +#include <linux/kdebug.h> #include <linux/ctype.h> #include <linux/init.h> #include <linux/poll.h> @@ -31,25 +33,36 @@ #include <linux/writeback.h> #include <linux/stacktrace.h> +#include <linux/ring_buffer.h> #include "trace.h" +#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) + unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; unsigned long __read_mostly tracing_thresh; -static unsigned long __read_mostly tracing_nr_buffers; +static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); + +static inline void ftrace_disable_cpu(void) +{ + preempt_disable(); + local_inc(&__get_cpu_var(ftrace_cpu_disabled)); +} + +static inline void ftrace_enable_cpu(void) +{ + local_dec(&__get_cpu_var(ftrace_cpu_disabled)); + preempt_enable(); +} + static cpumask_t __read_mostly tracing_buffer_mask; #define for_each_tracing_cpu(cpu) \ for_each_cpu_mask(cpu, tracing_buffer_mask) -static int trace_alloc_page(void); -static int trace_free_page(void); - static int tracing_disabled = 1; -static unsigned long tracing_pages_allocated; - long ns2usecs(cycle_t nsec) { @@ -60,7 +73,9 @@ ns2usecs(cycle_t nsec) cycle_t ftrace_now(int cpu) { - return cpu_clock(cpu); + u64 ts = ring_buffer_time_stamp(cpu); + ring_buffer_normalize_time_stamp(cpu, &ts); + return ts; } /* @@ -100,11 +115,18 @@ static int tracer_enabled = 1; int ftrace_function_enabled; /* - * trace_nr_entries is the number of entries that is allocated - * for a buffer. Note, the number of entries is always rounded - * to ENTRIES_PER_PAGE. + * trace_buf_size is the size in bytes that is allocated + * for a buffer. Note, the number of bytes is always rounded + * to page size. + * + * This number is purposely set to a low number of 16384. + * If the dump on oops happens, it will be much appreciated + * to not have to wait for all that output. Anyway this can be + * boot time and run time configurable. */ -static unsigned long trace_nr_entries = 65536UL; +#define TRACE_BUF_SIZE_DEFAULT 1441792UL /* 16384 * 88 (sizeof(entry)) */ + +static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT; /* trace_types holds a link list of available tracers. */ static struct tracer *trace_types __read_mostly; @@ -133,24 +155,6 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds iter_ctrl options */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; -static notrace void no_trace_init(struct trace_array *tr) -{ - int cpu; - - ftrace_function_enabled = 0; - if(tr->ctrl) - for_each_online_cpu(cpu) - tracing_reset(tr->data[cpu]); - tracer_enabled = 0; -} - -/* dummy trace to disable tracing */ -static struct tracer no_tracer __read_mostly = { - .name = "none", - .init = no_trace_init -}; - - /** * trace_wake_up - wake up tasks waiting for trace input * @@ -167,23 +171,21 @@ void trace_wake_up(void) wake_up(&trace_wait); } -#define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry)) - -static int __init set_nr_entries(char *str) +static int __init set_buf_size(char *str) { - unsigned long nr_entries; + unsigned long buf_size; int ret; if (!str) return 0; - ret = strict_strtoul(str, 0, &nr_entries); + ret = strict_strtoul(str, 0, &buf_size); /* nr_entries can not be zero */ - if (ret < 0 || nr_entries == 0) + if (ret < 0 || buf_size == 0) return 0; - trace_nr_entries = nr_entries; + trace_buf_size = buf_size; return 1; } -__setup("trace_entries=", set_nr_entries); +__setup("trace_buf_size=", set_buf_size); unsigned long nsecs_to_usecs(unsigned long nsecs) { @@ -191,21 +193,6 @@ unsigned long nsecs_to_usecs(unsigned long nsecs) } /* - * trace_flag_type is an enumeration that holds different - * states when a trace occurs. These are: - * IRQS_OFF - interrupts were disabled - * NEED_RESCED - reschedule is requested - * HARDIRQ - inside an interrupt handler - * SOFTIRQ - inside a softirq handler - */ -enum trace_flag_type { - TRACE_FLAG_IRQS_OFF = 0x01, - TRACE_FLAG_NEED_RESCHED = 0x02, - TRACE_FLAG_HARDIRQ = 0x04, - TRACE_FLAG_SOFTIRQ = 0x08, -}; - -/* * TRACE_ITER_SYM_MASK masks the options in trace_flags that * control the output of kernel symbols. */ @@ -224,6 +211,7 @@ static const char *trace_options[] = { "block", "stacktrace", "sched-tree", + "ftrace_printk", NULL }; @@ -266,54 +254,6 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) tracing_record_cmdline(current); } -#define CHECK_COND(cond) \ - if (unlikely(cond)) { \ - tracing_disabled = 1; \ - WARN_ON(1); \ - return -1; \ - } - -/** - * check_pages - integrity check of trace buffers - * - * As a safty measure we check to make sure the data pages have not - * been corrupted. - */ -int check_pages(struct trace_array_cpu *data) -{ - struct page *page, *tmp; - - CHECK_COND(data->trace_pages.next->prev != &data->trace_pages); - CHECK_COND(data->trace_pages.prev->next != &data->trace_pages); - - list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) { - CHECK_COND(page->lru.next->prev != &page->lru); - CHECK_COND(page->lru.prev->next != &page->lru); - } - - return 0; -} - -/** - * head_page - page address of the first page in per_cpu buffer. - * - * head_page returns the page address of the first page in - * a per_cpu buffer. This also preforms various consistency - * checks to make sure the buffer has not been corrupted. - */ -void *head_page(struct trace_array_cpu *data) -{ - struct page *page; - - if (list_empty(&data->trace_pages)) - return NULL; - - page = list_entry(data->trace_pages.next, struct page, lru); - BUG_ON(&page->lru == &data->trace_pages); - - return page_address(page); -} - /** * trace_seq_printf - sequence printing of trace information * @s: trace sequence descriptor @@ -395,28 +335,23 @@ trace_seq_putmem(struct trace_seq *s, void *mem, size_t len) return len; } -#define HEX_CHARS 17 -static const char hex2asc[] = "0123456789abcdef"; +#define MAX_MEMHEX_BYTES 8 +#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) static int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) { unsigned char hex[HEX_CHARS]; unsigned char *data = mem; - unsigned char byte; int i, j; - BUG_ON(len >= HEX_CHARS); - #ifdef __BIG_ENDIAN for (i = 0, j = 0; i < len; i++) { #else for (i = len-1, j = 0; i >= 0; i--) { #endif - byte = data[i]; - - hex[j++] = hex2asc[byte & 0x0f]; - hex[j++] = hex2asc[byte >> 4]; + hex[j++] = hex_asc_hi(data[i]); + hex[j++] = hex_asc_lo(data[i]); } hex[j++] = ' '; @@ -460,34 +395,6 @@ trace_print_seq(struct seq_file *m, struct trace_seq *s) trace_seq_reset(s); } -/* - * flip the trace buffers between two trace descriptors. - * This usually is the buffers between the global_trace and - * the max_tr to record a snapshot of a current trace. - * - * The ftrace_max_lock must be held. - */ -static void -flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2) -{ - struct list_head flip_pages; - - INIT_LIST_HEAD(&flip_pages); - - memcpy(&tr1->trace_head_idx, &tr2->trace_head_idx, - sizeof(struct trace_array_cpu) - - offsetof(struct trace_array_cpu, trace_head_idx)); - - check_pages(tr1); - check_pages(tr2); - list_splice_init(&tr1->trace_pages, &flip_pages); - list_splice_init(&tr2->trace_pages, &tr1->trace_pages); - list_splice_init(&flip_pages, &tr2->trace_pages); - BUG_ON(!list_empty(&flip_pages)); - check_pages(tr1); - check_pages(tr2); -} - /** * update_max_tr - snapshot all trace buffers from global_trace to max_tr * @tr: tracer @@ -500,17 +407,17 @@ flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2) void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { - struct trace_array_cpu *data; - int i; + struct ring_buffer *buf = tr->buffer; WARN_ON_ONCE(!irqs_disabled()); __raw_spin_lock(&ftrace_max_lock); - /* clear out all the previous traces */ - for_each_tracing_cpu(i) { - data = tr->data[i]; - flip_trace(max_tr.data[i], data); - tracing_reset(data); - } + + tr->buffer = max_tr.buffer; + max_tr.buffer = buf; + + ftrace_disable_cpu(); + ring_buffer_reset(tr->buffer); + ftrace_enable_cpu(); __update_max_tr(tr, tsk, cpu); __raw_spin_unlock(&ftrace_max_lock); @@ -527,16 +434,19 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) { - struct trace_array_cpu *data = tr->data[cpu]; - int i; + int ret; WARN_ON_ONCE(!irqs_disabled()); __raw_spin_lock(&ftrace_max_lock); - for_each_tracing_cpu(i) - tracing_reset(max_tr.data[i]); - flip_trace(max_tr.data[cpu], data); - tracing_reset(data); + ftrace_disable_cpu(); + + ring_buffer_reset(max_tr.buffer); + ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); + + ftrace_enable_cpu(); + + WARN_ON_ONCE(ret); __update_max_tr(tr, tsk, cpu); __raw_spin_unlock(&ftrace_max_lock); @@ -573,7 +483,6 @@ int register_tracer(struct tracer *type) #ifdef CONFIG_FTRACE_STARTUP_TEST if (type->selftest) { struct tracer *saved_tracer = current_trace; - struct trace_array_cpu *data; struct trace_array *tr = &global_trace; int saved_ctrl = tr->ctrl; int i; @@ -585,10 +494,7 @@ int register_tracer(struct tracer *type) * If we fail, we do not register this tracer. */ for_each_tracing_cpu(i) { - data = tr->data[i]; - if (!head_page(data)) - continue; - tracing_reset(data); + tracing_reset(tr, i); } current_trace = type; tr->ctrl = 0; @@ -604,10 +510,7 @@ int register_tracer(struct tracer *type) } /* Only reset on passing, to avoid touching corrupted buffers */ for_each_tracing_cpu(i) { - data = tr->data[i]; - if (!head_page(data)) - continue; - tracing_reset(data); + tracing_reset(tr, i); } printk(KERN_CONT "PASSED\n"); } @@ -653,13 +556,11 @@ void unregister_tracer(struct tracer *type) mutex_unlock(&trace_types_lock); } -void tracing_reset(struct trace_array_cpu *data) +void tracing_reset(struct trace_array *tr, int cpu) { - data->trace_idx = 0; - data->overrun = 0; - data->trace_head = data->trace_tail = head_page(data); - data->trace_head_idx = 0; - data->trace_tail_idx = 0; + ftrace_disable_cpu(); + ring_buffer_reset_cpu(tr->buffer, cpu); + ftrace_enable_cpu(); } #define SAVED_CMDLINES 128 @@ -745,82 +646,16 @@ void tracing_record_cmdline(struct task_struct *tsk) trace_save_cmdline(tsk); } -static inline struct list_head * -trace_next_list(struct trace_array_cpu *data, struct list_head *next) -{ - /* - * Roundrobin - but skip the head (which is not a real page): - */ - next = next->next; - if (unlikely(next == &data->trace_pages)) - next = next->next; - BUG_ON(next == &data->trace_pages); - - return next; -} - -static inline void * -trace_next_page(struct trace_array_cpu *data, void *addr) -{ - struct list_head *next; - struct page *page; - - page = virt_to_page(addr); - - next = trace_next_list(data, &page->lru); - page = list_entry(next, struct page, lru); - - return page_address(page); -} - -static inline struct trace_entry * -tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data) -{ - unsigned long idx, idx_next; - struct trace_entry *entry; - - data->trace_idx++; - idx = data->trace_head_idx; - idx_next = idx + 1; - - BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE); - - entry = data->trace_head + idx * TRACE_ENTRY_SIZE; - - if (unlikely(idx_next >= ENTRIES_PER_PAGE)) { - data->trace_head = trace_next_page(data, data->trace_head); - idx_next = 0; - } - - if (data->trace_head == data->trace_tail && - idx_next == data->trace_tail_idx) { - /* overrun */ - data->overrun++; - data->trace_tail_idx++; - if (data->trace_tail_idx >= ENTRIES_PER_PAGE) { - data->trace_tail = - trace_next_page(data, data->trace_tail); - data->trace_tail_idx = 0; - } - } - - data->trace_head_idx = idx_next; - - return entry; -} - -static inline void -tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) +void +tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, + int pc) { struct task_struct *tsk = current; - unsigned long pc; - - pc = preempt_count(); - entry->preempt_count = pc & 0xff; - entry->pid = (tsk) ? tsk->pid : 0; - entry->t = ftrace_now(raw_smp_processor_id()); - entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | + entry->preempt_count = pc & 0xff; + entry->pid = (tsk) ? tsk->pid : 0; + entry->flags = + (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); @@ -828,145 +663,139 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags) void trace_function(struct trace_array *tr, struct trace_array_cpu *data, - unsigned long ip, unsigned long parent_ip, unsigned long flags) + unsigned long ip, unsigned long parent_ip, unsigned long flags, + int pc) { - struct trace_entry *entry; + struct ring_buffer_event *event; + struct ftrace_entry *entry; unsigned long irq_flags; - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); - entry = tracing_get_trace_entry(tr, data); - tracing_generic_entry_update(entry, flags); - entry->type = TRACE_FN; - entry->fn.ip = ip; - entry->fn.parent_ip = parent_ip; - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); + /* If we are reading the ring buffer, don't trace */ + if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) + return; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, flags, pc); + entry->ent.type = TRACE_FN; + entry->ip = ip; + entry->parent_ip = parent_ip; + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); } void ftrace(struct trace_array *tr, struct trace_array_cpu *data, - unsigned long ip, unsigned long parent_ip, unsigned long flags) + unsigned long ip, unsigned long parent_ip, unsigned long flags, + int pc) { if (likely(!atomic_read(&data->disabled))) - trace_function(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags, pc); } -#ifdef CONFIG_MMIOTRACE -void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, - struct mmiotrace_rw *rw) +static void ftrace_trace_stack(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long flags, + int skip, int pc) { - struct trace_entry *entry; + struct ring_buffer_event *event; + struct stack_entry *entry; + struct stack_trace trace; unsigned long irq_flags; - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); - - entry = tracing_get_trace_entry(tr, data); - tracing_generic_entry_update(entry, 0); - entry->type = TRACE_MMIO_RW; - entry->mmiorw = *rw; - - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); - - trace_wake_up(); -} - -void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data, - struct mmiotrace_map *map) -{ - struct trace_entry *entry; - unsigned long irq_flags; + if (!(trace_flags & TRACE_ITER_STACKTRACE)) + return; - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, flags, pc); + entry->ent.type = TRACE_STACK; - entry = tracing_get_trace_entry(tr, data); - tracing_generic_entry_update(entry, 0); - entry->type = TRACE_MMIO_MAP; - entry->mmiomap = *map; + memset(&entry->caller, 0, sizeof(entry->caller)); - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); + trace.nr_entries = 0; + trace.max_entries = FTRACE_STACK_ENTRIES; + trace.skip = skip; + trace.entries = entry->caller; - trace_wake_up(); + save_stack_trace(&trace); + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); } -#endif void __trace_stack(struct trace_array *tr, struct trace_array_cpu *data, unsigned long flags, int skip) { - struct trace_entry *entry; - struct stack_trace trace; - - if (!(trace_flags & TRACE_ITER_STACKTRACE)) - return; - - entry = tracing_get_trace_entry(tr, data); - tracing_generic_entry_update(entry, flags); - entry->type = TRACE_STACK; - - memset(&entry->stack, 0, sizeof(entry->stack)); - - trace.nr_entries = 0; - trace.max_entries = FTRACE_STACK_ENTRIES; - trace.skip = skip; - trace.entries = entry->stack.caller; - - save_stack_trace(&trace); + ftrace_trace_stack(tr, data, flags, skip, preempt_count()); } -void -__trace_special(void *__tr, void *__data, - unsigned long arg1, unsigned long arg2, unsigned long arg3) +static void +ftrace_trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3, + int pc) { + struct ring_buffer_event *event; struct trace_array_cpu *data = __data; struct trace_array *tr = __tr; - struct trace_entry *entry; + struct special_entry *entry; unsigned long irq_flags; - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); - entry = tracing_get_trace_entry(tr, data); - tracing_generic_entry_update(entry, 0); - entry->type = TRACE_SPECIAL; - entry->special.arg1 = arg1; - entry->special.arg2 = arg2; - entry->special.arg3 = arg3; - __trace_stack(tr, data, irq_flags, 4); - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, pc); + entry->ent.type = TRACE_SPECIAL; + entry->arg1 = arg1; + entry->arg2 = arg2; + entry->arg3 = arg3; + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + ftrace_trace_stack(tr, data, irq_flags, 4, pc); trace_wake_up(); } void +__trace_special(void *__tr, void *__data, + unsigned long arg1, unsigned long arg2, unsigned long arg3) +{ + ftrace_trace_special(__tr, __data, arg1, arg2, arg3, preempt_count()); +} + +void tracing_sched_switch_trace(struct trace_array *tr, struct trace_array_cpu *data, struct task_struct *prev, struct task_struct *next, - unsigned long flags) + unsigned long flags, int pc) { - struct trace_entry *entry; + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; unsigned long irq_flags; - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); - entry = tracing_get_trace_entry(tr, data); - tracing_generic_entry_update(entry, flags); - entry->type = TRACE_CTX; - entry->ctx.prev_pid = prev->pid; - entry->ctx.prev_prio = prev->prio; - entry->ctx.prev_state = prev->state; - entry->ctx.next_pid = next->pid; - entry->ctx.next_prio = next->prio; - entry->ctx.next_state = next->state; - __trace_stack(tr, data, flags, 5); - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, flags, pc); + entry->ent.type = TRACE_CTX; + entry->prev_pid = prev->pid; + entry->prev_prio = prev->prio; + entry->prev_state = prev->state; + entry->next_pid = next->pid; + entry->next_prio = next->prio; + entry->next_state = next->state; + entry->next_cpu = task_cpu(next); + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + ftrace_trace_stack(tr, data, flags, 5, pc); } void @@ -974,25 +803,28 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct trace_array_cpu *data, struct task_struct *wakee, struct task_struct *curr, - unsigned long flags) + unsigned long flags, int pc) { - struct trace_entry *entry; + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; unsigned long irq_flags; - raw_local_irq_save(irq_flags); - __raw_spin_lock(&data->lock); - entry = tracing_get_trace_entry(tr, data); - tracing_generic_entry_update(entry, flags); - entry->type = TRACE_WAKE; - entry->ctx.prev_pid = curr->pid; - entry->ctx.prev_prio = curr->prio; - entry->ctx.prev_state = curr->state; - entry->ctx.next_pid = wakee->pid; - entry->ctx.next_prio = wakee->prio; - entry->ctx.next_state = wakee->state; - __trace_stack(tr, data, flags, 6); - __raw_spin_unlock(&data->lock); - raw_local_irq_restore(irq_flags); + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, flags, pc); + entry->ent.type = TRACE_WAKE; + entry->prev_pid = curr->pid; + entry->prev_prio = curr->prio; + entry->prev_state = curr->state; + entry->next_pid = wakee->pid; + entry->next_prio = wakee->prio; + entry->next_state = wakee->state; + entry->next_cpu = task_cpu(wakee); + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + ftrace_trace_stack(tr, data, flags, 6, pc); trace_wake_up(); } @@ -1002,23 +834,21 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { struct trace_array *tr = &global_trace; struct trace_array_cpu *data; - unsigned long flags; - long disabled; int cpu; + int pc; - if (tracing_disabled || current_trace == &no_tracer || !tr->ctrl) + if (tracing_disabled || !tr->ctrl) return; - local_irq_save(flags); + pc = preempt_count(); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - if (likely(disabled == 1)) - __trace_special(tr, data, arg1, arg2, arg3); + if (likely(!atomic_read(&data->disabled))) + ftrace_trace_special(tr, data, arg1, arg2, arg3, pc); - atomic_dec(&data->disabled); - local_irq_restore(flags); + preempt_enable_notrace(); } #ifdef CONFIG_FTRACE @@ -1029,7 +859,8 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) struct trace_array_cpu *data; unsigned long flags; long disabled; - int cpu; + int cpu, resched; + int pc; if (unlikely(!ftrace_function_enabled)) return; @@ -1037,16 +868,22 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) if (skip_trace(ip)) return; - local_irq_save(flags); + pc = preempt_count(); + resched = need_resched(); + preempt_disable_notrace(); + local_save_flags(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) - trace_function(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags, pc); atomic_dec(&data->disabled); - local_irq_restore(flags); + if (resched) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); } static struct ftrace_ops trace_ops __read_mostly = @@ -1073,111 +910,96 @@ enum trace_file_type { TRACE_FILE_LAT_FMT = 1, }; -static struct trace_entry * -trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data, - struct trace_iterator *iter, int cpu) +static void trace_iterator_increment(struct trace_iterator *iter, int cpu) { - struct page *page; - struct trace_entry *array; + /* Don't allow ftrace to trace into the ring buffers */ + ftrace_disable_cpu(); - if (iter->next_idx[cpu] >= tr->entries || - iter->next_idx[cpu] >= data->trace_idx || - (data->trace_head == data->trace_tail && - data->trace_head_idx == data->trace_tail_idx)) - return NULL; + iter->idx++; + if (iter->buffer_iter[iter->cpu]) + ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); - if (!iter->next_page[cpu]) { - /* Initialize the iterator for this cpu trace buffer */ - WARN_ON(!data->trace_tail); - page = virt_to_page(data->trace_tail); - iter->next_page[cpu] = &page->lru; - iter->next_page_idx[cpu] = data->trace_tail_idx; - } + ftrace_enable_cpu(); +} + +static struct trace_entry * +peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts) +{ + struct ring_buffer_event *event; + struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; - page = list_entry(iter->next_page[cpu], struct page, lru); - BUG_ON(&data->trace_pages == &page->lru); + /* Don't allow ftrace to trace into the ring buffers */ + ftrace_disable_cpu(); + + if (buf_iter) + event = ring_buffer_iter_peek(buf_iter, ts); + else + event = ring_buffer_peek(iter->tr->buffer, cpu, ts); - array = page_address(page); + ftrace_enable_cpu(); - WARN_ON(iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE); - return &array[iter->next_page_idx[cpu]]; + return event ? ring_buffer_event_data(event) : NULL; } static struct trace_entry * -find_next_entry(struct trace_iterator *iter, int *ent_cpu) +__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) { - struct trace_array *tr = iter->tr; + struct ring_buffer *buffer = iter->tr->buffer; struct trace_entry *ent, *next = NULL; + u64 next_ts = 0, ts; int next_cpu = -1; int cpu; for_each_tracing_cpu(cpu) { - if (!head_page(tr->data[cpu])) + + if (ring_buffer_empty_cpu(buffer, cpu)) continue; - ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu); + + ent = peek_next_entry(iter, cpu, &ts); + /* * Pick the entry with the smallest timestamp: */ - if (ent && (!next || ent->t < next->t)) { + if (ent && (!next || ts < next_ts)) { next = ent; next_cpu = cpu; + next_ts = ts; } } if (ent_cpu) *ent_cpu = next_cpu; + if (ent_ts) + *ent_ts = next_ts; + return next; } -static void trace_iterator_increment(struct trace_iterator *iter) +/* Find the next real entry, without updating the iterator itself */ +static struct trace_entry * +find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) { - iter->idx++; - iter->next_idx[iter->cpu]++; - iter->next_page_idx[iter->cpu]++; - - if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) { - struct trace_array_cpu *data = iter->tr->data[iter->cpu]; - - iter->next_page_idx[iter->cpu] = 0; - iter->next_page[iter->cpu] = - trace_next_list(data, iter->next_page[iter->cpu]); - } + return __find_next_entry(iter, ent_cpu, ent_ts); } -static void trace_consume(struct trace_iterator *iter) +/* Find the next real entry, and increment the iterator to the next entry */ +static void *find_next_entry_inc(struct trace_iterator *iter) { - struct trace_array_cpu *data = iter->tr->data[iter->cpu]; + iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts); - data->trace_tail_idx++; - if (data->trace_tail_idx >= ENTRIES_PER_PAGE) { - data->trace_tail = trace_next_page(data, data->trace_tail); - data->trace_tail_idx = 0; - } + if (iter->ent) + trace_iterator_increment(iter, iter->cpu); - /* Check if we empty it, then reset the index */ - if (data->trace_head == data->trace_tail && - data->trace_head_idx == data->trace_tail_idx) - data->trace_idx = 0; + return iter->ent ? iter : NULL; } -static void *find_next_entry_inc(struct trace_iterator *iter) +static void trace_consume(struct trace_iterator *iter) { - struct trace_entry *next; - int next_cpu = -1; - - next = find_next_entry(iter, &next_cpu); - - iter->prev_ent = iter->ent; - iter->prev_cpu = iter->cpu; - - iter->ent = next; - iter->cpu = next_cpu; - - if (next) - trace_iterator_increment(iter); - - return next ? iter : NULL; + /* Don't allow ftrace to trace into the ring buffers */ + ftrace_disable_cpu(); + ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts); + ftrace_enable_cpu(); } static void *s_next(struct seq_file *m, void *v, loff_t *pos) @@ -1210,7 +1032,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) struct trace_iterator *iter = m->private; void *p = NULL; loff_t l = 0; - int i; + int cpu; mutex_lock(&trace_types_lock); @@ -1229,14 +1051,15 @@ static void *s_start(struct seq_file *m, loff_t *pos) iter->ent = NULL; iter->cpu = 0; iter->idx = -1; - iter->prev_ent = NULL; - iter->prev_cpu = -1; - for_each_tracing_cpu(i) { - iter->next_idx[i] = 0; - iter->next_page[i] = NULL; + ftrace_disable_cpu(); + + for_each_tracing_cpu(cpu) { + ring_buffer_iter_reset(iter->buffer_iter[cpu]); } + ftrace_enable_cpu(); + for (p = iter; p && l < *pos; p = s_next(m, p, &l)) ; @@ -1330,21 +1153,21 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) static void print_lat_help_header(struct seq_file *m) { - seq_puts(m, "# _------=> CPU# \n"); - seq_puts(m, "# / _-----=> irqs-off \n"); - seq_puts(m, "# | / _----=> need-resched \n"); - seq_puts(m, "# || / _---=> hardirq/softirq \n"); - seq_puts(m, "# ||| / _--=> preempt-depth \n"); - seq_puts(m, "# |||| / \n"); - seq_puts(m, "# ||||| delay \n"); - seq_puts(m, "# cmd pid ||||| time | caller \n"); - seq_puts(m, "# \\ / ||||| \\ | / \n"); + seq_puts(m, "# _------=> CPU# \n"); + seq_puts(m, "# / _-----=> irqs-off \n"); + seq_puts(m, "# | / _----=> need-resched \n"); + seq_puts(m, "# || / _---=> hardirq/softirq \n"); + seq_puts(m, "# ||| / _--=> preempt-depth \n"); + seq_puts(m, "# |||| / \n"); + seq_puts(m, "# ||||| delay \n"); + seq_puts(m, "# cmd pid ||||| time | caller \n"); + seq_puts(m, "# \\ / ||||| \\ | / \n"); } static void print_func_help_header(struct seq_file *m) { - seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); - seq_puts(m, "# | | | | |\n"); + seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); + seq_puts(m, "# | | | | |\n"); } @@ -1355,23 +1178,16 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) struct trace_array *tr = iter->tr; struct trace_array_cpu *data = tr->data[tr->cpu]; struct tracer *type = current_trace; - unsigned long total = 0; - unsigned long entries = 0; - int cpu; + unsigned long total; + unsigned long entries; const char *name = "preemption"; if (type) name = type->name; - for_each_tracing_cpu(cpu) { - if (head_page(tr->data[cpu])) { - total += tr->data[cpu]->trace_idx; - if (tr->data[cpu]->trace_idx > tr->entries) - entries += tr->entries; - else - entries += tr->data[cpu]->trace_idx; - } - } + entries = ring_buffer_entries(iter->tr->buffer); + total = entries + + ring_buffer_overruns(iter->tr->buffer); seq_printf(m, "%s latency trace v1.1.5 on %s\n", name, UTS_RELEASE); @@ -1428,7 +1244,7 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) comm = trace_find_cmdline(entry->pid); trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid); - trace_seq_printf(s, "%d", cpu); + trace_seq_printf(s, "%3d", cpu); trace_seq_printf(s, "%c%c", (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.', ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); @@ -1457,7 +1273,7 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) unsigned long preempt_mark_thresh = 100; static void -lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs, +lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, unsigned long rel_usecs) { trace_seq_printf(s, " %4lldus", abs_usecs); @@ -1471,34 +1287,76 @@ lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs, static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; -static int +/* + * The message is supposed to contain an ending newline. + * If the printing stops prematurely, try to add a newline of our own. + */ +void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter) +{ + struct trace_entry *ent; + struct trace_field_cont *cont; + bool ok = true; + + ent = peek_next_entry(iter, iter->cpu, NULL); + if (!ent || ent->type != TRACE_CONT) { + trace_seq_putc(s, '\n'); + return; + } + + do { + cont = (struct trace_field_cont *)ent; + if (ok) + ok = (trace_seq_printf(s, "%s", cont->buf) > 0); + + ftrace_disable_cpu(); + + if (iter->buffer_iter[iter->cpu]) + ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); + else + ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); + + ftrace_enable_cpu(); + + ent = peek_next_entry(iter, iter->cpu, NULL); + } while (ent && ent->type == TRACE_CONT); + + if (!ok) + trace_seq_putc(s, '\n'); +} + +static enum print_line_t print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) { struct trace_seq *s = &iter->seq; unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); - struct trace_entry *next_entry = find_next_entry(iter, NULL); + struct trace_entry *next_entry; unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); struct trace_entry *entry = iter->ent; unsigned long abs_usecs; unsigned long rel_usecs; + u64 next_ts; char *comm; int S, T; int i; unsigned state; + if (entry->type == TRACE_CONT) + return TRACE_TYPE_HANDLED; + + next_entry = find_next_entry(iter, NULL, &next_ts); if (!next_entry) - next_entry = entry; - rel_usecs = ns2usecs(next_entry->t - entry->t); - abs_usecs = ns2usecs(entry->t - iter->tr->time_start); + next_ts = iter->ts; + rel_usecs = ns2usecs(next_ts - iter->ts); + abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); if (verbose) { comm = trace_find_cmdline(entry->pid); - trace_seq_printf(s, "%16s %5d %d %d %08x %08x [%08lx]" + trace_seq_printf(s, "%16s %5d %3d %d %08x %08x [%08lx]" " %ld.%03ldms (+%ld.%03ldms): ", comm, entry->pid, cpu, entry->flags, entry->preempt_count, trace_idx, - ns2usecs(entry->t), + ns2usecs(iter->ts), abs_usecs/1000, abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000); @@ -1507,52 +1365,85 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) lat_print_timestamp(s, abs_usecs, rel_usecs); } switch (entry->type) { - case TRACE_FN: - seq_print_ip_sym(s, entry->fn.ip, sym_flags); + case TRACE_FN: { + struct ftrace_entry *field; + + trace_assign_type(field, entry); + + seq_print_ip_sym(s, field->ip, sym_flags); trace_seq_puts(s, " ("); - if (kretprobed(entry->fn.parent_ip)) + if (kretprobed(field->parent_ip)) trace_seq_puts(s, KRETPROBE_MSG); else - seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags); + seq_print_ip_sym(s, field->parent_ip, sym_flags); trace_seq_puts(s, ")\n"); break; + } case TRACE_CTX: - case TRACE_WAKE: - T = entry->ctx.next_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.next_state] : 'X'; + case TRACE_WAKE: { + struct ctx_switch_entry *field; + + trace_assign_type(field, entry); + + T = field->next_state < sizeof(state_to_char) ? + state_to_char[field->next_state] : 'X'; - state = entry->ctx.prev_state ? __ffs(entry->ctx.prev_state) + 1 : 0; + state = field->prev_state ? + __ffs(field->prev_state) + 1 : 0; S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X'; - comm = trace_find_cmdline(entry->ctx.next_pid); - trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n", - entry->ctx.prev_pid, - entry->ctx.prev_prio, + comm = trace_find_cmdline(field->next_pid); + trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", + field->prev_pid, + field->prev_prio, S, entry->type == TRACE_CTX ? "==>" : " +", - entry->ctx.next_pid, - entry->ctx.next_prio, + field->next_cpu, + field->next_pid, + field->next_prio, T, comm); break; - case TRACE_SPECIAL: + } + case TRACE_SPECIAL: { + struct special_entry *field; + + trace_assign_type(field, entry); + trace_seq_printf(s, "# %ld %ld %ld\n", - entry->special.arg1, - entry->special.arg2, - entry->special.arg3); + field->arg1, + field->arg2, + field->arg3); break; - case TRACE_STACK: + } + case TRACE_STACK: { + struct stack_entry *field; + + trace_assign_type(field, entry); + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { if (i) trace_seq_puts(s, " <= "); - seq_print_ip_sym(s, entry->stack.caller[i], sym_flags); + seq_print_ip_sym(s, field->caller[i], sym_flags); } trace_seq_puts(s, "\n"); break; + } + case TRACE_PRINT: { + struct print_entry *field; + + trace_assign_type(field, entry); + + seq_print_ip_sym(s, field->ip, sym_flags); + trace_seq_printf(s, ": %s", field->buf); + if (entry->flags & TRACE_FLAG_CONT) + trace_seq_print_cont(s, iter); + break; + } default: trace_seq_printf(s, "Unknown type %d\n", entry->type); } - return 1; + return TRACE_TYPE_HANDLED; } -static int print_trace_fmt(struct trace_iterator *iter) +static enum print_line_t print_trace_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); @@ -1567,90 +1458,126 @@ static int print_trace_fmt(struct trace_iterator *iter) entry = iter->ent; + if (entry->type == TRACE_CONT) + return TRACE_TYPE_HANDLED; + comm = trace_find_cmdline(iter->ent->pid); - t = ns2usecs(entry->t); + t = ns2usecs(iter->ts); usec_rem = do_div(t, 1000000ULL); secs = (unsigned long)t; ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); if (!ret) - return 0; - ret = trace_seq_printf(s, "[%02d] ", iter->cpu); + return TRACE_TYPE_PARTIAL_LINE; + ret = trace_seq_printf(s, "[%03d] ", iter->cpu); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; switch (entry->type) { - case TRACE_FN: - ret = seq_print_ip_sym(s, entry->fn.ip, sym_flags); + case TRACE_FN: { + struct ftrace_entry *field; + + trace_assign_type(field, entry); + + ret = seq_print_ip_sym(s, field->ip, sym_flags); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; if ((sym_flags & TRACE_ITER_PRINT_PARENT) && - entry->fn.parent_ip) { + field->parent_ip) { ret = trace_seq_printf(s, " <-"); if (!ret) - return 0; - if (kretprobed(entry->fn.parent_ip)) + return TRACE_TYPE_PARTIAL_LINE; + if (kretprobed(field->parent_ip)) ret = trace_seq_puts(s, KRETPROBE_MSG); else - ret = seq_print_ip_sym(s, entry->fn.parent_ip, + ret = seq_print_ip_sym(s, + field->parent_ip, sym_flags); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; } ret = trace_seq_printf(s, "\n"); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; break; + } case TRACE_CTX: - case TRACE_WAKE: - S = entry->ctx.prev_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.prev_state] : 'X'; - T = entry->ctx.next_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.next_state] : 'X'; - ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n", - entry->ctx.prev_pid, - entry->ctx.prev_prio, + case TRACE_WAKE: { + struct ctx_switch_entry *field; + + trace_assign_type(field, entry); + + S = field->prev_state < sizeof(state_to_char) ? + state_to_char[field->prev_state] : 'X'; + T = field->next_state < sizeof(state_to_char) ? + state_to_char[field->next_state] : 'X'; + ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n", + field->prev_pid, + field->prev_prio, S, entry->type == TRACE_CTX ? "==>" : " +", - entry->ctx.next_pid, - entry->ctx.next_prio, + field->next_cpu, + field->next_pid, + field->next_prio, T); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; break; - case TRACE_SPECIAL: + } + case TRACE_SPECIAL: { + struct special_entry *field; + + trace_assign_type(field, entry); + ret = trace_seq_printf(s, "# %ld %ld %ld\n", - entry->special.arg1, - entry->special.arg2, - entry->special.arg3); + field->arg1, + field->arg2, + field->arg3); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; break; - case TRACE_STACK: + } + case TRACE_STACK: { + struct stack_entry *field; + + trace_assign_type(field, entry); + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { if (i) { ret = trace_seq_puts(s, " <= "); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; } - ret = seq_print_ip_sym(s, entry->stack.caller[i], + ret = seq_print_ip_sym(s, field->caller[i], sym_flags); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; } ret = trace_seq_puts(s, "\n"); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; break; } - return 1; + case TRACE_PRINT: { + struct print_entry *field; + + trace_assign_type(field, entry); + + seq_print_ip_sym(s, field->ip, sym_flags); + trace_seq_printf(s, ": %s", field->buf); + if (entry->flags & TRACE_FLAG_CONT) + trace_seq_print_cont(s, iter); + break; + } + } + return TRACE_TYPE_HANDLED; } -static int print_raw_fmt(struct trace_iterator *iter) +static enum print_line_t print_raw_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct trace_entry *entry; @@ -1659,47 +1586,77 @@ static int print_raw_fmt(struct trace_iterator *iter) entry = iter->ent; + if (entry->type == TRACE_CONT) + return TRACE_TYPE_HANDLED; + ret = trace_seq_printf(s, "%d %d %llu ", - entry->pid, iter->cpu, entry->t); + entry->pid, iter->cpu, iter->ts); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; switch (entry->type) { - case TRACE_FN: + case TRACE_FN: { + struct ftrace_entry *field; + + trace_assign_type(field, entry); + ret = trace_seq_printf(s, "%x %x\n", - entry->fn.ip, entry->fn.parent_ip); + field->ip, + field->parent_ip); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; break; + } case TRACE_CTX: - case TRACE_WAKE: - S = entry->ctx.prev_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.prev_state] : 'X'; - T = entry->ctx.next_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.next_state] : 'X'; + case TRACE_WAKE: { + struct ctx_switch_entry *field; + + trace_assign_type(field, entry); + + S = field->prev_state < sizeof(state_to_char) ? + state_to_char[field->prev_state] : 'X'; + T = field->next_state < sizeof(state_to_char) ? + state_to_char[field->next_state] : 'X'; if (entry->type == TRACE_WAKE) S = '+'; - ret = trace_seq_printf(s, "%d %d %c %d %d %c\n", - entry->ctx.prev_pid, - entry->ctx.prev_prio, + ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n", + field->prev_pid, + field->prev_prio, S, - entry->ctx.next_pid, - entry->ctx.next_prio, + field->next_cpu, + field->next_pid, + field->next_prio, T); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; break; + } case TRACE_SPECIAL: - case TRACE_STACK: + case TRACE_STACK: { + struct special_entry *field; + + trace_assign_type(field, entry); + ret = trace_seq_printf(s, "# %ld %ld %ld\n", - entry->special.arg1, - entry->special.arg2, - entry->special.arg3); + field->arg1, + field->arg2, + field->arg3); if (!ret) - return 0; + return TRACE_TYPE_PARTIAL_LINE; break; } - return 1; + case TRACE_PRINT: { + struct print_entry *field; + + trace_assign_type(field, entry); + + trace_seq_printf(s, "# %lx %s", field->ip, field->buf); + if (entry->flags & TRACE_FLAG_CONT) + trace_seq_print_cont(s, iter); + break; + } + } + return TRACE_TYPE_HANDLED; } #define SEQ_PUT_FIELD_RET(s, x) \ @@ -1710,11 +1667,12 @@ do { \ #define SEQ_PUT_HEX_FIELD_RET(s, x) \ do { \ + BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \ if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ return 0; \ } while (0) -static int print_hex_fmt(struct trace_iterator *iter) +static enum print_line_t print_hex_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; unsigned char newline = '\n'; @@ -1723,97 +1681,139 @@ static int print_hex_fmt(struct trace_iterator *iter) entry = iter->ent; + if (entry->type == TRACE_CONT) + return TRACE_TYPE_HANDLED; + SEQ_PUT_HEX_FIELD_RET(s, entry->pid); SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); - SEQ_PUT_HEX_FIELD_RET(s, entry->t); + SEQ_PUT_HEX_FIELD_RET(s, iter->ts); switch (entry->type) { - case TRACE_FN: - SEQ_PUT_HEX_FIELD_RET(s, entry->fn.ip); - SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); + case TRACE_FN: { + struct ftrace_entry *field; + + trace_assign_type(field, entry); + + SEQ_PUT_HEX_FIELD_RET(s, field->ip); + SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip); break; + } case TRACE_CTX: - case TRACE_WAKE: - S = entry->ctx.prev_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.prev_state] : 'X'; - T = entry->ctx.next_state < sizeof(state_to_char) ? - state_to_char[entry->ctx.next_state] : 'X'; + case TRACE_WAKE: { + struct ctx_switch_entry *field; + + trace_assign_type(field, entry); + + S = field->prev_state < sizeof(state_to_char) ? + state_to_char[field->prev_state] : 'X'; + T = field->next_state < sizeof(state_to_char) ? + state_to_char[field->next_state] : 'X'; if (entry->type == TRACE_WAKE) S = '+'; - SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid); - SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_prio); + SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); + SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); SEQ_PUT_HEX_FIELD_RET(s, S); - SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_pid); - SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_prio); - SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); + SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu); + SEQ_PUT_HEX_FIELD_RET(s, field->next_pid); + SEQ_PUT_HEX_FIELD_RET(s, field->next_prio); SEQ_PUT_HEX_FIELD_RET(s, T); break; + } case TRACE_SPECIAL: - case TRACE_STACK: - SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1); - SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2); - SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3); + case TRACE_STACK: { + struct special_entry *field; + + trace_assign_type(field, entry); + + SEQ_PUT_HEX_FIELD_RET(s, field->arg1); + SEQ_PUT_HEX_FIELD_RET(s, field->arg2); + SEQ_PUT_HEX_FIELD_RET(s, field->arg3); break; } + } SEQ_PUT_FIELD_RET(s, newline); - return 1; + return TRACE_TYPE_HANDLED; } -static int print_bin_fmt(struct trace_iterator *iter) +static enum print_line_t print_bin_fmt(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; struct trace_entry *entry; entry = iter->ent; + if (entry->type == TRACE_CONT) + return TRACE_TYPE_HANDLED; + SEQ_PUT_FIELD_RET(s, entry->pid); - SEQ_PUT_FIELD_RET(s, entry->cpu); - SEQ_PUT_FIELD_RET(s, entry->t); + SEQ_PUT_FIELD_RET(s, iter->cpu); + SEQ_PUT_FIELD_RET(s, iter->ts); switch (entry->type) { - case TRACE_FN: - SEQ_PUT_FIELD_RET(s, entry->fn.ip); - SEQ_PUT_FIELD_RET(s, entry->fn.parent_ip); + case TRACE_FN: { + struct ftrace_entry *field; + + trace_assign_type(field, entry); + + SEQ_PUT_FIELD_RET(s, field->ip); + SEQ_PUT_FIELD_RET(s, field->parent_ip); break; - case TRACE_CTX: - SEQ_PUT_FIELD_RET(s, entry->ctx.prev_pid); - SEQ_PUT_FIELD_RET(s, entry->ctx.prev_prio); - SEQ_PUT_FIELD_RET(s, entry->ctx.prev_state); - SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid); - SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio); - SEQ_PUT_FIELD_RET(s, entry->ctx.next_state); + } + case TRACE_CTX: { + struct ctx_switch_entry *field; + + trace_assign_type(field, entry); + + SEQ_PUT_FIELD_RET(s, field->prev_pid); + SEQ_PUT_FIELD_RET(s, field->prev_prio); + SEQ_PUT_FIELD_RET(s, field->prev_state); + SEQ_PUT_FIELD_RET(s, field->next_pid); + SEQ_PUT_FIELD_RET(s, field->next_prio); + SEQ_PUT_FIELD_RET(s, field->next_state); break; + } case TRACE_SPECIAL: - case TRACE_STACK: - SEQ_PUT_FIELD_RET(s, entry->special.arg1); - SEQ_PUT_FIELD_RET(s, entry->special.arg2); - SEQ_PUT_FIELD_RET(s, entry->special.arg3); + case TRACE_STACK: { + struct special_entry *field; + + trace_assign_type(field, entry); + + SEQ_PUT_FIELD_RET(s, field->arg1); + SEQ_PUT_FIELD_RET(s, field->arg2); + SEQ_PUT_FIELD_RET(s, field->arg3); break; } + } return 1; } static int trace_empty(struct trace_iterator *iter) { - struct trace_array_cpu *data; int cpu; for_each_tracing_cpu(cpu) { - data = iter->tr->data[cpu]; - - if (head_page(data) && data->trace_idx && - (data->trace_tail != data->trace_head || - data->trace_tail_idx != data->trace_head_idx)) - return 0; + if (iter->buffer_iter[cpu]) { + if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) + return 0; + } else { + if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) + return 0; + } } + return 1; } -static int print_trace_line(struct trace_iterator *iter) +static enum print_line_t print_trace_line(struct trace_iterator *iter) { - if (iter->trace && iter->trace->print_line) - return iter->trace->print_line(iter); + enum print_line_t ret; + + if (iter->trace && iter->trace->print_line) { + ret = iter->trace->print_line(iter); + if (ret != TRACE_TYPE_UNHANDLED) + return ret; + } if (trace_flags & TRACE_ITER_BIN) return print_bin_fmt(iter); @@ -1869,6 +1869,8 @@ static struct trace_iterator * __tracing_open(struct inode *inode, struct file *file, int *ret) { struct trace_iterator *iter; + struct seq_file *m; + int cpu; if (tracing_disabled) { *ret = -ENODEV; @@ -1889,28 +1891,45 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) iter->trace = current_trace; iter->pos = -1; + for_each_tracing_cpu(cpu) { + + iter->buffer_iter[cpu] = + ring_buffer_read_start(iter->tr->buffer, cpu); + + if (!iter->buffer_iter[cpu]) + goto fail_buffer; + } + /* TODO stop tracer */ *ret = seq_open(file, &tracer_seq_ops); - if (!*ret) { - struct seq_file *m = file->private_data; - m->private = iter; + if (*ret) + goto fail_buffer; - /* stop the trace while dumping */ - if (iter->tr->ctrl) { - tracer_enabled = 0; - ftrace_function_enabled = 0; - } + m = file->private_data; + m->private = iter; - if (iter->trace && iter->trace->open) - iter->trace->open(iter); - } else { - kfree(iter); - iter = NULL; + /* stop the trace while dumping */ + if (iter->tr->ctrl) { + tracer_enabled = 0; + ftrace_function_enabled = 0; } + + if (iter->trace && iter->trace->open) + iter->trace->open(iter); + mutex_unlock(&trace_types_lock); out: return iter; + + fail_buffer: + for_each_tracing_cpu(cpu) { + if (iter->buffer_iter[cpu]) + ring_buffer_read_finish(iter->buffer_iter[cpu]); + } + mutex_unlock(&trace_types_lock); + + return ERR_PTR(-ENOMEM); } int tracing_open_generic(struct inode *inode, struct file *filp) @@ -1926,8 +1945,14 @@ int tracing_release(struct inode *inode, struct file *file) { struct seq_file *m = (struct seq_file *)file->private_data; struct trace_iterator *iter = m->private; + int cpu; mutex_lock(&trace_types_lock); + for_each_tracing_cpu(cpu) { + if (iter->buffer_iter[cpu]) + ring_buffer_read_finish(iter->buffer_iter[cpu]); + } + if (iter->trace && iter->trace->close) iter->trace->close(iter); @@ -2352,9 +2377,11 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, struct tracer *t; char buf[max_tracer_type_len+1]; int i; + size_t ret; if (cnt > max_tracer_type_len) cnt = max_tracer_type_len; + ret = cnt; if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; @@ -2370,7 +2397,11 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, if (strcmp(t->name, buf) == 0) break; } - if (!t || t == current_trace) + if (!t) { + ret = -EINVAL; + goto out; + } + if (t == current_trace) goto out; if (current_trace && current_trace->reset) @@ -2383,9 +2414,10 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, out: mutex_unlock(&trace_types_lock); - filp->f_pos += cnt; + if (ret == cnt) + filp->f_pos += cnt; - return cnt; + return ret; } static ssize_t @@ -2500,20 +2532,12 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_iterator *iter = filp->private_data; - struct trace_array_cpu *data; - static cpumask_t mask; - unsigned long flags; -#ifdef CONFIG_FTRACE - int ftrace_save; -#endif - int cpu; ssize_t sret; /* return any leftover data */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); if (sret != -EBUSY) return sret; - sret = 0; trace_seq_reset(&iter->seq); @@ -2524,6 +2548,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, goto out; } +waitagain: + sret = 0; while (trace_empty(iter)) { if ((filp->f_flags & O_NONBLOCK)) { @@ -2588,46 +2614,12 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, offsetof(struct trace_iterator, seq)); iter->pos = -1; - /* - * We need to stop all tracing on all CPUS to read the - * the next buffer. This is a bit expensive, but is - * not done often. We fill all what we can read, - * and then release the locks again. - */ - - cpus_clear(mask); - local_irq_save(flags); -#ifdef CONFIG_FTRACE - ftrace_save = ftrace_enabled; - ftrace_enabled = 0; -#endif - smp_wmb(); - for_each_tracing_cpu(cpu) { - data = iter->tr->data[cpu]; - - if (!head_page(data) || !data->trace_idx) - continue; - - atomic_inc(&data->disabled); - cpu_set(cpu, mask); - } - - for_each_cpu_mask(cpu, mask) { - data = iter->tr->data[cpu]; - __raw_spin_lock(&data->lock); - - if (data->overrun > iter->last_overrun[cpu]) - iter->overrun[cpu] += - data->overrun - iter->last_overrun[cpu]; - iter->last_overrun[cpu] = data->overrun; - } - while (find_next_entry_inc(iter) != NULL) { - int ret; + enum print_line_t ret; int len = iter->seq.len; ret = print_trace_line(iter); - if (!ret) { + if (ret == TRACE_TYPE_PARTIAL_LINE) { /* don't print partial lines */ iter->seq.len = len; break; @@ -2639,26 +2631,17 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, break; } - for_each_cpu_mask(cpu, mask) { - data = iter->tr->data[cpu]; - __raw_spin_unlock(&data->lock); - } - - for_each_cpu_mask(cpu, mask) { - data = iter->tr->data[cpu]; - atomic_dec(&data->disabled); - } -#ifdef CONFIG_FTRACE - ftrace_enabled = ftrace_save; -#endif - local_irq_restore(flags); - /* Now copy what we have to the user */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); if (iter->seq.readpos >= iter->seq.len) trace_seq_reset(&iter->seq); + + /* + * If there was nothing to send to user, inspite of consuming trace + * entries, go back to wait for more entries. + */ if (sret == -EBUSY) - sret = 0; + goto waitagain; out: mutex_unlock(&trace_types_lock); @@ -2684,7 +2667,8 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, { unsigned long val; char buf[64]; - int i, ret; + int ret; + struct trace_array *tr = filp->private_data; if (cnt >= sizeof(buf)) return -EINVAL; @@ -2704,59 +2688,38 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, mutex_lock(&trace_types_lock); - if (current_trace != &no_tracer) { + if (tr->ctrl) { cnt = -EBUSY; - pr_info("ftrace: set current_tracer to none" + pr_info("ftrace: please disable tracing" " before modifying buffer size\n"); goto out; } - if (val > global_trace.entries) { - long pages_requested; - unsigned long freeable_pages; - - /* make sure we have enough memory before mapping */ - pages_requested = - (val + (ENTRIES_PER_PAGE-1)) / ENTRIES_PER_PAGE; - - /* account for each buffer (and max_tr) */ - pages_requested *= tracing_nr_buffers * 2; - - /* Check for overflow */ - if (pages_requested < 0) { - cnt = -ENOMEM; - goto out; - } - - freeable_pages = determine_dirtyable_memory(); - - /* we only allow to request 1/4 of useable memory */ - if (pages_requested > - ((freeable_pages + tracing_pages_allocated) / 4)) { - cnt = -ENOMEM; + if (val != global_trace.entries) { + ret = ring_buffer_resize(global_trace.buffer, val); + if (ret < 0) { + cnt = ret; goto out; } - while (global_trace.entries < val) { - if (trace_alloc_page()) { - cnt = -ENOMEM; - goto out; + ret = ring_buffer_resize(max_tr.buffer, val); + if (ret < 0) { + int r; + cnt = ret; + r = ring_buffer_resize(global_trace.buffer, + global_trace.entries); + if (r < 0) { + /* AARGH! We are left with different + * size max buffer!!!! */ + WARN_ON(1); + tracing_disabled = 1; } - /* double check that we don't go over the known pages */ - if (tracing_pages_allocated > pages_requested) - break; + goto out; } - } else { - /* include the number of entries in val (inc of page entries) */ - while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1)) - trace_free_page(); + global_trace.entries = val; } - /* check integrity */ - for_each_tracing_cpu(i) - check_pages(global_trace.data[i]); - filp->f_pos += cnt; /* If check pages failed, return ENOMEM */ @@ -2769,6 +2732,52 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, return cnt; } +static int mark_printk(const char *fmt, ...) +{ + int ret; + va_list args; + va_start(args, fmt); + ret = trace_vprintk(0, fmt, args); + va_end(args); + return ret; +} + +static ssize_t +tracing_mark_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + char *buf; + char *end; + struct trace_array *tr = &global_trace; + + if (!tr->ctrl || tracing_disabled) + return -EINVAL; + + if (cnt > TRACE_BUF_SIZE) + cnt = TRACE_BUF_SIZE; + + buf = kmalloc(cnt + 1, GFP_KERNEL); + if (buf == NULL) + return -ENOMEM; + + if (copy_from_user(buf, ubuf, cnt)) { + kfree(buf); + return -EFAULT; + } + + /* Cut from the first nil or newline. */ + buf[cnt] = '\0'; + end = strchr(buf, '\n'); + if (end) + *end = '\0'; + + cnt = mark_printk("%s\n", buf); + kfree(buf); + *fpos += cnt; + + return cnt; +} + static struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, @@ -2800,6 +2809,11 @@ static struct file_operations tracing_entries_fops = { .write = tracing_entries_write, }; +static struct file_operations tracing_mark_fops = { + .open = tracing_open_generic, + .write = tracing_mark_write, +}; + #ifdef CONFIG_DYNAMIC_FTRACE static ssize_t @@ -2846,7 +2860,7 @@ struct dentry *tracing_init_dentry(void) #include "trace_selftest.c" #endif -static __init void tracer_init_debugfs(void) +static __init int tracer_init_debugfs(void) { struct dentry *d_tracer; struct dentry *entry; @@ -2881,12 +2895,12 @@ static __init void tracer_init_debugfs(void) entry = debugfs_create_file("available_tracers", 0444, d_tracer, &global_trace, &show_traces_fops); if (!entry) - pr_warning("Could not create debugfs 'trace' entry\n"); + pr_warning("Could not create debugfs 'available_tracers' entry\n"); entry = debugfs_create_file("current_tracer", 0444, d_tracer, &global_trace, &set_tracer_fops); if (!entry) - pr_warning("Could not create debugfs 'trace' entry\n"); + pr_warning("Could not create debugfs 'current_tracer' entry\n"); entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer, &tracing_max_latency, @@ -2899,7 +2913,7 @@ static __init void tracer_init_debugfs(void) &tracing_thresh, &tracing_max_lat_fops); if (!entry) pr_warning("Could not create debugfs " - "'tracing_threash' entry\n"); + "'tracing_thresh' entry\n"); entry = debugfs_create_file("README", 0644, d_tracer, NULL, &tracing_readme_fops); if (!entry) @@ -2909,13 +2923,19 @@ static __init void tracer_init_debugfs(void) NULL, &tracing_pipe_fops); if (!entry) pr_warning("Could not create debugfs " - "'tracing_threash' entry\n"); + "'trace_pipe' entry\n"); entry = debugfs_create_file("trace_entries", 0644, d_tracer, &global_trace, &tracing_entries_fops); if (!entry) pr_warning("Could not create debugfs " - "'tracing_threash' entry\n"); + "'trace_entries' entry\n"); + + entry = debugfs_create_file("trace_marker", 0220, d_tracer, + NULL, &tracing_mark_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'trace_marker' entry\n"); #ifdef CONFIG_DYNAMIC_FTRACE entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, @@ -2928,230 +2948,263 @@ static __init void tracer_init_debugfs(void) #ifdef CONFIG_SYSPROF_TRACER init_tracer_sysprof_debugfs(d_tracer); #endif + return 0; } -static int trace_alloc_page(void) +int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { + static DEFINE_SPINLOCK(trace_buf_lock); + static char trace_buf[TRACE_BUF_SIZE]; + + struct ring_buffer_event *event; + struct trace_array *tr = &global_trace; struct trace_array_cpu *data; - struct page *page, *tmp; - LIST_HEAD(pages); - void *array; - unsigned pages_allocated = 0; - int i; + struct print_entry *entry; + unsigned long flags, irq_flags; + int cpu, len = 0, size, pc; - /* first allocate a page for each CPU */ - for_each_tracing_cpu(i) { - array = (void *)__get_free_page(GFP_KERNEL); - if (array == NULL) { - printk(KERN_ERR "tracer: failed to allocate page" - "for trace buffer!\n"); - goto free_pages; - } + if (!tr->ctrl || tracing_disabled) + return 0; - pages_allocated++; - page = virt_to_page(array); - list_add(&page->lru, &pages); + pc = preempt_count(); + preempt_disable_notrace(); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; -/* Only allocate if we are actually using the max trace */ -#ifdef CONFIG_TRACER_MAX_TRACE - array = (void *)__get_free_page(GFP_KERNEL); - if (array == NULL) { - printk(KERN_ERR "tracer: failed to allocate page" - "for trace buffer!\n"); - goto free_pages; - } - pages_allocated++; - page = virt_to_page(array); - list_add(&page->lru, &pages); -#endif - } + if (unlikely(atomic_read(&data->disabled))) + goto out; - /* Now that we successfully allocate a page per CPU, add them */ - for_each_tracing_cpu(i) { - data = global_trace.data[i]; - page = list_entry(pages.next, struct page, lru); - list_del_init(&page->lru); - list_add_tail(&page->lru, &data->trace_pages); - ClearPageLRU(page); + spin_lock_irqsave(&trace_buf_lock, flags); + len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); -#ifdef CONFIG_TRACER_MAX_TRACE - data = max_tr.data[i]; - page = list_entry(pages.next, struct page, lru); - list_del_init(&page->lru); - list_add_tail(&page->lru, &data->trace_pages); - SetPageLRU(page); -#endif - } - tracing_pages_allocated += pages_allocated; - global_trace.entries += ENTRIES_PER_PAGE; + len = min(len, TRACE_BUF_SIZE-1); + trace_buf[len] = 0; - return 0; + size = sizeof(*entry) + len + 1; + event = ring_buffer_lock_reserve(tr->buffer, size, &irq_flags); + if (!event) + goto out_unlock; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, flags, pc); + entry->ent.type = TRACE_PRINT; + entry->ip = ip; - free_pages: - list_for_each_entry_safe(page, tmp, &pages, lru) { - list_del_init(&page->lru); - __free_page(page); - } - return -ENOMEM; + memcpy(&entry->buf, trace_buf, len); + entry->buf[len] = 0; + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + + out_unlock: + spin_unlock_irqrestore(&trace_buf_lock, flags); + + out: + preempt_enable_notrace(); + + return len; } +EXPORT_SYMBOL_GPL(trace_vprintk); -static int trace_free_page(void) +int __ftrace_printk(unsigned long ip, const char *fmt, ...) { - struct trace_array_cpu *data; - struct page *page; - struct list_head *p; - int i; - int ret = 0; + int ret; + va_list ap; - /* free one page from each buffer */ - for_each_tracing_cpu(i) { - data = global_trace.data[i]; - p = data->trace_pages.next; - if (p == &data->trace_pages) { - /* should never happen */ - WARN_ON(1); - tracing_disabled = 1; - ret = -1; - break; - } - page = list_entry(p, struct page, lru); - ClearPageLRU(page); - list_del(&page->lru); - tracing_pages_allocated--; - tracing_pages_allocated--; - __free_page(page); + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; - tracing_reset(data); + va_start(ap, fmt); + ret = trace_vprintk(ip, fmt, ap); + va_end(ap); + return ret; +} +EXPORT_SYMBOL_GPL(__ftrace_printk); -#ifdef CONFIG_TRACER_MAX_TRACE - data = max_tr.data[i]; - p = data->trace_pages.next; - if (p == &data->trace_pages) { - /* should never happen */ - WARN_ON(1); - tracing_disabled = 1; - ret = -1; - break; - } - page = list_entry(p, struct page, lru); - ClearPageLRU(page); - list_del(&page->lru); - __free_page(page); +static int trace_panic_handler(struct notifier_block *this, + unsigned long event, void *unused) +{ + ftrace_dump(); + return NOTIFY_OK; +} - tracing_reset(data); -#endif - } - global_trace.entries -= ENTRIES_PER_PAGE; +static struct notifier_block trace_panic_notifier = { + .notifier_call = trace_panic_handler, + .next = NULL, + .priority = 150 /* priority: INT_MAX >= x >= 0 */ +}; - return ret; +static int trace_die_handler(struct notifier_block *self, + unsigned long val, + void *data) +{ + switch (val) { + case DIE_OOPS: + ftrace_dump(); + break; + default: + break; + } + return NOTIFY_OK; } -__init static int tracer_alloc_buffers(void) +static struct notifier_block trace_die_notifier = { + .notifier_call = trace_die_handler, + .priority = 200 +}; + +/* + * printk is set to max of 1024, we really don't need it that big. + * Nothing should be printing 1000 characters anyway. + */ +#define TRACE_MAX_PRINT 1000 + +/* + * Define here KERN_TRACE so that we have one place to modify + * it if we decide to change what log level the ftrace dump + * should be at. + */ +#define KERN_TRACE KERN_INFO + +static void +trace_printk_seq(struct trace_seq *s) { - struct trace_array_cpu *data; - void *array; - struct page *page; - int pages = 0; - int ret = -ENOMEM; - int i; + /* Probably should print a warning here. */ + if (s->len >= 1000) + s->len = 1000; - /* TODO: make the number of buffers hot pluggable with CPUS */ - tracing_nr_buffers = num_possible_cpus(); - tracing_buffer_mask = cpu_possible_map; + /* should be zero ended, but we are paranoid. */ + s->buffer[s->len] = 0; - /* Allocate the first page for all buffers */ - for_each_tracing_cpu(i) { - data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); - max_tr.data[i] = &per_cpu(max_data, i); + printk(KERN_TRACE "%s", s->buffer); - array = (void *)__get_free_page(GFP_KERNEL); - if (array == NULL) { - printk(KERN_ERR "tracer: failed to allocate page" - "for trace buffer!\n"); - goto free_buffers; - } + trace_seq_reset(s); +} + + +void ftrace_dump(void) +{ + static DEFINE_SPINLOCK(ftrace_dump_lock); + /* use static because iter can be a bit big for the stack */ + static struct trace_iterator iter; + static cpumask_t mask; + static int dump_ran; + unsigned long flags; + int cnt = 0, cpu; - /* set the array to the list */ - INIT_LIST_HEAD(&data->trace_pages); - page = virt_to_page(array); - list_add(&page->lru, &data->trace_pages); - /* use the LRU flag to differentiate the two buffers */ - ClearPageLRU(page); + /* only one dump */ + spin_lock_irqsave(&ftrace_dump_lock, flags); + if (dump_ran) + goto out; - data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - max_tr.data[i]->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + dump_ran = 1; -/* Only allocate if we are actually using the max trace */ -#ifdef CONFIG_TRACER_MAX_TRACE - array = (void *)__get_free_page(GFP_KERNEL); - if (array == NULL) { - printk(KERN_ERR "tracer: failed to allocate page" - "for trace buffer!\n"); - goto free_buffers; - } + /* No turning back! */ + ftrace_kill_atomic(); - INIT_LIST_HEAD(&max_tr.data[i]->trace_pages); - page = virt_to_page(array); - list_add(&page->lru, &max_tr.data[i]->trace_pages); - SetPageLRU(page); -#endif + for_each_tracing_cpu(cpu) { + atomic_inc(&global_trace.data[cpu]->disabled); } + printk(KERN_TRACE "Dumping ftrace buffer:\n"); + + iter.tr = &global_trace; + iter.trace = current_trace; + /* - * Since we allocate by orders of pages, we may be able to - * round up a bit. + * We need to stop all tracing on all CPUS to read the + * the next buffer. This is a bit expensive, but is + * not done often. We fill all what we can read, + * and then release the locks again. */ - global_trace.entries = ENTRIES_PER_PAGE; - pages++; - while (global_trace.entries < trace_nr_entries) { - if (trace_alloc_page()) - break; - pages++; + cpus_clear(mask); + + while (!trace_empty(&iter)) { + + if (!cnt) + printk(KERN_TRACE "---------------------------------\n"); + + cnt++; + + /* reset all but tr, trace, and overruns */ + memset(&iter.seq, 0, + sizeof(struct trace_iterator) - + offsetof(struct trace_iterator, seq)); + iter.iter_flags |= TRACE_FILE_LAT_FMT; + iter.pos = -1; + + if (find_next_entry_inc(&iter) != NULL) { + print_trace_line(&iter); + trace_consume(&iter); + } + + trace_printk_seq(&iter.seq); } - max_tr.entries = global_trace.entries; - pr_info("tracer: %d pages allocated for %ld entries of %ld bytes\n", - pages, trace_nr_entries, (long)TRACE_ENTRY_SIZE); - pr_info(" actual entries %ld\n", global_trace.entries); + if (!cnt) + printk(KERN_TRACE " (ftrace buffer empty)\n"); + else + printk(KERN_TRACE "---------------------------------\n"); + + out: + spin_unlock_irqrestore(&ftrace_dump_lock, flags); +} + +__init static int tracer_alloc_buffers(void) +{ + struct trace_array_cpu *data; + int i; + + /* TODO: make the number of buffers hot pluggable with CPUS */ + tracing_buffer_mask = cpu_possible_map; + + global_trace.buffer = ring_buffer_alloc(trace_buf_size, + TRACE_BUFFER_FLAGS); + if (!global_trace.buffer) { + printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); + WARN_ON(1); + return 0; + } + global_trace.entries = ring_buffer_size(global_trace.buffer); - tracer_init_debugfs(); +#ifdef CONFIG_TRACER_MAX_TRACE + max_tr.buffer = ring_buffer_alloc(trace_buf_size, + TRACE_BUFFER_FLAGS); + if (!max_tr.buffer) { + printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); + WARN_ON(1); + ring_buffer_free(global_trace.buffer); + return 0; + } + max_tr.entries = ring_buffer_size(max_tr.buffer); + WARN_ON(max_tr.entries != global_trace.entries); +#endif + + /* Allocate the first page for all buffers */ + for_each_tracing_cpu(i) { + data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); + max_tr.data[i] = &per_cpu(max_data, i); + } trace_init_cmdlines(); - register_tracer(&no_tracer); - current_trace = &no_tracer; + register_tracer(&nop_trace); +#ifdef CONFIG_BOOT_TRACER + register_tracer(&boot_tracer); + current_trace = &boot_tracer; + current_trace->init(&global_trace); +#else + current_trace = &nop_trace; +#endif /* All seems OK, enable tracing */ global_trace.ctrl = tracer_enabled; tracing_disabled = 0; - return 0; + atomic_notifier_chain_register(&panic_notifier_list, + &trace_panic_notifier); - free_buffers: - for (i-- ; i >= 0; i--) { - struct page *page, *tmp; - struct trace_array_cpu *data = global_trace.data[i]; + register_die_notifier(&trace_die_notifier); - if (data) { - list_for_each_entry_safe(page, tmp, - &data->trace_pages, lru) { - list_del_init(&page->lru); - __free_page(page); - } - } - -#ifdef CONFIG_TRACER_MAX_TRACE - data = max_tr.data[i]; - if (data) { - list_for_each_entry_safe(page, tmp, - &data->trace_pages, lru) { - list_del_init(&page->lru); - __free_page(page); - } - } -#endif - } - return ret; + return 0; } -fs_initcall(tracer_alloc_buffers); +early_initcall(tracer_alloc_buffers); +fs_initcall(tracer_init_debugfs); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f69f867..f1f9957 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -5,7 +5,9 @@ #include <asm/atomic.h> #include <linux/sched.h> #include <linux/clocksource.h> +#include <linux/ring_buffer.h> #include <linux/mmiotrace.h> +#include <linux/ftrace.h> enum trace_type { __TRACE_FIRST_TYPE = 0, @@ -13,38 +15,60 @@ enum trace_type { TRACE_FN, TRACE_CTX, TRACE_WAKE, + TRACE_CONT, TRACE_STACK, + TRACE_PRINT, TRACE_SPECIAL, TRACE_MMIO_RW, TRACE_MMIO_MAP, + TRACE_BOOT, __TRACE_LAST_TYPE }; /* + * The trace entry - the most basic unit of tracing. This is what + * is printed in the end as a single line in the trace output, such as: + * + * bash-15816 [01] 235.197585: idle_cpu <- irq_enter + */ +struct trace_entry { + unsigned char type; + unsigned char cpu; + unsigned char flags; + unsigned char preempt_count; + int pid; +}; + +/* * Function trace entry - function address and parent function addres: */ struct ftrace_entry { + struct trace_entry ent; unsigned long ip; unsigned long parent_ip; }; +extern struct tracer boot_tracer; /* * Context switch trace entry - which task (and prio) we switched from/to: */ struct ctx_switch_entry { + struct trace_entry ent; unsigned int prev_pid; unsigned char prev_prio; unsigned char prev_state; unsigned int next_pid; unsigned char next_prio; unsigned char next_state; + unsigned int next_cpu; }; /* * Special (free-form) trace entry: */ struct special_entry { + struct trace_entry ent; unsigned long arg1; unsigned long arg2; unsigned long arg3; @@ -57,33 +81,60 @@ struct special_entry { #define FTRACE_STACK_ENTRIES 8 struct stack_entry { + struct trace_entry ent; unsigned long caller[FTRACE_STACK_ENTRIES]; }; /* - * The trace entry - the most basic unit of tracing. This is what - * is printed in the end as a single line in the trace output, such as: - * - * bash-15816 [01] 235.197585: idle_cpu <- irq_enter + * ftrace_printk entry: */ -struct trace_entry { - char type; - char cpu; - char flags; - char preempt_count; - int pid; - cycle_t t; - union { - struct ftrace_entry fn; - struct ctx_switch_entry ctx; - struct special_entry special; - struct stack_entry stack; - struct mmiotrace_rw mmiorw; - struct mmiotrace_map mmiomap; - }; +struct print_entry { + struct trace_entry ent; + unsigned long ip; + char buf[]; +}; + +#define TRACE_OLD_SIZE 88 + +struct trace_field_cont { + unsigned char type; + /* Temporary till we get rid of this completely */ + char buf[TRACE_OLD_SIZE - 1]; +}; + +struct trace_mmiotrace_rw { + struct trace_entry ent; + struct mmiotrace_rw rw; }; -#define TRACE_ENTRY_SIZE sizeof(struct trace_entry) +struct trace_mmiotrace_map { + struct trace_entry ent; + struct mmiotrace_map map; +}; + +struct trace_boot { + struct trace_entry ent; + struct boot_trace initcall; +}; + +/* + * trace_flag_type is an enumeration that holds different + * states when a trace occurs. These are: + * IRQS_OFF - interrupts were disabled + * NEED_RESCED - reschedule is requested + * HARDIRQ - inside an interrupt handler + * SOFTIRQ - inside a softirq handler + * CONT - multiple entries hold the trace item + */ +enum trace_flag_type { + TRACE_FLAG_IRQS_OFF = 0x01, + TRACE_FLAG_NEED_RESCHED = 0x02, + TRACE_FLAG_HARDIRQ = 0x04, + TRACE_FLAG_SOFTIRQ = 0x08, + TRACE_FLAG_CONT = 0x10, +}; + +#define TRACE_BUF_SIZE 1024 /* * The CPU trace array - it consists of thousands of trace entries @@ -91,16 +142,9 @@ struct trace_entry { * the trace, etc.) */ struct trace_array_cpu { - struct list_head trace_pages; atomic_t disabled; - raw_spinlock_t lock; - struct lock_class_key lock_key; /* these fields get copied into max-trace: */ - unsigned trace_head_idx; - unsigned trace_tail_idx; - void *trace_head; /* producer */ - void *trace_tail; /* consumer */ unsigned long trace_idx; unsigned long overrun; unsigned long saved_latency; @@ -124,6 +168,7 @@ struct trace_iterator; * They have on/off state as well: */ struct trace_array { + struct ring_buffer *buffer; unsigned long entries; long ctrl; int cpu; @@ -132,6 +177,56 @@ struct trace_array { struct trace_array_cpu *data[NR_CPUS]; }; +#define FTRACE_CMP_TYPE(var, type) \ + __builtin_types_compatible_p(typeof(var), type *) + +#undef IF_ASSIGN +#define IF_ASSIGN(var, entry, etype, id) \ + if (FTRACE_CMP_TYPE(var, etype)) { \ + var = (typeof(var))(entry); \ + WARN_ON(id && (entry)->type != id); \ + break; \ + } + +/* Will cause compile errors if type is not found. */ +extern void __ftrace_bad_type(void); + +/* + * The trace_assign_type is a verifier that the entry type is + * the same as the type being assigned. To add new types simply + * add a line with the following format: + * + * IF_ASSIGN(var, ent, type, id); + * + * Where "type" is the trace type that includes the trace_entry + * as the "ent" item. And "id" is the trace identifier that is + * used in the trace_type enum. + * + * If the type can have more than one id, then use zero. + */ +#define trace_assign_type(var, ent) \ + do { \ + IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \ + IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \ + IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \ + IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ + IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ + IF_ASSIGN(var, ent, struct special_entry, 0); \ + IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ + TRACE_MMIO_RW); \ + IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ + TRACE_MMIO_MAP); \ + IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT); \ + __ftrace_bad_type(); \ + } while (0) + +/* Return values for print_line callback */ +enum print_line_t { + TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */ + TRACE_TYPE_HANDLED = 1, + TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */ +}; + /* * A specific tracer, represented by methods that operate on a trace array: */ @@ -152,7 +247,7 @@ struct tracer { int (*selftest)(struct tracer *trace, struct trace_array *tr); #endif - int (*print_line)(struct trace_iterator *iter); + enum print_line_t (*print_line)(struct trace_iterator *iter); struct tracer *next; int print_max; }; @@ -171,57 +266,58 @@ struct trace_iterator { struct trace_array *tr; struct tracer *trace; void *private; - long last_overrun[NR_CPUS]; - long overrun[NR_CPUS]; + struct ring_buffer_iter *buffer_iter[NR_CPUS]; /* The below is zeroed out in pipe_read */ struct trace_seq seq; struct trace_entry *ent; int cpu; - - struct trace_entry *prev_ent; - int prev_cpu; + u64 ts; unsigned long iter_flags; loff_t pos; - unsigned long next_idx[NR_CPUS]; - struct list_head *next_page[NR_CPUS]; - unsigned next_page_idx[NR_CPUS]; long idx; }; -void tracing_reset(struct trace_array_cpu *data); +void trace_wake_up(void); +void tracing_reset(struct trace_array *tr, int cpu); int tracing_open_generic(struct inode *inode, struct file *filp); struct dentry *tracing_init_dentry(void); void init_tracer_sysprof_debugfs(struct dentry *d_tracer); +struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, + struct trace_array_cpu *data); +void tracing_generic_entry_update(struct trace_entry *entry, + unsigned long flags, + int pc); + void ftrace(struct trace_array *tr, struct trace_array_cpu *data, unsigned long ip, unsigned long parent_ip, - unsigned long flags); + unsigned long flags, int pc); void tracing_sched_switch_trace(struct trace_array *tr, struct trace_array_cpu *data, struct task_struct *prev, struct task_struct *next, - unsigned long flags); + unsigned long flags, int pc); void tracing_record_cmdline(struct task_struct *tsk); void tracing_sched_wakeup_trace(struct trace_array *tr, struct trace_array_cpu *data, struct task_struct *wakee, struct task_struct *cur, - unsigned long flags); + unsigned long flags, int pc); void trace_special(struct trace_array *tr, struct trace_array_cpu *data, unsigned long arg1, unsigned long arg2, - unsigned long arg3); + unsigned long arg3, int pc); void trace_function(struct trace_array *tr, struct trace_array_cpu *data, unsigned long ip, unsigned long parent_ip, - unsigned long flags); + unsigned long flags, int pc); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); @@ -268,51 +364,33 @@ extern unsigned long ftrace_update_tot_cnt; extern int DYN_FTRACE_TEST_NAME(void); #endif -#ifdef CONFIG_MMIOTRACE -extern void __trace_mmiotrace_rw(struct trace_array *tr, - struct trace_array_cpu *data, - struct mmiotrace_rw *rw); -extern void __trace_mmiotrace_map(struct trace_array *tr, - struct trace_array_cpu *data, - struct mmiotrace_map *map); -#endif - #ifdef CONFIG_FTRACE_STARTUP_TEST -#ifdef CONFIG_FTRACE extern int trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr); -#endif -#ifdef CONFIG_IRQSOFF_TRACER extern int trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr); -#endif -#ifdef CONFIG_PREEMPT_TRACER extern int trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr); -#endif -#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER) extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr); -#endif -#ifdef CONFIG_SCHED_TRACER extern int trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr); -#endif -#ifdef CONFIG_CONTEXT_SWITCH_TRACER +extern int trace_selftest_startup_nop(struct tracer *trace, + struct trace_array *tr); extern int trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr); -#endif -#ifdef CONFIG_SYSPROF_TRACER extern int trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr); -#endif #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); +extern void trace_seq_print_cont(struct trace_seq *s, + struct trace_iterator *iter); extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt); extern long ns2usecs(cycle_t nsec); +extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args); extern unsigned long trace_flags; @@ -334,6 +412,9 @@ enum trace_iterator_flags { TRACE_ITER_BLOCK = 0x80, TRACE_ITER_STACKTRACE = 0x100, TRACE_ITER_SCHED_TREE = 0x200, + TRACE_ITER_PRINTK = 0x400, }; +extern struct tracer nop_trace; + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c new file mode 100644 index 0000000..d0a5e50 --- /dev/null +++ b/kernel/trace/trace_boot.c @@ -0,0 +1,126 @@ +/* + * ring buffer based initcalls tracer + * + * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> + * + */ + +#include <linux/init.h> +#include <linux/debugfs.h> +#include <linux/ftrace.h> +#include <linux/kallsyms.h> + +#include "trace.h" + +static struct trace_array *boot_trace; +static int trace_boot_enabled; + + +/* Should be started after do_pre_smp_initcalls() in init/main.c */ +void start_boot_trace(void) +{ + trace_boot_enabled = 1; +} + +void stop_boot_trace(void) +{ + trace_boot_enabled = 0; +} + +void reset_boot_trace(struct trace_array *tr) +{ + stop_boot_trace(); +} + +static void boot_trace_init(struct trace_array *tr) +{ + int cpu; + boot_trace = tr; + + trace_boot_enabled = 0; + + for_each_cpu_mask(cpu, cpu_possible_map) + tracing_reset(tr, cpu); +} + +static void boot_trace_ctrl_update(struct trace_array *tr) +{ + if (tr->ctrl) + start_boot_trace(); + else + stop_boot_trace(); +} + +static enum print_line_t initcall_print_line(struct trace_iterator *iter) +{ + int ret; + struct trace_entry *entry = iter->ent; + struct trace_boot *field = (struct trace_boot *)entry; + struct boot_trace *it = &field->initcall; + struct trace_seq *s = &iter->seq; + struct timespec calltime = ktime_to_timespec(it->calltime); + struct timespec rettime = ktime_to_timespec(it->rettime); + + if (entry->type == TRACE_BOOT) { + ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", + calltime.tv_sec, + calltime.tv_nsec, + it->func, it->caller); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " + "returned %d after %lld msecs\n", + rettime.tv_sec, + rettime.tv_nsec, + it->func, it->result, it->duration); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; + } + return TRACE_TYPE_UNHANDLED; +} + +struct tracer boot_tracer __read_mostly = +{ + .name = "initcall", + .init = boot_trace_init, + .reset = reset_boot_trace, + .ctrl_update = boot_trace_ctrl_update, + .print_line = initcall_print_line, +}; + +void trace_boot(struct boot_trace *it, initcall_t fn) +{ + struct ring_buffer_event *event; + struct trace_boot *entry; + struct trace_array_cpu *data; + unsigned long irq_flags; + struct trace_array *tr = boot_trace; + + if (!trace_boot_enabled) + return; + + /* Get its name now since this function could + * disappear because it is in the .init section. + */ + sprint_symbol(it->func, (unsigned long)fn); + preempt_disable(); + data = tr->data[smp_processor_id()]; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, 0); + entry->ent.type = TRACE_BOOT; + entry->initcall = *it; + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + + trace_wake_up(); + + out: + preempt_enable(); +} diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 3121448..e90eb0c 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -23,7 +23,7 @@ static void function_reset(struct trace_array *tr) tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) - tracing_reset(tr->data[cpu]); + tracing_reset(tr, cpu); } static void start_function_trace(struct trace_array *tr) diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index ece6cfb..a7db7f0 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -95,7 +95,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) - trace_function(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags, preempt_count()); atomic_dec(&data->disabled); } @@ -130,6 +130,7 @@ check_critical_timing(struct trace_array *tr, unsigned long latency, t0, t1; cycle_t T0, T1, delta; unsigned long flags; + int pc; /* * usecs conversion is slow so we try to delay the conversion @@ -141,6 +142,8 @@ check_critical_timing(struct trace_array *tr, local_save_flags(flags); + pc = preempt_count(); + if (!report_latency(delta)) goto out; @@ -150,7 +153,7 @@ check_critical_timing(struct trace_array *tr, if (!report_latency(delta)) goto out_unlock; - trace_function(tr, data, CALLER_ADDR0, parent_ip, flags); + trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc); latency = nsecs_to_usecs(delta); @@ -173,8 +176,8 @@ out_unlock: out: data->critical_sequence = max_sequence; data->preempt_timestamp = ftrace_now(cpu); - tracing_reset(data); - trace_function(tr, data, CALLER_ADDR0, parent_ip, flags); + tracing_reset(tr, cpu); + trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc); } static inline void @@ -203,11 +206,11 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) data->critical_sequence = max_sequence; data->preempt_timestamp = ftrace_now(cpu); data->critical_start = parent_ip ? : ip; - tracing_reset(data); + tracing_reset(tr, cpu); local_save_flags(flags); - trace_function(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags, preempt_count()); per_cpu(tracing_cpu, cpu) = 1; @@ -234,14 +237,14 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) data = tr->data[cpu]; - if (unlikely(!data) || unlikely(!head_page(data)) || + if (unlikely(!data) || !data->critical_start || atomic_read(&data->disabled)) return; atomic_inc(&data->disabled); local_save_flags(flags); - trace_function(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags, preempt_count()); check_critical_timing(tr, data, parent_ip ? : ip, cpu); data->critical_start = 0; atomic_dec(&data->disabled); diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index b13dc19..f284846 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -27,7 +27,7 @@ static void mmio_reset_data(struct trace_array *tr) tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) - tracing_reset(tr->data[cpu]); + tracing_reset(tr, cpu); } static void mmio_trace_init(struct trace_array *tr) @@ -130,10 +130,14 @@ static unsigned long count_overruns(struct trace_iterator *iter) { int cpu; unsigned long cnt = 0; +/* FIXME: */ +#if 0 for_each_online_cpu(cpu) { cnt += iter->overrun[cpu]; iter->overrun[cpu] = 0; } +#endif + (void)cpu; return cnt; } @@ -171,17 +175,21 @@ print_out: return (ret == -EBUSY) ? 0 : ret; } -static int mmio_print_rw(struct trace_iterator *iter) +static enum print_line_t mmio_print_rw(struct trace_iterator *iter) { struct trace_entry *entry = iter->ent; - struct mmiotrace_rw *rw = &entry->mmiorw; + struct trace_mmiotrace_rw *field; + struct mmiotrace_rw *rw; struct trace_seq *s = &iter->seq; - unsigned long long t = ns2usecs(entry->t); + unsigned long long t = ns2usecs(iter->ts); unsigned long usec_rem = do_div(t, 1000000ULL); unsigned secs = (unsigned long)t; int ret = 1; - switch (entry->mmiorw.opcode) { + trace_assign_type(field, entry); + rw = &field->rw; + + switch (rw->opcode) { case MMIO_READ: ret = trace_seq_printf(s, "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", @@ -209,21 +217,25 @@ static int mmio_print_rw(struct trace_iterator *iter) break; } if (ret) - return 1; - return 0; + return TRACE_TYPE_HANDLED; + return TRACE_TYPE_PARTIAL_LINE; } -static int mmio_print_map(struct trace_iterator *iter) +static enum print_line_t mmio_print_map(struct trace_iterator *iter) { struct trace_entry *entry = iter->ent; - struct mmiotrace_map *m = &entry->mmiomap; + struct trace_mmiotrace_map *field; + struct mmiotrace_map *m; struct trace_seq *s = &iter->seq; - unsigned long long t = ns2usecs(entry->t); + unsigned long long t = ns2usecs(iter->ts); unsigned long usec_rem = do_div(t, 1000000ULL); unsigned secs = (unsigned long)t; - int ret = 1; + int ret; - switch (entry->mmiorw.opcode) { + trace_assign_type(field, entry); + m = &field->map; + + switch (m->opcode) { case MMIO_PROBE: ret = trace_seq_printf(s, "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", @@ -241,20 +253,43 @@ static int mmio_print_map(struct trace_iterator *iter) break; } if (ret) - return 1; - return 0; + return TRACE_TYPE_HANDLED; + return TRACE_TYPE_PARTIAL_LINE; +} + +static enum print_line_t mmio_print_mark(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct print_entry *print = (struct print_entry *)entry; + const char *msg = print->buf; + struct trace_seq *s = &iter->seq; + unsigned long long t = ns2usecs(iter->ts); + unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned secs = (unsigned long)t; + int ret; + + /* The trailing newline must be in the message. */ + ret = trace_seq_printf(s, "MARK %lu.%06lu %s", secs, usec_rem, msg); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + if (entry->flags & TRACE_FLAG_CONT) + trace_seq_print_cont(s, iter); + + return TRACE_TYPE_HANDLED; } -/* return 0 to abort printing without consuming current entry in pipe mode */ -static int mmio_print_line(struct trace_iterator *iter) +static enum print_line_t mmio_print_line(struct trace_iterator *iter) { switch (iter->ent->type) { case TRACE_MMIO_RW: return mmio_print_rw(iter); case TRACE_MMIO_MAP: return mmio_print_map(iter); + case TRACE_PRINT: + return mmio_print_mark(iter); default: - return 1; /* ignore unknown entries */ + return TRACE_TYPE_HANDLED; /* ignore unknown entries */ } } @@ -276,6 +311,27 @@ __init static int init_mmio_trace(void) } device_initcall(init_mmio_trace); +static void __trace_mmiotrace_rw(struct trace_array *tr, + struct trace_array_cpu *data, + struct mmiotrace_rw *rw) +{ + struct ring_buffer_event *event; + struct trace_mmiotrace_rw *entry; + unsigned long irq_flags; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, preempt_count()); + entry->ent.type = TRACE_MMIO_RW; + entry->rw = *rw; + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + + trace_wake_up(); +} + void mmio_trace_rw(struct mmiotrace_rw *rw) { struct trace_array *tr = mmio_trace_array; @@ -283,6 +339,27 @@ void mmio_trace_rw(struct mmiotrace_rw *rw) __trace_mmiotrace_rw(tr, data, rw); } +static void __trace_mmiotrace_map(struct trace_array *tr, + struct trace_array_cpu *data, + struct mmiotrace_map *map) +{ + struct ring_buffer_event *event; + struct trace_mmiotrace_map *entry; + unsigned long irq_flags; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), + &irq_flags); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, preempt_count()); + entry->ent.type = TRACE_MMIO_MAP; + entry->map = *map; + ring_buffer_unlock_commit(tr->buffer, event, irq_flags); + + trace_wake_up(); +} + void mmio_trace_mapping(struct mmiotrace_map *map) { struct trace_array *tr = mmio_trace_array; @@ -293,3 +370,8 @@ void mmio_trace_mapping(struct mmiotrace_map *map) __trace_mmiotrace_map(tr, data, map); preempt_enable(); } + +int mmio_trace_printk(const char *fmt, va_list args) +{ + return trace_vprintk(0, fmt, args); +} diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c new file mode 100644 index 0000000..4592b48 --- /dev/null +++ b/kernel/trace/trace_nop.c @@ -0,0 +1,64 @@ +/* + * nop tracer + * + * Copyright (C) 2008 Steven Noonan <steven@uplinklabs.net> + * + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/debugfs.h> +#include <linux/ftrace.h> + +#include "trace.h" + +static struct trace_array *ctx_trace; + +static void start_nop_trace(struct trace_array *tr) +{ + /* Nothing to do! */ +} + +static void stop_nop_trace(struct trace_array *tr) +{ + /* Nothing to do! */ +} + +static void nop_trace_init(struct trace_array *tr) +{ + int cpu; + ctx_trace = tr; + + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); + + if (tr->ctrl) + start_nop_trace(tr); +} + +static void nop_trace_reset(struct trace_array *tr) +{ + if (tr->ctrl) + stop_nop_trace(tr); +} + +static void nop_trace_ctrl_update(struct trace_array *tr) +{ + /* When starting a new trace, reset the buffers */ + if (tr->ctrl) + start_nop_trace(tr); + else + stop_nop_trace(tr); +} + +struct tracer nop_trace __read_mostly = +{ + .name = "nop", + .init = nop_trace_init, + .reset = nop_trace_reset, + .ctrl_update = nop_trace_ctrl_update, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_nop, +#endif +}; + diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index cb817a2..b8f56be 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -9,8 +9,8 @@ #include <linux/debugfs.h> #include <linux/kallsyms.h> #include <linux/uaccess.h> -#include <linux/marker.h> #include <linux/ftrace.h> +#include <trace/sched.h> #include "trace.h" @@ -19,15 +19,16 @@ static int __read_mostly tracer_enabled; static atomic_t sched_ref; static void -sched_switch_func(void *private, void *__rq, struct task_struct *prev, +probe_sched_switch(struct rq *__rq, struct task_struct *prev, struct task_struct *next) { - struct trace_array **ptr = private; - struct trace_array *tr = *ptr; struct trace_array_cpu *data; unsigned long flags; - long disabled; int cpu; + int pc; + + if (!atomic_read(&sched_ref)) + return; tracing_record_cmdline(prev); tracing_record_cmdline(next); @@ -35,97 +36,41 @@ sched_switch_func(void *private, void *__rq, struct task_struct *prev, if (!tracer_enabled) return; + pc = preempt_count(); local_irq_save(flags); cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); + data = ctx_trace->data[cpu]; - if (likely(disabled == 1)) - tracing_sched_switch_trace(tr, data, prev, next, flags); + if (likely(!atomic_read(&data->disabled))) + tracing_sched_switch_trace(ctx_trace, data, prev, next, flags, pc); - atomic_dec(&data->disabled); local_irq_restore(flags); } -static notrace void -sched_switch_callback(void *probe_data, void *call_data, - const char *format, va_list *args) -{ - struct task_struct *prev; - struct task_struct *next; - struct rq *__rq; - - if (!atomic_read(&sched_ref)) - return; - - /* skip prev_pid %d next_pid %d prev_state %ld */ - (void)va_arg(*args, int); - (void)va_arg(*args, int); - (void)va_arg(*args, long); - __rq = va_arg(*args, typeof(__rq)); - prev = va_arg(*args, typeof(prev)); - next = va_arg(*args, typeof(next)); - - /* - * If tracer_switch_func only points to the local - * switch func, it still needs the ptr passed to it. - */ - sched_switch_func(probe_data, __rq, prev, next); -} - static void -wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct - task_struct *curr) +probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee) { - struct trace_array **ptr = private; - struct trace_array *tr = *ptr; struct trace_array_cpu *data; unsigned long flags; - long disabled; - int cpu; + int cpu, pc; - if (!tracer_enabled) + if (!likely(tracer_enabled)) return; - tracing_record_cmdline(curr); + pc = preempt_count(); + tracing_record_cmdline(current); local_irq_save(flags); cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); + data = ctx_trace->data[cpu]; - if (likely(disabled == 1)) - tracing_sched_wakeup_trace(tr, data, wakee, curr, flags); + if (likely(!atomic_read(&data->disabled))) + tracing_sched_wakeup_trace(ctx_trace, data, wakee, current, + flags, pc); - atomic_dec(&data->disabled); local_irq_restore(flags); } -static notrace void -wake_up_callback(void *probe_data, void *call_data, - const char *format, va_list *args) -{ - struct task_struct *curr; - struct task_struct *task; - struct rq *__rq; - - if (likely(!tracer_enabled)) - return; - - /* Skip pid %d state %ld */ - (void)va_arg(*args, int); - (void)va_arg(*args, long); - /* now get the meat: "rq %p task %p rq->curr %p" */ - __rq = va_arg(*args, typeof(__rq)); - task = va_arg(*args, typeof(task)); - curr = va_arg(*args, typeof(curr)); - - tracing_record_cmdline(task); - tracing_record_cmdline(curr); - - wakeup_func(probe_data, __rq, task, curr); -} - static void sched_switch_reset(struct trace_array *tr) { int cpu; @@ -133,67 +78,47 @@ static void sched_switch_reset(struct trace_array *tr) tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) - tracing_reset(tr->data[cpu]); + tracing_reset(tr, cpu); } static int tracing_sched_register(void) { int ret; - ret = marker_probe_register("kernel_sched_wakeup", - "pid %d state %ld ## rq %p task %p rq->curr %p", - wake_up_callback, - &ctx_trace); + ret = register_trace_sched_wakeup(probe_sched_wakeup); if (ret) { - pr_info("wakeup trace: Couldn't add marker" + pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup\n"); return ret; } - ret = marker_probe_register("kernel_sched_wakeup_new", - "pid %d state %ld ## rq %p task %p rq->curr %p", - wake_up_callback, - &ctx_trace); + ret = register_trace_sched_wakeup_new(probe_sched_wakeup); if (ret) { - pr_info("wakeup trace: Couldn't add marker" + pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup_new\n"); goto fail_deprobe; } - ret = marker_probe_register("kernel_sched_schedule", - "prev_pid %d next_pid %d prev_state %ld " - "## rq %p prev %p next %p", - sched_switch_callback, - &ctx_trace); + ret = register_trace_sched_switch(probe_sched_switch); if (ret) { - pr_info("sched trace: Couldn't add marker" + pr_info("sched trace: Couldn't activate tracepoint" " probe to kernel_sched_schedule\n"); goto fail_deprobe_wake_new; } return ret; fail_deprobe_wake_new: - marker_probe_unregister("kernel_sched_wakeup_new", - wake_up_callback, - &ctx_trace); + unregister_trace_sched_wakeup_new(probe_sched_wakeup); fail_deprobe: - marker_probe_unregister("kernel_sched_wakeup", - wake_up_callback, - &ctx_trace); + unregister_trace_sched_wakeup(probe_sched_wakeup); return ret; } static void tracing_sched_unregister(void) { - marker_probe_unregister("kernel_sched_schedule", - sched_switch_callback, - &ctx_trace); - marker_probe_unregister("kernel_sched_wakeup_new", - wake_up_callback, - &ctx_trace); - marker_probe_unregister("kernel_sched_wakeup", - wake_up_callback, - &ctx_trace); + unregister_trace_sched_switch(probe_sched_switch); + unregister_trace_sched_wakeup_new(probe_sched_wakeup); + unregister_trace_sched_wakeup(probe_sched_wakeup); } static void tracing_start_sched_switch(void) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index e303ccb..fe4a252 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -15,7 +15,7 @@ #include <linux/kallsyms.h> #include <linux/uaccess.h> #include <linux/ftrace.h> -#include <linux/marker.h> +#include <trace/sched.h> #include "trace.h" @@ -44,10 +44,12 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) long disabled; int resched; int cpu; + int pc; if (likely(!wakeup_task)) return; + pc = preempt_count(); resched = need_resched(); preempt_disable_notrace(); @@ -70,7 +72,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) if (task_cpu(wakeup_task) != cpu) goto unlock; - trace_function(tr, data, ip, parent_ip, flags); + trace_function(tr, data, ip, parent_ip, flags, pc); unlock: __raw_spin_unlock(&wakeup_lock); @@ -112,17 +114,18 @@ static int report_latency(cycle_t delta) } static void notrace -wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, +probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { unsigned long latency = 0, t0 = 0, t1 = 0; - struct trace_array **ptr = private; - struct trace_array *tr = *ptr; struct trace_array_cpu *data; cycle_t T0, T1, delta; unsigned long flags; long disabled; int cpu; + int pc; + + tracing_record_cmdline(prev); if (unlikely(!tracer_enabled)) return; @@ -139,12 +142,14 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, if (next != wakeup_task) return; + pc = preempt_count(); + /* The task we are waiting for is waking up */ - data = tr->data[wakeup_cpu]; + data = wakeup_trace->data[wakeup_cpu]; /* disable local data, not wakeup_cpu data */ cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&tr->data[cpu]->disabled); + disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); if (likely(disabled != 1)) goto out; @@ -155,7 +160,7 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, if (unlikely(!tracer_enabled || next != wakeup_task)) goto out_unlock; - trace_function(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags); + trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, flags, pc); /* * usecs conversion is slow so we try to delay the conversion @@ -174,39 +179,14 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, t0 = nsecs_to_usecs(T0); t1 = nsecs_to_usecs(T1); - update_max_tr(tr, wakeup_task, wakeup_cpu); + update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); out_unlock: - __wakeup_reset(tr); + __wakeup_reset(wakeup_trace); __raw_spin_unlock(&wakeup_lock); local_irq_restore(flags); out: - atomic_dec(&tr->data[cpu]->disabled); -} - -static notrace void -sched_switch_callback(void *probe_data, void *call_data, - const char *format, va_list *args) -{ - struct task_struct *prev; - struct task_struct *next; - struct rq *__rq; - - /* skip prev_pid %d next_pid %d prev_state %ld */ - (void)va_arg(*args, int); - (void)va_arg(*args, int); - (void)va_arg(*args, long); - __rq = va_arg(*args, typeof(__rq)); - prev = va_arg(*args, typeof(prev)); - next = va_arg(*args, typeof(next)); - - tracing_record_cmdline(prev); - - /* - * If tracer_switch_func only points to the local - * switch func, it still needs the ptr passed to it. - */ - wakeup_sched_switch(probe_data, __rq, prev, next); + atomic_dec(&wakeup_trace->data[cpu]->disabled); } static void __wakeup_reset(struct trace_array *tr) @@ -216,7 +196,7 @@ static void __wakeup_reset(struct trace_array *tr) for_each_possible_cpu(cpu) { data = tr->data[cpu]; - tracing_reset(data); + tracing_reset(tr, cpu); } wakeup_cpu = -1; @@ -240,19 +220,26 @@ static void wakeup_reset(struct trace_array *tr) } static void -wakeup_check_start(struct trace_array *tr, struct task_struct *p, - struct task_struct *curr) +probe_wakeup(struct rq *rq, struct task_struct *p) { int cpu = smp_processor_id(); unsigned long flags; long disabled; + int pc; + + if (likely(!tracer_enabled)) + return; + + tracing_record_cmdline(p); + tracing_record_cmdline(current); if (likely(!rt_task(p)) || p->prio >= wakeup_prio || - p->prio >= curr->prio) + p->prio >= current->prio) return; - disabled = atomic_inc_return(&tr->data[cpu]->disabled); + pc = preempt_count(); + disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); if (unlikely(disabled != 1)) goto out; @@ -264,7 +251,7 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p, goto out_locked; /* reset the trace */ - __wakeup_reset(tr); + __wakeup_reset(wakeup_trace); wakeup_cpu = task_cpu(p); wakeup_prio = p->prio; @@ -274,74 +261,37 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p, local_save_flags(flags); - tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu); - trace_function(tr, tr->data[wakeup_cpu], - CALLER_ADDR1, CALLER_ADDR2, flags); + wakeup_trace->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu); + trace_function(wakeup_trace, wakeup_trace->data[wakeup_cpu], + CALLER_ADDR1, CALLER_ADDR2, flags, pc); out_locked: __raw_spin_unlock(&wakeup_lock); out: - atomic_dec(&tr->data[cpu]->disabled); -} - -static notrace void -wake_up_callback(void *probe_data, void *call_data, - const char *format, va_list *args) -{ - struct trace_array **ptr = probe_data; - struct trace_array *tr = *ptr; - struct task_struct *curr; - struct task_struct *task; - struct rq *__rq; - - if (likely(!tracer_enabled)) - return; - - /* Skip pid %d state %ld */ - (void)va_arg(*args, int); - (void)va_arg(*args, long); - /* now get the meat: "rq %p task %p rq->curr %p" */ - __rq = va_arg(*args, typeof(__rq)); - task = va_arg(*args, typeof(task)); - curr = va_arg(*args, typeof(curr)); - - tracing_record_cmdline(task); - tracing_record_cmdline(curr); - - wakeup_check_start(tr, task, curr); + atomic_dec(&wakeup_trace->data[cpu]->disabled); } static void start_wakeup_tracer(struct trace_array *tr) { int ret; - ret = marker_probe_register("kernel_sched_wakeup", - "pid %d state %ld ## rq %p task %p rq->curr %p", - wake_up_callback, - &wakeup_trace); + ret = register_trace_sched_wakeup(probe_wakeup); if (ret) { - pr_info("wakeup trace: Couldn't add marker" + pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup\n"); return; } - ret = marker_probe_register("kernel_sched_wakeup_new", - "pid %d state %ld ## rq %p task %p rq->curr %p", - wake_up_callback, - &wakeup_trace); + ret = register_trace_sched_wakeup_new(probe_wakeup); if (ret) { - pr_info("wakeup trace: Couldn't add marker" + pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup_new\n"); goto fail_deprobe; } - ret = marker_probe_register("kernel_sched_schedule", - "prev_pid %d next_pid %d prev_state %ld " - "## rq %p prev %p next %p", - sched_switch_callback, - &wakeup_trace); + ret = register_trace_sched_switch(probe_wakeup_sched_switch); if (ret) { - pr_info("sched trace: Couldn't add marker" + pr_info("sched trace: Couldn't activate tracepoint" " probe to kernel_sched_schedule\n"); goto fail_deprobe_wake_new; } @@ -363,28 +313,18 @@ static void start_wakeup_tracer(struct trace_array *tr) return; fail_deprobe_wake_new: - marker_probe_unregister("kernel_sched_wakeup_new", - wake_up_callback, - &wakeup_trace); + unregister_trace_sched_wakeup_new(probe_wakeup); fail_deprobe: - marker_probe_unregister("kernel_sched_wakeup", - wake_up_callback, - &wakeup_trace); + unregister_trace_sched_wakeup(probe_wakeup); } static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; unregister_ftrace_function(&trace_ops); - marker_probe_unregister("kernel_sched_schedule", - sched_switch_callback, - &wakeup_trace); - marker_probe_unregister("kernel_sched_wakeup_new", - wake_up_callback, - &wakeup_trace); - marker_probe_unregister("kernel_sched_wakeup", - wake_up_callback, - &wakeup_trace); + unregister_trace_sched_switch(probe_wakeup_sched_switch); + unregister_trace_sched_wakeup_new(probe_wakeup); + unregister_trace_sched_wakeup(probe_wakeup); } static void wakeup_tracer_init(struct trace_array *tr) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 0911b7e..09cf230 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -9,65 +9,29 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_FN: case TRACE_CTX: case TRACE_WAKE: + case TRACE_CONT: case TRACE_STACK: + case TRACE_PRINT: case TRACE_SPECIAL: return 1; } return 0; } -static int -trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data) +static int trace_test_buffer_cpu(struct trace_array *tr, int cpu) { - struct trace_entry *entries; - struct page *page; - int idx = 0; - int i; + struct ring_buffer_event *event; + struct trace_entry *entry; - BUG_ON(list_empty(&data->trace_pages)); - page = list_entry(data->trace_pages.next, struct page, lru); - entries = page_address(page); + while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) { + entry = ring_buffer_event_data(event); - check_pages(data); - if (head_page(data) != entries) - goto failed; - - /* - * The starting trace buffer always has valid elements, - * if any element exists. - */ - entries = head_page(data); - - for (i = 0; i < tr->entries; i++) { - - if (i < data->trace_idx && !trace_valid_entry(&entries[idx])) { + if (!trace_valid_entry(entry)) { printk(KERN_CONT ".. invalid entry %d ", - entries[idx].type); + entry->type); goto failed; } - - idx++; - if (idx >= ENTRIES_PER_PAGE) { - page = virt_to_page(entries); - if (page->lru.next == &data->trace_pages) { - if (i != tr->entries - 1) { - printk(KERN_CONT ".. entries buffer mismatch"); - goto failed; - } - } else { - page = list_entry(page->lru.next, struct page, lru); - entries = page_address(page); - } - idx = 0; - } } - - page = virt_to_page(entries); - if (page->lru.next != &data->trace_pages) { - printk(KERN_CONT ".. too many entries"); - goto failed; - } - return 0; failed: @@ -89,13 +53,11 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) /* Don't allow flipping of max traces now */ raw_local_irq_save(flags); __raw_spin_lock(&ftrace_max_lock); - for_each_possible_cpu(cpu) { - if (!head_page(tr->data[cpu])) - continue; - cnt += tr->data[cpu]->trace_idx; + cnt = ring_buffer_entries(tr->buffer); - ret = trace_test_buffer_cpu(tr, tr->data[cpu]); + for_each_possible_cpu(cpu) { + ret = trace_test_buffer_cpu(tr, cpu); if (ret) break; } @@ -120,11 +82,11 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, struct trace_array *tr, int (*func)(void)) { - unsigned long count; - int ret; int save_ftrace_enabled = ftrace_enabled; int save_tracer_enabled = tracer_enabled; + unsigned long count; char *func_name; + int ret; /* The ftrace test PASSED */ printk(KERN_CONT "PASSED\n"); @@ -157,6 +119,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, /* enable tracing */ tr->ctrl = 1; trace->init(tr); + /* Sleep for a 1/10 of a second */ msleep(100); @@ -212,10 +175,10 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, int trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) { - unsigned long count; - int ret; int save_ftrace_enabled = ftrace_enabled; int save_tracer_enabled = tracer_enabled; + unsigned long count; + int ret; /* make sure msleep has been recorded */ msleep(1); @@ -415,6 +378,15 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * } #endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */ +#ifdef CONFIG_NOP_TRACER +int +trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) +{ + /* What could possibly go wrong? */ + return 0; +} +#endif + #ifdef CONFIG_SCHED_TRACER static int trace_wakeup_test_thread(void *data) { @@ -486,6 +458,9 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) wake_up_process(p); + /* give a little time to let the thread wake up */ + msleep(100); + /* stop the tracing. */ tr->ctrl = 0; trace->ctrl_update(tr); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c new file mode 100644 index 0000000..74c5d9a --- /dev/null +++ b/kernel/trace/trace_stack.c @@ -0,0 +1,310 @@ +/* + * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> + * + */ +#include <linux/stacktrace.h> +#include <linux/kallsyms.h> +#include <linux/seq_file.h> +#include <linux/spinlock.h> +#include <linux/uaccess.h> +#include <linux/debugfs.h> +#include <linux/ftrace.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/fs.h> +#include "trace.h" + +#define STACK_TRACE_ENTRIES 500 + +static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = + { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; +static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; + +static struct stack_trace max_stack_trace = { + .max_entries = STACK_TRACE_ENTRIES, + .entries = stack_dump_trace, +}; + +static unsigned long max_stack_size; +static raw_spinlock_t max_stack_lock = + (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + +static int stack_trace_disabled __read_mostly; +static DEFINE_PER_CPU(int, trace_active); + +static inline void check_stack(void) +{ + unsigned long this_size, flags; + unsigned long *p, *top, *start; + int i; + + this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1); + this_size = THREAD_SIZE - this_size; + + if (this_size <= max_stack_size) + return; + + raw_local_irq_save(flags); + __raw_spin_lock(&max_stack_lock); + + /* a race could have already updated it */ + if (this_size <= max_stack_size) + goto out; + + max_stack_size = this_size; + + max_stack_trace.nr_entries = 0; + max_stack_trace.skip = 3; + + save_stack_trace(&max_stack_trace); + + /* + * Now find where in the stack these are. + */ + i = 0; + start = &this_size; + top = (unsigned long *) + (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); + + /* + * Loop through all the entries. One of the entries may + * for some reason be missed on the stack, so we may + * have to account for them. If they are all there, this + * loop will only happen once. This code only takes place + * on a new max, so it is far from a fast path. + */ + while (i < max_stack_trace.nr_entries) { + + stack_dump_index[i] = this_size; + p = start; + + for (; p < top && i < max_stack_trace.nr_entries; p++) { + if (*p == stack_dump_trace[i]) { + this_size = stack_dump_index[i++] = + (top - p) * sizeof(unsigned long); + /* Start the search from here */ + start = p + 1; + } + } + + i++; + } + + out: + __raw_spin_unlock(&max_stack_lock); + raw_local_irq_restore(flags); +} + +static void +stack_trace_call(unsigned long ip, unsigned long parent_ip) +{ + int cpu, resched; + + if (unlikely(!ftrace_enabled || stack_trace_disabled)) + return; + + resched = need_resched(); + preempt_disable_notrace(); + + cpu = raw_smp_processor_id(); + /* no atomic needed, we only modify this variable by this cpu */ + if (per_cpu(trace_active, cpu)++ != 0) + goto out; + + check_stack(); + + out: + per_cpu(trace_active, cpu)--; + /* prevent recursion in schedule */ + if (resched) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = stack_trace_call, +}; + +static ssize_t +stack_max_size_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + unsigned long *ptr = filp->private_data; + char buf[64]; + int r; + + r = snprintf(buf, sizeof(buf), "%ld\n", *ptr); + if (r > sizeof(buf)) + r = sizeof(buf); + return simple_read_from_buffer(ubuf, count, ppos, buf, r); +} + +static ssize_t +stack_max_size_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + long *ptr = filp->private_data; + unsigned long val, flags; + char buf[64]; + int ret; + + if (count >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, count)) + return -EFAULT; + + buf[count] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + raw_local_irq_save(flags); + __raw_spin_lock(&max_stack_lock); + *ptr = val; + __raw_spin_unlock(&max_stack_lock); + raw_local_irq_restore(flags); + + return count; +} + +static struct file_operations stack_max_size_fops = { + .open = tracing_open_generic, + .read = stack_max_size_read, + .write = stack_max_size_write, +}; + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + long i = (long)m->private; + + (*pos)++; + + i++; + + if (i >= max_stack_trace.nr_entries || + stack_dump_trace[i] == ULONG_MAX) + return NULL; + + m->private = (void *)i; + + return &m->private; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + void *t = &m->private; + loff_t l = 0; + + local_irq_disable(); + __raw_spin_lock(&max_stack_lock); + + for (; t && l < *pos; t = t_next(m, t, &l)) + ; + + return t; +} + +static void t_stop(struct seq_file *m, void *p) +{ + __raw_spin_unlock(&max_stack_lock); + local_irq_enable(); +} + +static int trace_lookup_stack(struct seq_file *m, long i) +{ + unsigned long addr = stack_dump_trace[i]; +#ifdef CONFIG_KALLSYMS + char str[KSYM_SYMBOL_LEN]; + + sprint_symbol(str, addr); + + return seq_printf(m, "%s\n", str); +#else + return seq_printf(m, "%p\n", (void*)addr); +#endif +} + +static int t_show(struct seq_file *m, void *v) +{ + long i = *(long *)v; + int size; + + if (i < 0) { + seq_printf(m, " Depth Size Location" + " (%d entries)\n" + " ----- ---- --------\n", + max_stack_trace.nr_entries); + return 0; + } + + if (i >= max_stack_trace.nr_entries || + stack_dump_trace[i] == ULONG_MAX) + return 0; + + if (i+1 == max_stack_trace.nr_entries || + stack_dump_trace[i+1] == ULONG_MAX) + size = stack_dump_index[i]; + else + size = stack_dump_index[i] - stack_dump_index[i+1]; + + seq_printf(m, "%3ld) %8d %5d ", i, stack_dump_index[i], size); + + trace_lookup_stack(m, i); + + return 0; +} + +static struct seq_operations stack_trace_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int stack_trace_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = seq_open(file, &stack_trace_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = (void *)-1; + } + + return ret; +} + +static struct file_operations stack_trace_fops = { + .open = stack_trace_open, + .read = seq_read, + .llseek = seq_lseek, +}; + +static __init int stack_trace_init(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + + entry = debugfs_create_file("stack_max_size", 0644, d_tracer, + &max_stack_size, &stack_max_size_fops); + if (!entry) + pr_warning("Could not create debugfs 'stack_max_size' entry\n"); + + entry = debugfs_create_file("stack_trace", 0444, d_tracer, + NULL, &stack_trace_fops); + if (!entry) + pr_warning("Could not create debugfs 'stack_trace' entry\n"); + + register_ftrace_function(&trace_ops); + + return 0; +} + +device_initcall(stack_trace_init); diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index db58fb6..9587d3b 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -241,7 +241,7 @@ static void stack_reset(struct trace_array *tr) tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) - tracing_reset(tr->data[cpu]); + tracing_reset(tr, cpu); } static void start_stack_trace(struct trace_array *tr) diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c new file mode 100644 index 0000000..f2b7c28 --- /dev/null +++ b/kernel/tracepoint.c @@ -0,0 +1,477 @@ +/* + * Copyright (C) 2008 Mathieu Desnoyers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/types.h> +#include <linux/jhash.h> +#include <linux/list.h> +#include <linux/rcupdate.h> +#include <linux/tracepoint.h> +#include <linux/err.h> +#include <linux/slab.h> + +extern struct tracepoint __start___tracepoints[]; +extern struct tracepoint __stop___tracepoints[]; + +/* Set to 1 to enable tracepoint debug output */ +static const int tracepoint_debug; + +/* + * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the + * builtin and module tracepoints and the hash table. + */ +static DEFINE_MUTEX(tracepoints_mutex); + +/* + * Tracepoint hash table, containing the active tracepoints. + * Protected by tracepoints_mutex. + */ +#define TRACEPOINT_HASH_BITS 6 +#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS) + +/* + * Note about RCU : + * It is used to to delay the free of multiple probes array until a quiescent + * state is reached. + * Tracepoint entries modifications are protected by the tracepoints_mutex. + */ +struct tracepoint_entry { + struct hlist_node hlist; + void **funcs; + int refcount; /* Number of times armed. 0 if disarmed. */ + struct rcu_head rcu; + void *oldptr; + unsigned char rcu_pending:1; + char name[0]; +}; + +static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; + +static void free_old_closure(struct rcu_head *head) +{ + struct tracepoint_entry *entry = container_of(head, + struct tracepoint_entry, rcu); + kfree(entry->oldptr); + /* Make sure we free the data before setting the pending flag to 0 */ + smp_wmb(); + entry->rcu_pending = 0; +} + +static void tracepoint_entry_free_old(struct tracepoint_entry *entry, void *old) +{ + if (!old) + return; + entry->oldptr = old; + entry->rcu_pending = 1; + /* write rcu_pending before calling the RCU callback */ + smp_wmb(); + call_rcu_sched(&entry->rcu, free_old_closure); +} + +static void debug_print_probes(struct tracepoint_entry *entry) +{ + int i; + + if (!tracepoint_debug) + return; + + for (i = 0; entry->funcs[i]; i++) + printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i]); +} + +static void * +tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe) +{ + int nr_probes = 0; + void **old, **new; + + WARN_ON(!probe); + + debug_print_probes(entry); + old = entry->funcs; + if (old) { + /* (N -> N+1), (N != 0, 1) probes */ + for (nr_probes = 0; old[nr_probes]; nr_probes++) + if (old[nr_probes] == probe) + return ERR_PTR(-EEXIST); + } + /* + 2 : one for new probe, one for NULL func */ + new = kzalloc((nr_probes + 2) * sizeof(void *), GFP_KERNEL); + if (new == NULL) + return ERR_PTR(-ENOMEM); + if (old) + memcpy(new, old, nr_probes * sizeof(void *)); + new[nr_probes] = probe; + entry->refcount = nr_probes + 1; + entry->funcs = new; + debug_print_probes(entry); + return old; +} + +static void * +tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe) +{ + int nr_probes = 0, nr_del = 0, i; + void **old, **new; + + old = entry->funcs; + + debug_print_probes(entry); + /* (N -> M), (N > 1, M >= 0) probes */ + for (nr_probes = 0; old[nr_probes]; nr_probes++) { + if ((!probe || old[nr_probes] == probe)) + nr_del++; + } + + if (nr_probes - nr_del == 0) { + /* N -> 0, (N > 1) */ + entry->funcs = NULL; + entry->refcount = 0; + debug_print_probes(entry); + return old; + } else { + int j = 0; + /* N -> M, (N > 1, M > 0) */ + /* + 1 for NULL */ + new = kzalloc((nr_probes - nr_del + 1) + * sizeof(void *), GFP_KERNEL); + if (new == NULL) + return ERR_PTR(-ENOMEM); + for (i = 0; old[i]; i++) + if ((probe && old[i] != probe)) + new[j++] = old[i]; + entry->refcount = nr_probes - nr_del; + entry->funcs = new; + } + debug_print_probes(entry); + return old; +} + +/* + * Get tracepoint if the tracepoint is present in the tracepoint hash table. + * Must be called with tracepoints_mutex held. + * Returns NULL if not present. + */ +static struct tracepoint_entry *get_tracepoint(const char *name) +{ + struct hlist_head *head; + struct hlist_node *node; + struct tracepoint_entry *e; + u32 hash = jhash(name, strlen(name), 0); + + head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(name, e->name)) + return e; + } + return NULL; +} + +/* + * Add the tracepoint to the tracepoint hash table. Must be called with + * tracepoints_mutex held. + */ +static struct tracepoint_entry *add_tracepoint(const char *name) +{ + struct hlist_head *head; + struct hlist_node *node; + struct tracepoint_entry *e; + size_t name_len = strlen(name) + 1; + u32 hash = jhash(name, name_len-1, 0); + + head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(name, e->name)) { + printk(KERN_NOTICE + "tracepoint %s busy\n", name); + return ERR_PTR(-EEXIST); /* Already there */ + } + } + /* + * Using kmalloc here to allocate a variable length element. Could + * cause some memory fragmentation if overused. + */ + e = kmalloc(sizeof(struct tracepoint_entry) + name_len, GFP_KERNEL); + if (!e) + return ERR_PTR(-ENOMEM); + memcpy(&e->name[0], name, name_len); + e->funcs = NULL; + e->refcount = 0; + e->rcu_pending = 0; + hlist_add_head(&e->hlist, head); + return e; +} + +/* + * Remove the tracepoint from the tracepoint hash table. Must be called with + * mutex_lock held. + */ +static int remove_tracepoint(const char *name) +{ + struct hlist_head *head; + struct hlist_node *node; + struct tracepoint_entry *e; + int found = 0; + size_t len = strlen(name) + 1; + u32 hash = jhash(name, len-1, 0); + + head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(name, e->name)) { + found = 1; + break; + } + } + if (!found) + return -ENOENT; + if (e->refcount) + return -EBUSY; + hlist_del(&e->hlist); + /* Make sure the call_rcu_sched has been executed */ + if (e->rcu_pending) + rcu_barrier_sched(); + kfree(e); + return 0; +} + +/* + * Sets the probe callback corresponding to one tracepoint. + */ +static void set_tracepoint(struct tracepoint_entry **entry, + struct tracepoint *elem, int active) +{ + WARN_ON(strcmp((*entry)->name, elem->name) != 0); + + /* + * rcu_assign_pointer has a smp_wmb() which makes sure that the new + * probe callbacks array is consistent before setting a pointer to it. + * This array is referenced by __DO_TRACE from + * include/linux/tracepoints.h. A matching smp_read_barrier_depends() + * is used. + */ + rcu_assign_pointer(elem->funcs, (*entry)->funcs); + elem->state = active; +} + +/* + * Disable a tracepoint and its probe callback. + * Note: only waiting an RCU period after setting elem->call to the empty + * function insures that the original callback is not used anymore. This insured + * by preempt_disable around the call site. + */ +static void disable_tracepoint(struct tracepoint *elem) +{ + elem->state = 0; +} + +/** + * tracepoint_update_probe_range - Update a probe range + * @begin: beginning of the range + * @end: end of the range + * + * Updates the probe callback corresponding to a range of tracepoints. + */ +void tracepoint_update_probe_range(struct tracepoint *begin, + struct tracepoint *end) +{ + struct tracepoint *iter; + struct tracepoint_entry *mark_entry; + + mutex_lock(&tracepoints_mutex); + for (iter = begin; iter < end; iter++) { + mark_entry = get_tracepoint(iter->name); + if (mark_entry) { + set_tracepoint(&mark_entry, iter, + !!mark_entry->refcount); + } else { + disable_tracepoint(iter); + } + } + mutex_unlock(&tracepoints_mutex); +} + +/* + * Update probes, removing the faulty probes. + */ +static void tracepoint_update_probes(void) +{ + /* Core kernel tracepoints */ + tracepoint_update_probe_range(__start___tracepoints, + __stop___tracepoints); + /* tracepoints in modules. */ + module_update_tracepoints(); +} + +/** + * tracepoint_probe_register - Connect a probe to a tracepoint + * @name: tracepoint name + * @probe: probe handler + * + * Returns 0 if ok, error value on error. + * The probe address must at least be aligned on the architecture pointer size. + */ +int tracepoint_probe_register(const char *name, void *probe) +{ + struct tracepoint_entry *entry; + int ret = 0; + void *old; + + mutex_lock(&tracepoints_mutex); + entry = get_tracepoint(name); + if (!entry) { + entry = add_tracepoint(name); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry); + goto end; + } + } + /* + * If we detect that a call_rcu_sched is pending for this tracepoint, + * make sure it's executed now. + */ + if (entry->rcu_pending) + rcu_barrier_sched(); + old = tracepoint_entry_add_probe(entry, probe); + if (IS_ERR(old)) { + ret = PTR_ERR(old); + goto end; + } + mutex_unlock(&tracepoints_mutex); + tracepoint_update_probes(); /* may update entry */ + mutex_lock(&tracepoints_mutex); + entry = get_tracepoint(name); + WARN_ON(!entry); + if (entry->rcu_pending) + rcu_barrier_sched(); + tracepoint_entry_free_old(entry, old); +end: + mutex_unlock(&tracepoints_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(tracepoint_probe_register); + +/** + * tracepoint_probe_unregister - Disconnect a probe from a tracepoint + * @name: tracepoint name + * @probe: probe function pointer + * + * We do not need to call a synchronize_sched to make sure the probes have + * finished running before doing a module unload, because the module unload + * itself uses stop_machine(), which insures that every preempt disabled section + * have finished. + */ +int tracepoint_probe_unregister(const char *name, void *probe) +{ + struct tracepoint_entry *entry; + void *old; + int ret = -ENOENT; + + mutex_lock(&tracepoints_mutex); + entry = get_tracepoint(name); + if (!entry) + goto end; + if (entry->rcu_pending) + rcu_barrier_sched(); + old = tracepoint_entry_remove_probe(entry, probe); + mutex_unlock(&tracepoints_mutex); + tracepoint_update_probes(); /* may update entry */ + mutex_lock(&tracepoints_mutex); + entry = get_tracepoint(name); + if (!entry) + goto end; + if (entry->rcu_pending) + rcu_barrier_sched(); + tracepoint_entry_free_old(entry, old); + remove_tracepoint(name); /* Ignore busy error message */ + ret = 0; +end: + mutex_unlock(&tracepoints_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); + +/** + * tracepoint_get_iter_range - Get a next tracepoint iterator given a range. + * @tracepoint: current tracepoints (in), next tracepoint (out) + * @begin: beginning of the range + * @end: end of the range + * + * Returns whether a next tracepoint has been found (1) or not (0). + * Will return the first tracepoint in the range if the input tracepoint is + * NULL. + */ +int tracepoint_get_iter_range(struct tracepoint **tracepoint, + struct tracepoint *begin, struct tracepoint *end) +{ + if (!*tracepoint && begin != end) { + *tracepoint = begin; + return 1; + } + if (*tracepoint >= begin && *tracepoint < end) + return 1; + return 0; +} +EXPORT_SYMBOL_GPL(tracepoint_get_iter_range); + +static void tracepoint_get_iter(struct tracepoint_iter *iter) +{ + int found = 0; + + /* Core kernel tracepoints */ + if (!iter->module) { + found = tracepoint_get_iter_range(&iter->tracepoint, + __start___tracepoints, __stop___tracepoints); + if (found) + goto end; + } + /* tracepoints in modules. */ + found = module_get_iter_tracepoints(iter); +end: + if (!found) + tracepoint_iter_reset(iter); +} + +void tracepoint_iter_start(struct tracepoint_iter *iter) +{ + tracepoint_get_iter(iter); +} +EXPORT_SYMBOL_GPL(tracepoint_iter_start); + +void tracepoint_iter_next(struct tracepoint_iter *iter) +{ + iter->tracepoint++; + /* + * iter->tracepoint may be invalid because we blindly incremented it. + * Make sure it is valid by marshalling on the tracepoints, getting the + * tracepoints from following modules if necessary. + */ + tracepoint_get_iter(iter); +} +EXPORT_SYMBOL_GPL(tracepoint_iter_next); + +void tracepoint_iter_stop(struct tracepoint_iter *iter) +{ +} +EXPORT_SYMBOL_GPL(tracepoint_iter_stop); + +void tracepoint_iter_reset(struct tracepoint_iter *iter) +{ + iter->module = NULL; + iter->tracepoint = NULL; +} +EXPORT_SYMBOL_GPL(tracepoint_iter_reset); |