diff options
Diffstat (limited to 'kernel')
149 files changed, 14722 insertions, 10237 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 057472f..0b5ff08 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -10,8 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ - async.o range.o -obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o + async.o range.o jump_label.o obj-y += groups.o ifdef CONFIG_FUNCTION_TRACER @@ -23,6 +22,7 @@ CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg CFLAGS_REMOVE_perf_event.o = -pg +CFLAGS_REMOVE_irq_work.o = -pg endif obj-$(CONFIG_FREEZER) += freezer.o @@ -70,14 +70,15 @@ obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o obj-$(CONFIG_SMP) += stop_machine.o obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o -obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o +obj-$(CONFIG_AUDIT) += audit.o auditfilter.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o -obj-$(CONFIG_GCOV_KERNEL) += gcov/ +obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o obj-$(CONFIG_AUDIT_TREE) += audit_tree.o +obj-$(CONFIG_GCOV_KERNEL) += gcov/ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += debug/ -obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o +obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o @@ -85,6 +86,7 @@ obj-$(CONFIG_TREE_RCU) += rcutree.o obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o obj-$(CONFIG_TINY_RCU) += rcutiny.o +obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o @@ -99,8 +101,7 @@ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o -obj-$(CONFIG_SLOW_WORK) += slow-work.o -obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o +obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o diff --git a/kernel/acct.c b/kernel/acct.c index 385b884..fa7eb3d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -122,7 +122,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) spin_unlock(&acct_lock); /* May block */ - if (vfs_statfs(file->f_path.dentry, &sbuf)) + if (vfs_statfs(&file->f_path, &sbuf)) return res; suspend = sbuf.f_blocks * SUSPEND; resume = sbuf.f_blocks * RESUME; diff --git a/kernel/async.c b/kernel/async.c index 15319d6..cd9dbb9 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -49,40 +49,33 @@ asynchronous and synchronous parts of the kernel. */ #include <linux/async.h> -#include <linux/bug.h> #include <linux/module.h> #include <linux/wait.h> #include <linux/sched.h> -#include <linux/init.h> -#include <linux/kthread.h> -#include <linux/delay.h> #include <linux/slab.h> +#include <linux/workqueue.h> #include <asm/atomic.h> static async_cookie_t next_cookie = 1; -#define MAX_THREADS 256 #define MAX_WORK 32768 static LIST_HEAD(async_pending); static LIST_HEAD(async_running); static DEFINE_SPINLOCK(async_lock); -static int async_enabled = 0; - struct async_entry { - struct list_head list; - async_cookie_t cookie; - async_func_ptr *func; - void *data; - struct list_head *running; + struct list_head list; + struct work_struct work; + async_cookie_t cookie; + async_func_ptr *func; + void *data; + struct list_head *running; }; static DECLARE_WAIT_QUEUE_HEAD(async_done); -static DECLARE_WAIT_QUEUE_HEAD(async_new); static atomic_t entry_count; -static atomic_t thread_count; extern int initcall_debug; @@ -117,27 +110,23 @@ static async_cookie_t lowest_in_progress(struct list_head *running) spin_unlock_irqrestore(&async_lock, flags); return ret; } + /* * pick the first pending entry and run it */ -static void run_one_entry(void) +static void async_run_entry_fn(struct work_struct *work) { + struct async_entry *entry = + container_of(work, struct async_entry, work); unsigned long flags; - struct async_entry *entry; ktime_t calltime, delta, rettime; - /* 1) pick one task from the pending queue */ - + /* 1) move self to the running queue */ spin_lock_irqsave(&async_lock, flags); - if (list_empty(&async_pending)) - goto out; - entry = list_first_entry(&async_pending, struct async_entry, list); - - /* 2) move it to the running queue */ list_move_tail(&entry->list, entry->running); spin_unlock_irqrestore(&async_lock, flags); - /* 3) run it (and print duration)*/ + /* 2) run (and print duration) */ if (initcall_debug && system_state == SYSTEM_BOOTING) { printk("calling %lli_%pF @ %i\n", (long long)entry->cookie, entry->func, task_pid_nr(current)); @@ -153,31 +142,25 @@ static void run_one_entry(void) (long long)ktime_to_ns(delta) >> 10); } - /* 4) remove it from the running queue */ + /* 3) remove self from the running queue */ spin_lock_irqsave(&async_lock, flags); list_del(&entry->list); - /* 5) free the entry */ + /* 4) free the entry */ kfree(entry); atomic_dec(&entry_count); spin_unlock_irqrestore(&async_lock, flags); - /* 6) wake up any waiters. */ + /* 5) wake up any waiters */ wake_up(&async_done); - return; - -out: - spin_unlock_irqrestore(&async_lock, flags); } - static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) { struct async_entry *entry; unsigned long flags; async_cookie_t newcookie; - /* allow irq-off callers */ entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC); @@ -186,7 +169,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l * If we're out of memory or if there's too much work * pending already, we execute synchronously. */ - if (!async_enabled || !entry || atomic_read(&entry_count) > MAX_WORK) { + if (!entry || atomic_read(&entry_count) > MAX_WORK) { kfree(entry); spin_lock_irqsave(&async_lock, flags); newcookie = next_cookie++; @@ -196,6 +179,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l ptr(data, newcookie); return newcookie; } + INIT_WORK(&entry->work, async_run_entry_fn); entry->func = ptr; entry->data = data; entry->running = running; @@ -205,7 +189,10 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l list_add_tail(&entry->list, &async_pending); atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); - wake_up(&async_new); + + /* schedule for execution */ + queue_work(system_unbound_wq, &entry->work); + return newcookie; } @@ -312,87 +299,3 @@ void async_synchronize_cookie(async_cookie_t cookie) async_synchronize_cookie_domain(cookie, &async_running); } EXPORT_SYMBOL_GPL(async_synchronize_cookie); - - -static int async_thread(void *unused) -{ - DECLARE_WAITQUEUE(wq, current); - add_wait_queue(&async_new, &wq); - - while (!kthread_should_stop()) { - int ret = HZ; - set_current_state(TASK_INTERRUPTIBLE); - /* - * check the list head without lock.. false positives - * are dealt with inside run_one_entry() while holding - * the lock. - */ - rmb(); - if (!list_empty(&async_pending)) - run_one_entry(); - else - ret = schedule_timeout(HZ); - - if (ret == 0) { - /* - * we timed out, this means we as thread are redundant. - * we sign off and die, but we to avoid any races there - * is a last-straw check to see if work snuck in. - */ - atomic_dec(&thread_count); - wmb(); /* manager must see our departure first */ - if (list_empty(&async_pending)) - break; - /* - * woops work came in between us timing out and us - * signing off; we need to stay alive and keep working. - */ - atomic_inc(&thread_count); - } - } - remove_wait_queue(&async_new, &wq); - - return 0; -} - -static int async_manager_thread(void *unused) -{ - DECLARE_WAITQUEUE(wq, current); - add_wait_queue(&async_new, &wq); - - while (!kthread_should_stop()) { - int tc, ec; - - set_current_state(TASK_INTERRUPTIBLE); - - tc = atomic_read(&thread_count); - rmb(); - ec = atomic_read(&entry_count); - - while (tc < ec && tc < MAX_THREADS) { - if (IS_ERR(kthread_run(async_thread, NULL, "async/%i", - tc))) { - msleep(100); - continue; - } - atomic_inc(&thread_count); - tc++; - } - - schedule(); - } - remove_wait_queue(&async_new, &wq); - - return 0; -} - -static int __init async_init(void) -{ - async_enabled = - !IS_ERR(kthread_run(async_manager_thread, NULL, "async/mgr")); - - WARN_ON(!async_enabled); - return 0; -} - -core_initcall(async_init); diff --git a/kernel/audit.c b/kernel/audit.c index c71bd26..d960457 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -56,7 +56,6 @@ #include <net/netlink.h> #include <linux/skbuff.h> #include <linux/netlink.h> -#include <linux/inotify.h> #include <linux/freezer.h> #include <linux/tty.h> @@ -407,7 +406,7 @@ static void kauditd_send_skb(struct sk_buff *skb) audit_hold_skb(skb); } else /* drop the extra reference if sent ok */ - kfree_skb(skb); + consume_skb(skb); } static int kauditd_thread(void *dummy) diff --git a/kernel/audit.h b/kernel/audit.h index 208687b..f7206db 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -103,21 +103,27 @@ extern struct mutex audit_filter_mutex; extern void audit_free_rule_rcu(struct rcu_head *); extern struct list_head audit_filter_list[]; +extern struct audit_entry *audit_dupe_rule(struct audit_krule *old); + /* audit watch functions */ -extern unsigned long audit_watch_inode(struct audit_watch *watch); -extern dev_t audit_watch_dev(struct audit_watch *watch); +#ifdef CONFIG_AUDIT_WATCH extern void audit_put_watch(struct audit_watch *watch); extern void audit_get_watch(struct audit_watch *watch); extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op); -extern int audit_add_watch(struct audit_krule *krule); -extern void audit_remove_watch(struct audit_watch *watch); -extern void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list); -extern void audit_inotify_unregister(struct list_head *in_list); +extern int audit_add_watch(struct audit_krule *krule, struct list_head **list); +extern void audit_remove_watch_rule(struct audit_krule *krule); extern char *audit_watch_path(struct audit_watch *watch); -extern struct list_head *audit_watch_rules(struct audit_watch *watch); - -extern struct audit_entry *audit_dupe_rule(struct audit_krule *old, - struct audit_watch *watch); +extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev); +#else +#define audit_put_watch(w) {} +#define audit_get_watch(w) {} +#define audit_to_watch(k, p, l, o) (-EINVAL) +#define audit_add_watch(k, l) (-EINVAL) +#define audit_remove_watch_rule(k) BUG() +#define audit_watch_path(w) "" +#define audit_watch_compare(w, i, d) 0 + +#endif /* CONFIG_AUDIT_WATCH */ #ifdef CONFIG_AUDIT_TREE extern struct audit_chunk *audit_tree_lookup(const struct inode *); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 46a57b5..7f18d3a 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -1,5 +1,5 @@ #include "audit.h" -#include <linux/inotify.h> +#include <linux/fsnotify_backend.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/kthread.h> @@ -22,7 +22,7 @@ struct audit_tree { struct audit_chunk { struct list_head hash; - struct inotify_watch watch; + struct fsnotify_mark mark; struct list_head trees; /* with root here */ int dead; int count; @@ -59,7 +59,7 @@ static LIST_HEAD(prune_list); * tree is refcounted; one reference for "some rules on rules_list refer to * it", one for each chunk with pointer to it. * - * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount + * chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount * of watch contributes 1 to .refs). * * node.index allows to get from node.list to containing chunk. @@ -68,7 +68,7 @@ static LIST_HEAD(prune_list); * that makes a difference. Some. */ -static struct inotify_handle *rtree_ih; +static struct fsnotify_group *audit_tree_group; static struct audit_tree *alloc_tree(const char *s) { @@ -111,29 +111,6 @@ const char *audit_tree_path(struct audit_tree *tree) return tree->pathname; } -static struct audit_chunk *alloc_chunk(int count) -{ - struct audit_chunk *chunk; - size_t size; - int i; - - size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node); - chunk = kzalloc(size, GFP_KERNEL); - if (!chunk) - return NULL; - - INIT_LIST_HEAD(&chunk->hash); - INIT_LIST_HEAD(&chunk->trees); - chunk->count = count; - atomic_long_set(&chunk->refs, 1); - for (i = 0; i < count; i++) { - INIT_LIST_HEAD(&chunk->owners[i].list); - chunk->owners[i].index = i; - } - inotify_init_watch(&chunk->watch); - return chunk; -} - static void free_chunk(struct audit_chunk *chunk) { int i; @@ -157,6 +134,35 @@ static void __put_chunk(struct rcu_head *rcu) audit_put_chunk(chunk); } +static void audit_tree_destroy_watch(struct fsnotify_mark *entry) +{ + struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); + call_rcu(&chunk->head, __put_chunk); +} + +static struct audit_chunk *alloc_chunk(int count) +{ + struct audit_chunk *chunk; + size_t size; + int i; + + size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node); + chunk = kzalloc(size, GFP_KERNEL); + if (!chunk) + return NULL; + + INIT_LIST_HEAD(&chunk->hash); + INIT_LIST_HEAD(&chunk->trees); + chunk->count = count; + atomic_long_set(&chunk->refs, 1); + for (i = 0; i < count; i++) { + INIT_LIST_HEAD(&chunk->owners[i].list); + chunk->owners[i].index = i; + } + fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch); + return chunk; +} + enum {HASH_SIZE = 128}; static struct list_head chunk_hash_heads[HASH_SIZE]; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock); @@ -167,10 +173,15 @@ static inline struct list_head *chunk_hash(const struct inode *inode) return chunk_hash_heads + n % HASH_SIZE; } -/* hash_lock is held by caller */ +/* hash_lock & entry->lock is held by caller */ static void insert_hash(struct audit_chunk *chunk) { - struct list_head *list = chunk_hash(chunk->watch.inode); + struct fsnotify_mark *entry = &chunk->mark; + struct list_head *list; + + if (!entry->i.inode) + return; + list = chunk_hash(entry->i.inode); list_add_rcu(&chunk->hash, list); } @@ -181,7 +192,8 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode) struct audit_chunk *p; list_for_each_entry_rcu(p, list, hash) { - if (p->watch.inode == inode) { + /* mark.inode may have gone NULL, but who cares? */ + if (p->mark.i.inode == inode) { atomic_long_inc(&p->refs); return p; } @@ -210,38 +222,19 @@ static struct audit_chunk *find_chunk(struct node *p) static void untag_chunk(struct node *p) { struct audit_chunk *chunk = find_chunk(p); + struct fsnotify_mark *entry = &chunk->mark; struct audit_chunk *new; struct audit_tree *owner; int size = chunk->count - 1; int i, j; - if (!pin_inotify_watch(&chunk->watch)) { - /* - * Filesystem is shutting down; all watches are getting - * evicted, just take it off the node list for this - * tree and let the eviction logics take care of the - * rest. - */ - owner = p->owner; - if (owner->root == chunk) { - list_del_init(&owner->same_root); - owner->root = NULL; - } - list_del_init(&p->list); - p->owner = NULL; - put_tree(owner); - return; - } + fsnotify_get_mark(entry); spin_unlock(&hash_lock); - /* - * pin_inotify_watch() succeeded, so the watch won't go away - * from under us. - */ - mutex_lock(&chunk->watch.inode->inotify_mutex); - if (chunk->dead) { - mutex_unlock(&chunk->watch.inode->inotify_mutex); + spin_lock(&entry->lock); + if (chunk->dead || !entry->i.inode) { + spin_unlock(&entry->lock); goto out; } @@ -256,16 +249,17 @@ static void untag_chunk(struct node *p) list_del_init(&p->list); list_del_rcu(&chunk->hash); spin_unlock(&hash_lock); - inotify_evict_watch(&chunk->watch); - mutex_unlock(&chunk->watch.inode->inotify_mutex); - put_inotify_watch(&chunk->watch); + spin_unlock(&entry->lock); + fsnotify_destroy_mark(entry); + fsnotify_put_mark(entry); goto out; } new = alloc_chunk(size); if (!new) goto Fallback; - if (inotify_clone_watch(&chunk->watch, &new->watch) < 0) { + fsnotify_duplicate_mark(&new->mark, entry); + if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { free_chunk(new); goto Fallback; } @@ -298,9 +292,9 @@ static void untag_chunk(struct node *p) list_for_each_entry(owner, &new->trees, same_root) owner->root = new; spin_unlock(&hash_lock); - inotify_evict_watch(&chunk->watch); - mutex_unlock(&chunk->watch.inode->inotify_mutex); - put_inotify_watch(&chunk->watch); + spin_unlock(&entry->lock); + fsnotify_destroy_mark(entry); + fsnotify_put_mark(entry); goto out; Fallback: @@ -314,31 +308,33 @@ Fallback: p->owner = NULL; put_tree(owner); spin_unlock(&hash_lock); - mutex_unlock(&chunk->watch.inode->inotify_mutex); + spin_unlock(&entry->lock); out: - unpin_inotify_watch(&chunk->watch); + fsnotify_put_mark(entry); spin_lock(&hash_lock); } static int create_chunk(struct inode *inode, struct audit_tree *tree) { + struct fsnotify_mark *entry; struct audit_chunk *chunk = alloc_chunk(1); if (!chunk) return -ENOMEM; - if (inotify_add_watch(rtree_ih, &chunk->watch, inode, IN_IGNORED | IN_DELETE_SELF) < 0) { + entry = &chunk->mark; + if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) { free_chunk(chunk); return -ENOSPC; } - mutex_lock(&inode->inotify_mutex); + spin_lock(&entry->lock); spin_lock(&hash_lock); if (tree->goner) { spin_unlock(&hash_lock); chunk->dead = 1; - inotify_evict_watch(&chunk->watch); - mutex_unlock(&inode->inotify_mutex); - put_inotify_watch(&chunk->watch); + spin_unlock(&entry->lock); + fsnotify_destroy_mark(entry); + fsnotify_put_mark(entry); return 0; } chunk->owners[0].index = (1U << 31); @@ -351,30 +347,31 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) } insert_hash(chunk); spin_unlock(&hash_lock); - mutex_unlock(&inode->inotify_mutex); + spin_unlock(&entry->lock); return 0; } /* the first tagged inode becomes root of tree */ static int tag_chunk(struct inode *inode, struct audit_tree *tree) { - struct inotify_watch *watch; + struct fsnotify_mark *old_entry, *chunk_entry; struct audit_tree *owner; struct audit_chunk *chunk, *old; struct node *p; int n; - if (inotify_find_watch(rtree_ih, inode, &watch) < 0) + old_entry = fsnotify_find_inode_mark(audit_tree_group, inode); + if (!old_entry) return create_chunk(inode, tree); - old = container_of(watch, struct audit_chunk, watch); + old = container_of(old_entry, struct audit_chunk, mark); /* are we already there? */ spin_lock(&hash_lock); for (n = 0; n < old->count; n++) { if (old->owners[n].owner == tree) { spin_unlock(&hash_lock); - put_inotify_watch(&old->watch); + fsnotify_put_mark(old_entry); return 0; } } @@ -382,25 +379,44 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) chunk = alloc_chunk(old->count + 1); if (!chunk) { - put_inotify_watch(&old->watch); + fsnotify_put_mark(old_entry); return -ENOMEM; } - mutex_lock(&inode->inotify_mutex); - if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) { - mutex_unlock(&inode->inotify_mutex); - put_inotify_watch(&old->watch); + chunk_entry = &chunk->mark; + + spin_lock(&old_entry->lock); + if (!old_entry->i.inode) { + /* old_entry is being shot, lets just lie */ + spin_unlock(&old_entry->lock); + fsnotify_put_mark(old_entry); free_chunk(chunk); + return -ENOENT; + } + + fsnotify_duplicate_mark(chunk_entry, old_entry); + if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { + spin_unlock(&old_entry->lock); + free_chunk(chunk); + fsnotify_put_mark(old_entry); return -ENOSPC; } + + /* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */ + spin_lock(&chunk_entry->lock); spin_lock(&hash_lock); + + /* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */ if (tree->goner) { spin_unlock(&hash_lock); chunk->dead = 1; - inotify_evict_watch(&chunk->watch); - mutex_unlock(&inode->inotify_mutex); - put_inotify_watch(&old->watch); - put_inotify_watch(&chunk->watch); + spin_unlock(&chunk_entry->lock); + spin_unlock(&old_entry->lock); + + fsnotify_destroy_mark(chunk_entry); + + fsnotify_put_mark(chunk_entry); + fsnotify_put_mark(old_entry); return 0; } list_replace_init(&old->trees, &chunk->trees); @@ -426,10 +442,11 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) list_add(&tree->same_root, &chunk->trees); } spin_unlock(&hash_lock); - inotify_evict_watch(&old->watch); - mutex_unlock(&inode->inotify_mutex); - put_inotify_watch(&old->watch); /* pair to inotify_find_watch */ - put_inotify_watch(&old->watch); /* and kill it */ + spin_unlock(&chunk_entry->lock); + spin_unlock(&old_entry->lock); + fsnotify_destroy_mark(old_entry); + fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ + fsnotify_put_mark(old_entry); /* and kill it */ return 0; } @@ -584,7 +601,9 @@ void audit_trim_trees(void) spin_lock(&hash_lock); list_for_each_entry(node, &tree->chunks, list) { - struct inode *inode = find_chunk(node)->watch.inode; + struct audit_chunk *chunk = find_chunk(node); + /* this could be NULL if the watch is dieing else where... */ + struct inode *inode = chunk->mark.i.inode; node->index |= 1U<<31; if (iterate_mounts(compare_root, inode, root_mnt)) node->index &= ~(1U<<31); @@ -846,7 +865,6 @@ void audit_kill_trees(struct list_head *list) * Here comes the stuff asynchronous to auditctl operations */ -/* inode->inotify_mutex is locked */ static void evict_chunk(struct audit_chunk *chunk) { struct audit_tree *owner; @@ -885,35 +903,46 @@ static void evict_chunk(struct audit_chunk *chunk) mutex_unlock(&audit_filter_mutex); } -static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask, - u32 cookie, const char *dname, struct inode *inode) +static int audit_tree_handle_event(struct fsnotify_group *group, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmonut_mark, + struct fsnotify_event *event) +{ + BUG(); + return -EOPNOTSUPP; +} + +static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group) { - struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); + struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); - if (mask & IN_IGNORED) { - evict_chunk(chunk); - put_inotify_watch(watch); - } + evict_chunk(chunk); + fsnotify_put_mark(entry); } -static void destroy_watch(struct inotify_watch *watch) +static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + __u32 mask, void *data, int data_type) { - struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); - call_rcu(&chunk->head, __put_chunk); + return false; } -static const struct inotify_operations rtree_inotify_ops = { - .handle_event = handle_event, - .destroy_watch = destroy_watch, +static const struct fsnotify_ops audit_tree_ops = { + .handle_event = audit_tree_handle_event, + .should_send_event = audit_tree_send_event, + .free_group_priv = NULL, + .free_event_priv = NULL, + .freeing_mark = audit_tree_freeing_mark, }; static int __init audit_tree_init(void) { int i; - rtree_ih = inotify_init(&rtree_inotify_ops); - if (IS_ERR(rtree_ih)) - audit_panic("cannot initialize inotify handle for rectree watches"); + audit_tree_group = fsnotify_alloc_group(&audit_tree_ops); + if (IS_ERR(audit_tree_group)) + audit_panic("cannot initialize fsnotify group for rectree watches"); for (i = 0; i < HASH_SIZE; i++) INIT_LIST_HEAD(&chunk_hash_heads[i]); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 8df4369..f0c9b2e 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -24,18 +24,18 @@ #include <linux/kthread.h> #include <linux/mutex.h> #include <linux/fs.h> +#include <linux/fsnotify_backend.h> #include <linux/namei.h> #include <linux/netlink.h> #include <linux/sched.h> #include <linux/slab.h> -#include <linux/inotify.h> #include <linux/security.h> #include "audit.h" /* * Reference counting: * - * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED + * audit_parent: lifetime is from audit_init_parent() to receipt of an FS_IGNORED * event. Each audit_watch holds a reference to its associated parent. * * audit_watch: if added to lists, lifetime is from audit_init_watch() to @@ -51,40 +51,61 @@ struct audit_watch { unsigned long ino; /* associated inode number */ struct audit_parent *parent; /* associated parent */ struct list_head wlist; /* entry in parent->watches list */ - struct list_head rules; /* associated rules */ + struct list_head rules; /* anchor for krule->rlist */ }; struct audit_parent { - struct list_head ilist; /* entry in inotify registration list */ - struct list_head watches; /* associated watches */ - struct inotify_watch wdata; /* inotify watch data */ - unsigned flags; /* status flags */ + struct list_head watches; /* anchor for audit_watch->wlist */ + struct fsnotify_mark mark; /* fsnotify mark on the inode */ }; -/* Inotify handle. */ -struct inotify_handle *audit_ih; +/* fsnotify handle. */ +struct fsnotify_group *audit_watch_group; -/* - * audit_parent status flags: - * - * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to - * a filesystem event to ensure we're adding audit watches to a valid parent. - * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot - * receive them while we have nameidata, but must be used for IN_MOVE_SELF which - * we can receive while holding nameidata. - */ -#define AUDIT_PARENT_INVALID 0x001 +/* fsnotify events we care about. */ +#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ + FS_MOVE_SELF | FS_EVENT_ON_CHILD) -/* Inotify events we care about. */ -#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF +static void audit_free_parent(struct audit_parent *parent) +{ + WARN_ON(!list_empty(&parent->watches)); + kfree(parent); +} -static void audit_free_parent(struct inotify_watch *i_watch) +static void audit_watch_free_mark(struct fsnotify_mark *entry) { struct audit_parent *parent; - parent = container_of(i_watch, struct audit_parent, wdata); - WARN_ON(!list_empty(&parent->watches)); - kfree(parent); + parent = container_of(entry, struct audit_parent, mark); + audit_free_parent(parent); +} + +static void audit_get_parent(struct audit_parent *parent) +{ + if (likely(parent)) + fsnotify_get_mark(&parent->mark); +} + +static void audit_put_parent(struct audit_parent *parent) +{ + if (likely(parent)) + fsnotify_put_mark(&parent->mark); +} + +/* + * Find and return the audit_parent on the given inode. If found a reference + * is taken on this parent. + */ +static inline struct audit_parent *audit_find_parent(struct inode *inode) +{ + struct audit_parent *parent = NULL; + struct fsnotify_mark *entry; + + entry = fsnotify_find_inode_mark(audit_watch_group, inode); + if (entry) + parent = container_of(entry, struct audit_parent, mark); + + return parent; } void audit_get_watch(struct audit_watch *watch) @@ -105,7 +126,7 @@ void audit_put_watch(struct audit_watch *watch) void audit_remove_watch(struct audit_watch *watch) { list_del(&watch->wlist); - put_inotify_watch(&watch->parent->wdata); + audit_put_parent(watch->parent); watch->parent = NULL; audit_put_watch(watch); /* match initial get */ } @@ -115,42 +136,32 @@ char *audit_watch_path(struct audit_watch *watch) return watch->path; } -struct list_head *audit_watch_rules(struct audit_watch *watch) -{ - return &watch->rules; -} - -unsigned long audit_watch_inode(struct audit_watch *watch) +int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) { - return watch->ino; -} - -dev_t audit_watch_dev(struct audit_watch *watch) -{ - return watch->dev; + return (watch->ino != (unsigned long)-1) && + (watch->ino == ino) && + (watch->dev == dev); } /* Initialize a parent watch entry. */ static struct audit_parent *audit_init_parent(struct nameidata *ndp) { + struct inode *inode = ndp->path.dentry->d_inode; struct audit_parent *parent; - s32 wd; + int ret; parent = kzalloc(sizeof(*parent), GFP_KERNEL); if (unlikely(!parent)) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&parent->watches); - parent->flags = 0; - - inotify_init_watch(&parent->wdata); - /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ - get_inotify_watch(&parent->wdata); - wd = inotify_add_watch(audit_ih, &parent->wdata, - ndp->path.dentry->d_inode, AUDIT_IN_WATCH); - if (wd < 0) { - audit_free_parent(&parent->wdata); - return ERR_PTR(wd); + + fsnotify_init_mark(&parent->mark, audit_watch_free_mark); + parent->mark.mask = AUDIT_FS_WATCH; + ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0); + if (ret < 0) { + audit_free_parent(parent); + return ERR_PTR(ret); } return parent; @@ -179,7 +190,7 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op) { struct audit_watch *watch; - if (!audit_ih) + if (!audit_watch_group) return -EOPNOTSUPP; if (path[0] != '/' || path[len-1] == '/' || @@ -217,7 +228,7 @@ static struct audit_watch *audit_dupe_watch(struct audit_watch *old) new->dev = old->dev; new->ino = old->ino; - get_inotify_watch(&old->parent->wdata); + audit_get_parent(old->parent); new->parent = old->parent; out: @@ -251,15 +262,19 @@ static void audit_update_watch(struct audit_parent *parent, struct audit_entry *oentry, *nentry; mutex_lock(&audit_filter_mutex); + /* Run all of the watches on this parent looking for the one that + * matches the given dname */ list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { if (audit_compare_dname_path(dname, owatch->path, NULL)) continue; /* If the update involves invalidating rules, do the inode-based * filtering now, so we don't omit records. */ - if (invalidating && current->audit_context) + if (invalidating && !audit_dummy_context()) audit_filter_inodes(current, current->audit_context); + /* updating ino will likely change which audit_hash_list we + * are on so we need a new watch for the new list */ nwatch = audit_dupe_watch(owatch); if (IS_ERR(nwatch)) { mutex_unlock(&audit_filter_mutex); @@ -275,12 +290,21 @@ static void audit_update_watch(struct audit_parent *parent, list_del(&oentry->rule.rlist); list_del_rcu(&oentry->list); - nentry = audit_dupe_rule(&oentry->rule, nwatch); + nentry = audit_dupe_rule(&oentry->rule); if (IS_ERR(nentry)) { list_del(&oentry->rule.list); audit_panic("error updating watch, removing"); } else { int h = audit_hash_ino((u32)ino); + + /* + * nentry->rule.watch == oentry->rule.watch so + * we must drop that reference and set it to our + * new watch. + */ + audit_put_watch(nentry->rule.watch); + audit_get_watch(nwatch); + nentry->rule.watch = nwatch; list_add(&nentry->rule.rlist, &nwatch->rules); list_add_rcu(&nentry->list, &audit_inode_hash[h]); list_replace(&oentry->rule.list, @@ -312,7 +336,6 @@ static void audit_remove_parent_watches(struct audit_parent *parent) struct audit_entry *e; mutex_lock(&audit_filter_mutex); - parent->flags |= AUDIT_PARENT_INVALID; list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { list_for_each_entry_safe(r, nextr, &w->rules, rlist) { e = container_of(r, struct audit_entry, rule); @@ -325,20 +348,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent) audit_remove_watch(w); } mutex_unlock(&audit_filter_mutex); -} - -/* Unregister inotify watches for parents on in_list. - * Generates an IN_IGNORED event. */ -void audit_inotify_unregister(struct list_head *in_list) -{ - struct audit_parent *p, *n; - list_for_each_entry_safe(p, n, in_list, ilist) { - list_del(&p->ilist); - inotify_rm_watch(audit_ih, &p->wdata); - /* the unpin matching the pin in audit_do_del_rule() */ - unpin_inotify_watch(&p->wdata); - } + fsnotify_destroy_mark(&parent->mark); } /* Get path information necessary for adding watches. */ @@ -389,7 +400,7 @@ static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) } } -/* Associate the given rule with an existing parent inotify_watch. +/* Associate the given rule with an existing parent. * Caller must hold audit_filter_mutex. */ static void audit_add_to_parent(struct audit_krule *krule, struct audit_parent *parent) @@ -397,6 +408,8 @@ static void audit_add_to_parent(struct audit_krule *krule, struct audit_watch *w, *watch = krule->watch; int watch_found = 0; + BUG_ON(!mutex_is_locked(&audit_filter_mutex)); + list_for_each_entry(w, &parent->watches, wlist) { if (strcmp(watch->path, w->path)) continue; @@ -413,7 +426,7 @@ static void audit_add_to_parent(struct audit_krule *krule, } if (!watch_found) { - get_inotify_watch(&parent->wdata); + audit_get_parent(parent); watch->parent = parent; list_add(&watch->wlist, &parent->watches); @@ -423,13 +436,12 @@ static void audit_add_to_parent(struct audit_krule *krule, /* Find a matching watch entry, or add this one. * Caller must hold audit_filter_mutex. */ -int audit_add_watch(struct audit_krule *krule) +int audit_add_watch(struct audit_krule *krule, struct list_head **list) { struct audit_watch *watch = krule->watch; - struct inotify_watch *i_watch; struct audit_parent *parent; struct nameidata *ndp = NULL, *ndw = NULL; - int ret = 0; + int h, ret = 0; mutex_unlock(&audit_filter_mutex); @@ -441,47 +453,38 @@ int audit_add_watch(struct audit_krule *krule) goto error; } + mutex_lock(&audit_filter_mutex); + /* update watch filter fields */ if (ndw) { watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev; watch->ino = ndw->path.dentry->d_inode->i_ino; } - /* The audit_filter_mutex must not be held during inotify calls because - * we hold it during inotify event callback processing. If an existing - * inotify watch is found, inotify_find_watch() grabs a reference before - * returning. - */ - if (inotify_find_watch(audit_ih, ndp->path.dentry->d_inode, - &i_watch) < 0) { + /* either find an old parent or attach a new one */ + parent = audit_find_parent(ndp->path.dentry->d_inode); + if (!parent) { parent = audit_init_parent(ndp); if (IS_ERR(parent)) { - /* caller expects mutex locked */ - mutex_lock(&audit_filter_mutex); ret = PTR_ERR(parent); goto error; } - } else - parent = container_of(i_watch, struct audit_parent, wdata); - - mutex_lock(&audit_filter_mutex); + } - /* parent was moved before we took audit_filter_mutex */ - if (parent->flags & AUDIT_PARENT_INVALID) - ret = -ENOENT; - else - audit_add_to_parent(krule, parent); + audit_add_to_parent(krule, parent); - /* match get in audit_init_parent or inotify_find_watch */ - put_inotify_watch(&parent->wdata); + /* match get in audit_find_parent or audit_init_parent */ + audit_put_parent(parent); + h = audit_hash_ino((u32)watch->ino); + *list = &audit_inode_hash[h]; error: audit_put_nd(ndp, ndw); /* NULL args OK */ return ret; } -void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list) +void audit_remove_watch_rule(struct audit_krule *krule) { struct audit_watch *watch = krule->watch; struct audit_parent *parent = watch->parent; @@ -492,53 +495,74 @@ void audit_remove_watch_rule(struct audit_krule *krule, struct list_head *list) audit_remove_watch(watch); if (list_empty(&parent->watches)) { - /* Put parent on the inotify un-registration - * list. Grab a reference before releasing - * audit_filter_mutex, to be released in - * audit_inotify_unregister(). - * If filesystem is going away, just leave - * the sucker alone, eviction will take - * care of it. */ - if (pin_inotify_watch(&parent->wdata)) - list_add(&parent->ilist, list); + audit_get_parent(parent); + fsnotify_destroy_mark(&parent->mark); + audit_put_parent(parent); } } } -/* Update watch data in audit rules based on inotify events. */ -static void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask, - u32 cookie, const char *dname, struct inode *inode) +static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + __u32 mask, void *data, int data_type) +{ + return true; +} + +/* Update watch data in audit rules based on fsnotify events. */ +static int audit_watch_handle_event(struct fsnotify_group *group, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + struct fsnotify_event *event) { + struct inode *inode; + __u32 mask = event->mask; + const char *dname = event->file_name; struct audit_parent *parent; - parent = container_of(i_watch, struct audit_parent, wdata); + parent = container_of(inode_mark, struct audit_parent, mark); - if (mask & (IN_CREATE|IN_MOVED_TO) && inode) - audit_update_watch(parent, dname, inode->i_sb->s_dev, - inode->i_ino, 0); - else if (mask & (IN_DELETE|IN_MOVED_FROM)) + BUG_ON(group != audit_watch_group); + + switch (event->data_type) { + case (FSNOTIFY_EVENT_PATH): + inode = event->path.dentry->d_inode; + break; + case (FSNOTIFY_EVENT_INODE): + inode = event->inode; + break; + default: + BUG(); + inode = NULL; + break; + }; + + if (mask & (FS_CREATE|FS_MOVED_TO) && inode) + audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); + else if (mask & (FS_DELETE|FS_MOVED_FROM)) audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); - /* inotify automatically removes the watch and sends IN_IGNORED */ - else if (mask & (IN_DELETE_SELF|IN_UNMOUNT)) - audit_remove_parent_watches(parent); - /* inotify does not remove the watch, so remove it manually */ - else if(mask & IN_MOVE_SELF) { + else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) audit_remove_parent_watches(parent); - inotify_remove_watch_locked(audit_ih, i_watch); - } else if (mask & IN_IGNORED) - put_inotify_watch(i_watch); + + return 0; } -static const struct inotify_operations audit_inotify_ops = { - .handle_event = audit_handle_ievent, - .destroy_watch = audit_free_parent, +static const struct fsnotify_ops audit_watch_fsnotify_ops = { + .should_send_event = audit_watch_should_send_event, + .handle_event = audit_watch_handle_event, + .free_group_priv = NULL, + .freeing_mark = NULL, + .free_event_priv = NULL, }; static int __init audit_watch_init(void) { - audit_ih = inotify_init(&audit_inotify_ops); - if (IS_ERR(audit_ih)) - audit_panic("cannot initialize inotify handle"); + audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops); + if (IS_ERR(audit_watch_group)) { + audit_watch_group = NULL; + audit_panic("cannot create audit fsnotify group"); + } return 0; } -subsys_initcall(audit_watch_init); +device_initcall(audit_watch_init); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index ce08041..eb76754 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -71,6 +71,7 @@ static inline void audit_free_rule(struct audit_entry *e) { int i; struct audit_krule *erule = &e->rule; + /* some rules don't have associated watches */ if (erule->watch) audit_put_watch(erule->watch); @@ -746,8 +747,7 @@ static inline int audit_dupe_lsm_field(struct audit_field *df, * rule with the new rule in the filterlist, then free the old rule. * The rlist element is undefined; list manipulations are handled apart from * the initial copy. */ -struct audit_entry *audit_dupe_rule(struct audit_krule *old, - struct audit_watch *watch) +struct audit_entry *audit_dupe_rule(struct audit_krule *old) { u32 fcount = old->field_count; struct audit_entry *entry; @@ -769,8 +769,8 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old, new->prio = old->prio; new->buflen = old->buflen; new->inode_f = old->inode_f; - new->watch = NULL; new->field_count = old->field_count; + /* * note that we are OK with not refcounting here; audit_match_tree() * never dereferences tree and we can't get false positives there @@ -811,9 +811,9 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old, } } - if (watch) { - audit_get_watch(watch); - new->watch = watch; + if (old->watch) { + audit_get_watch(old->watch); + new->watch = old->watch; } return entry; @@ -866,7 +866,7 @@ static inline int audit_add_rule(struct audit_entry *entry) struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; struct list_head *list; - int h, err; + int err; #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; @@ -889,15 +889,11 @@ static inline int audit_add_rule(struct audit_entry *entry) if (watch) { /* audit_filter_mutex is dropped and re-taken during this call */ - err = audit_add_watch(&entry->rule); + err = audit_add_watch(&entry->rule, &list); if (err) { mutex_unlock(&audit_filter_mutex); goto error; } - /* entry->rule.watch may have changed during audit_add_watch() */ - watch = entry->rule.watch; - h = audit_hash_ino((u32)audit_watch_inode(watch)); - list = &audit_inode_hash[h]; } if (tree) { err = audit_add_tree_rule(&entry->rule); @@ -949,7 +945,6 @@ static inline int audit_del_rule(struct audit_entry *entry) struct audit_watch *watch = entry->rule.watch; struct audit_tree *tree = entry->rule.tree; struct list_head *list; - LIST_HEAD(inotify_list); int ret = 0; #ifdef CONFIG_AUDITSYSCALL int dont_count = 0; @@ -969,7 +964,7 @@ static inline int audit_del_rule(struct audit_entry *entry) } if (e->rule.watch) - audit_remove_watch_rule(&e->rule, &inotify_list); + audit_remove_watch_rule(&e->rule); if (e->rule.tree) audit_remove_tree_rule(&e->rule); @@ -987,9 +982,6 @@ static inline int audit_del_rule(struct audit_entry *entry) #endif mutex_unlock(&audit_filter_mutex); - if (!list_empty(&inotify_list)) - audit_inotify_unregister(&inotify_list); - out: if (watch) audit_put_watch(watch); /* match initial get */ @@ -1323,30 +1315,23 @@ static int update_lsm_rule(struct audit_krule *r) { struct audit_entry *entry = container_of(r, struct audit_entry, rule); struct audit_entry *nentry; - struct audit_watch *watch; - struct audit_tree *tree; int err = 0; if (!security_audit_rule_known(r)) return 0; - watch = r->watch; - tree = r->tree; - nentry = audit_dupe_rule(r, watch); + nentry = audit_dupe_rule(r); if (IS_ERR(nentry)) { /* save the first error encountered for the * return value */ err = PTR_ERR(nentry); audit_panic("error updating LSM filters"); - if (watch) + if (r->watch) list_del(&r->rlist); list_del_rcu(&entry->list); list_del(&r->list); } else { - if (watch) { - list_add(&nentry->rule.rlist, audit_watch_rules(watch)); - list_del(&r->rlist); - } else if (tree) + if (r->watch || r->tree) list_replace_init(&r->rlist, &nentry->rule.rlist); list_replace_rcu(&entry->list, &nentry->list); list_replace(&r->list, &nentry->rule.list); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3828ad5..1b31c13 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -65,7 +65,6 @@ #include <linux/binfmts.h> #include <linux/highmem.h> #include <linux/syscalls.h> -#include <linux/inotify.h> #include <linux/capability.h> #include <linux/fs_struct.h> @@ -549,9 +548,8 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_WATCH: - if (name && audit_watch_inode(rule->watch) != (unsigned long)-1) - result = (name->dev == audit_watch_dev(rule->watch) && - name->ino == audit_watch_inode(rule->watch)); + if (name) + result = audit_watch_compare(rule->watch, name->ino, name->dev); break; case AUDIT_DIR: if (ctx) @@ -1726,7 +1724,7 @@ static inline void handle_one(const struct inode *inode) struct audit_tree_refs *p; struct audit_chunk *chunk; int count; - if (likely(list_empty(&inode->inotify_watches))) + if (likely(hlist_empty(&inode->i_fsnotify_marks))) return; context = current->audit_context; p = context->trees; @@ -1769,7 +1767,7 @@ retry: seq = read_seqbegin(&rename_lock); for(;;) { struct inode *inode = d->d_inode; - if (inode && unlikely(!list_empty(&inode->inotify_watches))) { + if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) { struct audit_chunk *chunk; chunk = audit_tree_lookup(inode); if (chunk) { @@ -1837,13 +1835,8 @@ void __audit_getname(const char *name) context->names[context->name_count].ino = (unsigned long)-1; context->names[context->name_count].osid = 0; ++context->name_count; - if (!context->pwd.dentry) { - read_lock(¤t->fs->lock); - context->pwd = current->fs->pwd; - path_get(¤t->fs->pwd); - read_unlock(¤t->fs->lock); - } - + if (!context->pwd.dentry) + get_fs_pwd(current->fs, &context->pwd); } /* audit_putname - intercept a putname request diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3ac6f5b..291ba3d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -138,7 +138,7 @@ struct css_id { * is called after synchronize_rcu(). But for safe use, css_is_removed() * css_tryget() should be used for avoiding race. */ - struct cgroup_subsys_state *css; + struct cgroup_subsys_state __rcu *css; /* * ID of this css. */ @@ -1102,7 +1102,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if (opts->release_agent) return -EINVAL; opts->release_agent = - kstrndup(token + 14, PATH_MAX, GFP_KERNEL); + kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); if (!opts->release_agent) return -ENOMEM; } else if (!strncmp(token, "name=", 5)) { @@ -1123,7 +1123,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if (opts->name) return -EINVAL; opts->name = kstrndup(name, - MAX_CGROUP_ROOT_NAMELEN, + MAX_CGROUP_ROOT_NAMELEN - 1, GFP_KERNEL); if (!opts->name) return -ENOMEM; @@ -1623,6 +1623,8 @@ static struct file_system_type cgroup_fs_type = { .kill_sb = cgroup_kill_sb, }; +static struct kobject *cgroup_kobj; + static inline struct cgroup *__d_cgrp(struct dentry *dentry) { return dentry->d_fsdata; @@ -1788,6 +1790,30 @@ out: return retval; } +/** + * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' + * @from: attach to all cgroups of a given task + * @tsk: the task to be attached + */ +int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) +{ + struct cgroupfs_root *root; + int retval = 0; + + cgroup_lock(); + for_each_active_root(root) { + struct cgroup *from_cg = task_cgroup_from_root(from, root); + + retval = cgroup_attach_task(from_cg, tsk); + if (retval) + break; + } + cgroup_unlock(); + + return retval; +} +EXPORT_SYMBOL_GPL(cgroup_attach_task_all); + /* * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex * held. May take task_lock of task @@ -3871,9 +3897,18 @@ int __init cgroup_init(void) hhead = css_set_hash(init_css_set.subsys); hlist_add_head(&init_css_set.hlist, hhead); BUG_ON(!init_root_id(&rootnode)); + + cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); + if (!cgroup_kobj) { + err = -ENOMEM; + goto out; + } + err = register_filesystem(&cgroup_fs_type); - if (err < 0) + if (err < 0) { + kobject_put(cgroup_kobj); goto out; + } proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); diff --git a/kernel/compat.c b/kernel/compat.c index 5adab05..c9e2ec0 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -279,11 +279,6 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource, struct compat_rlimit __user *rlim) { struct rlimit r; - int ret; - mm_segment_t old_fs = get_fs (); - - if (resource >= RLIM_NLIMITS) - return -EINVAL; if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) || __get_user(r.rlim_cur, &rlim->rlim_cur) || @@ -294,10 +289,7 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource, r.rlim_cur = RLIM_INFINITY; if (r.rlim_max == COMPAT_RLIM_INFINITY) r.rlim_max = RLIM_INFINITY; - set_fs(KERNEL_DS); - ret = sys_setrlimit(resource, (struct rlimit __user *) &r); - set_fs(old_fs); - return ret; + return do_prlimit(current, resource, &r, NULL); } #ifdef COMPAT_RLIM_OLD_INFINITY @@ -329,16 +321,13 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource, #endif -asmlinkage long compat_sys_getrlimit (unsigned int resource, +asmlinkage long compat_sys_getrlimit(unsigned int resource, struct compat_rlimit __user *rlim) { struct rlimit r; int ret; - mm_segment_t old_fs = get_fs(); - set_fs(KERNEL_DS); - ret = sys_getrlimit(resource, (struct rlimit __user *) &r); - set_fs(old_fs); + ret = do_prlimit(current, resource, NULL, &r); if (!ret) { if (r.rlim_cur > COMPAT_RLIM_INFINITY) r.rlim_cur = COMPAT_RLIM_INFINITY; @@ -1137,3 +1126,24 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info) return 0; } + +/* + * Allocate user-space memory for the duration of a single system call, + * in order to marshall parameters inside a compat thunk. + */ +void __user *compat_alloc_user_space(unsigned long len) +{ + void __user *ptr; + + /* If len would occupy more than half of the entire compat space... */ + if (unlikely(len > (((compat_uptr_t)~0) >> 1))) + return NULL; + + ptr = arch_compat_alloc_user_space(len); + + if (unlikely(!access_ok(VERIFY_WRITE, ptr, len))) + return NULL; + + return ptr; +} +EXPORT_SYMBOL_GPL(compat_alloc_user_space); diff --git a/kernel/cpu.c b/kernel/cpu.c index 97d1b42..f6e726f 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -235,11 +235,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) return -EINVAL; cpu_hotplug_begin(); - set_cpu_active(cpu, false); err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); if (err) { - set_cpu_active(cpu, true); - nr_calls--; __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); printk("%s: attempt to take down CPU %u failed\n", @@ -249,7 +246,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { - set_cpu_active(cpu, true); /* CPU didn't die: tell everyone. Can't complain. */ cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); @@ -321,8 +317,6 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) goto out_notify; BUG_ON(!cpu_online(cpu)); - set_cpu_active(cpu, true); - /* Now call notifier in preparation. */ cpu_notify(CPU_ONLINE | mod, hcpu); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 02b9611..51b143e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -105,7 +105,7 @@ struct cpuset { /* for custom sched domain */ int relax_domain_level; - /* used for walking a cpuset heirarchy */ + /* used for walking a cpuset hierarchy */ struct list_head stack_list; }; @@ -1397,7 +1397,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, if (tsk->flags & PF_THREAD_BOUND) return -EINVAL; - ret = security_task_setscheduler(tsk, 0, NULL); + ret = security_task_setscheduler(tsk); if (ret) return ret; if (threadgroup) { @@ -1405,7 +1405,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, rcu_read_lock(); list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - ret = security_task_setscheduler(c, 0, NULL); + ret = security_task_setscheduler(c); if (ret) { rcu_read_unlock(); return ret; @@ -2113,31 +2113,17 @@ static void scan_for_empty_cpusets(struct cpuset *root) * but making no active use of cpusets. * * This routine ensures that top_cpuset.cpus_allowed tracks - * cpu_online_map on each CPU hotplug (cpuhp) event. + * cpu_active_mask on each CPU hotplug (cpuhp) event. * * Called within get_online_cpus(). Needs to call cgroup_lock() * before calling generate_sched_domains(). */ -static int cpuset_track_online_cpus(struct notifier_block *unused_nb, - unsigned long phase, void *unused_cpu) +void cpuset_update_active_cpus(void) { struct sched_domain_attr *attr; cpumask_var_t *doms; int ndoms; - switch (phase) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - break; - - default: - return NOTIFY_DONE; - } - cgroup_lock(); mutex_lock(&callback_mutex); cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); @@ -2148,8 +2134,6 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); - - return NOTIFY_OK; } #ifdef CONFIG_MEMORY_HOTPLUG @@ -2203,7 +2187,6 @@ void __init cpuset_init_smp(void) cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; - hotcpu_notifier(cpuset_track_online_cpus, 0); hotplug_memory_notifier(cpuset_track_online_nodes, 10); cpuset_wq = create_singlethread_workqueue("cpuset"); diff --git a/kernel/cred.c b/kernel/cred.c index a2d5504..9a3e226 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -22,10 +22,6 @@ #define kdebug(FMT, ...) \ printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) #else -static inline __attribute__((format(printf, 1, 2))) -void no_printk(const char *fmt, ...) -{ -} #define kdebug(FMT, ...) \ no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) #endif @@ -209,6 +205,31 @@ void exit_creds(struct task_struct *tsk) } } +/** + * get_task_cred - Get another task's objective credentials + * @task: The task to query + * + * Get the objective credentials of a task, pinning them so that they can't go + * away. Accessing a task's credentials directly is not permitted. + * + * The caller must also make sure task doesn't get deleted, either by holding a + * ref on task or by holding tasklist_lock to prevent it from being unlinked. + */ +const struct cred *get_task_cred(struct task_struct *task) +{ + const struct cred *cred; + + rcu_read_lock(); + + do { + cred = __task_cred((task)); + BUG_ON(!cred); + } while (!atomic_inc_not_zero(&((struct cred *)cred)->usage)); + + rcu_read_unlock(); + return cred; +} + /* * Allocate blank credentials, such that the credentials can be filled in at a * later date without risk of ENOMEM. diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 5cb7cd1..de407c7 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -6,7 +6,7 @@ * Copyright (C) 2000-2001 VERITAS Software Corporation. * Copyright (C) 2002-2004 Timesys Corporation * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> - * Copyright (C) 2004 Pavel Machek <pavel@suse.cz> + * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. * Copyright (C) 2005-2009 Wind River Systems, Inc. @@ -605,13 +605,15 @@ cpu_master_loop: if (dbg_kdb_mode) { kgdb_connected = 1; error = kdb_stub(ks); + if (error == -1) + continue; + kgdb_connected = 0; } else { error = gdb_serial_stub(ks); } if (error == DBG_PASS_EVENT) { dbg_kdb_mode = !dbg_kdb_mode; - kgdb_connected = 0; } else if (error == DBG_SWITCH_CPU_EVENT) { dbg_cpu_switch(cpu, dbg_switch_cpu); goto cpu_loop; @@ -739,7 +741,7 @@ static struct console kgdbcons = { }; #ifdef CONFIG_MAGIC_SYSRQ -static void sysrq_handle_dbg(int key, struct tty_struct *tty) +static void sysrq_handle_dbg(int key) { if (!dbg_io_ops) { printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 4b17b32..481a7bd 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -6,7 +6,7 @@ * Copyright (C) 2000-2001 VERITAS Software Corporation. * Copyright (C) 2002-2004 Timesys Corporation * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com> - * Copyright (C) 2004 Pavel Machek <pavel@suse.cz> + * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org> * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. * Copyright (C) 2005-2009 Wind River Systems, Inc. @@ -52,17 +52,6 @@ static unsigned long gdb_regs[(NUMREGBYTES + * GDB remote protocol parser: */ -static int hex(char ch) -{ - if ((ch >= 'a') && (ch <= 'f')) - return ch - 'a' + 10; - if ((ch >= '0') && (ch <= '9')) - return ch - '0'; - if ((ch >= 'A') && (ch <= 'F')) - return ch - 'A' + 10; - return -1; -} - #ifdef CONFIG_KGDB_KDB static int gdbstub_read_wait(void) { @@ -123,8 +112,8 @@ static void get_packet(char *buffer) buffer[count] = 0; if (ch == '#') { - xmitcsum = hex(gdbstub_read_wait()) << 4; - xmitcsum += hex(gdbstub_read_wait()); + xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4; + xmitcsum += hex_to_bin(gdbstub_read_wait()); if (checksum != xmitcsum) /* failed checksum */ @@ -236,7 +225,7 @@ void gdbstub_msg_write(const char *s, int len) * buf. Return a pointer to the last char put in buf (null). May * return an error. */ -int kgdb_mem2hex(char *mem, char *buf, int count) +char *kgdb_mem2hex(char *mem, char *buf, int count) { char *tmp; int err; @@ -248,17 +237,16 @@ int kgdb_mem2hex(char *mem, char *buf, int count) tmp = buf + count; err = probe_kernel_read(tmp, mem, count); - if (!err) { - while (count > 0) { - buf = pack_hex_byte(buf, *tmp); - tmp++; - count--; - } - - *buf = 0; + if (err) + return NULL; + while (count > 0) { + buf = pack_hex_byte(buf, *tmp); + tmp++; + count--; } + *buf = 0; - return err; + return buf; } /* @@ -280,8 +268,8 @@ int kgdb_hex2mem(char *buf, char *mem, int count) tmp_hex = tmp_raw - 1; while (tmp_hex >= buf) { tmp_raw--; - *tmp_raw = hex(*tmp_hex--); - *tmp_raw |= hex(*tmp_hex--) << 4; + *tmp_raw = hex_to_bin(*tmp_hex--); + *tmp_raw |= hex_to_bin(*tmp_hex--) << 4; } return probe_kernel_write(mem, tmp_raw, count); @@ -304,7 +292,7 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val) (*ptr)++; } while (**ptr) { - hex_val = hex(**ptr); + hex_val = hex_to_bin(**ptr); if (hex_val < 0) break; @@ -339,6 +327,32 @@ static int kgdb_ebin2mem(char *buf, char *mem, int count) return probe_kernel_write(mem, c, size); } +#if DBG_MAX_REG_NUM > 0 +void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) +{ + int i; + int idx = 0; + char *ptr = (char *)gdb_regs; + + for (i = 0; i < DBG_MAX_REG_NUM; i++) { + dbg_get_reg(i, ptr + idx, regs); + idx += dbg_reg_def[i].size; + } +} + +void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) +{ + int i; + int idx = 0; + char *ptr = (char *)gdb_regs; + + for (i = 0; i < DBG_MAX_REG_NUM; i++) { + dbg_set_reg(i, ptr + idx, regs); + idx += dbg_reg_def[i].size; + } +} +#endif /* DBG_MAX_REG_NUM > 0 */ + /* Write memory due to an 'M' or 'X' packet. */ static int write_mem_msg(int binary) { @@ -378,28 +392,31 @@ static void error_packet(char *pkt, int error) * remapped to negative TIDs. */ -#define BUF_THREAD_ID_SIZE 16 +#define BUF_THREAD_ID_SIZE 8 static char *pack_threadid(char *pkt, unsigned char *id) { - char *limit; + unsigned char *limit; + int lzero = 1; + + limit = id + (BUF_THREAD_ID_SIZE / 2); + while (id < limit) { + if (!lzero || *id != 0) { + pkt = pack_hex_byte(pkt, *id); + lzero = 0; + } + id++; + } - limit = pkt + BUF_THREAD_ID_SIZE; - while (pkt < limit) - pkt = pack_hex_byte(pkt, *id++); + if (lzero) + pkt = pack_hex_byte(pkt, 0); return pkt; } static void int_to_threadref(unsigned char *id, int value) { - unsigned char *scan; - int i = 4; - - scan = (unsigned char *)id; - while (i--) - *scan++ = 0; - put_unaligned_be32(value, scan); + put_unaligned_be32(value, id); } static struct task_struct *getthread(struct pt_regs *regs, int tid) @@ -463,8 +480,7 @@ static void gdb_cmd_status(struct kgdb_state *ks) pack_hex_byte(&remcom_out_buffer[1], ks->signo); } -/* Handle the 'g' get registers request */ -static void gdb_cmd_getregs(struct kgdb_state *ks) +static void gdb_get_regs_helper(struct kgdb_state *ks) { struct task_struct *thread; void *local_debuggerinfo; @@ -505,6 +521,12 @@ static void gdb_cmd_getregs(struct kgdb_state *ks) */ sleeping_thread_to_gdb_regs(gdb_regs, thread); } +} + +/* Handle the 'g' get registers request */ +static void gdb_cmd_getregs(struct kgdb_state *ks) +{ + gdb_get_regs_helper(ks); kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES); } @@ -527,13 +549,13 @@ static void gdb_cmd_memread(struct kgdb_state *ks) char *ptr = &remcom_in_buffer[1]; unsigned long length; unsigned long addr; - int err; + char *err; if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' && kgdb_hex2long(&ptr, &length) > 0) { err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length); - if (err) - error_packet(remcom_out_buffer, err); + if (!err) + error_packet(remcom_out_buffer, -EINVAL); } else { error_packet(remcom_out_buffer, -EINVAL); } @@ -550,6 +572,60 @@ static void gdb_cmd_memwrite(struct kgdb_state *ks) strcpy(remcom_out_buffer, "OK"); } +#if DBG_MAX_REG_NUM > 0 +static char *gdb_hex_reg_helper(int regnum, char *out) +{ + int i; + int offset = 0; + + for (i = 0; i < regnum; i++) + offset += dbg_reg_def[i].size; + return kgdb_mem2hex((char *)gdb_regs + offset, out, + dbg_reg_def[i].size); +} + +/* Handle the 'p' individual regster get */ +static void gdb_cmd_reg_get(struct kgdb_state *ks) +{ + unsigned long regnum; + char *ptr = &remcom_in_buffer[1]; + + kgdb_hex2long(&ptr, ®num); + if (regnum >= DBG_MAX_REG_NUM) { + error_packet(remcom_out_buffer, -EINVAL); + return; + } + gdb_get_regs_helper(ks); + gdb_hex_reg_helper(regnum, remcom_out_buffer); +} + +/* Handle the 'P' individual regster set */ +static void gdb_cmd_reg_set(struct kgdb_state *ks) +{ + unsigned long regnum; + char *ptr = &remcom_in_buffer[1]; + int i = 0; + + kgdb_hex2long(&ptr, ®num); + if (*ptr++ != '=' || + !(!kgdb_usethread || kgdb_usethread == current) || + !dbg_get_reg(regnum, gdb_regs, ks->linux_regs)) { + error_packet(remcom_out_buffer, -EINVAL); + return; + } + memset(gdb_regs, 0, sizeof(gdb_regs)); + while (i < sizeof(gdb_regs) * 2) + if (hex_to_bin(ptr[i]) >= 0) + i++; + else + break; + i = i / 2; + kgdb_hex2mem(ptr, (char *)gdb_regs, i); + dbg_set_reg(regnum, gdb_regs, ks->linux_regs); + strcpy(remcom_out_buffer, "OK"); +} +#endif /* DBG_MAX_REG_NUM > 0 */ + /* Handle the 'X' memory binary write bytes */ static void gdb_cmd_binwrite(struct kgdb_state *ks) { @@ -612,7 +688,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) { struct task_struct *g; struct task_struct *p; - unsigned char thref[8]; + unsigned char thref[BUF_THREAD_ID_SIZE]; char *ptr; int i; int cpu; @@ -621,10 +697,8 @@ static void gdb_cmd_query(struct kgdb_state *ks) switch (remcom_in_buffer[1]) { case 's': case 'f': - if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) { - error_packet(remcom_out_buffer, -EINVAL); + if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) break; - } i = 0; remcom_out_buffer[0] = 'm'; @@ -634,8 +708,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) for_each_online_cpu(cpu) { ks->thr_query = 0; int_to_threadref(thref, -cpu - 2); - pack_threadid(ptr, thref); - ptr += BUF_THREAD_ID_SIZE; + ptr = pack_threadid(ptr, thref); *(ptr++) = ','; i++; } @@ -644,8 +717,7 @@ static void gdb_cmd_query(struct kgdb_state *ks) do_each_thread(g, p) { if (i >= ks->thr_query && !finished) { int_to_threadref(thref, p->pid); - pack_threadid(ptr, thref); - ptr += BUF_THREAD_ID_SIZE; + ptr = pack_threadid(ptr, thref); *(ptr++) = ','; ks->thr_query++; if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0) @@ -665,10 +737,9 @@ static void gdb_cmd_query(struct kgdb_state *ks) pack_threadid(remcom_out_buffer + 2, thref); break; case 'T': - if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) { - error_packet(remcom_out_buffer, -EINVAL); + if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) break; - } + ks->threadid = 0; ptr = remcom_in_buffer + 17; kgdb_hex2long(&ptr, &ks->threadid); @@ -861,11 +932,14 @@ int gdb_serial_stub(struct kgdb_state *ks) int error = 0; int tmp; - /* Clear the out buffer. */ + /* Initialize comm buffer and globals. */ memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); + kgdb_usethread = kgdb_info[ks->cpu].task; + ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid); + ks->pass_exception = 0; if (kgdb_connected) { - unsigned char thref[8]; + unsigned char thref[BUF_THREAD_ID_SIZE]; char *ptr; /* Reply to host that an exception has occurred */ @@ -879,10 +953,6 @@ int gdb_serial_stub(struct kgdb_state *ks) put_packet(remcom_out_buffer); } - kgdb_usethread = kgdb_info[ks->cpu].task; - ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid); - ks->pass_exception = 0; - while (1) { error = 0; @@ -907,6 +977,14 @@ int gdb_serial_stub(struct kgdb_state *ks) case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */ gdb_cmd_memwrite(ks); break; +#if DBG_MAX_REG_NUM > 0 + case 'p': /* pXX Return gdb register XX (in hex) */ + gdb_cmd_reg_get(ks); + break; + case 'P': /* PXX=aaaa Set gdb register XX to aaaa (in hex) */ + gdb_cmd_reg_set(ks); + break; +#endif /* DBG_MAX_REG_NUM > 0 */ case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */ gdb_cmd_binwrite(ks); break; diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index 75bd9b3..20059ef 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -274,7 +274,6 @@ static int kdb_bp(int argc, const char **argv) int i, bpno; kdb_bp_t *bp, *bp_check; int diag; - int free; char *symname = NULL; long offset = 0ul; int nextarg; @@ -305,7 +304,6 @@ static int kdb_bp(int argc, const char **argv) /* * Find an empty bp structure to allocate */ - free = KDB_MAXBPT; for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) { if (bp->bp_free) break; diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 184cd82..caf057a 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -312,7 +312,7 @@ int kdbgetularg(const char *arg, unsigned long *value) if (endp == arg) { /* - * Try base 16, for us folks too lazy to type the + * Also try base 16, for us folks too lazy to type the * leading 0x... */ val = simple_strtoul(arg, &endp, 16); @@ -325,6 +325,25 @@ int kdbgetularg(const char *arg, unsigned long *value) return 0; } +int kdbgetu64arg(const char *arg, u64 *value) +{ + char *endp; + u64 val; + + val = simple_strtoull(arg, &endp, 0); + + if (endp == arg) { + + val = simple_strtoull(arg, &endp, 16); + if (endp == arg) + return KDB_BADINT; + } + + *value = val; + + return 0; +} + /* * kdb_set - This function implements the 'set' command. Alter an * existing environment variable or create a new one. @@ -1770,11 +1789,65 @@ static int kdb_go(int argc, const char **argv) */ static int kdb_rd(int argc, const char **argv) { - int diag = kdb_check_regs(); - if (diag) - return diag; + int len = kdb_check_regs(); +#if DBG_MAX_REG_NUM > 0 + int i; + char *rname; + int rsize; + u64 reg64; + u32 reg32; + u16 reg16; + u8 reg8; + + if (len) + return len; + + for (i = 0; i < DBG_MAX_REG_NUM; i++) { + rsize = dbg_reg_def[i].size * 2; + if (rsize > 16) + rsize = 2; + if (len + strlen(dbg_reg_def[i].name) + 4 + rsize > 80) { + len = 0; + kdb_printf("\n"); + } + if (len) + len += kdb_printf(" "); + switch(dbg_reg_def[i].size * 8) { + case 8: + rname = dbg_get_reg(i, ®8, kdb_current_regs); + if (!rname) + break; + len += kdb_printf("%s: %02x", rname, reg8); + break; + case 16: + rname = dbg_get_reg(i, ®16, kdb_current_regs); + if (!rname) + break; + len += kdb_printf("%s: %04x", rname, reg16); + break; + case 32: + rname = dbg_get_reg(i, ®32, kdb_current_regs); + if (!rname) + break; + len += kdb_printf("%s: %08x", rname, reg32); + break; + case 64: + rname = dbg_get_reg(i, ®64, kdb_current_regs); + if (!rname) + break; + len += kdb_printf("%s: %016llx", rname, reg64); + break; + default: + len += kdb_printf("%s: ??", dbg_reg_def[i].name); + } + } + kdb_printf("\n"); +#else + if (len) + return len; kdb_dumpregs(kdb_current_regs); +#endif return 0; } @@ -1782,32 +1855,67 @@ static int kdb_rd(int argc, const char **argv) * kdb_rm - This function implements the 'rm' (register modify) command. * rm register-name new-contents * Remarks: - * Currently doesn't allow modification of control or - * debug registers. + * Allows register modification with the same restrictions as gdb */ static int kdb_rm(int argc, const char **argv) { +#if DBG_MAX_REG_NUM > 0 int diag; - int ind = 0; - unsigned long contents; + const char *rname; + int i; + u64 reg64; + u32 reg32; + u16 reg16; + u8 reg8; if (argc != 2) return KDB_ARGCOUNT; /* * Allow presence or absence of leading '%' symbol. */ - if (argv[1][0] == '%') - ind = 1; + rname = argv[1]; + if (*rname == '%') + rname++; - diag = kdbgetularg(argv[2], &contents); + diag = kdbgetu64arg(argv[2], ®64); if (diag) return diag; diag = kdb_check_regs(); if (diag) return diag; + + diag = KDB_BADREG; + for (i = 0; i < DBG_MAX_REG_NUM; i++) { + if (strcmp(rname, dbg_reg_def[i].name) == 0) { + diag = 0; + break; + } + } + if (!diag) { + switch(dbg_reg_def[i].size * 8) { + case 8: + reg8 = reg64; + dbg_set_reg(i, ®8, kdb_current_regs); + break; + case 16: + reg16 = reg64; + dbg_set_reg(i, ®16, kdb_current_regs); + break; + case 32: + reg32 = reg64; + dbg_set_reg(i, ®32, kdb_current_regs); + break; + case 64: + dbg_set_reg(i, ®64, kdb_current_regs); + break; + } + } + return diag; +#else kdb_printf("ERROR: Register set currently not implemented\n"); - return 0; + return 0; +#endif } #if defined(CONFIG_MAGIC_SYSRQ) @@ -1820,9 +1928,8 @@ static int kdb_sr(int argc, const char **argv) { if (argc != 1) return KDB_ARGCOUNT; - sysrq_toggle_support(1); kdb_trap_printk++; - handle_sysrq(*argv[1], NULL); + __handle_sysrq(*argv[1], false); kdb_trap_printk--; return 0; @@ -1883,6 +1990,7 @@ static int kdb_lsmod(int argc, const char **argv) kdb_printf(" (Loading)"); else kdb_printf(" (Live)"); + kdb_printf(" 0x%p", mod->module_core); #ifdef CONFIG_MODULE_UNLOAD { @@ -2291,6 +2399,9 @@ static int kdb_ll(int argc, const char **argv) while (va) { char buf[80]; + if (KDB_FLAG(CMD_INTERRUPT)) + return 0; + sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va); diag = kdb_parse(buf); if (diag) @@ -2437,6 +2548,7 @@ static void kdb_sysinfo(struct sysinfo *val) */ static int kdb_summary(int argc, const char **argv) { + struct timespec now; struct kdb_tm tm; struct sysinfo val; @@ -2451,7 +2563,8 @@ static int kdb_summary(int argc, const char **argv) kdb_printf("domainname %s\n", init_uts_ns.name.domainname); kdb_printf("ccversion %s\n", __stringify(CCVERSION)); - kdb_gmtime(&xtime, &tm); + now = __current_kernel_time(); + kdb_gmtime(&now, &tm); kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d " "tz_minuteswest %d\n", 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday, diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 97d3ba6..be775f7 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -144,9 +144,7 @@ extern int kdb_getword(unsigned long *, unsigned long, size_t); extern int kdb_putword(unsigned long, unsigned long, size_t); extern int kdbgetularg(const char *, unsigned long *); -extern int kdb_set(int, const char **); extern char *kdbgetenv(const char *); -extern int kdbgetintenv(const char *, int *); extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, long *, char **); extern int kdbgetsymval(const char *, kdb_symtab_t *); @@ -257,7 +255,14 @@ extern void kdb_ps1(const struct task_struct *p); extern void kdb_print_nameval(const char *name, unsigned long val); extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); extern void kdb_meminfo_proc_show(void); +#ifdef CONFIG_KALLSYMS extern const char *kdb_walk_kallsyms(loff_t *pos); +#else /* ! CONFIG_KALLSYMS */ +static inline const char *kdb_walk_kallsyms(loff_t *pos) +{ + return NULL; +} +#endif /* ! CONFIG_KALLSYMS */ extern char *kdb_getstr(char *, size_t, char *); /* Defines for kdb_symbol_print */ diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 45344d5..6b2485d 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -82,8 +82,8 @@ static char *kdb_name_table[100]; /* arbitrary size */ int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) { int ret = 0; - unsigned long symbolsize; - unsigned long offset; + unsigned long symbolsize = 0; + unsigned long offset = 0; #define knt1_size 128 /* must be >= kallsyms table size */ char *knt1 = NULL; diff --git a/kernel/early_res.c b/kernel/early_res.c deleted file mode 100644 index 31aa933..0000000 --- a/kernel/early_res.c +++ /dev/null @@ -1,584 +0,0 @@ -/* - * early_res, could be used to replace bootmem - */ -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/bootmem.h> -#include <linux/mm.h> -#include <linux/early_res.h> - -/* - * Early reserved memory areas. - */ -/* - * need to make sure this one is bigger enough before - * find_fw_memmap_area could be used - */ -#define MAX_EARLY_RES_X 32 - -struct early_res { - u64 start, end; - char name[15]; - char overlap_ok; -}; -static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata; - -static int max_early_res __initdata = MAX_EARLY_RES_X; -static struct early_res *early_res __initdata = &early_res_x[0]; -static int early_res_count __initdata; - -static int __init find_overlapped_early(u64 start, u64 end) -{ - int i; - struct early_res *r; - - for (i = 0; i < max_early_res && early_res[i].end; i++) { - r = &early_res[i]; - if (end > r->start && start < r->end) - break; - } - - return i; -} - -/* - * Drop the i-th range from the early reservation map, - * by copying any higher ranges down one over it, and - * clearing what had been the last slot. - */ -static void __init drop_range(int i) -{ - int j; - - for (j = i + 1; j < max_early_res && early_res[j].end; j++) - ; - - memmove(&early_res[i], &early_res[i + 1], - (j - 1 - i) * sizeof(struct early_res)); - - early_res[j - 1].end = 0; - early_res_count--; -} - -static void __init drop_range_partial(int i, u64 start, u64 end) -{ - u64 common_start, common_end; - u64 old_start, old_end; - - old_start = early_res[i].start; - old_end = early_res[i].end; - common_start = max(old_start, start); - common_end = min(old_end, end); - - /* no overlap ? */ - if (common_start >= common_end) - return; - - if (old_start < common_start) { - /* make head segment */ - early_res[i].end = common_start; - if (old_end > common_end) { - char name[15]; - - /* - * Save a local copy of the name, since the - * early_res array could get resized inside - * reserve_early_without_check() -> - * __check_and_double_early_res(), which would - * make the current name pointer invalid. - */ - strncpy(name, early_res[i].name, - sizeof(early_res[i].name) - 1); - /* add another for left over on tail */ - reserve_early_without_check(common_end, old_end, name); - } - return; - } else { - if (old_end > common_end) { - /* reuse the entry for tail left */ - early_res[i].start = common_end; - return; - } - /* all covered */ - drop_range(i); - } -} - -/* - * Split any existing ranges that: - * 1) are marked 'overlap_ok', and - * 2) overlap with the stated range [start, end) - * into whatever portion (if any) of the existing range is entirely - * below or entirely above the stated range. Drop the portion - * of the existing range that overlaps with the stated range, - * which will allow the caller of this routine to then add that - * stated range without conflicting with any existing range. - */ -static void __init drop_overlaps_that_are_ok(u64 start, u64 end) -{ - int i; - struct early_res *r; - u64 lower_start, lower_end; - u64 upper_start, upper_end; - char name[15]; - - for (i = 0; i < max_early_res && early_res[i].end; i++) { - r = &early_res[i]; - - /* Continue past non-overlapping ranges */ - if (end <= r->start || start >= r->end) - continue; - - /* - * Leave non-ok overlaps as is; let caller - * panic "Overlapping early reservations" - * when it hits this overlap. - */ - if (!r->overlap_ok) - return; - - /* - * We have an ok overlap. We will drop it from the early - * reservation map, and add back in any non-overlapping - * portions (lower or upper) as separate, overlap_ok, - * non-overlapping ranges. - */ - - /* 1. Note any non-overlapping (lower or upper) ranges. */ - strncpy(name, r->name, sizeof(name) - 1); - - lower_start = lower_end = 0; - upper_start = upper_end = 0; - if (r->start < start) { - lower_start = r->start; - lower_end = start; - } - if (r->end > end) { - upper_start = end; - upper_end = r->end; - } - - /* 2. Drop the original ok overlapping range */ - drop_range(i); - - i--; /* resume for-loop on copied down entry */ - - /* 3. Add back in any non-overlapping ranges. */ - if (lower_end) - reserve_early_overlap_ok(lower_start, lower_end, name); - if (upper_end) - reserve_early_overlap_ok(upper_start, upper_end, name); - } -} - -static void __init __reserve_early(u64 start, u64 end, char *name, - int overlap_ok) -{ - int i; - struct early_res *r; - - i = find_overlapped_early(start, end); - if (i >= max_early_res) - panic("Too many early reservations"); - r = &early_res[i]; - if (r->end) - panic("Overlapping early reservations " - "%llx-%llx %s to %llx-%llx %s\n", - start, end - 1, name ? name : "", r->start, - r->end - 1, r->name); - r->start = start; - r->end = end; - r->overlap_ok = overlap_ok; - if (name) - strncpy(r->name, name, sizeof(r->name) - 1); - early_res_count++; -} - -/* - * A few early reservtations come here. - * - * The 'overlap_ok' in the name of this routine does -not- mean it - * is ok for these reservations to overlap an earlier reservation. - * Rather it means that it is ok for subsequent reservations to - * overlap this one. - * - * Use this entry point to reserve early ranges when you are doing - * so out of "Paranoia", reserving perhaps more memory than you need, - * just in case, and don't mind a subsequent overlapping reservation - * that is known to be needed. - * - * The drop_overlaps_that_are_ok() call here isn't really needed. - * It would be needed if we had two colliding 'overlap_ok' - * reservations, so that the second such would not panic on the - * overlap with the first. We don't have any such as of this - * writing, but might as well tolerate such if it happens in - * the future. - */ -void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) -{ - drop_overlaps_that_are_ok(start, end); - __reserve_early(start, end, name, 1); -} - -static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end) -{ - u64 start, end, size, mem; - struct early_res *new; - - /* do we have enough slots left ? */ - if ((max_early_res - early_res_count) > max(max_early_res/8, 2)) - return; - - /* double it */ - mem = -1ULL; - size = sizeof(struct early_res) * max_early_res * 2; - if (early_res == early_res_x) - start = 0; - else - start = early_res[0].end; - end = ex_start; - if (start + size < end) - mem = find_fw_memmap_area(start, end, size, - sizeof(struct early_res)); - if (mem == -1ULL) { - start = ex_end; - end = get_max_mapped(); - if (start + size < end) - mem = find_fw_memmap_area(start, end, size, - sizeof(struct early_res)); - } - if (mem == -1ULL) - panic("can not find more space for early_res array"); - - new = __va(mem); - /* save the first one for own */ - new[0].start = mem; - new[0].end = mem + size; - new[0].overlap_ok = 0; - /* copy old to new */ - if (early_res == early_res_x) { - memcpy(&new[1], &early_res[0], - sizeof(struct early_res) * max_early_res); - memset(&new[max_early_res+1], 0, - sizeof(struct early_res) * (max_early_res - 1)); - early_res_count++; - } else { - memcpy(&new[1], &early_res[1], - sizeof(struct early_res) * (max_early_res - 1)); - memset(&new[max_early_res], 0, - sizeof(struct early_res) * max_early_res); - } - memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); - early_res = new; - max_early_res *= 2; - printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n", - max_early_res, mem, mem + size - 1); -} - -/* - * Most early reservations come here. - * - * We first have drop_overlaps_that_are_ok() drop any pre-existing - * 'overlap_ok' ranges, so that we can then reserve this memory - * range without risk of panic'ing on an overlapping overlap_ok - * early reservation. - */ -void __init reserve_early(u64 start, u64 end, char *name) -{ - if (start >= end) - return; - - __check_and_double_early_res(start, end); - - drop_overlaps_that_are_ok(start, end); - __reserve_early(start, end, name, 0); -} - -void __init reserve_early_without_check(u64 start, u64 end, char *name) -{ - struct early_res *r; - - if (start >= end) - return; - - __check_and_double_early_res(start, end); - - r = &early_res[early_res_count]; - - r->start = start; - r->end = end; - r->overlap_ok = 0; - if (name) - strncpy(r->name, name, sizeof(r->name) - 1); - early_res_count++; -} - -void __init free_early(u64 start, u64 end) -{ - struct early_res *r; - int i; - - i = find_overlapped_early(start, end); - r = &early_res[i]; - if (i >= max_early_res || r->end != end || r->start != start) - panic("free_early on not reserved area: %llx-%llx!", - start, end - 1); - - drop_range(i); -} - -void __init free_early_partial(u64 start, u64 end) -{ - struct early_res *r; - int i; - - if (start == end) - return; - - if (WARN_ONCE(start > end, " wrong range [%#llx, %#llx]\n", start, end)) - return; - -try_next: - i = find_overlapped_early(start, end); - if (i >= max_early_res) - return; - - r = &early_res[i]; - /* hole ? */ - if (r->end >= end && r->start <= start) { - drop_range_partial(i, start, end); - return; - } - - drop_range_partial(i, start, end); - goto try_next; -} - -#ifdef CONFIG_NO_BOOTMEM -static void __init subtract_early_res(struct range *range, int az) -{ - int i, count; - u64 final_start, final_end; - int idx = 0; - - count = 0; - for (i = 0; i < max_early_res && early_res[i].end; i++) - count++; - - /* need to skip first one ?*/ - if (early_res != early_res_x) - idx = 1; - -#define DEBUG_PRINT_EARLY_RES 1 - -#if DEBUG_PRINT_EARLY_RES - printk(KERN_INFO "Subtract (%d early reservations)\n", count); -#endif - for (i = idx; i < count; i++) { - struct early_res *r = &early_res[i]; -#if DEBUG_PRINT_EARLY_RES - printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i, - r->start, r->end, r->name); -#endif - final_start = PFN_DOWN(r->start); - final_end = PFN_UP(r->end); - if (final_start >= final_end) - continue; - subtract_range(range, az, final_start, final_end); - } - -} - -int __init get_free_all_memory_range(struct range **rangep, int nodeid) -{ - int i, count; - u64 start = 0, end; - u64 size; - u64 mem; - struct range *range; - int nr_range; - - count = 0; - for (i = 0; i < max_early_res && early_res[i].end; i++) - count++; - - count *= 2; - - size = sizeof(struct range) * count; - end = get_max_mapped(); -#ifdef MAX_DMA32_PFN - if (end > (MAX_DMA32_PFN << PAGE_SHIFT)) - start = MAX_DMA32_PFN << PAGE_SHIFT; -#endif - mem = find_fw_memmap_area(start, end, size, sizeof(struct range)); - if (mem == -1ULL) - panic("can not find more space for range free"); - - range = __va(mem); - /* use early_node_map[] and early_res to get range array at first */ - memset(range, 0, size); - nr_range = 0; - - /* need to go over early_node_map to find out good range for node */ - nr_range = add_from_early_node_map(range, count, nr_range, nodeid); -#ifdef CONFIG_X86_32 - subtract_range(range, count, max_low_pfn, -1ULL); -#endif - subtract_early_res(range, count); - nr_range = clean_sort_range(range, count); - - /* need to clear it ? */ - if (nodeid == MAX_NUMNODES) { - memset(&early_res[0], 0, - sizeof(struct early_res) * max_early_res); - early_res = NULL; - max_early_res = 0; - } - - *rangep = range; - return nr_range; -} -#else -void __init early_res_to_bootmem(u64 start, u64 end) -{ - int i, count; - u64 final_start, final_end; - int idx = 0; - - count = 0; - for (i = 0; i < max_early_res && early_res[i].end; i++) - count++; - - /* need to skip first one ?*/ - if (early_res != early_res_x) - idx = 1; - - printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n", - count - idx, max_early_res, start, end); - for (i = idx; i < count; i++) { - struct early_res *r = &early_res[i]; - printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, - r->start, r->end, r->name); - final_start = max(start, r->start); - final_end = min(end, r->end); - if (final_start >= final_end) { - printk(KERN_CONT "\n"); - continue; - } - printk(KERN_CONT " ==> [%010llx - %010llx]\n", - final_start, final_end); - reserve_bootmem_generic(final_start, final_end - final_start, - BOOTMEM_DEFAULT); - } - /* clear them */ - memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); - early_res = NULL; - max_early_res = 0; - early_res_count = 0; -} -#endif - -/* Check for already reserved areas */ -static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) -{ - int i; - u64 addr = *addrp; - int changed = 0; - struct early_res *r; -again: - i = find_overlapped_early(addr, addr + size); - r = &early_res[i]; - if (i < max_early_res && r->end) { - *addrp = addr = round_up(r->end, align); - changed = 1; - goto again; - } - return changed; -} - -/* Check for already reserved areas */ -static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align) -{ - int i; - u64 addr = *addrp, last; - u64 size = *sizep; - int changed = 0; -again: - last = addr + size; - for (i = 0; i < max_early_res && early_res[i].end; i++) { - struct early_res *r = &early_res[i]; - if (last > r->start && addr < r->start) { - size = r->start - addr; - changed = 1; - goto again; - } - if (last > r->end && addr < r->end) { - addr = round_up(r->end, align); - size = last - addr; - changed = 1; - goto again; - } - if (last <= r->end && addr >= r->start) { - (*sizep)++; - return 0; - } - } - if (changed) { - *addrp = addr; - *sizep = size; - } - return changed; -} - -/* - * Find a free area with specified alignment in a specific range. - * only with the area.between start to end is active range from early_node_map - * so they are good as RAM - */ -u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, - u64 size, u64 align) -{ - u64 addr, last; - - addr = round_up(ei_start, align); - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - goto out; - while (bad_addr(&addr, size, align) && addr+size <= ei_last) - ; - last = addr + size; - if (last > ei_last) - goto out; - if (last > end) - goto out; - - return addr; - -out: - return -1ULL; -} - -u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start, - u64 *sizep, u64 align) -{ - u64 addr, last; - - addr = round_up(ei_start, align); - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - goto out; - *sizep = ei_last - addr; - while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last) - ; - last = addr + *sizep; - if (last > ei_last) - goto out; - - return addr; - -out: - return -1ULL; -} diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index dd62f8e..0dbeae3 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -134,23 +134,14 @@ unregister: return 0; } -int -__set_personality(unsigned int personality) +int __set_personality(unsigned int personality) { - struct exec_domain *ep, *oep; - - ep = lookup_exec_domain(personality); - if (ep == current_thread_info()->exec_domain) { - current->personality = personality; - module_put(ep->module); - return 0; - } + struct exec_domain *oep = current_thread_info()->exec_domain; + current_thread_info()->exec_domain = lookup_exec_domain(personality); current->personality = personality; - oep = current_thread_info()->exec_domain; - current_thread_info()->exec_domain = ep; - module_put(oep->module); + return 0; } @@ -192,11 +183,8 @@ SYSCALL_DEFINE1(personality, unsigned int, personality) { unsigned int old = current->personality; - if (personality != 0xffffffff) { + if (personality != 0xffffffff) set_personality(personality); - if (current->personality != personality) - return -EINVAL; - } return old; } diff --git a/kernel/exit.c b/kernel/exit.c index ceffc67..e2bdf37 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -149,9 +149,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); -#ifdef CONFIG_PERF_EVENTS - WARN_ON_ONCE(tsk->perf_event_ctxp); -#endif + perf_event_delayed_put(tsk); trace_sched_process_free(tsk); put_task_struct(tsk); } @@ -771,9 +769,12 @@ static void forget_original_parent(struct task_struct *father) struct task_struct *p, *n, *reaper; LIST_HEAD(dead_children); - exit_ptrace(father); - write_lock_irq(&tasklist_lock); + /* + * Note that exit_ptrace() and find_new_reaper() might + * drop tasklist_lock and reacquire it. + */ + exit_ptrace(father); reaper = find_new_reaper(father); list_for_each_entry_safe(p, n, &father->children, sibling) { @@ -1383,8 +1384,7 @@ static int wait_task_stopped(struct wait_opts *wo, if (!unlikely(wo->wo_flags & WNOWAIT)) *p_code = 0; - /* don't need the RCU readlock here as we're holding a spinlock */ - uid = __task_cred(p)->uid; + uid = task_uid(p); unlock_sig: spin_unlock_irq(&p->sighand->siglock); if (!exit_code) @@ -1457,7 +1457,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) } if (!unlikely(wo->wo_flags & WNOWAIT)) p->signal->flags &= ~SIGNAL_STOP_CONTINUED; - uid = __task_cred(p)->uid; + uid = task_uid(p); spin_unlock_irq(&p->sighand->siglock); pid = task_pid_vnr(p); diff --git a/kernel/fork.c b/kernel/fork.c index b6cce14..c445f8c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -300,7 +300,7 @@ out: #ifdef CONFIG_MMU static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { - struct vm_area_struct *mpnt, *tmp, **pprev; + struct vm_area_struct *mpnt, *tmp, *prev, **pprev; struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge; @@ -328,6 +328,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (retval) goto out; + prev = NULL; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { struct file *file; @@ -355,11 +356,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (IS_ERR(pol)) goto fail_nomem_policy; vma_set_policy(tmp, pol); + tmp->vm_mm = mm; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; tmp->vm_flags &= ~VM_LOCKED; - tmp->vm_mm = mm; - tmp->vm_next = NULL; + tmp->vm_next = tmp->vm_prev = NULL; file = tmp->vm_file; if (file) { struct inode *inode = file->f_path.dentry->d_inode; @@ -392,6 +393,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) */ *pprev = tmp; pprev = &tmp->vm_next; + tmp->vm_prev = prev; + prev = tmp; __vma_link_rb(mm, tmp, rb_link, rb_parent); rb_link = &tmp->vm_rb.rb_right; @@ -752,13 +755,13 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) struct fs_struct *fs = current->fs; if (clone_flags & CLONE_FS) { /* tsk->fs is already what we want */ - write_lock(&fs->lock); + spin_lock(&fs->lock); if (fs->in_exec) { - write_unlock(&fs->lock); + spin_unlock(&fs->lock); return -EAGAIN; } fs->users++; - write_unlock(&fs->lock); + spin_unlock(&fs->lock); return 0; } tsk->fs = copy_fs_struct(fs); @@ -899,6 +902,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); sig->oom_adj = current->signal->oom_adj; + sig->oom_score_adj = current->signal->oom_score_adj; return 0; } @@ -907,7 +911,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) { unsigned long new_flags = p->flags; - new_flags &= ~PF_SUPERPRIV; + new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); new_flags |= PF_FORKNOEXEC; new_flags |= PF_STARTING; p->flags = new_flags; @@ -1675,13 +1679,13 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) if (new_fs) { fs = current->fs; - write_lock(&fs->lock); + spin_lock(&fs->lock); current->fs = new_fs; if (--fs->users) new_fs = NULL; else new_fs = fs; - write_unlock(&fs->lock); + spin_unlock(&fs->lock); } if (new_mm) { diff --git a/kernel/futex.c b/kernel/futex.c index e7a35f1..a118bf1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -91,6 +91,7 @@ struct futex_pi_state { /** * struct futex_q - The hashed futex queue entry, one per waiting task + * @list: priority-sorted list of tasks waiting on this futex * @task: the task waiting on the futex * @lock_ptr: the hash bucket lock * @key: the key the futex is hashed on @@ -104,7 +105,7 @@ struct futex_pi_state { * * A futex_q has a woken state, just like tasks have TASK_RUNNING. * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. - * The order of wakup is always to make the first condition true, then + * The order of wakeup is always to make the first condition true, then * the second. * * PI futexes are typically woken before they are removed from the hash list via @@ -295,7 +296,7 @@ void put_futex_key(int fshared, union futex_key *key) * Slow path to fixup the fault we just took in the atomic write * access to @uaddr. * - * We have no generic implementation of a non destructive write to the + * We have no generic implementation of a non-destructive write to the * user address. We know that we faulted in the atomic pagefault * disabled section so we can as well avoid the #PF overhead by * calling get_user_pages() right away. @@ -429,20 +430,11 @@ static void free_pi_state(struct futex_pi_state *pi_state) static struct task_struct * futex_find_get_task(pid_t pid) { struct task_struct *p; - const struct cred *cred = current_cred(), *pcred; rcu_read_lock(); p = find_task_by_vpid(pid); - if (!p) { - p = ERR_PTR(-ESRCH); - } else { - pcred = __task_cred(p); - if (cred->euid != pcred->euid && - cred->euid != pcred->uid) - p = ERR_PTR(-ESRCH); - else - get_task_struct(p); - } + if (p) + get_task_struct(p); rcu_read_unlock(); @@ -524,7 +516,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, */ pi_state = this->pi_state; /* - * Userspace might have messed up non PI and PI futexes + * Userspace might have messed up non-PI and PI futexes */ if (unlikely(!pi_state)) return -EINVAL; @@ -564,8 +556,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, if (!pid) return -ESRCH; p = futex_find_get_task(pid); - if (IS_ERR(p)) - return PTR_ERR(p); + if (!p) + return -ESRCH; /* * We need to look at the task state flags to figure out, @@ -745,8 +737,8 @@ static void wake_futex(struct futex_q *q) /* * We set q->lock_ptr = NULL _before_ we wake up the task. If - * a non futex wake up happens on another CPU then the task - * might exit and p would dereference a non existing task + * a non-futex wake up happens on another CPU then the task + * might exit and p would dereference a non-existing task * struct. Prevent this by holding a reference on p across the * wake up. */ @@ -1140,11 +1132,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, /** * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 - * uaddr1: source futex user address - * uaddr2: target futex user address - * nr_wake: number of waiters to wake (must be 1 for requeue_pi) - * nr_requeue: number of waiters to requeue (0-INT_MAX) - * requeue_pi: if we are attempting to requeue from a non-pi futex to a + * @uaddr1: source futex user address + * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED + * @uaddr2: target futex user address + * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) + * @nr_requeue: number of waiters to requeue (0-INT_MAX) + * @cmpval: @uaddr1 expected value (or %NULL) + * @requeue_pi: if we are attempting to requeue from a non-pi futex to a * pi futex (pi to pi requeue is not supported) * * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire @@ -1369,10 +1363,10 @@ out: /* The key must be already stored in q->key. */ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) + __acquires(&hb->lock) { struct futex_hash_bucket *hb; - get_futex_key_refs(&q->key); hb = hash_futex(&q->key); q->lock_ptr = &hb->lock; @@ -1382,9 +1376,9 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) static inline void queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) + __releases(&hb->lock) { spin_unlock(&hb->lock); - drop_futex_key_refs(&q->key); } /** @@ -1400,6 +1394,7 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) * an example). */ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) + __releases(&hb->lock) { int prio; @@ -1480,6 +1475,7 @@ retry: * and dropped here. */ static void unqueue_me_pi(struct futex_q *q) + __releases(q->lock_ptr) { WARN_ON(plist_node_empty(&q->list)); plist_del(&q->list, &q->list.plist); @@ -1489,8 +1485,6 @@ static void unqueue_me_pi(struct futex_q *q) q->pi_state = NULL; spin_unlock(q->lock_ptr); - - drop_futex_key_refs(&q->key); } /* @@ -1821,7 +1815,10 @@ static int futex_wait(u32 __user *uaddr, int fshared, } retry: - /* Prepare to wait on uaddr. */ + /* + * Prepare to wait on uaddr. On success, holds hb lock and increments + * q.key refs. + */ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); if (ret) goto out; @@ -1831,28 +1828,27 @@ retry: /* If we were woken (and unqueued), we succeeded, whatever. */ ret = 0; + /* unqueue_me() drops q.key ref */ if (!unqueue_me(&q)) - goto out_put_key; + goto out; ret = -ETIMEDOUT; if (to && !to->task) - goto out_put_key; + goto out; /* * We expect signal_pending(current), but we might be the * victim of a spurious wakeup as well. */ - if (!signal_pending(current)) { - put_futex_key(fshared, &q.key); + if (!signal_pending(current)) goto retry; - } ret = -ERESTARTSYS; if (!abs_time) - goto out_put_key; + goto out; restart = ¤t_thread_info()->restart_block; restart->fn = futex_wait_restart; - restart->futex.uaddr = (u32 *)uaddr; + restart->futex.uaddr = uaddr; restart->futex.val = val; restart->futex.time = abs_time->tv64; restart->futex.bitset = bitset; @@ -1865,8 +1861,6 @@ retry: ret = -ERESTART_RESTARTBLOCK; -out_put_key: - put_futex_key(fshared, &q.key); out: if (to) { hrtimer_cancel(&to->timer); @@ -1878,7 +1872,7 @@ out: static long futex_wait_restart(struct restart_block *restart) { - u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; + u32 __user *uaddr = restart->futex.uaddr; int fshared = 0; ktime_t t, *tp = NULL; @@ -2245,7 +2239,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, q.rt_waiter = &rt_waiter; q.requeue_pi_key = &key2; - /* Prepare to wait on uaddr. */ + /* + * Prepare to wait on uaddr. On success, increments q.key (key1) ref + * count. + */ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); if (ret) goto out_key2; @@ -2263,7 +2260,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, * In order for us to be here, we know our q.key == key2, and since * we took the hb->lock above, we also know that futex_requeue() has * completed and we no longer have to concern ourselves with a wakeup - * race with the atomic proxy lock acquition by the requeue code. + * race with the atomic proxy lock acquisition by the requeue code. The + * futex_requeue dropped our key1 reference and incremented our key2 + * reference count. */ /* Check if the requeue code acquired the second futex for us. */ @@ -2467,7 +2466,7 @@ retry: */ static inline int fetch_robust_entry(struct robust_list __user **entry, struct robust_list __user * __user *head, - int *pi) + unsigned int *pi) { unsigned long uentry; @@ -2656,7 +2655,7 @@ static int __init futex_init(void) * of the complex code paths. Also we want to prevent * registration of robust lists in that case. NULL is * guaranteed to fault and we get -EFAULT on functional - * implementation, the non functional ones will return + * implementation, the non-functional ones will return * -ENOSYS. */ curval = cmpxchg_futex_value_locked(NULL, 0, 0); diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index d49afb2..06da4df 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -19,7 +19,7 @@ */ static inline int fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, - compat_uptr_t __user *head, int *pi) + compat_uptr_t __user *head, unsigned int *pi) { if (get_user(*uentry, head)) return -EFAULT; diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index ef3c3f8..f83972b 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -33,10 +33,11 @@ * @children: child nodes * @all: list head for list of all nodes * @parent: parent node - * @info: associated profiling data structure if not a directory - * @ghost: when an object file containing profiling data is unloaded we keep a - * copy of the profiling data here to allow collecting coverage data - * for cleanup code. Such a node is called a "ghost". + * @loaded_info: array of pointers to profiling data sets for loaded object + * files. + * @num_loaded: number of profiling data sets for loaded object files. + * @unloaded_info: accumulated copy of profiling data sets for unloaded + * object files. Used only when gcov_persist=1. * @dentry: main debugfs entry, either a directory or data file * @links: associated symbolic links * @name: data file basename @@ -51,10 +52,11 @@ struct gcov_node { struct list_head children; struct list_head all; struct gcov_node *parent; - struct gcov_info *info; - struct gcov_info *ghost; + struct gcov_info **loaded_info; + struct gcov_info *unloaded_info; struct dentry *dentry; struct dentry **links; + int num_loaded; char name[0]; }; @@ -136,16 +138,37 @@ static const struct seq_operations gcov_seq_ops = { }; /* - * Return the profiling data set for a given node. This can either be the - * original profiling data structure or a duplicate (also called "ghost") - * in case the associated object file has been unloaded. + * Return a profiling data set associated with the given node. This is + * either a data set for a loaded object file or a data set copy in case + * all associated object files have been unloaded. */ static struct gcov_info *get_node_info(struct gcov_node *node) { - if (node->info) - return node->info; + if (node->num_loaded > 0) + return node->loaded_info[0]; - return node->ghost; + return node->unloaded_info; +} + +/* + * Return a newly allocated profiling data set which contains the sum of + * all profiling data associated with the given node. + */ +static struct gcov_info *get_accumulated_info(struct gcov_node *node) +{ + struct gcov_info *info; + int i = 0; + + if (node->unloaded_info) + info = gcov_info_dup(node->unloaded_info); + else + info = gcov_info_dup(node->loaded_info[i++]); + if (!info) + return NULL; + for (; i < node->num_loaded; i++) + gcov_info_add(info, node->loaded_info[i]); + + return info; } /* @@ -163,9 +186,10 @@ static int gcov_seq_open(struct inode *inode, struct file *file) mutex_lock(&node_lock); /* * Read from a profiling data copy to minimize reference tracking - * complexity and concurrent access. + * complexity and concurrent access and to keep accumulating multiple + * profiling data sets associated with one node simple. */ - info = gcov_info_dup(get_node_info(node)); + info = get_accumulated_info(node); if (!info) goto out_unlock; iter = gcov_iter_new(info); @@ -225,12 +249,25 @@ static struct gcov_node *get_node_by_name(const char *name) return NULL; } +/* + * Reset all profiling data associated with the specified node. + */ +static void reset_node(struct gcov_node *node) +{ + int i; + + if (node->unloaded_info) + gcov_info_reset(node->unloaded_info); + for (i = 0; i < node->num_loaded; i++) + gcov_info_reset(node->loaded_info[i]); +} + static void remove_node(struct gcov_node *node); /* * write() implementation for gcov data files. Reset profiling data for the - * associated file. If the object file has been unloaded (i.e. this is - * a "ghost" node), remove the debug fs node as well. + * corresponding file. If all associated object files have been unloaded, + * remove the debug fs node as well. */ static ssize_t gcov_seq_write(struct file *file, const char __user *addr, size_t len, loff_t *pos) @@ -245,10 +282,10 @@ static ssize_t gcov_seq_write(struct file *file, const char __user *addr, node = get_node_by_name(info->filename); if (node) { /* Reset counts or remove node for unloaded modules. */ - if (node->ghost) + if (node->num_loaded == 0) remove_node(node); else - gcov_info_reset(node->info); + reset_node(node); } /* Reset counts for open file. */ gcov_info_reset(info); @@ -378,7 +415,10 @@ static void init_node(struct gcov_node *node, struct gcov_info *info, INIT_LIST_HEAD(&node->list); INIT_LIST_HEAD(&node->children); INIT_LIST_HEAD(&node->all); - node->info = info; + if (node->loaded_info) { + node->loaded_info[0] = info; + node->num_loaded = 1; + } node->parent = parent; if (name) strcpy(node->name, name); @@ -394,9 +434,13 @@ static struct gcov_node *new_node(struct gcov_node *parent, struct gcov_node *node; node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL); - if (!node) { - pr_warning("out of memory\n"); - return NULL; + if (!node) + goto err_nomem; + if (info) { + node->loaded_info = kcalloc(1, sizeof(struct gcov_info *), + GFP_KERNEL); + if (!node->loaded_info) + goto err_nomem; } init_node(node, info, name, parent); /* Differentiate between gcov data file nodes and directory nodes. */ @@ -416,6 +460,11 @@ static struct gcov_node *new_node(struct gcov_node *parent, list_add(&node->all, &all_head); return node; + +err_nomem: + kfree(node); + pr_warning("out of memory\n"); + return NULL; } /* Remove symbolic links associated with node. */ @@ -441,8 +490,9 @@ static void release_node(struct gcov_node *node) list_del(&node->all); debugfs_remove(node->dentry); remove_links(node); - if (node->ghost) - gcov_info_free(node->ghost); + kfree(node->loaded_info); + if (node->unloaded_info) + gcov_info_free(node->unloaded_info); kfree(node); } @@ -477,7 +527,7 @@ static struct gcov_node *get_child_by_name(struct gcov_node *parent, /* * write() implementation for reset file. Reset all profiling data to zero - * and remove ghost nodes. + * and remove nodes for which all associated object files are unloaded. */ static ssize_t reset_write(struct file *file, const char __user *addr, size_t len, loff_t *pos) @@ -487,8 +537,8 @@ static ssize_t reset_write(struct file *file, const char __user *addr, mutex_lock(&node_lock); restart: list_for_each_entry(node, &all_head, all) { - if (node->info) - gcov_info_reset(node->info); + if (node->num_loaded > 0) + reset_node(node); else if (list_empty(&node->children)) { remove_node(node); /* Several nodes may have gone - restart loop. */ @@ -564,37 +614,115 @@ err_remove: } /* - * The profiling data set associated with this node is being unloaded. Store a - * copy of the profiling data and turn this node into a "ghost". + * Associate a profiling data set with an existing node. Needs to be called + * with node_lock held. */ -static int ghost_node(struct gcov_node *node) +static void add_info(struct gcov_node *node, struct gcov_info *info) { - node->ghost = gcov_info_dup(node->info); - if (!node->ghost) { - pr_warning("could not save data for '%s' (out of memory)\n", - node->info->filename); - return -ENOMEM; + struct gcov_info **loaded_info; + int num = node->num_loaded; + + /* + * Prepare new array. This is done first to simplify cleanup in + * case the new data set is incompatible, the node only contains + * unloaded data sets and there's not enough memory for the array. + */ + loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL); + if (!loaded_info) { + pr_warning("could not add '%s' (out of memory)\n", + info->filename); + return; + } + memcpy(loaded_info, node->loaded_info, + num * sizeof(struct gcov_info *)); + loaded_info[num] = info; + /* Check if the new data set is compatible. */ + if (num == 0) { + /* + * A module was unloaded, modified and reloaded. The new + * data set replaces the copy of the last one. + */ + if (!gcov_info_is_compatible(node->unloaded_info, info)) { + pr_warning("discarding saved data for %s " + "(incompatible version)\n", info->filename); + gcov_info_free(node->unloaded_info); + node->unloaded_info = NULL; + } + } else { + /* + * Two different versions of the same object file are loaded. + * The initial one takes precedence. + */ + if (!gcov_info_is_compatible(node->loaded_info[0], info)) { + pr_warning("could not add '%s' (incompatible " + "version)\n", info->filename); + kfree(loaded_info); + return; + } } - node->info = NULL; + /* Overwrite previous array. */ + kfree(node->loaded_info); + node->loaded_info = loaded_info; + node->num_loaded = num + 1; +} - return 0; +/* + * Return the index of a profiling data set associated with a node. + */ +static int get_info_index(struct gcov_node *node, struct gcov_info *info) +{ + int i; + + for (i = 0; i < node->num_loaded; i++) { + if (node->loaded_info[i] == info) + return i; + } + return -ENOENT; } /* - * Profiling data for this node has been loaded again. Add profiling data - * from previous instantiation and turn this node into a regular node. + * Save the data of a profiling data set which is being unloaded. */ -static void revive_node(struct gcov_node *node, struct gcov_info *info) +static void save_info(struct gcov_node *node, struct gcov_info *info) { - if (gcov_info_is_compatible(node->ghost, info)) - gcov_info_add(info, node->ghost); + if (node->unloaded_info) + gcov_info_add(node->unloaded_info, info); else { - pr_warning("discarding saved data for '%s' (version changed)\n", + node->unloaded_info = gcov_info_dup(info); + if (!node->unloaded_info) { + pr_warning("could not save data for '%s' " + "(out of memory)\n", info->filename); + } + } +} + +/* + * Disassociate a profiling data set from a node. Needs to be called with + * node_lock held. + */ +static void remove_info(struct gcov_node *node, struct gcov_info *info) +{ + int i; + + i = get_info_index(node, info); + if (i < 0) { + pr_warning("could not remove '%s' (not found)\n", info->filename); + return; } - gcov_info_free(node->ghost); - node->ghost = NULL; - node->info = info; + if (gcov_persist) + save_info(node, info); + /* Shrink array. */ + node->loaded_info[i] = node->loaded_info[node->num_loaded - 1]; + node->num_loaded--; + if (node->num_loaded > 0) + return; + /* Last loaded data set was removed. */ + kfree(node->loaded_info); + node->loaded_info = NULL; + node->num_loaded = 0; + if (!node->unloaded_info) + remove_node(node); } /* @@ -609,30 +737,18 @@ void gcov_event(enum gcov_action action, struct gcov_info *info) node = get_node_by_name(info->filename); switch (action) { case GCOV_ADD: - /* Add new node or revive ghost. */ - if (!node) { + if (node) + add_info(node, info); + else add_node(info); - break; - } - if (gcov_persist) - revive_node(node, info); - else { - pr_warning("could not add '%s' (already exists)\n", - info->filename); - } break; case GCOV_REMOVE: - /* Remove node or turn into ghost. */ - if (!node) { + if (node) + remove_info(node, info); + else { pr_warning("could not remove '%s' (not found)\n", info->filename); - break; } - if (gcov_persist) { - if (!ghost_node(node)) - break; - } - remove_node(node); break; } mutex_unlock(&node_lock); diff --git a/kernel/groups.c b/kernel/groups.c index 53b1916..253dc0f 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -143,10 +143,9 @@ int groups_search(const struct group_info *group_info, gid_t grp) right = group_info->ngroups; while (left < right) { unsigned int mid = (left+right)/2; - int cmp = grp - GROUP_AT(group_info, mid); - if (cmp > 0) + if (grp > GROUP_AT(group_info, mid)) left = mid + 1; - else if (cmp < 0) + else if (grp < GROUP_AT(group_info, mid)) right = mid; else return 1; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 5c69e99..72206cf 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -90,7 +90,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) do { seq = read_seqbegin(&xtime_lock); xts = __current_kernel_time(); - tom = wall_to_monotonic; + tom = __get_wall_to_monotonic(); } while (read_seqretry(&xtime_lock, seq)); xtim = timespec_to_ktime(xts); @@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, static int hrtimer_get_target(int this_cpu, int pinned) { #ifdef CONFIG_NO_HZ - if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) { - int preferred_cpu = get_nohz_load_balancer(); - - if (preferred_cpu >= 0) - return preferred_cpu; - } + if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) + return get_nohz_timer_target(); #endif return this_cpu; } @@ -612,7 +608,7 @@ static int hrtimer_reprogram(struct hrtimer *timer, static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base; - struct timespec realtime_offset; + struct timespec realtime_offset, wtm; unsigned long seq; if (!hrtimer_hres_active()) @@ -620,10 +616,9 @@ static void retrigger_next_event(void *arg) do { seq = read_seqbegin(&xtime_lock); - set_normalized_timespec(&realtime_offset, - -wall_to_monotonic.tv_sec, - -wall_to_monotonic.tv_nsec); + wtm = __get_wall_to_monotonic(); } while (read_seqretry(&xtime_lock, seq)); + set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); base = &__get_cpu_var(hrtimer_bases); @@ -936,6 +931,7 @@ static inline int remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) { if (hrtimer_is_queued(timer)) { + unsigned long state; int reprogram; /* @@ -949,8 +945,13 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) debug_deactivate(timer); timer_stats_hrtimer_clear_start_info(timer); reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); - __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, - reprogram); + /* + * We must preserve the CALLBACK state flag here, + * otherwise we could move the timer base in + * switch_hrtimer_base. + */ + state = timer->state & HRTIMER_STATE_CALLBACK; + __remove_hrtimer(timer, base, state, reprogram); return 1; } return 0; @@ -1096,11 +1097,10 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); */ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) { - struct hrtimer_clock_base *base; unsigned long flags; ktime_t rem; - base = lock_hrtimer_base(timer, &flags); + lock_hrtimer_base(timer, &flags); rem = hrtimer_expires_remaining(timer); unlock_hrtimer_base(timer, &flags); @@ -1237,6 +1237,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); enqueue_hrtimer(timer, base); } + + WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK)); + timer->state &= ~HRTIMER_STATE_CALLBACK; } diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 0c642d5..53ead17 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -98,7 +98,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); - __debug_show_held_locks(t); + debug_show_held_locks(t); touch_nmi_watchdog(); @@ -111,7 +111,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) * periodically exit the critical section and enter a new one. * * For preemptible RCU it is sufficient to call rcu_read_unlock in order - * exit the grace period. For classic RCU, a reschedule is required. + * to exit the grace period. For classic RCU, a reschedule is required. */ static void rcu_lock_break(struct task_struct *g, struct task_struct *t) { diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 7a56b22..2c9120f 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -41,6 +41,7 @@ #include <linux/sched.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/list.h> #include <linux/cpu.h> #include <linux/smp.h> @@ -62,6 +63,9 @@ static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]); static int nr_slots[TYPE_MAX]; +/* Keep track of the breakpoints attached to tasks */ +static LIST_HEAD(bp_task_head); + static int constraints_initialized; /* Gather the number of total pinned and un-pinned bp in a cpuset */ @@ -103,33 +107,21 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) return 0; } -static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type) +/* + * Count the number of breakpoints of the same type and same task. + * The given event must be not on the list. + */ +static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) { - struct perf_event_context *ctx = tsk->perf_event_ctxp; - struct list_head *list; - struct perf_event *bp; - unsigned long flags; + struct task_struct *tsk = bp->hw.bp_target; + struct perf_event *iter; int count = 0; - if (WARN_ONCE(!ctx, "No perf context for this task")) - return 0; - - list = &ctx->event_list; - - raw_spin_lock_irqsave(&ctx->lock, flags); - - /* - * The current breakpoint counter is not included in the list - * at the open() callback time - */ - list_for_each_entry(bp, list, event_entry) { - if (bp->attr.type == PERF_TYPE_BREAKPOINT) - if (find_slot_idx(bp) == type) - count += hw_breakpoint_weight(bp); + list_for_each_entry(iter, &bp_task_head, hw.bp_list) { + if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type) + count += hw_breakpoint_weight(iter); } - raw_spin_unlock_irqrestore(&ctx->lock, flags); - return count; } @@ -142,14 +134,14 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, enum bp_type_idx type) { int cpu = bp->cpu; - struct task_struct *tsk = bp->ctx->task; + struct task_struct *tsk = bp->hw.bp_target; if (cpu >= 0) { slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); if (!tsk) slots->pinned += max_task_bp_pinned(cpu, type); else - slots->pinned += task_bp_pinned(tsk, type); + slots->pinned += task_bp_pinned(bp, type); slots->flexible = per_cpu(nr_bp_flexible[type], cpu); return; @@ -162,7 +154,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, if (!tsk) nr += max_task_bp_pinned(cpu, type); else - nr += task_bp_pinned(tsk, type); + nr += task_bp_pinned(bp, type); if (nr > slots->pinned) slots->pinned = nr; @@ -188,7 +180,7 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight) /* * Add a pinned breakpoint for the given task in our constraint table */ -static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable, +static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, enum bp_type_idx type, int weight) { unsigned int *tsk_pinned; @@ -196,10 +188,11 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable, int old_idx = 0; int idx = 0; - old_count = task_bp_pinned(tsk, type); + old_count = task_bp_pinned(bp, type); old_idx = old_count - 1; idx = old_idx + weight; + /* tsk_pinned[n] is the number of tasks having n breakpoints */ tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); if (enable) { tsk_pinned[idx]++; @@ -220,25 +213,43 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight) { int cpu = bp->cpu; - struct task_struct *tsk = bp->ctx->task; + struct task_struct *tsk = bp->hw.bp_target; + + /* Pinned counter cpu profiling */ + if (!tsk) { + + if (enable) + per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; + else + per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; + return; + } /* Pinned counter task profiling */ - if (tsk) { - if (cpu >= 0) { - toggle_bp_task_slot(tsk, cpu, enable, type, weight); - return; - } + if (!enable) + list_del(&bp->hw.bp_list); + + if (cpu >= 0) { + toggle_bp_task_slot(bp, cpu, enable, type, weight); + } else { for_each_online_cpu(cpu) - toggle_bp_task_slot(tsk, cpu, enable, type, weight); - return; + toggle_bp_task_slot(bp, cpu, enable, type, weight); } - /* Pinned counter cpu profiling */ if (enable) - per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; - else - per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; + list_add_tail(&bp->hw.bp_list, &bp_task_head); +} + +/* + * Function to perform processor-specific cleanup during unregistration + */ +__weak void arch_unregister_hw_breakpoint(struct perf_event *bp) +{ + /* + * A weak stub function here for those archs that don't define + * it inside arch/.../kernel/hw_breakpoint.c + */ } /* @@ -301,6 +312,10 @@ static int __reserve_bp_slot(struct perf_event *bp) weight = hw_breakpoint_weight(bp); fetch_bp_busy_slots(&slots, bp, type); + /* + * Simulate the addition of this breakpoint to the constraints + * and see the result. + */ fetch_this_slot(&slots, weight); /* Flexible counters need to keep at least one slot */ @@ -339,6 +354,7 @@ void release_bp_slot(struct perf_event *bp) { mutex_lock(&nr_bp_mutex); + arch_unregister_hw_breakpoint(bp); __release_bp_slot(bp); mutex_unlock(&nr_bp_mutex); @@ -417,7 +433,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, struct task_struct *tsk) { - return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); + return perf_event_create_kernel_counter(attr, -1, tsk, triggered); } EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); @@ -499,7 +515,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, get_online_cpus(); for_each_online_cpu(cpu) { pevent = per_cpu_ptr(cpu_events, cpu); - bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); + bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered); *pevent = bp; @@ -549,6 +565,61 @@ static struct notifier_block hw_breakpoint_exceptions_nb = { .priority = 0x7fffffff }; +static void bp_perf_event_destroy(struct perf_event *event) +{ + release_bp_slot(event); +} + +static int hw_breakpoint_event_init(struct perf_event *bp) +{ + int err; + + if (bp->attr.type != PERF_TYPE_BREAKPOINT) + return -ENOENT; + + err = register_perf_hw_breakpoint(bp); + if (err) + return err; + + bp->destroy = bp_perf_event_destroy; + + return 0; +} + +static int hw_breakpoint_add(struct perf_event *bp, int flags) +{ + if (!(flags & PERF_EF_START)) + bp->hw.state = PERF_HES_STOPPED; + + return arch_install_hw_breakpoint(bp); +} + +static void hw_breakpoint_del(struct perf_event *bp, int flags) +{ + arch_uninstall_hw_breakpoint(bp); +} + +static void hw_breakpoint_start(struct perf_event *bp, int flags) +{ + bp->hw.state = 0; +} + +static void hw_breakpoint_stop(struct perf_event *bp, int flags) +{ + bp->hw.state = PERF_HES_STOPPED; +} + +static struct pmu perf_breakpoint = { + .task_ctx_nr = perf_sw_context, /* could eventually get its own */ + + .event_init = hw_breakpoint_event_init, + .add = hw_breakpoint_add, + .del = hw_breakpoint_del, + .start = hw_breakpoint_start, + .stop = hw_breakpoint_stop, + .read = hw_breakpoint_pmu_read, +}; + static int __init init_hw_breakpoint(void) { unsigned int **task_bp_pinned; @@ -570,6 +641,8 @@ static int __init init_hw_breakpoint(void) constraints_initialized = 1; + perf_pmu_register(&perf_breakpoint); + return register_die_notifier(&hw_breakpoint_exceptions_nb); err_alloc: @@ -585,8 +658,3 @@ static int __init init_hw_breakpoint(void) core_initcall(init_hw_breakpoint); -struct pmu perf_ops_bp = { - .enable = arch_install_hw_breakpoint, - .disable = arch_uninstall_hw_breakpoint, - .read = hw_breakpoint_pmu_read, -}; diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig new file mode 100644 index 0000000..31d766b --- /dev/null +++ b/kernel/irq/Kconfig @@ -0,0 +1,53 @@ +config HAVE_GENERIC_HARDIRQS + def_bool n + +if HAVE_GENERIC_HARDIRQS +menu "IRQ subsystem" +# +# Interrupt subsystem related configuration options +# +config GENERIC_HARDIRQS + def_bool y + +config GENERIC_HARDIRQS_NO__DO_IRQ + def_bool y + +# Select this to disable the deprecated stuff +config GENERIC_HARDIRQS_NO_DEPRECATED + def_bool n + +# Options selectable by the architecture code +config HAVE_SPARSE_IRQ + def_bool n + +config GENERIC_IRQ_PROBE + def_bool n + +config GENERIC_PENDING_IRQ + def_bool n + +config AUTO_IRQ_AFFINITY + def_bool n + +config IRQ_PER_CPU + def_bool n + +config HARDIRQS_SW_RESEND + def_bool n + +config SPARSE_IRQ + bool "Support sparse irq numbering" + depends on HAVE_SPARSE_IRQ + ---help--- + + Sparse irq numbering is useful for distro kernels that want + to define a high CONFIG_NR_CPUS value but still want to have + low kernel memory footprint on smaller machines. + + ( Sparse irqs can also be beneficial on NUMA boxes, as they spread + out the interrupt descriptors in a more NUMA-friendly way. ) + + If you don't know what to do here, say N. + +endmenu +endif diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 7d04780..54329cd 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -1,7 +1,6 @@ -obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o +obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o -obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o obj-$(CONFIG_PM_SLEEP) += pm.o diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 2295a31..505798f 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -57,9 +57,10 @@ unsigned long probe_irq_on(void) * Some chips need to know about probing in * progress: */ - if (desc->chip->set_type) - desc->chip->set_type(i, IRQ_TYPE_PROBE); - desc->chip->startup(i); + if (desc->irq_data.chip->irq_set_type) + desc->irq_data.chip->irq_set_type(&desc->irq_data, + IRQ_TYPE_PROBE); + desc->irq_data.chip->irq_startup(&desc->irq_data); } raw_spin_unlock_irq(&desc->lock); } @@ -76,7 +77,7 @@ unsigned long probe_irq_on(void) raw_spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { desc->status |= IRQ_AUTODETECT | IRQ_WAITING; - if (desc->chip->startup(i)) + if (desc->irq_data.chip->irq_startup(&desc->irq_data)) desc->status |= IRQ_PENDING; } raw_spin_unlock_irq(&desc->lock); @@ -98,7 +99,7 @@ unsigned long probe_irq_on(void) /* It triggered already - consider it spurious. */ if (!(status & IRQ_WAITING)) { desc->status = status & ~IRQ_AUTODETECT; - desc->chip->shutdown(i); + desc->irq_data.chip->irq_shutdown(&desc->irq_data); } else if (i < 32) mask |= 1 << i; @@ -137,7 +138,7 @@ unsigned int probe_irq_mask(unsigned long val) mask |= 1 << i; desc->status = status & ~IRQ_AUTODETECT; - desc->chip->shutdown(i); + desc->irq_data.chip->irq_shutdown(&desc->irq_data); } raw_spin_unlock_irq(&desc->lock); } @@ -181,7 +182,7 @@ int probe_irq_off(unsigned long val) nr_of_irqs++; } desc->status = status & ~IRQ_AUTODETECT; - desc->chip->shutdown(i); + desc->irq_data.chip->irq_shutdown(&desc->irq_data); } raw_spin_unlock_irq(&desc->lock); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b7091d5..baa5c4a 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -18,108 +18,6 @@ #include "internals.h" -static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data) -{ - struct irq_desc *desc; - unsigned long flags; - - desc = irq_to_desc(irq); - if (!desc) { - WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); - return; - } - - /* Ensure we don't have left over values from a previous use of this irq */ - raw_spin_lock_irqsave(&desc->lock, flags); - desc->status = IRQ_DISABLED; - desc->chip = &no_irq_chip; - desc->handle_irq = handle_bad_irq; - desc->depth = 1; - desc->msi_desc = NULL; - desc->handler_data = NULL; - if (!keep_chip_data) - desc->chip_data = NULL; - desc->action = NULL; - desc->irq_count = 0; - desc->irqs_unhandled = 0; -#ifdef CONFIG_SMP - cpumask_setall(desc->affinity); -#ifdef CONFIG_GENERIC_PENDING_IRQ - cpumask_clear(desc->pending_mask); -#endif -#endif - raw_spin_unlock_irqrestore(&desc->lock, flags); -} - -/** - * dynamic_irq_init - initialize a dynamically allocated irq - * @irq: irq number to initialize - */ -void dynamic_irq_init(unsigned int irq) -{ - dynamic_irq_init_x(irq, false); -} - -/** - * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq - * @irq: irq number to initialize - * - * does not set irq_to_desc(irq)->chip_data to NULL - */ -void dynamic_irq_init_keep_chip_data(unsigned int irq) -{ - dynamic_irq_init_x(irq, true); -} - -static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data) -{ - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - - if (!desc) { - WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); - return; - } - - raw_spin_lock_irqsave(&desc->lock, flags); - if (desc->action) { - raw_spin_unlock_irqrestore(&desc->lock, flags); - WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n", - irq); - return; - } - desc->msi_desc = NULL; - desc->handler_data = NULL; - if (!keep_chip_data) - desc->chip_data = NULL; - desc->handle_irq = handle_bad_irq; - desc->chip = &no_irq_chip; - desc->name = NULL; - clear_kstat_irqs(desc); - raw_spin_unlock_irqrestore(&desc->lock, flags); -} - -/** - * dynamic_irq_cleanup - cleanup a dynamically allocated irq - * @irq: irq number to initialize - */ -void dynamic_irq_cleanup(unsigned int irq) -{ - dynamic_irq_cleanup_x(irq, false); -} - -/** - * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq - * @irq: irq number to initialize - * - * does not set irq_to_desc(irq)->chip_data to NULL - */ -void dynamic_irq_cleanup_keep_chip_data(unsigned int irq) -{ - dynamic_irq_cleanup_x(irq, true); -} - - /** * set_irq_chip - set the irq chip for an irq * @irq: irq number @@ -140,7 +38,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) raw_spin_lock_irqsave(&desc->lock, flags); irq_chip_set_defaults(chip); - desc->chip = chip; + desc->irq_data.chip = chip; raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; @@ -193,7 +91,7 @@ int set_irq_data(unsigned int irq, void *data) } raw_spin_lock_irqsave(&desc->lock, flags); - desc->handler_data = data; + desc->irq_data.handler_data = data; raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -218,7 +116,7 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) } raw_spin_lock_irqsave(&desc->lock, flags); - desc->msi_desc = entry; + desc->irq_data.msi_desc = entry; if (entry) entry->irq = irq; raw_spin_unlock_irqrestore(&desc->lock, flags); @@ -243,19 +141,27 @@ int set_irq_chip_data(unsigned int irq, void *data) return -EINVAL; } - if (!desc->chip) { + if (!desc->irq_data.chip) { printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); return -EINVAL; } raw_spin_lock_irqsave(&desc->lock, flags); - desc->chip_data = data; + desc->irq_data.chip_data = data; raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } EXPORT_SYMBOL(set_irq_chip_data); +struct irq_data *irq_get_irq_data(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + return desc ? &desc->irq_data : NULL; +} +EXPORT_SYMBOL_GPL(irq_get_irq_data); + /** * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq * @@ -287,93 +193,216 @@ EXPORT_SYMBOL_GPL(set_irq_nested_thread); /* * default enable function */ -static void default_enable(unsigned int irq) +static void default_enable(struct irq_data *data) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc = irq_data_to_desc(data); - desc->chip->unmask(irq); + desc->irq_data.chip->irq_unmask(&desc->irq_data); desc->status &= ~IRQ_MASKED; } /* * default disable function */ -static void default_disable(unsigned int irq) +static void default_disable(struct irq_data *data) { } /* * default startup function */ -static unsigned int default_startup(unsigned int irq) +static unsigned int default_startup(struct irq_data *data) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc = irq_data_to_desc(data); - desc->chip->enable(irq); + desc->irq_data.chip->irq_enable(data); return 0; } /* * default shutdown function */ -static void default_shutdown(unsigned int irq) +static void default_shutdown(struct irq_data *data) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc = irq_data_to_desc(data); - desc->chip->mask(irq); + desc->irq_data.chip->irq_mask(&desc->irq_data); desc->status |= IRQ_MASKED; } +#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED +/* Temporary migration helpers */ +static void compat_irq_mask(struct irq_data *data) +{ + data->chip->mask(data->irq); +} + +static void compat_irq_unmask(struct irq_data *data) +{ + data->chip->unmask(data->irq); +} + +static void compat_irq_ack(struct irq_data *data) +{ + data->chip->ack(data->irq); +} + +static void compat_irq_mask_ack(struct irq_data *data) +{ + data->chip->mask_ack(data->irq); +} + +static void compat_irq_eoi(struct irq_data *data) +{ + data->chip->eoi(data->irq); +} + +static void compat_irq_enable(struct irq_data *data) +{ + data->chip->enable(data->irq); +} + +static void compat_irq_disable(struct irq_data *data) +{ + data->chip->disable(data->irq); +} + +static void compat_irq_shutdown(struct irq_data *data) +{ + data->chip->shutdown(data->irq); +} + +static unsigned int compat_irq_startup(struct irq_data *data) +{ + return data->chip->startup(data->irq); +} + +static int compat_irq_set_affinity(struct irq_data *data, + const struct cpumask *dest, bool force) +{ + return data->chip->set_affinity(data->irq, dest); +} + +static int compat_irq_set_type(struct irq_data *data, unsigned int type) +{ + return data->chip->set_type(data->irq, type); +} + +static int compat_irq_set_wake(struct irq_data *data, unsigned int on) +{ + return data->chip->set_wake(data->irq, on); +} + +static int compat_irq_retrigger(struct irq_data *data) +{ + return data->chip->retrigger(data->irq); +} + +static void compat_bus_lock(struct irq_data *data) +{ + data->chip->bus_lock(data->irq); +} + +static void compat_bus_sync_unlock(struct irq_data *data) +{ + data->chip->bus_sync_unlock(data->irq); +} +#endif + /* * Fixup enable/disable function pointers */ void irq_chip_set_defaults(struct irq_chip *chip) { - if (!chip->enable) - chip->enable = default_enable; - if (!chip->disable) - chip->disable = default_disable; - if (!chip->startup) - chip->startup = default_startup; +#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED /* - * We use chip->disable, when the user provided its own. When - * we have default_disable set for chip->disable, then we need + * Compat fixup functions need to be before we set the + * defaults for enable/disable/startup/shutdown + */ + if (chip->enable) + chip->irq_enable = compat_irq_enable; + if (chip->disable) + chip->irq_disable = compat_irq_disable; + if (chip->shutdown) + chip->irq_shutdown = compat_irq_shutdown; + if (chip->startup) + chip->irq_startup = compat_irq_startup; +#endif + /* + * The real defaults + */ + if (!chip->irq_enable) + chip->irq_enable = default_enable; + if (!chip->irq_disable) + chip->irq_disable = default_disable; + if (!chip->irq_startup) + chip->irq_startup = default_startup; + /* + * We use chip->irq_disable, when the user provided its own. When + * we have default_disable set for chip->irq_disable, then we need * to use default_shutdown, otherwise the irq line is not * disabled on free_irq(): */ - if (!chip->shutdown) - chip->shutdown = chip->disable != default_disable ? - chip->disable : default_shutdown; - if (!chip->name) - chip->name = chip->typename; + if (!chip->irq_shutdown) + chip->irq_shutdown = chip->irq_disable != default_disable ? + chip->irq_disable : default_shutdown; + +#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED if (!chip->end) chip->end = dummy_irq_chip.end; + + /* + * Now fix up the remaining compat handlers + */ + if (chip->bus_lock) + chip->irq_bus_lock = compat_bus_lock; + if (chip->bus_sync_unlock) + chip->irq_bus_sync_unlock = compat_bus_sync_unlock; + if (chip->mask) + chip->irq_mask = compat_irq_mask; + if (chip->unmask) + chip->irq_unmask = compat_irq_unmask; + if (chip->ack) + chip->irq_ack = compat_irq_ack; + if (chip->mask_ack) + chip->irq_mask_ack = compat_irq_mask_ack; + if (chip->eoi) + chip->irq_eoi = compat_irq_eoi; + if (chip->set_affinity) + chip->irq_set_affinity = compat_irq_set_affinity; + if (chip->set_type) + chip->irq_set_type = compat_irq_set_type; + if (chip->set_wake) + chip->irq_set_wake = compat_irq_set_wake; + if (chip->retrigger) + chip->irq_retrigger = compat_irq_retrigger; +#endif } -static inline void mask_ack_irq(struct irq_desc *desc, int irq) +static inline void mask_ack_irq(struct irq_desc *desc) { - if (desc->chip->mask_ack) - desc->chip->mask_ack(irq); + if (desc->irq_data.chip->irq_mask_ack) + desc->irq_data.chip->irq_mask_ack(&desc->irq_data); else { - desc->chip->mask(irq); - if (desc->chip->ack) - desc->chip->ack(irq); + desc->irq_data.chip->irq_mask(&desc->irq_data); + if (desc->irq_data.chip->irq_ack) + desc->irq_data.chip->irq_ack(&desc->irq_data); } desc->status |= IRQ_MASKED; } -static inline void mask_irq(struct irq_desc *desc, int irq) +static inline void mask_irq(struct irq_desc *desc) { - if (desc->chip->mask) { - desc->chip->mask(irq); + if (desc->irq_data.chip->irq_mask) { + desc->irq_data.chip->irq_mask(&desc->irq_data); desc->status |= IRQ_MASKED; } } -static inline void unmask_irq(struct irq_desc *desc, int irq) +static inline void unmask_irq(struct irq_desc *desc) { - if (desc->chip->unmask) { - desc->chip->unmask(irq); + if (desc->irq_data.chip->irq_unmask) { + desc->irq_data.chip->irq_unmask(&desc->irq_data); desc->status &= ~IRQ_MASKED; } } @@ -476,7 +505,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) irqreturn_t action_ret; raw_spin_lock(&desc->lock); - mask_ack_irq(desc, irq); + mask_ack_irq(desc); if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; @@ -502,7 +531,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) desc->status &= ~IRQ_INPROGRESS; if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) - unmask_irq(desc, irq); + unmask_irq(desc); out_unlock: raw_spin_unlock(&desc->lock); } @@ -539,7 +568,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) action = desc->action; if (unlikely(!action || (desc->status & IRQ_DISABLED))) { desc->status |= IRQ_PENDING; - mask_irq(desc, irq); + mask_irq(desc); goto out; } @@ -554,7 +583,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; out: - desc->chip->eoi(irq); + desc->irq_data.chip->irq_eoi(&desc->irq_data); raw_spin_unlock(&desc->lock); } @@ -590,14 +619,13 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || !desc->action)) { desc->status |= (IRQ_PENDING | IRQ_MASKED); - mask_ack_irq(desc, irq); + mask_ack_irq(desc); goto out_unlock; } kstat_incr_irqs_this_cpu(irq, desc); /* Start handling the irq */ - if (desc->chip->ack) - desc->chip->ack(irq); + desc->irq_data.chip->irq_ack(&desc->irq_data); /* Mark the IRQ currently in progress.*/ desc->status |= IRQ_INPROGRESS; @@ -607,7 +635,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) irqreturn_t action_ret; if (unlikely(!action)) { - mask_irq(desc, irq); + mask_irq(desc); goto out_unlock; } @@ -619,7 +647,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) if (unlikely((desc->status & (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == (IRQ_PENDING | IRQ_MASKED))) { - unmask_irq(desc, irq); + unmask_irq(desc); } desc->status &= ~IRQ_PENDING; @@ -650,15 +678,15 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) kstat_incr_irqs_this_cpu(irq, desc); - if (desc->chip->ack) - desc->chip->ack(irq); + if (desc->irq_data.chip->irq_ack) + desc->irq_data.chip->irq_ack(&desc->irq_data); action_ret = handle_IRQ_event(irq, desc->action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - if (desc->chip->eoi) - desc->chip->eoi(irq); + if (desc->irq_data.chip->irq_eoi) + desc->irq_data.chip->irq_eoi(&desc->irq_data); } void @@ -676,7 +704,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, if (!handle) handle = handle_bad_irq; - else if (desc->chip == &no_irq_chip) { + else if (desc->irq_data.chip == &no_irq_chip) { printk(KERN_WARNING "Trying to install %sinterrupt handler " "for IRQ%d\n", is_chained ? "chained " : "", irq); /* @@ -686,16 +714,16 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, * prevent us to setup the interrupt at all. Switch it to * dummy_irq_chip for easy transition. */ - desc->chip = &dummy_irq_chip; + desc->irq_data.chip = &dummy_irq_chip; } - chip_bus_lock(irq, desc); + chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); /* Uninstall? */ if (handle == handle_bad_irq) { - if (desc->chip != &no_irq_chip) - mask_ack_irq(desc, irq); + if (desc->irq_data.chip != &no_irq_chip) + mask_ack_irq(desc); desc->status |= IRQ_DISABLED; desc->depth = 1; } @@ -706,10 +734,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, desc->status &= ~IRQ_DISABLED; desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; desc->depth = 0; - desc->chip->startup(irq); + desc->irq_data.chip->irq_startup(&desc->irq_data); } raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); } EXPORT_SYMBOL_GPL(__set_irq_handler); @@ -729,32 +757,20 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, __set_irq_handler(irq, handle, 0, name); } -void set_irq_noprobe(unsigned int irq) +void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - if (!desc) { - printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq); + if (!desc) return; - } - - raw_spin_lock_irqsave(&desc->lock, flags); - desc->status |= IRQ_NOPROBE; - raw_spin_unlock_irqrestore(&desc->lock, flags); -} - -void set_irq_probe(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - if (!desc) { - printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq); - return; - } + /* Sanitize flags */ + set &= IRQF_MODIFY_MASK; + clr &= IRQF_MODIFY_MASK; raw_spin_lock_irqsave(&desc->lock, flags); - desc->status &= ~IRQ_NOPROBE; + desc->status &= ~clr; + desc->status |= set; raw_spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c new file mode 100644 index 0000000..20dc547 --- /dev/null +++ b/kernel/irq/dummychip.c @@ -0,0 +1,68 @@ +/* + * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar + * Copyright (C) 2005-2006, Thomas Gleixner, Russell King + * + * This file contains the dummy interrupt chip implementation + */ +#include <linux/interrupt.h> +#include <linux/irq.h> + +#include "internals.h" + +/* + * What should we do if we get a hw irq event on an illegal vector? + * Each architecture has to answer this themself. + */ +static void ack_bad(struct irq_data *data) +{ + struct irq_desc *desc = irq_data_to_desc(data); + + print_irq_desc(data->irq, desc); + ack_bad_irq(data->irq); +} + +/* + * NOP functions + */ +static void noop(struct irq_data *data) { } + +static unsigned int noop_ret(struct irq_data *data) +{ + return 0; +} + +#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED +static void compat_noop(unsigned int irq) { } +#define END_INIT .end = compat_noop +#else +#define END_INIT +#endif + +/* + * Generic no controller implementation + */ +struct irq_chip no_irq_chip = { + .name = "none", + .irq_startup = noop_ret, + .irq_shutdown = noop, + .irq_enable = noop, + .irq_disable = noop, + .irq_ack = ack_bad, + END_INIT +}; + +/* + * Generic dummy implementation which can be used for + * real dumb interrupt sources + */ +struct irq_chip dummy_irq_chip = { + .name = "dummy", + .irq_startup = noop_ret, + .irq_shutdown = noop, + .irq_enable = noop, + .irq_disable = noop, + .irq_ack = noop, + .irq_mask = noop, + .irq_unmask = noop, + END_INIT +}; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 27e5c69..e2347eb 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -11,24 +11,15 @@ */ #include <linux/irq.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/module.h> #include <linux/random.h> +#include <linux/sched.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> -#include <linux/rculist.h> -#include <linux/hash.h> -#include <linux/radix-tree.h> + #include <trace/events/irq.h> #include "internals.h" -/* - * lockdep: we want to handle all irq_desc locks as a single lock-class: - */ -struct lock_class_key irq_desc_lock_class; - /** * handle_bad_irq - handle spurious and unhandled irqs * @irq: the interrupt number @@ -43,304 +34,6 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc) ack_bad_irq(irq); } -#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) -static void __init init_irq_default_affinity(void) -{ - alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); - cpumask_setall(irq_default_affinity); -} -#else -static void __init init_irq_default_affinity(void) -{ -} -#endif - -/* - * Linux has a controller-independent interrupt architecture. - * Every controller has a 'controller-template', that is used - * by the main code to do the right thing. Each driver-visible - * interrupt source is transparently wired to the appropriate - * controller. Thus drivers need not be aware of the - * interrupt-controller. - * - * The code is designed to be easily extended with new/different - * interrupt controllers, without having to do assembly magic or - * having to touch the generic code. - * - * Controller mappings for all interrupt sources: - */ -int nr_irqs = NR_IRQS; -EXPORT_SYMBOL_GPL(nr_irqs); - -#ifdef CONFIG_SPARSE_IRQ - -static struct irq_desc irq_desc_init = { - .irq = -1, - .status = IRQ_DISABLED, - .chip = &no_irq_chip, - .handle_irq = handle_bad_irq, - .depth = 1, - .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), -}; - -void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) -{ - void *ptr; - - ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), - GFP_ATOMIC, node); - - /* - * don't overwite if can not get new one - * init_copy_kstat_irqs() could still use old one - */ - if (ptr) { - printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node); - desc->kstat_irqs = ptr; - } -} - -static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) -{ - memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); - - raw_spin_lock_init(&desc->lock); - desc->irq = irq; -#ifdef CONFIG_SMP - desc->node = node; -#endif - lockdep_set_class(&desc->lock, &irq_desc_lock_class); - init_kstat_irqs(desc, node, nr_cpu_ids); - if (!desc->kstat_irqs) { - printk(KERN_ERR "can not alloc kstat_irqs\n"); - BUG_ON(1); - } - if (!alloc_desc_masks(desc, node, false)) { - printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); - BUG_ON(1); - } - init_desc_masks(desc); - arch_init_chip_data(desc, node); -} - -/* - * Protect the sparse_irqs: - */ -DEFINE_RAW_SPINLOCK(sparse_irq_lock); - -static RADIX_TREE(irq_desc_tree, GFP_ATOMIC); - -static void set_irq_desc(unsigned int irq, struct irq_desc *desc) -{ - radix_tree_insert(&irq_desc_tree, irq, desc); -} - -struct irq_desc *irq_to_desc(unsigned int irq) -{ - return radix_tree_lookup(&irq_desc_tree, irq); -} - -void replace_irq_desc(unsigned int irq, struct irq_desc *desc) -{ - void **ptr; - - ptr = radix_tree_lookup_slot(&irq_desc_tree, irq); - if (ptr) - radix_tree_replace_slot(ptr, desc); -} - -static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { - [0 ... NR_IRQS_LEGACY-1] = { - .irq = -1, - .status = IRQ_DISABLED, - .chip = &no_irq_chip, - .handle_irq = handle_bad_irq, - .depth = 1, - .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), - } -}; - -static unsigned int *kstat_irqs_legacy; - -int __init early_irq_init(void) -{ - struct irq_desc *desc; - int legacy_count; - int node; - int i; - - init_irq_default_affinity(); - - /* initialize nr_irqs based on nr_cpu_ids */ - arch_probe_nr_irqs(); - printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs); - - desc = irq_desc_legacy; - legacy_count = ARRAY_SIZE(irq_desc_legacy); - node = first_online_node; - - /* allocate based on nr_cpu_ids */ - kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * - sizeof(int), GFP_NOWAIT, node); - - for (i = 0; i < legacy_count; i++) { - desc[i].irq = i; -#ifdef CONFIG_SMP - desc[i].node = node; -#endif - desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; - lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); - alloc_desc_masks(&desc[i], node, true); - init_desc_masks(&desc[i]); - set_irq_desc(i, &desc[i]); - } - - return arch_early_irq_init(); -} - -struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) -{ - struct irq_desc *desc; - unsigned long flags; - - if (irq >= nr_irqs) { - WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", - irq, nr_irqs); - return NULL; - } - - desc = irq_to_desc(irq); - if (desc) - return desc; - - raw_spin_lock_irqsave(&sparse_irq_lock, flags); - - /* We have to check it to avoid races with another CPU */ - desc = irq_to_desc(irq); - if (desc) - goto out_unlock; - - desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); - - printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); - if (!desc) { - printk(KERN_ERR "can not alloc irq_desc\n"); - BUG_ON(1); - } - init_one_irq_desc(irq, desc, node); - - set_irq_desc(irq, desc); - -out_unlock: - raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); - - return desc; -} - -#else /* !CONFIG_SPARSE_IRQ */ - -struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { - [0 ... NR_IRQS-1] = { - .status = IRQ_DISABLED, - .chip = &no_irq_chip, - .handle_irq = handle_bad_irq, - .depth = 1, - .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), - } -}; - -static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS]; -int __init early_irq_init(void) -{ - struct irq_desc *desc; - int count; - int i; - - init_irq_default_affinity(); - - printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); - - desc = irq_desc; - count = ARRAY_SIZE(irq_desc); - - for (i = 0; i < count; i++) { - desc[i].irq = i; - alloc_desc_masks(&desc[i], 0, true); - init_desc_masks(&desc[i]); - desc[i].kstat_irqs = kstat_irqs_all[i]; - } - return arch_early_irq_init(); -} - -struct irq_desc *irq_to_desc(unsigned int irq) -{ - return (irq < NR_IRQS) ? irq_desc + irq : NULL; -} - -struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) -{ - return irq_to_desc(irq); -} -#endif /* !CONFIG_SPARSE_IRQ */ - -void clear_kstat_irqs(struct irq_desc *desc) -{ - memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); -} - -/* - * What should we do if we get a hw irq event on an illegal vector? - * Each architecture has to answer this themself. - */ -static void ack_bad(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - - print_irq_desc(irq, desc); - ack_bad_irq(irq); -} - -/* - * NOP functions - */ -static void noop(unsigned int irq) -{ -} - -static unsigned int noop_ret(unsigned int irq) -{ - return 0; -} - -/* - * Generic no controller implementation - */ -struct irq_chip no_irq_chip = { - .name = "none", - .startup = noop_ret, - .shutdown = noop, - .enable = noop, - .disable = noop, - .ack = ack_bad, - .end = noop, -}; - -/* - * Generic dummy implementation which can be used for - * real dumb interrupt sources - */ -struct irq_chip dummy_irq_chip = { - .name = "dummy", - .startup = noop_ret, - .shutdown = noop, - .enable = noop, - .disable = noop, - .ack = noop, - .mask = noop, - .unmask = noop, - .end = noop, -}; - /* * Special, empty irq handler: */ @@ -457,20 +150,20 @@ unsigned int __do_IRQ(unsigned int irq) /* * No locking required for CPU-local interrupts: */ - if (desc->chip->ack) - desc->chip->ack(irq); + if (desc->irq_data.chip->ack) + desc->irq_data.chip->ack(irq); if (likely(!(desc->status & IRQ_DISABLED))) { action_ret = handle_IRQ_event(irq, desc->action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); } - desc->chip->end(irq); + desc->irq_data.chip->end(irq); return 1; } raw_spin_lock(&desc->lock); - if (desc->chip->ack) - desc->chip->ack(irq); + if (desc->irq_data.chip->ack) + desc->irq_data.chip->ack(irq); /* * REPLAY is when Linux resends an IRQ that was dropped earlier * WAITING is used by probe to mark irqs that are being tested @@ -530,27 +223,9 @@ out: * The ->end() handler has to deal with interrupts which got * disabled while the handler was running. */ - desc->chip->end(irq); + desc->irq_data.chip->end(irq); raw_spin_unlock(&desc->lock); return 1; } #endif - -void early_init_irq_lock_class(void) -{ - struct irq_desc *desc; - int i; - - for_each_irq_desc(i, desc) { - lockdep_set_class(&desc->lock, &irq_desc_lock_class); - } -} - -unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) -{ - struct irq_desc *desc = irq_to_desc(irq); - return desc ? desc->kstat_irqs[cpu] : 0; -} -EXPORT_SYMBOL(kstat_irqs_cpu); - diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index c63f3bc..4571ae7 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -1,9 +1,12 @@ /* * IRQ subsystem internal functions and variables: */ +#include <linux/irqdesc.h> extern int noirqdebug; +#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) + /* Set default functions for irq_chip structures: */ extern void irq_chip_set_defaults(struct irq_chip *chip); @@ -15,21 +18,19 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); -extern struct lock_class_key irq_desc_lock_class; extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); -extern void clear_kstat_irqs(struct irq_desc *desc); -extern raw_spinlock_t sparse_irq_lock; -#ifdef CONFIG_SPARSE_IRQ -void replace_irq_desc(unsigned int irq, struct irq_desc *desc); -#endif +/* Resending of interrupts :*/ +void check_irq_resend(struct irq_desc *desc, unsigned int irq); #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); +extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc); extern void register_handler_proc(unsigned int irq, struct irqaction *action); extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); #else static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { } +static inline void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) { } static inline void register_handler_proc(unsigned int irq, struct irqaction *action) { } static inline void unregister_handler_proc(unsigned int irq, @@ -40,17 +41,27 @@ extern int irq_select_affinity_usr(unsigned int irq); extern void irq_set_thread_affinity(struct irq_desc *desc); +#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED +static inline void irq_end(unsigned int irq, struct irq_desc *desc) +{ + if (desc->irq_data.chip && desc->irq_data.chip->end) + desc->irq_data.chip->end(irq); +} +#else +static inline void irq_end(unsigned int irq, struct irq_desc *desc) { } +#endif + /* Inline functions for support of irq chips on slow busses */ -static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc) +static inline void chip_bus_lock(struct irq_desc *desc) { - if (unlikely(desc->chip->bus_lock)) - desc->chip->bus_lock(irq); + if (unlikely(desc->irq_data.chip->irq_bus_lock)) + desc->irq_data.chip->irq_bus_lock(&desc->irq_data); } -static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc) +static inline void chip_bus_sync_unlock(struct irq_desc *desc) { - if (unlikely(desc->chip->bus_sync_unlock)) - desc->chip->bus_sync_unlock(irq); + if (unlikely(desc->irq_data.chip->irq_bus_sync_unlock)) + desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); } /* @@ -67,8 +78,8 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); printk("->handle_irq(): %p, ", desc->handle_irq); print_symbol("%s\n", (unsigned long)desc->handle_irq); - printk("->chip(): %p, ", desc->chip); - print_symbol("%s\n", (unsigned long)desc->chip); + printk("->irq_data.chip(): %p, ", desc->irq_data.chip); + print_symbol("%s\n", (unsigned long)desc->irq_data.chip); printk("->action(): %p\n", desc->action); if (desc->action) { printk("->action->handler(): %p, ", desc->action->handler); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c new file mode 100644 index 0000000..9d917ff --- /dev/null +++ b/kernel/irq/irqdesc.c @@ -0,0 +1,395 @@ +/* + * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar + * Copyright (C) 2005-2006, Thomas Gleixner, Russell King + * + * This file contains the interrupt descriptor management code + * + * Detailed information is available in Documentation/DocBook/genericirq + * + */ +#include <linux/irq.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> +#include <linux/radix-tree.h> +#include <linux/bitmap.h> + +#include "internals.h" + +/* + * lockdep: we want to handle all irq_desc locks as a single lock-class: + */ +static struct lock_class_key irq_desc_lock_class; + +#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) +static void __init init_irq_default_affinity(void) +{ + alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); + cpumask_setall(irq_default_affinity); +} +#else +static void __init init_irq_default_affinity(void) +{ +} +#endif + +#ifdef CONFIG_SMP +static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) +{ + if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node)) + return -ENOMEM; + +#ifdef CONFIG_GENERIC_PENDING_IRQ + if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) { + free_cpumask_var(desc->irq_data.affinity); + return -ENOMEM; + } +#endif + return 0; +} + +static void desc_smp_init(struct irq_desc *desc, int node) +{ + desc->irq_data.node = node; + cpumask_copy(desc->irq_data.affinity, irq_default_affinity); +#ifdef CONFIG_GENERIC_PENDING_IRQ + cpumask_clear(desc->pending_mask); +#endif +} + +static inline int desc_node(struct irq_desc *desc) +{ + return desc->irq_data.node; +} + +#else +static inline int +alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; } +static inline void desc_smp_init(struct irq_desc *desc, int node) { } +static inline int desc_node(struct irq_desc *desc) { return 0; } +#endif + +static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) +{ + desc->irq_data.irq = irq; + desc->irq_data.chip = &no_irq_chip; + desc->irq_data.chip_data = NULL; + desc->irq_data.handler_data = NULL; + desc->irq_data.msi_desc = NULL; + desc->status = IRQ_DEFAULT_INIT_FLAGS; + desc->handle_irq = handle_bad_irq; + desc->depth = 1; + desc->irq_count = 0; + desc->irqs_unhandled = 0; + desc->name = NULL; + memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); + desc_smp_init(desc, node); +} + +int nr_irqs = NR_IRQS; +EXPORT_SYMBOL_GPL(nr_irqs); + +static DEFINE_MUTEX(sparse_irq_lock); +static DECLARE_BITMAP(allocated_irqs, NR_IRQS); + +#ifdef CONFIG_SPARSE_IRQ + +static RADIX_TREE(irq_desc_tree, GFP_KERNEL); + +static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) +{ + radix_tree_insert(&irq_desc_tree, irq, desc); +} + +struct irq_desc *irq_to_desc(unsigned int irq) +{ + return radix_tree_lookup(&irq_desc_tree, irq); +} + +static void delete_irq_desc(unsigned int irq) +{ + radix_tree_delete(&irq_desc_tree, irq); +} + +#ifdef CONFIG_SMP +static void free_masks(struct irq_desc *desc) +{ +#ifdef CONFIG_GENERIC_PENDING_IRQ + free_cpumask_var(desc->pending_mask); +#endif + free_cpumask_var(desc->irq_data.affinity); +} +#else +static inline void free_masks(struct irq_desc *desc) { } +#endif + +static struct irq_desc *alloc_desc(int irq, int node) +{ + struct irq_desc *desc; + gfp_t gfp = GFP_KERNEL; + + desc = kzalloc_node(sizeof(*desc), gfp, node); + if (!desc) + return NULL; + /* allocate based on nr_cpu_ids */ + desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs), + gfp, node); + if (!desc->kstat_irqs) + goto err_desc; + + if (alloc_masks(desc, gfp, node)) + goto err_kstat; + + raw_spin_lock_init(&desc->lock); + lockdep_set_class(&desc->lock, &irq_desc_lock_class); + + desc_set_defaults(irq, desc, node); + + return desc; + +err_kstat: + kfree(desc->kstat_irqs); +err_desc: + kfree(desc); + return NULL; +} + +static void free_desc(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + unregister_irq_proc(irq, desc); + + mutex_lock(&sparse_irq_lock); + delete_irq_desc(irq); + mutex_unlock(&sparse_irq_lock); + + free_masks(desc); + kfree(desc->kstat_irqs); + kfree(desc); +} + +static int alloc_descs(unsigned int start, unsigned int cnt, int node) +{ + struct irq_desc *desc; + int i; + + for (i = 0; i < cnt; i++) { + desc = alloc_desc(start + i, node); + if (!desc) + goto err; + mutex_lock(&sparse_irq_lock); + irq_insert_desc(start + i, desc); + mutex_unlock(&sparse_irq_lock); + } + return start; + +err: + for (i--; i >= 0; i--) + free_desc(start + i); + + mutex_lock(&sparse_irq_lock); + bitmap_clear(allocated_irqs, start, cnt); + mutex_unlock(&sparse_irq_lock); + return -ENOMEM; +} + +struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) +{ + int res = irq_alloc_descs(irq, irq, 1, node); + + if (res == -EEXIST || res == irq) + return irq_to_desc(irq); + return NULL; +} + +int __init early_irq_init(void) +{ + int i, initcnt, node = first_online_node; + struct irq_desc *desc; + + init_irq_default_affinity(); + + /* Let arch update nr_irqs and return the nr of preallocated irqs */ + initcnt = arch_probe_nr_irqs(); + printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt); + + for (i = 0; i < initcnt; i++) { + desc = alloc_desc(i, node); + set_bit(i, allocated_irqs); + irq_insert_desc(i, desc); + } + return arch_early_irq_init(); +} + +#else /* !CONFIG_SPARSE_IRQ */ + +struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { + [0 ... NR_IRQS-1] = { + .status = IRQ_DEFAULT_INIT_FLAGS, + .handle_irq = handle_bad_irq, + .depth = 1, + .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), + } +}; + +static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS]; +int __init early_irq_init(void) +{ + int count, i, node = first_online_node; + struct irq_desc *desc; + + init_irq_default_affinity(); + + printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); + + desc = irq_desc; + count = ARRAY_SIZE(irq_desc); + + for (i = 0; i < count; i++) { + desc[i].irq_data.irq = i; + desc[i].irq_data.chip = &no_irq_chip; + desc[i].kstat_irqs = kstat_irqs_all[i]; + alloc_masks(desc + i, GFP_KERNEL, node); + desc_smp_init(desc + i, node); + lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); + } + return arch_early_irq_init(); +} + +struct irq_desc *irq_to_desc(unsigned int irq) +{ + return (irq < NR_IRQS) ? irq_desc + irq : NULL; +} + +struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) +{ + return irq_to_desc(irq); +} + +static void free_desc(unsigned int irq) +{ + dynamic_irq_cleanup(irq); +} + +static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) +{ + return start; +} +#endif /* !CONFIG_SPARSE_IRQ */ + +/* Dynamic interrupt handling */ + +/** + * irq_free_descs - free irq descriptors + * @from: Start of descriptor range + * @cnt: Number of consecutive irqs to free + */ +void irq_free_descs(unsigned int from, unsigned int cnt) +{ + int i; + + if (from >= nr_irqs || (from + cnt) > nr_irqs) + return; + + for (i = 0; i < cnt; i++) + free_desc(from + i); + + mutex_lock(&sparse_irq_lock); + bitmap_clear(allocated_irqs, from, cnt); + mutex_unlock(&sparse_irq_lock); +} + +/** + * irq_alloc_descs - allocate and initialize a range of irq descriptors + * @irq: Allocate for specific irq number if irq >= 0 + * @from: Start the search from this irq number + * @cnt: Number of consecutive irqs to allocate. + * @node: Preferred node on which the irq descriptor should be allocated + * + * Returns the first irq number or error code + */ +int __ref +irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) +{ + int start, ret; + + if (!cnt) + return -EINVAL; + + mutex_lock(&sparse_irq_lock); + + start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); + ret = -EEXIST; + if (irq >=0 && start != irq) + goto err; + + ret = -ENOMEM; + if (start >= nr_irqs) + goto err; + + bitmap_set(allocated_irqs, start, cnt); + mutex_unlock(&sparse_irq_lock); + return alloc_descs(start, cnt, node); + +err: + mutex_unlock(&sparse_irq_lock); + return ret; +} + +/** + * irq_reserve_irqs - mark irqs allocated + * @from: mark from irq number + * @cnt: number of irqs to mark + * + * Returns 0 on success or an appropriate error code + */ +int irq_reserve_irqs(unsigned int from, unsigned int cnt) +{ + unsigned int start; + int ret = 0; + + if (!cnt || (from + cnt) > nr_irqs) + return -EINVAL; + + mutex_lock(&sparse_irq_lock); + start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); + if (start == from) + bitmap_set(allocated_irqs, start, cnt); + else + ret = -EEXIST; + mutex_unlock(&sparse_irq_lock); + return ret; +} + +/** + * irq_get_next_irq - get next allocated irq number + * @offset: where to start the search + * + * Returns next irq number after offset or nr_irqs if none is found. + */ +unsigned int irq_get_next_irq(unsigned int offset) +{ + return find_next_bit(allocated_irqs, nr_irqs, offset); +} + +/** + * dynamic_irq_cleanup - cleanup a dynamically allocated irq + * @irq: irq number to initialize + */ +void dynamic_irq_cleanup(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; + + raw_spin_lock_irqsave(&desc->lock, flags); + desc_set_defaults(irq, desc, desc_node(desc)); + raw_spin_unlock_irqrestore(&desc->lock, flags); +} + +unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) +{ + struct irq_desc *desc = irq_to_desc(irq); + return desc ? desc->kstat_irqs[cpu] : 0; +} diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index e149748..644e8d5 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -73,8 +73,8 @@ int irq_can_set_affinity(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip || - !desc->chip->set_affinity) + if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip || + !desc->irq_data.chip->irq_set_affinity) return 0; return 1; @@ -109,17 +109,18 @@ void irq_set_thread_affinity(struct irq_desc *desc) int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) { struct irq_desc *desc = irq_to_desc(irq); + struct irq_chip *chip = desc->irq_data.chip; unsigned long flags; - if (!desc->chip->set_affinity) + if (!chip->irq_set_affinity) return -EINVAL; raw_spin_lock_irqsave(&desc->lock, flags); #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PCNTXT) { - if (!desc->chip->set_affinity(irq, cpumask)) { - cpumask_copy(desc->affinity, cpumask); + if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { + cpumask_copy(desc->irq_data.affinity, cpumask); irq_set_thread_affinity(desc); } } @@ -128,8 +129,8 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) cpumask_copy(desc->pending_mask, cpumask); } #else - if (!desc->chip->set_affinity(irq, cpumask)) { - cpumask_copy(desc->affinity, cpumask); + if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { + cpumask_copy(desc->irq_data.affinity, cpumask); irq_set_thread_affinity(desc); } #endif @@ -168,16 +169,16 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc) * one of the targets is online. */ if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { - if (cpumask_any_and(desc->affinity, cpu_online_mask) + if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask) < nr_cpu_ids) goto set_affinity; else desc->status &= ~IRQ_AFFINITY_SET; } - cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity); + cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity); set_affinity: - desc->chip->set_affinity(irq, desc->affinity); + desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false); return 0; } @@ -216,14 +217,14 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *desc) void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) { if (suspend) { - if (!desc->action || (desc->action->flags & IRQF_TIMER)) + if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) return; desc->status |= IRQ_SUSPENDED; } if (!desc->depth++) { desc->status |= IRQ_DISABLED; - desc->chip->disable(irq); + desc->irq_data.chip->irq_disable(&desc->irq_data); } } @@ -246,11 +247,11 @@ void disable_irq_nosync(unsigned int irq) if (!desc) return; - chip_bus_lock(irq, desc); + chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); __disable_irq(desc, irq, false); raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); } EXPORT_SYMBOL(disable_irq_nosync); @@ -313,7 +314,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) * IRQ line is re-enabled. * * This function may be called from IRQ context only when - * desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! + * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! */ void enable_irq(unsigned int irq) { @@ -323,11 +324,11 @@ void enable_irq(unsigned int irq) if (!desc) return; - chip_bus_lock(irq, desc); + chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, false); raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); } EXPORT_SYMBOL(enable_irq); @@ -336,8 +337,8 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) struct irq_desc *desc = irq_to_desc(irq); int ret = -ENXIO; - if (desc->chip->set_wake) - ret = desc->chip->set_wake(irq, on); + if (desc->irq_data.chip->irq_set_wake) + ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on); return ret; } @@ -429,12 +430,12 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc) } int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, - unsigned long flags) + unsigned long flags) { int ret; - struct irq_chip *chip = desc->chip; + struct irq_chip *chip = desc->irq_data.chip; - if (!chip || !chip->set_type) { + if (!chip || !chip->irq_set_type) { /* * IRQF_TRIGGER_* but the PIC does not support multiple * flow-types? @@ -445,11 +446,11 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, } /* caller masked out all except trigger mode flags */ - ret = chip->set_type(irq, flags); + ret = chip->irq_set_type(&desc->irq_data, flags); if (ret) - pr_err("setting trigger mode %d for irq %u failed (%pF)\n", - (int)flags, irq, chip->set_type); + pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", + flags, irq, chip->irq_set_type); else { if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)) flags |= IRQ_LEVEL; @@ -457,8 +458,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); desc->status |= flags; - if (chip != desc->chip) - irq_chip_set_defaults(desc->chip); + if (chip != desc->irq_data.chip) + irq_chip_set_defaults(desc->irq_data.chip); } return ret; @@ -507,7 +508,7 @@ static int irq_wait_for_interrupt(struct irqaction *action) static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) { again: - chip_bus_lock(irq, desc); + chip_bus_lock(desc); raw_spin_lock_irq(&desc->lock); /* @@ -521,17 +522,17 @@ again: */ if (unlikely(desc->status & IRQ_INPROGRESS)) { raw_spin_unlock_irq(&desc->lock); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); cpu_relax(); goto again; } if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { desc->status &= ~IRQ_MASKED; - desc->chip->unmask(irq); + desc->irq_data.chip->irq_unmask(&desc->irq_data); } raw_spin_unlock_irq(&desc->lock); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); } #ifdef CONFIG_SMP @@ -556,7 +557,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) } raw_spin_lock_irq(&desc->lock); - cpumask_copy(mask, desc->affinity); + cpumask_copy(mask, desc->irq_data.affinity); raw_spin_unlock_irq(&desc->lock); set_cpus_allowed_ptr(current, mask); @@ -657,7 +658,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (!desc) return -EINVAL; - if (desc->chip == &no_irq_chip) + if (desc->irq_data.chip == &no_irq_chip) return -ENOSYS; /* * Some drivers like serial.c use request_irq() heavily, @@ -752,7 +753,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } if (!shared) { - irq_chip_set_defaults(desc->chip); + irq_chip_set_defaults(desc->irq_data.chip); init_waitqueue_head(&desc->wait_for_threads); @@ -779,7 +780,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (!(desc->status & IRQ_NOAUTOEN)) { desc->depth = 0; desc->status &= ~IRQ_DISABLED; - desc->chip->startup(irq); + desc->irq_data.chip->irq_startup(&desc->irq_data); } else /* Undo nested disables: */ desc->depth = 1; @@ -912,17 +913,17 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) /* Currently used only by UML, might disappear one day: */ #ifdef CONFIG_IRQ_RELEASE_METHOD - if (desc->chip->release) - desc->chip->release(irq, dev_id); + if (desc->irq_data.chip->release) + desc->irq_data.chip->release(irq, dev_id); #endif /* If this was the last handler, shut down the IRQ line: */ if (!desc->action) { desc->status |= IRQ_DISABLED; - if (desc->chip->shutdown) - desc->chip->shutdown(irq); + if (desc->irq_data.chip->irq_shutdown) + desc->irq_data.chip->irq_shutdown(&desc->irq_data); else - desc->chip->disable(irq); + desc->irq_data.chip->irq_disable(&desc->irq_data); } #ifdef CONFIG_SMP @@ -997,9 +998,9 @@ void free_irq(unsigned int irq, void *dev_id) if (!desc) return; - chip_bus_lock(irq, desc); + chip_bus_lock(desc); kfree(__free_irq(irq, dev_id)); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); } EXPORT_SYMBOL(free_irq); @@ -1086,9 +1087,9 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, action->name = devname; action->dev_id = dev_id; - chip_bus_lock(irq, desc); + chip_bus_lock(desc); retval = __setup_irq(irq, desc, action); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); if (retval) kfree(action); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 2419622..1d25419 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -7,6 +7,7 @@ void move_masked_irq(int irq) { struct irq_desc *desc = irq_to_desc(irq); + struct irq_chip *chip = desc->irq_data.chip; if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; @@ -24,7 +25,7 @@ void move_masked_irq(int irq) if (unlikely(cpumask_empty(desc->pending_mask))) return; - if (!desc->chip->set_affinity) + if (!chip->irq_set_affinity) return; assert_raw_spin_locked(&desc->lock); @@ -43,8 +44,9 @@ void move_masked_irq(int irq) */ if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids)) - if (!desc->chip->set_affinity(irq, desc->pending_mask)) { - cpumask_copy(desc->affinity, desc->pending_mask); + if (!chip->irq_set_affinity(&desc->irq_data, + desc->pending_mask, false)) { + cpumask_copy(desc->irq_data.affinity, desc->pending_mask); irq_set_thread_affinity(desc); } @@ -61,8 +63,8 @@ void move_native_irq(int irq) if (unlikely(desc->status & IRQ_DISABLED)) return; - desc->chip->mask(irq); + desc->irq_data.chip->irq_mask(&desc->irq_data); move_masked_irq(irq); - desc->chip->unmask(irq); + desc->irq_data.chip->irq_unmask(&desc->irq_data); } diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c deleted file mode 100644 index 65d3845..0000000 --- a/kernel/irq/numa_migrate.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * NUMA irq-desc migration code - * - * Migrate IRQ data structures (irq_desc, chip_data, etc.) over to - * the new "home node" of the IRQ. - */ - -#include <linux/irq.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <linux/random.h> -#include <linux/interrupt.h> -#include <linux/kernel_stat.h> - -#include "internals.h" - -static void init_copy_kstat_irqs(struct irq_desc *old_desc, - struct irq_desc *desc, - int node, int nr) -{ - init_kstat_irqs(desc, node, nr); - - if (desc->kstat_irqs != old_desc->kstat_irqs) - memcpy(desc->kstat_irqs, old_desc->kstat_irqs, - nr * sizeof(*desc->kstat_irqs)); -} - -static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc) -{ - if (old_desc->kstat_irqs == desc->kstat_irqs) - return; - - kfree(old_desc->kstat_irqs); - old_desc->kstat_irqs = NULL; -} - -static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, - struct irq_desc *desc, int node) -{ - memcpy(desc, old_desc, sizeof(struct irq_desc)); - if (!alloc_desc_masks(desc, node, false)) { - printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " - "for migration.\n", irq); - return false; - } - raw_spin_lock_init(&desc->lock); - desc->node = node; - lockdep_set_class(&desc->lock, &irq_desc_lock_class); - init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids); - init_copy_desc_masks(old_desc, desc); - arch_init_copy_chip_data(old_desc, desc, node); - return true; -} - -static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) -{ - free_kstat_irqs(old_desc, desc); - free_desc_masks(old_desc, desc); - arch_free_chip_data(old_desc, desc); -} - -static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, - int node) -{ - struct irq_desc *desc; - unsigned int irq; - unsigned long flags; - - irq = old_desc->irq; - - raw_spin_lock_irqsave(&sparse_irq_lock, flags); - - /* We have to check it to avoid races with another CPU */ - desc = irq_to_desc(irq); - - if (desc && old_desc != desc) - goto out_unlock; - - desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); - if (!desc) { - printk(KERN_ERR "irq %d: can not get new irq_desc " - "for migration.\n", irq); - /* still use old one */ - desc = old_desc; - goto out_unlock; - } - if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) { - /* still use old one */ - kfree(desc); - desc = old_desc; - goto out_unlock; - } - - replace_irq_desc(irq, desc); - raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); - - /* free the old one */ - free_one_irq_desc(old_desc, desc); - kfree(old_desc); - - return desc; - -out_unlock: - raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); - - return desc; -} - -struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) -{ - /* those static or target node is -1, do not move them */ - if (desc->irq < NR_IRQS_LEGACY || node == -1) - return desc; - - if (desc->node != node) - desc = __real_move_irq_desc(desc, node); - - return desc; -} - diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 09a2ee5..01b1d3a 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -21,7 +21,7 @@ static struct proc_dir_entry *root_irq_dir; static int irq_affinity_proc_show(struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long)m->private); - const struct cpumask *mask = desc->affinity; + const struct cpumask *mask = desc->irq_data.affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PENDING) @@ -65,7 +65,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, cpumask_var_t new_value; int err; - if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity || + if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity || irq_balancing_disabled(irq)) return -EIO; @@ -185,7 +185,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long) m->private); - seq_printf(m, "%d\n", desc->node); + seq_printf(m, "%d\n", desc->irq_data.node); return 0; } @@ -269,7 +269,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) { char name [MAX_NAMELEN]; - if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) + if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir) return; memset(name, 0, MAX_NAMELEN); @@ -297,6 +297,24 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) &irq_spurious_proc_fops, (void *)(long)irq); } +void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) +{ + char name [MAX_NAMELEN]; + + if (!root_irq_dir || !desc->dir) + return; +#ifdef CONFIG_SMP + remove_proc_entry("smp_affinity", desc->dir); + remove_proc_entry("affinity_hint", desc->dir); + remove_proc_entry("node", desc->dir); +#endif + remove_proc_entry("spurious", desc->dir); + + memset(name, 0, MAX_NAMELEN); + sprintf(name, "%u", irq); + remove_proc_entry(name, root_irq_dir); +} + #undef MAX_NAMELEN void unregister_handler_proc(unsigned int irq, struct irqaction *action) diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 090c376..891115a 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -60,7 +60,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) /* * Make sure the interrupt is enabled, before resending it: */ - desc->chip->enable(irq); + desc->irq_data.chip->irq_enable(&desc->irq_data); /* * We do not resend level type interrupts. Level type @@ -70,7 +70,8 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; - if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) { + if (!desc->irq_data.chip->irq_retrigger || + !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { #ifdef CONFIG_HARDIRQS_SW_RESEND /* Set it pending and activate the softirq: */ set_bit(irq, irqs_resend); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 89fb90a..3089d3b 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -14,6 +14,8 @@ #include <linux/moduleparam.h> #include <linux/timer.h> +#include "internals.h" + static int irqfixup __read_mostly; #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) @@ -78,8 +80,8 @@ static int try_one_irq(int irq, struct irq_desc *desc) * If we did actual work for the real IRQ line we must let the * IRQ controller clean up too */ - if (work && desc->chip && desc->chip->end) - desc->chip->end(irq); + if (work) + irq_end(irq, desc); raw_spin_unlock(&desc->lock); return ok; @@ -254,7 +256,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, printk(KERN_EMERG "Disabling IRQ #%d\n", irq); desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; desc->depth++; - desc->chip->disable(irq); + desc->irq_data.chip->irq_disable(&desc->irq_data); mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); diff --git a/kernel/irq_work.c b/kernel/irq_work.c new file mode 100644 index 0000000..f16763f --- /dev/null +++ b/kernel/irq_work.c @@ -0,0 +1,164 @@ +/* + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * + * Provides a framework for enqueueing and running callbacks from hardirq + * context. The enqueueing is NMI-safe. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/irq_work.h> +#include <linux/hardirq.h> + +/* + * An entry can be in one of four states: + * + * free NULL, 0 -> {claimed} : free to be used + * claimed NULL, 3 -> {pending} : claimed to be enqueued + * pending next, 3 -> {busy} : queued, pending callback + * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed + * + * We use the lower two bits of the next pointer to keep PENDING and BUSY + * flags. + */ + +#define IRQ_WORK_PENDING 1UL +#define IRQ_WORK_BUSY 2UL +#define IRQ_WORK_FLAGS 3UL + +static inline bool irq_work_is_set(struct irq_work *entry, int flags) +{ + return (unsigned long)entry->next & flags; +} + +static inline struct irq_work *irq_work_next(struct irq_work *entry) +{ + unsigned long next = (unsigned long)entry->next; + next &= ~IRQ_WORK_FLAGS; + return (struct irq_work *)next; +} + +static inline struct irq_work *next_flags(struct irq_work *entry, int flags) +{ + unsigned long next = (unsigned long)entry; + next |= flags; + return (struct irq_work *)next; +} + +static DEFINE_PER_CPU(struct irq_work *, irq_work_list); + +/* + * Claim the entry so that no one else will poke at it. + */ +static bool irq_work_claim(struct irq_work *entry) +{ + struct irq_work *next, *nflags; + + do { + next = entry->next; + if ((unsigned long)next & IRQ_WORK_PENDING) + return false; + nflags = next_flags(next, IRQ_WORK_FLAGS); + } while (cmpxchg(&entry->next, next, nflags) != next); + + return true; +} + + +void __weak arch_irq_work_raise(void) +{ + /* + * Lame architectures will get the timer tick callback + */ +} + +/* + * Queue the entry and raise the IPI if needed. + */ +static void __irq_work_queue(struct irq_work *entry) +{ + struct irq_work **head, *next; + + head = &get_cpu_var(irq_work_list); + + do { + next = *head; + /* Can assign non-atomic because we keep the flags set. */ + entry->next = next_flags(next, IRQ_WORK_FLAGS); + } while (cmpxchg(head, next, entry) != next); + + /* The list was empty, raise self-interrupt to start processing. */ + if (!irq_work_next(entry)) + arch_irq_work_raise(); + + put_cpu_var(irq_work_list); +} + +/* + * Enqueue the irq_work @entry, returns true on success, failure when the + * @entry was already enqueued by someone else. + * + * Can be re-enqueued while the callback is still in progress. + */ +bool irq_work_queue(struct irq_work *entry) +{ + if (!irq_work_claim(entry)) { + /* + * Already enqueued, can't do! + */ + return false; + } + + __irq_work_queue(entry); + return true; +} +EXPORT_SYMBOL_GPL(irq_work_queue); + +/* + * Run the irq_work entries on this cpu. Requires to be ran from hardirq + * context with local IRQs disabled. + */ +void irq_work_run(void) +{ + struct irq_work *list, **head; + + head = &__get_cpu_var(irq_work_list); + if (*head == NULL) + return; + + BUG_ON(!in_irq()); + BUG_ON(!irqs_disabled()); + + list = xchg(head, NULL); + while (list != NULL) { + struct irq_work *entry = list; + + list = irq_work_next(list); + + /* + * Clear the PENDING bit, after this point the @entry + * can be re-used. + */ + entry->next = next_flags(NULL, IRQ_WORK_BUSY); + entry->func(entry); + /* + * Clear the BUSY bit and return to the free state if + * no-one else claimed it meanwhile. + */ + cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL); + } +} +EXPORT_SYMBOL_GPL(irq_work_run); + +/* + * Synchronize against the irq_work @entry, ensures the entry is not + * currently in use. + */ +void irq_work_sync(struct irq_work *entry) +{ + WARN_ON_ONCE(irqs_disabled()); + + while (irq_work_is_set(entry, IRQ_WORK_BUSY)) + cpu_relax(); +} +EXPORT_SYMBOL_GPL(irq_work_sync); diff --git a/kernel/jump_label.c b/kernel/jump_label.c new file mode 100644 index 0000000..7be868b --- /dev/null +++ b/kernel/jump_label.c @@ -0,0 +1,429 @@ +/* + * jump label support + * + * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> + * + */ +#include <linux/jump_label.h> +#include <linux/memory.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/list.h> +#include <linux/jhash.h> +#include <linux/slab.h> +#include <linux/sort.h> +#include <linux/err.h> + +#ifdef HAVE_JUMP_LABEL + +#define JUMP_LABEL_HASH_BITS 6 +#define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS) +static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE]; + +/* mutex to protect coming/going of the the jump_label table */ +static DEFINE_MUTEX(jump_label_mutex); + +struct jump_label_entry { + struct hlist_node hlist; + struct jump_entry *table; + int nr_entries; + /* hang modules off here */ + struct hlist_head modules; + unsigned long key; +}; + +struct jump_label_module_entry { + struct hlist_node hlist; + struct jump_entry *table; + int nr_entries; + struct module *mod; +}; + +static int jump_label_cmp(const void *a, const void *b) +{ + const struct jump_entry *jea = a; + const struct jump_entry *jeb = b; + + if (jea->key < jeb->key) + return -1; + + if (jea->key > jeb->key) + return 1; + + return 0; +} + +static void +sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop) +{ + unsigned long size; + + size = (((unsigned long)stop - (unsigned long)start) + / sizeof(struct jump_entry)); + sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); +} + +static struct jump_label_entry *get_jump_label_entry(jump_label_t key) +{ + struct hlist_head *head; + struct hlist_node *node; + struct jump_label_entry *e; + u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0); + + head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (key == e->key) + return e; + } + return NULL; +} + +static struct jump_label_entry * +add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table) +{ + struct hlist_head *head; + struct jump_label_entry *e; + u32 hash; + + e = get_jump_label_entry(key); + if (e) + return ERR_PTR(-EEXIST); + + e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL); + if (!e) + return ERR_PTR(-ENOMEM); + + hash = jhash((void *)&key, sizeof(jump_label_t), 0); + head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)]; + e->key = key; + e->table = table; + e->nr_entries = nr_entries; + INIT_HLIST_HEAD(&(e->modules)); + hlist_add_head(&e->hlist, head); + return e; +} + +static int +build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop) +{ + struct jump_entry *iter, *iter_begin; + struct jump_label_entry *entry; + int count; + + sort_jump_label_entries(start, stop); + iter = start; + while (iter < stop) { + entry = get_jump_label_entry(iter->key); + if (!entry) { + iter_begin = iter; + count = 0; + while ((iter < stop) && + (iter->key == iter_begin->key)) { + iter++; + count++; + } + entry = add_jump_label_entry(iter_begin->key, + count, iter_begin); + if (IS_ERR(entry)) + return PTR_ERR(entry); + } else { + WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n"); + return -1; + } + } + return 0; +} + +/*** + * jump_label_update - update jump label text + * @key - key value associated with a a jump label + * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE + * + * Will enable/disable the jump for jump label @key, depending on the + * value of @type. + * + */ + +void jump_label_update(unsigned long key, enum jump_label_type type) +{ + struct jump_entry *iter; + struct jump_label_entry *entry; + struct hlist_node *module_node; + struct jump_label_module_entry *e_module; + int count; + + mutex_lock(&jump_label_mutex); + entry = get_jump_label_entry((jump_label_t)key); + if (entry) { + count = entry->nr_entries; + iter = entry->table; + while (count--) { + if (kernel_text_address(iter->code)) + arch_jump_label_transform(iter, type); + iter++; + } + /* eanble/disable jump labels in modules */ + hlist_for_each_entry(e_module, module_node, &(entry->modules), + hlist) { + count = e_module->nr_entries; + iter = e_module->table; + while (count--) { + if (kernel_text_address(iter->code)) + arch_jump_label_transform(iter, type); + iter++; + } + } + } + mutex_unlock(&jump_label_mutex); +} + +static int addr_conflict(struct jump_entry *entry, void *start, void *end) +{ + if (entry->code <= (unsigned long)end && + entry->code + JUMP_LABEL_NOP_SIZE > (unsigned long)start) + return 1; + + return 0; +} + +#ifdef CONFIG_MODULES + +static int module_conflict(void *start, void *end) +{ + struct hlist_head *head; + struct hlist_node *node, *node_next, *module_node, *module_node_next; + struct jump_label_entry *e; + struct jump_label_module_entry *e_module; + struct jump_entry *iter; + int i, count; + int conflict = 0; + + for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { + head = &jump_label_table[i]; + hlist_for_each_entry_safe(e, node, node_next, head, hlist) { + hlist_for_each_entry_safe(e_module, module_node, + module_node_next, + &(e->modules), hlist) { + count = e_module->nr_entries; + iter = e_module->table; + while (count--) { + if (addr_conflict(iter, start, end)) { + conflict = 1; + goto out; + } + iter++; + } + } + } + } +out: + return conflict; +} + +#endif + +/*** + * jump_label_text_reserved - check if addr range is reserved + * @start: start text addr + * @end: end text addr + * + * checks if the text addr located between @start and @end + * overlaps with any of the jump label patch addresses. Code + * that wants to modify kernel text should first verify that + * it does not overlap with any of the jump label addresses. + * + * returns 1 if there is an overlap, 0 otherwise + */ +int jump_label_text_reserved(void *start, void *end) +{ + struct jump_entry *iter; + struct jump_entry *iter_start = __start___jump_table; + struct jump_entry *iter_stop = __start___jump_table; + int conflict = 0; + + mutex_lock(&jump_label_mutex); + iter = iter_start; + while (iter < iter_stop) { + if (addr_conflict(iter, start, end)) { + conflict = 1; + goto out; + } + iter++; + } + + /* now check modules */ +#ifdef CONFIG_MODULES + conflict = module_conflict(start, end); +#endif +out: + mutex_unlock(&jump_label_mutex); + return conflict; +} + +static __init int init_jump_label(void) +{ + int ret; + struct jump_entry *iter_start = __start___jump_table; + struct jump_entry *iter_stop = __stop___jump_table; + struct jump_entry *iter; + + mutex_lock(&jump_label_mutex); + ret = build_jump_label_hashtable(__start___jump_table, + __stop___jump_table); + iter = iter_start; + while (iter < iter_stop) { + arch_jump_label_text_poke_early(iter->code); + iter++; + } + mutex_unlock(&jump_label_mutex); + return ret; +} +early_initcall(init_jump_label); + +#ifdef CONFIG_MODULES + +static struct jump_label_module_entry * +add_jump_label_module_entry(struct jump_label_entry *entry, + struct jump_entry *iter_begin, + int count, struct module *mod) +{ + struct jump_label_module_entry *e; + + e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL); + if (!e) + return ERR_PTR(-ENOMEM); + e->mod = mod; + e->nr_entries = count; + e->table = iter_begin; + hlist_add_head(&e->hlist, &entry->modules); + return e; +} + +static int add_jump_label_module(struct module *mod) +{ + struct jump_entry *iter, *iter_begin; + struct jump_label_entry *entry; + struct jump_label_module_entry *module_entry; + int count; + + /* if the module doesn't have jump label entries, just return */ + if (!mod->num_jump_entries) + return 0; + + sort_jump_label_entries(mod->jump_entries, + mod->jump_entries + mod->num_jump_entries); + iter = mod->jump_entries; + while (iter < mod->jump_entries + mod->num_jump_entries) { + entry = get_jump_label_entry(iter->key); + iter_begin = iter; + count = 0; + while ((iter < mod->jump_entries + mod->num_jump_entries) && + (iter->key == iter_begin->key)) { + iter++; + count++; + } + if (!entry) { + entry = add_jump_label_entry(iter_begin->key, 0, NULL); + if (IS_ERR(entry)) + return PTR_ERR(entry); + } + module_entry = add_jump_label_module_entry(entry, iter_begin, + count, mod); + if (IS_ERR(module_entry)) + return PTR_ERR(module_entry); + } + return 0; +} + +static void remove_jump_label_module(struct module *mod) +{ + struct hlist_head *head; + struct hlist_node *node, *node_next, *module_node, *module_node_next; + struct jump_label_entry *e; + struct jump_label_module_entry *e_module; + int i; + + /* if the module doesn't have jump label entries, just return */ + if (!mod->num_jump_entries) + return; + + for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { + head = &jump_label_table[i]; + hlist_for_each_entry_safe(e, node, node_next, head, hlist) { + hlist_for_each_entry_safe(e_module, module_node, + module_node_next, + &(e->modules), hlist) { + if (e_module->mod == mod) { + hlist_del(&e_module->hlist); + kfree(e_module); + } + } + if (hlist_empty(&e->modules) && (e->nr_entries == 0)) { + hlist_del(&e->hlist); + kfree(e); + } + } + } +} + +static int +jump_label_module_notify(struct notifier_block *self, unsigned long val, + void *data) +{ + struct module *mod = data; + int ret = 0; + + switch (val) { + case MODULE_STATE_COMING: + mutex_lock(&jump_label_mutex); + ret = add_jump_label_module(mod); + if (ret) + remove_jump_label_module(mod); + mutex_unlock(&jump_label_mutex); + break; + case MODULE_STATE_GOING: + mutex_lock(&jump_label_mutex); + remove_jump_label_module(mod); + mutex_unlock(&jump_label_mutex); + break; + } + return ret; +} + +/*** + * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop() + * @mod: module to patch + * + * Allow for run-time selection of the optimal nops. Before the module + * loads patch these with arch_get_jump_label_nop(), which is specified by + * the arch specific jump label code. + */ +void jump_label_apply_nops(struct module *mod) +{ + struct jump_entry *iter; + + /* if the module doesn't have jump label entries, just return */ + if (!mod->num_jump_entries) + return; + + iter = mod->jump_entries; + while (iter < mod->jump_entries + mod->num_jump_entries) { + arch_jump_label_text_poke_early(iter->code); + iter++; + } +} + +struct notifier_block jump_label_module_nb = { + .notifier_call = jump_label_module_notify, + .priority = 0, +}; + +static __init int init_jump_label_module(void) +{ + return register_module_notifier(&jump_label_module_nb); +} +early_initcall(init_jump_label_module); + +#endif /* CONFIG_MODULES */ + +#endif diff --git a/kernel/kexec.c b/kernel/kexec.c index 474a847..c0613f7 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -151,8 +151,10 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, image->nr_segments = nr_segments; segment_bytes = nr_segments * sizeof(*segments); result = copy_from_user(image->segment, segments, segment_bytes); - if (result) + if (result) { + result = -EFAULT; goto out; + } /* * Verify we have good destination addresses. The caller is @@ -827,7 +829,7 @@ static int kimage_load_normal_segment(struct kimage *image, result = copy_from_user(ptr, buf, uchunk); kunmap(page); if (result) { - result = (result < 0) ? result : -EIO; + result = -EFAULT; goto out; } ubytes -= uchunk; @@ -882,7 +884,7 @@ static int kimage_load_crash_segment(struct kimage *image, kexec_flush_icache_page(page); kunmap(page); if (result) { - result = (result < 0) ? result : -EIO; + result = -EFAULT; goto out; } ubytes -= uchunk; @@ -1089,9 +1091,10 @@ void crash_kexec(struct pt_regs *regs) size_t crash_get_memory_size(void) { - size_t size; + size_t size = 0; mutex_lock(&kexec_mutex); - size = crashk_res.end - crashk_res.start + 1; + if (crashk_res.end != crashk_res.start) + size = crashk_res.end - crashk_res.start + 1; mutex_unlock(&kexec_mutex); return size; } @@ -1134,7 +1137,7 @@ int crash_shrink_memory(unsigned long new_size) free_reserved_phys_range(end, crashk_res.end); - if (start == end) + if ((start == end) && (crashk_res.parent != NULL)) release_resource(&crashk_res); crashk_res.end = end - 1; diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 35edbe2..01a0700 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -1,8 +1,7 @@ /* - * A generic kernel FIFO implementation. + * A generic kernel FIFO implementation * - * Copyright (C) 2009 Stefani Seibold <stefani@seibold.net> - * Copyright (C) 2004 Stelian Pop <stelian@popies.net> + * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -11,7 +10,7 @@ * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License @@ -24,422 +23,586 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/err.h> -#include <linux/kfifo.h> #include <linux/log2.h> #include <linux/uaccess.h> +#include <linux/kfifo.h> -static void _kfifo_init(struct kfifo *fifo, void *buffer, - unsigned int size) -{ - fifo->buffer = buffer; - fifo->size = size; - - kfifo_reset(fifo); -} - -/** - * kfifo_init - initialize a FIFO using a preallocated buffer - * @fifo: the fifo to assign the buffer - * @buffer: the preallocated buffer to be used. - * @size: the size of the internal buffer, this has to be a power of 2. - * +/* + * internal helper to calculate the unused elements in a fifo */ -void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size) +static inline unsigned int kfifo_unused(struct __kfifo *fifo) { - /* size must be a power of 2 */ - BUG_ON(!is_power_of_2(size)); - - _kfifo_init(fifo, buffer, size); + return (fifo->mask + 1) - (fifo->in - fifo->out); } -EXPORT_SYMBOL(kfifo_init); -/** - * kfifo_alloc - allocates a new FIFO internal buffer - * @fifo: the fifo to assign then new buffer - * @size: the size of the buffer to be allocated, this have to be a power of 2. - * @gfp_mask: get_free_pages mask, passed to kmalloc() - * - * This function dynamically allocates a new fifo internal buffer - * - * The size will be rounded-up to a power of 2. - * The buffer will be release with kfifo_free(). - * Return 0 if no error, otherwise the an error code - */ -int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask) +int __kfifo_alloc(struct __kfifo *fifo, unsigned int size, + size_t esize, gfp_t gfp_mask) { - unsigned char *buffer; - /* - * round up to the next power of 2, since our 'let the indices + * round down to the next power of 2, since our 'let the indices * wrap' technique works only in this case. */ - if (!is_power_of_2(size)) { - BUG_ON(size > 0x80000000); - size = roundup_pow_of_two(size); + if (!is_power_of_2(size)) + size = rounddown_pow_of_two(size); + + fifo->in = 0; + fifo->out = 0; + fifo->esize = esize; + + if (size < 2) { + fifo->data = NULL; + fifo->mask = 0; + return -EINVAL; } - buffer = kmalloc(size, gfp_mask); - if (!buffer) { - _kfifo_init(fifo, NULL, 0); + fifo->data = kmalloc(size * esize, gfp_mask); + + if (!fifo->data) { + fifo->mask = 0; return -ENOMEM; } - - _kfifo_init(fifo, buffer, size); + fifo->mask = size - 1; return 0; } -EXPORT_SYMBOL(kfifo_alloc); +EXPORT_SYMBOL(__kfifo_alloc); -/** - * kfifo_free - frees the FIFO internal buffer - * @fifo: the fifo to be freed. - */ -void kfifo_free(struct kfifo *fifo) +void __kfifo_free(struct __kfifo *fifo) { - kfree(fifo->buffer); - _kfifo_init(fifo, NULL, 0); + kfree(fifo->data); + fifo->in = 0; + fifo->out = 0; + fifo->esize = 0; + fifo->data = NULL; + fifo->mask = 0; } -EXPORT_SYMBOL(kfifo_free); +EXPORT_SYMBOL(__kfifo_free); -/** - * kfifo_skip - skip output data - * @fifo: the fifo to be used. - * @len: number of bytes to skip - */ -void kfifo_skip(struct kfifo *fifo, unsigned int len) +int __kfifo_init(struct __kfifo *fifo, void *buffer, + unsigned int size, size_t esize) { - if (len < kfifo_len(fifo)) { - __kfifo_add_out(fifo, len); - return; + size /= esize; + + if (!is_power_of_2(size)) + size = rounddown_pow_of_two(size); + + fifo->in = 0; + fifo->out = 0; + fifo->esize = esize; + fifo->data = buffer; + + if (size < 2) { + fifo->mask = 0; + return -EINVAL; } - kfifo_reset_out(fifo); + fifo->mask = size - 1; + + return 0; } -EXPORT_SYMBOL(kfifo_skip); +EXPORT_SYMBOL(__kfifo_init); -static inline void __kfifo_in_data(struct kfifo *fifo, - const void *from, unsigned int len, unsigned int off) +static void kfifo_copy_in(struct __kfifo *fifo, const void *src, + unsigned int len, unsigned int off) { + unsigned int size = fifo->mask + 1; + unsigned int esize = fifo->esize; unsigned int l; + off &= fifo->mask; + if (esize != 1) { + off *= esize; + size *= esize; + len *= esize; + } + l = min(len, size - off); + + memcpy(fifo->data + off, src, l); + memcpy(fifo->data, src + l, len - l); /* - * Ensure that we sample the fifo->out index -before- we - * start putting bytes into the kfifo. + * make sure that the data in the fifo is up to date before + * incrementing the fifo->in index counter */ + smp_wmb(); +} - smp_mb(); - - off = __kfifo_off(fifo, fifo->in + off); +unsigned int __kfifo_in(struct __kfifo *fifo, + const void *buf, unsigned int len) +{ + unsigned int l; - /* first put the data starting from fifo->in to buffer end */ - l = min(len, fifo->size - off); - memcpy(fifo->buffer + off, from, l); + l = kfifo_unused(fifo); + if (len > l) + len = l; - /* then put the rest (if any) at the beginning of the buffer */ - memcpy(fifo->buffer, from + l, len - l); + kfifo_copy_in(fifo, buf, len, fifo->in); + fifo->in += len; + return len; } +EXPORT_SYMBOL(__kfifo_in); -static inline void __kfifo_out_data(struct kfifo *fifo, - void *to, unsigned int len, unsigned int off) +static void kfifo_copy_out(struct __kfifo *fifo, void *dst, + unsigned int len, unsigned int off) { + unsigned int size = fifo->mask + 1; + unsigned int esize = fifo->esize; unsigned int l; + off &= fifo->mask; + if (esize != 1) { + off *= esize; + size *= esize; + len *= esize; + } + l = min(len, size - off); + + memcpy(dst, fifo->data + off, l); + memcpy(dst + l, fifo->data, len - l); /* - * Ensure that we sample the fifo->in index -before- we - * start removing bytes from the kfifo. + * make sure that the data is copied before + * incrementing the fifo->out index counter */ + smp_wmb(); +} - smp_rmb(); +unsigned int __kfifo_out_peek(struct __kfifo *fifo, + void *buf, unsigned int len) +{ + unsigned int l; - off = __kfifo_off(fifo, fifo->out + off); + l = fifo->in - fifo->out; + if (len > l) + len = l; - /* first get the data from fifo->out until the end of the buffer */ - l = min(len, fifo->size - off); - memcpy(to, fifo->buffer + off, l); + kfifo_copy_out(fifo, buf, len, fifo->out); + return len; +} +EXPORT_SYMBOL(__kfifo_out_peek); - /* then get the rest (if any) from the beginning of the buffer */ - memcpy(to + l, fifo->buffer, len - l); +unsigned int __kfifo_out(struct __kfifo *fifo, + void *buf, unsigned int len) +{ + len = __kfifo_out_peek(fifo, buf, len); + fifo->out += len; + return len; } +EXPORT_SYMBOL(__kfifo_out); -static inline int __kfifo_from_user_data(struct kfifo *fifo, - const void __user *from, unsigned int len, unsigned int off, - unsigned *lenout) +static unsigned long kfifo_copy_from_user(struct __kfifo *fifo, + const void __user *from, unsigned int len, unsigned int off, + unsigned int *copied) { + unsigned int size = fifo->mask + 1; + unsigned int esize = fifo->esize; unsigned int l; - int ret; + unsigned long ret; + off &= fifo->mask; + if (esize != 1) { + off *= esize; + size *= esize; + len *= esize; + } + l = min(len, size - off); + + ret = copy_from_user(fifo->data + off, from, l); + if (unlikely(ret)) + ret = DIV_ROUND_UP(ret + len - l, esize); + else { + ret = copy_from_user(fifo->data, from + l, len - l); + if (unlikely(ret)) + ret = DIV_ROUND_UP(ret, esize); + } /* - * Ensure that we sample the fifo->out index -before- we - * start putting bytes into the kfifo. + * make sure that the data in the fifo is up to date before + * incrementing the fifo->in index counter */ + smp_wmb(); + *copied = len - ret; + /* return the number of elements which are not copied */ + return ret; +} - smp_mb(); +int __kfifo_from_user(struct __kfifo *fifo, const void __user *from, + unsigned long len, unsigned int *copied) +{ + unsigned int l; + unsigned long ret; + unsigned int esize = fifo->esize; + int err; - off = __kfifo_off(fifo, fifo->in + off); + if (esize != 1) + len /= esize; - /* first put the data starting from fifo->in to buffer end */ - l = min(len, fifo->size - off); - ret = copy_from_user(fifo->buffer + off, from, l); - if (unlikely(ret)) { - *lenout = ret; - return -EFAULT; - } - *lenout = l; + l = kfifo_unused(fifo); + if (len > l) + len = l; - /* then put the rest (if any) at the beginning of the buffer */ - ret = copy_from_user(fifo->buffer, from + l, len - l); - *lenout += ret ? ret : len - l; - return ret ? -EFAULT : 0; + ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied); + if (unlikely(ret)) { + len -= ret; + err = -EFAULT; + } else + err = 0; + fifo->in += len; + return err; } +EXPORT_SYMBOL(__kfifo_from_user); -static inline int __kfifo_to_user_data(struct kfifo *fifo, - void __user *to, unsigned int len, unsigned int off, unsigned *lenout) +static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to, + unsigned int len, unsigned int off, unsigned int *copied) { unsigned int l; - int ret; - + unsigned long ret; + unsigned int size = fifo->mask + 1; + unsigned int esize = fifo->esize; + + off &= fifo->mask; + if (esize != 1) { + off *= esize; + size *= esize; + len *= esize; + } + l = min(len, size - off); + + ret = copy_to_user(to, fifo->data + off, l); + if (unlikely(ret)) + ret = DIV_ROUND_UP(ret + len - l, esize); + else { + ret = copy_to_user(to + l, fifo->data, len - l); + if (unlikely(ret)) + ret = DIV_ROUND_UP(ret, esize); + } /* - * Ensure that we sample the fifo->in index -before- we - * start removing bytes from the kfifo. + * make sure that the data is copied before + * incrementing the fifo->out index counter */ + smp_wmb(); + *copied = len - ret; + /* return the number of elements which are not copied */ + return ret; +} - smp_rmb(); +int __kfifo_to_user(struct __kfifo *fifo, void __user *to, + unsigned long len, unsigned int *copied) +{ + unsigned int l; + unsigned long ret; + unsigned int esize = fifo->esize; + int err; - off = __kfifo_off(fifo, fifo->out + off); + if (esize != 1) + len /= esize; - /* first get the data from fifo->out until the end of the buffer */ - l = min(len, fifo->size - off); - ret = copy_to_user(to, fifo->buffer + off, l); - *lenout = l; + l = fifo->in - fifo->out; + if (len > l) + len = l; + ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied); if (unlikely(ret)) { - *lenout -= ret; - return -EFAULT; - } + len -= ret; + err = -EFAULT; + } else + err = 0; + fifo->out += len; + return err; +} +EXPORT_SYMBOL(__kfifo_to_user); - /* then get the rest (if any) from the beginning of the buffer */ - len -= l; - ret = copy_to_user(to + l, fifo->buffer, len); - if (unlikely(ret)) { - *lenout += len - ret; - return -EFAULT; +static int setup_sgl_buf(struct scatterlist *sgl, void *buf, + int nents, unsigned int len) +{ + int n; + unsigned int l; + unsigned int off; + struct page *page; + + if (!nents) + return 0; + + if (!len) + return 0; + + n = 0; + page = virt_to_page(buf); + off = offset_in_page(buf); + l = 0; + + while (len >= l + PAGE_SIZE - off) { + struct page *npage; + + l += PAGE_SIZE; + buf += PAGE_SIZE; + npage = virt_to_page(buf); + if (page_to_phys(page) != page_to_phys(npage) - l) { + sg_set_page(sgl, page, l - off, off); + sgl = sg_next(sgl); + if (++n == nents || sgl == NULL) + return n; + page = npage; + len -= l - off; + l = off = 0; + } } - *lenout += len; - return 0; + sg_set_page(sgl, page, len, off); + return n + 1; } -unsigned int __kfifo_in_n(struct kfifo *fifo, - const void *from, unsigned int len, unsigned int recsize) +static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl, + int nents, unsigned int len, unsigned int off) { - if (kfifo_avail(fifo) < len + recsize) - return len + 1; + unsigned int size = fifo->mask + 1; + unsigned int esize = fifo->esize; + unsigned int l; + unsigned int n; - __kfifo_in_data(fifo, from, len, recsize); - return 0; + off &= fifo->mask; + if (esize != 1) { + off *= esize; + size *= esize; + len *= esize; + } + l = min(len, size - off); + + n = setup_sgl_buf(sgl, fifo->data + off, nents, l); + n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l); + + return n; } -EXPORT_SYMBOL(__kfifo_in_n); -/** - * kfifo_in - puts some data into the FIFO - * @fifo: the fifo to be used. - * @from: the data to be added. - * @len: the length of the data to be added. - * - * This function copies at most @len bytes from the @from buffer into - * the FIFO depending on the free space, and returns the number of - * bytes copied. - * - * Note that with only one concurrent reader and one concurrent - * writer, you don't need extra locking to use these functions. - */ -unsigned int kfifo_in(struct kfifo *fifo, const void *from, - unsigned int len) +unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo, + struct scatterlist *sgl, int nents, unsigned int len) { - len = min(kfifo_avail(fifo), len); + unsigned int l; - __kfifo_in_data(fifo, from, len, 0); - __kfifo_add_in(fifo, len); - return len; + l = kfifo_unused(fifo); + if (len > l) + len = l; + + return setup_sgl(fifo, sgl, nents, len, fifo->in); } -EXPORT_SYMBOL(kfifo_in); +EXPORT_SYMBOL(__kfifo_dma_in_prepare); -unsigned int __kfifo_in_generic(struct kfifo *fifo, - const void *from, unsigned int len, unsigned int recsize) +unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo, + struct scatterlist *sgl, int nents, unsigned int len) { - return __kfifo_in_rec(fifo, from, len, recsize); + unsigned int l; + + l = fifo->in - fifo->out; + if (len > l) + len = l; + + return setup_sgl(fifo, sgl, nents, len, fifo->out); } -EXPORT_SYMBOL(__kfifo_in_generic); +EXPORT_SYMBOL(__kfifo_dma_out_prepare); -unsigned int __kfifo_out_n(struct kfifo *fifo, - void *to, unsigned int len, unsigned int recsize) +unsigned int __kfifo_max_r(unsigned int len, size_t recsize) { - if (kfifo_len(fifo) < len + recsize) - return len; + unsigned int max = (1 << (recsize << 3)) - 1; - __kfifo_out_data(fifo, to, len, recsize); - __kfifo_add_out(fifo, len + recsize); - return 0; + if (len > max) + return max; + return len; } -EXPORT_SYMBOL(__kfifo_out_n); -/** - * kfifo_out - gets some data from the FIFO - * @fifo: the fifo to be used. - * @to: where the data must be copied. - * @len: the size of the destination buffer. - * - * This function copies at most @len bytes from the FIFO into the - * @to buffer and returns the number of copied bytes. - * - * Note that with only one concurrent reader and one concurrent - * writer, you don't need extra locking to use these functions. +#define __KFIFO_PEEK(data, out, mask) \ + ((data)[(out) & (mask)]) +/* + * __kfifo_peek_n internal helper function for determinate the length of + * the next record in the fifo */ -unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len) +static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize) { - len = min(kfifo_len(fifo), len); + unsigned int l; + unsigned int mask = fifo->mask; + unsigned char *data = fifo->data; - __kfifo_out_data(fifo, to, len, 0); - __kfifo_add_out(fifo, len); + l = __KFIFO_PEEK(data, fifo->out, mask); - return len; + if (--recsize) + l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8; + + return l; } -EXPORT_SYMBOL(kfifo_out); -/** - * kfifo_out_peek - copy some data from the FIFO, but do not remove it - * @fifo: the fifo to be used. - * @to: where the data must be copied. - * @len: the size of the destination buffer. - * @offset: offset into the fifo - * - * This function copies at most @len bytes at @offset from the FIFO - * into the @to buffer and returns the number of copied bytes. - * The data is not removed from the FIFO. +#define __KFIFO_POKE(data, in, mask, val) \ + ( \ + (data)[(in) & (mask)] = (unsigned char)(val) \ + ) + +/* + * __kfifo_poke_n internal helper function for storeing the length of + * the record into the fifo */ -unsigned int kfifo_out_peek(struct kfifo *fifo, void *to, unsigned int len, - unsigned offset) +static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize) { - len = min(kfifo_len(fifo), len + offset); + unsigned int mask = fifo->mask; + unsigned char *data = fifo->data; - __kfifo_out_data(fifo, to, len, offset); - return len; + __KFIFO_POKE(data, fifo->in, mask, n); + + if (recsize > 1) + __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8); } -EXPORT_SYMBOL(kfifo_out_peek); -unsigned int __kfifo_out_generic(struct kfifo *fifo, - void *to, unsigned int len, unsigned int recsize, - unsigned int *total) +unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize) { - return __kfifo_out_rec(fifo, to, len, recsize, total); + return __kfifo_peek_n(fifo, recsize); } -EXPORT_SYMBOL(__kfifo_out_generic); +EXPORT_SYMBOL(__kfifo_len_r); -unsigned int __kfifo_from_user_n(struct kfifo *fifo, - const void __user *from, unsigned int len, unsigned int recsize) +unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf, + unsigned int len, size_t recsize) { - unsigned total; + if (len + recsize > kfifo_unused(fifo)) + return 0; - if (kfifo_avail(fifo) < len + recsize) - return len + 1; + __kfifo_poke_n(fifo, len, recsize); - __kfifo_from_user_data(fifo, from, len, recsize, &total); - return total; + kfifo_copy_in(fifo, buf, len, fifo->in + recsize); + fifo->in += len + recsize; + return len; } -EXPORT_SYMBOL(__kfifo_from_user_n); +EXPORT_SYMBOL(__kfifo_in_r); -/** - * kfifo_from_user - puts some data from user space into the FIFO - * @fifo: the fifo to be used. - * @from: pointer to the data to be added. - * @len: the length of the data to be added. - * @total: the actual returned data length. - * - * This function copies at most @len bytes from the @from into the - * FIFO depending and returns -EFAULT/0. - * - * Note that with only one concurrent reader and one concurrent - * writer, you don't need extra locking to use these functions. - */ -int kfifo_from_user(struct kfifo *fifo, - const void __user *from, unsigned int len, unsigned *total) -{ - int ret; - len = min(kfifo_avail(fifo), len); - ret = __kfifo_from_user_data(fifo, from, len, 0, total); - if (ret) - return ret; - __kfifo_add_in(fifo, len); - return 0; +static unsigned int kfifo_out_copy_r(struct __kfifo *fifo, + void *buf, unsigned int len, size_t recsize, unsigned int *n) +{ + *n = __kfifo_peek_n(fifo, recsize); + + if (len > *n) + len = *n; + + kfifo_copy_out(fifo, buf, len, fifo->out + recsize); + return len; } -EXPORT_SYMBOL(kfifo_from_user); -unsigned int __kfifo_from_user_generic(struct kfifo *fifo, - const void __user *from, unsigned int len, unsigned int recsize) +unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf, + unsigned int len, size_t recsize) { - return __kfifo_from_user_rec(fifo, from, len, recsize); + unsigned int n; + + if (fifo->in == fifo->out) + return 0; + + return kfifo_out_copy_r(fifo, buf, len, recsize, &n); } -EXPORT_SYMBOL(__kfifo_from_user_generic); +EXPORT_SYMBOL(__kfifo_out_peek_r); -unsigned int __kfifo_to_user_n(struct kfifo *fifo, - void __user *to, unsigned int len, unsigned int reclen, - unsigned int recsize) +unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf, + unsigned int len, size_t recsize) { - unsigned int ret, total; + unsigned int n; - if (kfifo_len(fifo) < reclen + recsize) - return len; + if (fifo->in == fifo->out) + return 0; - ret = __kfifo_to_user_data(fifo, to, reclen, recsize, &total); + len = kfifo_out_copy_r(fifo, buf, len, recsize, &n); + fifo->out += n + recsize; + return len; +} +EXPORT_SYMBOL(__kfifo_out_r); - if (likely(ret == 0)) - __kfifo_add_out(fifo, reclen + recsize); +void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize) +{ + unsigned int n; - return total; + n = __kfifo_peek_n(fifo, recsize); + fifo->out += n + recsize; } -EXPORT_SYMBOL(__kfifo_to_user_n); +EXPORT_SYMBOL(__kfifo_skip_r); -/** - * kfifo_to_user - gets data from the FIFO and write it to user space - * @fifo: the fifo to be used. - * @to: where the data must be copied. - * @len: the size of the destination buffer. - * @lenout: pointer to output variable with copied data - * - * This function copies at most @len bytes from the FIFO into the - * @to buffer and 0 or -EFAULT. - * - * Note that with only one concurrent reader and one concurrent - * writer, you don't need extra locking to use these functions. - */ -int kfifo_to_user(struct kfifo *fifo, - void __user *to, unsigned int len, unsigned *lenout) +int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from, + unsigned long len, unsigned int *copied, size_t recsize) { - int ret; - len = min(kfifo_len(fifo), len); - ret = __kfifo_to_user_data(fifo, to, len, 0, lenout); - __kfifo_add_out(fifo, *lenout); - return ret; + unsigned long ret; + + len = __kfifo_max_r(len, recsize); + + if (len + recsize > kfifo_unused(fifo)) { + *copied = 0; + return 0; + } + + __kfifo_poke_n(fifo, len, recsize); + + ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied); + if (unlikely(ret)) { + *copied = 0; + return -EFAULT; + } + fifo->in += len + recsize; + return 0; } -EXPORT_SYMBOL(kfifo_to_user); +EXPORT_SYMBOL(__kfifo_from_user_r); -unsigned int __kfifo_to_user_generic(struct kfifo *fifo, - void __user *to, unsigned int len, unsigned int recsize, - unsigned int *total) +int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to, + unsigned long len, unsigned int *copied, size_t recsize) { - return __kfifo_to_user_rec(fifo, to, len, recsize, total); + unsigned long ret; + unsigned int n; + + if (fifo->in == fifo->out) { + *copied = 0; + return 0; + } + + n = __kfifo_peek_n(fifo, recsize); + if (len > n) + len = n; + + ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied); + if (unlikely(ret)) { + *copied = 0; + return -EFAULT; + } + fifo->out += n + recsize; + return 0; } -EXPORT_SYMBOL(__kfifo_to_user_generic); +EXPORT_SYMBOL(__kfifo_to_user_r); -unsigned int __kfifo_peek_generic(struct kfifo *fifo, unsigned int recsize) +unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo, + struct scatterlist *sgl, int nents, unsigned int len, size_t recsize) { - if (recsize == 0) - return kfifo_avail(fifo); + if (!nents) + BUG(); - return __kfifo_peek_n(fifo, recsize); + len = __kfifo_max_r(len, recsize); + + if (len + recsize > kfifo_unused(fifo)) + return 0; + + return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize); } -EXPORT_SYMBOL(__kfifo_peek_generic); +EXPORT_SYMBOL(__kfifo_dma_in_prepare_r); -void __kfifo_skip_generic(struct kfifo *fifo, unsigned int recsize) +void __kfifo_dma_in_finish_r(struct __kfifo *fifo, + unsigned int len, size_t recsize) { - __kfifo_skip_rec(fifo, recsize); + len = __kfifo_max_r(len, recsize); + __kfifo_poke_n(fifo, len, recsize); + fifo->in += len + recsize; } -EXPORT_SYMBOL(__kfifo_skip_generic); +EXPORT_SYMBOL(__kfifo_dma_in_finish_r); +unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo, + struct scatterlist *sgl, int nents, unsigned int len, size_t recsize) +{ + if (!nents) + BUG(); + + len = __kfifo_max_r(len, recsize); + + if (len + recsize > fifo->in - fifo->out) + return 0; + + return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize); +} +EXPORT_SYMBOL(__kfifo_dma_out_prepare_r); + +void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize) +{ + unsigned int len; + + len = __kfifo_peek_n(fifo, recsize); + fifo->out += len + recsize; +} +EXPORT_SYMBOL(__kfifo_dma_out_finish_r); diff --git a/kernel/kmod.c b/kernel/kmod.c index 6e9b196..9cd0591 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -153,7 +153,9 @@ static int ____call_usermodehelper(void *data) goto fail; } - retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp); + retval = kernel_execve(sub_info->path, + (const char *const *)sub_info->argv, + (const char *const *)sub_info->envp); /* Exec failed? */ fail: diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 282035f..ec4210c 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -47,6 +47,7 @@ #include <linux/memory.h> #include <linux/ftrace.h> #include <linux/cpu.h> +#include <linux/jump_label.h> #include <asm-generic/sections.h> #include <asm/cacheflush.h> @@ -399,7 +400,7 @@ static inline int kprobe_optready(struct kprobe *p) * Return an optimized kprobe whose optimizing code replaces * instructions including addr (exclude breakpoint). */ -struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) +static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) { int i; struct kprobe *p = NULL; @@ -831,6 +832,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, void __kprobes kretprobe_hash_lock(struct task_struct *tsk, struct hlist_head **head, unsigned long *flags) +__acquires(hlist_lock) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); spinlock_t *hlist_lock; @@ -842,6 +844,7 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk, static void __kprobes kretprobe_table_lock(unsigned long hash, unsigned long *flags) +__acquires(hlist_lock) { spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); spin_lock_irqsave(hlist_lock, *flags); @@ -849,6 +852,7 @@ static void __kprobes kretprobe_table_lock(unsigned long hash, void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) +__releases(hlist_lock) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); spinlock_t *hlist_lock; @@ -857,7 +861,9 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, spin_unlock_irqrestore(hlist_lock, *flags); } -void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) +static void __kprobes kretprobe_table_unlock(unsigned long hash, + unsigned long *flags) +__releases(hlist_lock) { spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); spin_unlock_irqrestore(hlist_lock, *flags); @@ -1141,7 +1147,8 @@ int __kprobes register_kprobe(struct kprobe *p) preempt_disable(); if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr) || - ftrace_text_reserved(p->addr, p->addr)) { + ftrace_text_reserved(p->addr, p->addr) || + jump_label_text_reserved(p->addr, p->addr)) { preempt_enable(); return -EINVAL; } @@ -1339,18 +1346,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num) if (num <= 0) return -EINVAL; for (i = 0; i < num; i++) { - unsigned long addr; + unsigned long addr, offset; jp = jps[i]; addr = arch_deref_entry_point(jp->entry); - if (!kernel_text_address(addr)) - ret = -EINVAL; - else { - /* Todo: Verify probepoint is a function entry point */ + /* Verify probepoint is a function entry point */ + if (kallsyms_lookup_size_offset(addr, NULL, &offset) && + offset == 0) { jp->kp.pre_handler = setjmp_pre_handler; jp->kp.break_handler = longjmp_break_handler; ret = register_kprobe(&jp->kp); - } + } else + ret = -EINVAL; + if (ret < 0) { if (i > 0) unregister_jprobes(jps, i); diff --git a/kernel/kthread.c b/kernel/kthread.c index 83911c7..2dc3786 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -14,6 +14,8 @@ #include <linux/file.h> #include <linux/module.h> #include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/freezer.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); @@ -35,6 +37,7 @@ struct kthread_create_info struct kthread { int should_stop; + void *data; struct completion exited; }; @@ -54,6 +57,19 @@ int kthread_should_stop(void) } EXPORT_SYMBOL(kthread_should_stop); +/** + * kthread_data - return data value specified on kthread creation + * @task: kthread task in question + * + * Return the data value specified when kthread @task was created. + * The caller is responsible for ensuring the validity of @task when + * calling this function. + */ +void *kthread_data(struct task_struct *task) +{ + return to_kthread(task)->data; +} + static int kthread(void *_create) { /* Copy data: it's on kthread's stack */ @@ -64,6 +80,7 @@ static int kthread(void *_create) int ret; self.should_stop = 0; + self.data = data; init_completion(&self.exited); current->vfork_done = &self.exited; @@ -247,3 +264,150 @@ int kthreadd(void *unused) return 0; } + +/** + * kthread_worker_fn - kthread function to process kthread_worker + * @worker_ptr: pointer to initialized kthread_worker + * + * This function can be used as @threadfn to kthread_create() or + * kthread_run() with @worker_ptr argument pointing to an initialized + * kthread_worker. The started kthread will process work_list until + * the it is stopped with kthread_stop(). A kthread can also call + * this function directly after extra initialization. + * + * Different kthreads can be used for the same kthread_worker as long + * as there's only one kthread attached to it at any given time. A + * kthread_worker without an attached kthread simply collects queued + * kthread_works. + */ +int kthread_worker_fn(void *worker_ptr) +{ + struct kthread_worker *worker = worker_ptr; + struct kthread_work *work; + + WARN_ON(worker->task); + worker->task = current; +repeat: + set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ + + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + spin_lock_irq(&worker->lock); + worker->task = NULL; + spin_unlock_irq(&worker->lock); + return 0; + } + + work = NULL; + spin_lock_irq(&worker->lock); + if (!list_empty(&worker->work_list)) { + work = list_first_entry(&worker->work_list, + struct kthread_work, node); + list_del_init(&work->node); + } + spin_unlock_irq(&worker->lock); + + if (work) { + __set_current_state(TASK_RUNNING); + work->func(work); + smp_wmb(); /* wmb worker-b0 paired with flush-b1 */ + work->done_seq = work->queue_seq; + smp_mb(); /* mb worker-b1 paired with flush-b0 */ + if (atomic_read(&work->flushing)) + wake_up_all(&work->done); + } else if (!freezing(current)) + schedule(); + + try_to_freeze(); + goto repeat; +} +EXPORT_SYMBOL_GPL(kthread_worker_fn); + +/** + * queue_kthread_work - queue a kthread_work + * @worker: target kthread_worker + * @work: kthread_work to queue + * + * Queue @work to work processor @task for async execution. @task + * must have been created with kthread_worker_create(). Returns %true + * if @work was successfully queued, %false if it was already pending. + */ +bool queue_kthread_work(struct kthread_worker *worker, + struct kthread_work *work) +{ + bool ret = false; + unsigned long flags; + + spin_lock_irqsave(&worker->lock, flags); + if (list_empty(&work->node)) { + list_add_tail(&work->node, &worker->work_list); + work->queue_seq++; + if (likely(worker->task)) + wake_up_process(worker->task); + ret = true; + } + spin_unlock_irqrestore(&worker->lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(queue_kthread_work); + +/** + * flush_kthread_work - flush a kthread_work + * @work: work to flush + * + * If @work is queued or executing, wait for it to finish execution. + */ +void flush_kthread_work(struct kthread_work *work) +{ + int seq = work->queue_seq; + + atomic_inc(&work->flushing); + + /* + * mb flush-b0 paired with worker-b1, to make sure either + * worker sees the above increment or we see done_seq update. + */ + smp_mb__after_atomic_inc(); + + /* A - B <= 0 tests whether B is in front of A regardless of overflow */ + wait_event(work->done, seq - work->done_seq <= 0); + atomic_dec(&work->flushing); + + /* + * rmb flush-b1 paired with worker-b0, to make sure our caller + * sees every change made by work->func(). + */ + smp_mb__after_atomic_dec(); +} +EXPORT_SYMBOL_GPL(flush_kthread_work); + +struct kthread_flush_work { + struct kthread_work work; + struct completion done; +}; + +static void kthread_flush_work_fn(struct kthread_work *work) +{ + struct kthread_flush_work *fwork = + container_of(work, struct kthread_flush_work, work); + complete(&fwork->done); +} + +/** + * flush_kthread_worker - flush all current works on a kthread_worker + * @worker: worker to flush + * + * Wait until all currently executing or pending works on @worker are + * finished. + */ +void flush_kthread_worker(struct kthread_worker *worker) +{ + struct kthread_flush_work fwork = { + KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), + COMPLETION_INITIALIZER_ONSTACK(fwork.done), + }; + + queue_kthread_work(worker, &fwork.work); + wait_for_completion(&fwork.done); +} +EXPORT_SYMBOL_GPL(flush_kthread_worker); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 5428679..42ba65d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -146,7 +146,7 @@ static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], static inline u64 lockstat_clock(void) { - return cpu_clock(smp_processor_id()); + return local_clock(); } static int lock_point(unsigned long points[], unsigned long ip) @@ -639,6 +639,16 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) } #endif + if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { + debug_locks_off(); + printk(KERN_ERR + "BUG: looking up invalid subclass: %u\n", subclass); + printk(KERN_ERR + "turning off the locking correctness validator.\n"); + dump_stack(); + return NULL; + } + /* * Static locks do not have their class-keys yet - for them the key * is the lock object itself: @@ -774,7 +784,9 @@ out_unlock_set: raw_local_irq_restore(flags); if (!subclass || force) - lock->class_cache = class; + lock->class_cache[0] = class; + else if (subclass < NR_LOCKDEP_CACHING_CLASSES) + lock->class_cache[subclass] = class; if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) return NULL; @@ -2679,7 +2691,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { - lock->class_cache = NULL; + int i; + + for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) + lock->class_cache[i] = NULL; + #ifdef CONFIG_LOCK_STAT lock->cpu = raw_smp_processor_id(); #endif @@ -2739,21 +2755,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; - if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { - debug_locks_off(); - printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n"); - printk("turning off the locking correctness validator.\n"); - dump_stack(); - return 0; - } - if (lock->key == &__lockdep_no_validate__) check = 1; - if (!subclass) - class = lock->class_cache; + if (subclass < NR_LOCKDEP_CACHING_CLASSES) + class = lock->class_cache[subclass]; /* - * Not cached yet or subclass? + * Not cached? */ if (unlikely(!class)) { class = register_lock_class(lock, subclass, 0); @@ -2918,7 +2926,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) return 1; if (hlock->references) { - struct lock_class *class = lock->class_cache; + struct lock_class *class = lock->class_cache[0]; if (!class) class = look_up_lock_class(lock, 0); @@ -3559,7 +3567,12 @@ void lockdep_reset_lock(struct lockdep_map *lock) if (list_empty(head)) continue; list_for_each_entry_safe(class, next, head, hash_entry) { - if (unlikely(class == lock->class_cache)) { + int match = 0; + + for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) + match |= class == lock->class_cache[j]; + + if (unlikely(match)) { if (debug_locks_off_graph_unlock()) WARN_ON(1); goto out_restore; @@ -3775,7 +3788,7 @@ EXPORT_SYMBOL_GPL(debug_show_all_locks); * Careful: only use this function if you are sure that * the task cannot run in parallel! */ -void __debug_show_held_locks(struct task_struct *task) +void debug_show_held_locks(struct task_struct *task) { if (unlikely(!debug_locks)) { printk("INFO: lockdep is turned off.\n"); @@ -3783,12 +3796,6 @@ void __debug_show_held_locks(struct task_struct *task) } lockdep_print_held_locks(task); } -EXPORT_SYMBOL_GPL(__debug_show_held_locks); - -void debug_show_held_locks(struct task_struct *task) -{ - __debug_show_held_locks(task); -} EXPORT_SYMBOL_GPL(debug_show_held_locks); void lockdep_sys_exit(void) diff --git a/kernel/module.c b/kernel/module.c index 8c6b428..2df4630 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1,6 +1,6 @@ /* Copyright (C) 2002 Richard Henderson - Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. + Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -55,6 +55,7 @@ #include <linux/async.h> #include <linux/percpu.h> #include <linux/kmemleak.h> +#include <linux/jump_label.h> #define CREATE_TRACE_POINTS #include <trace/events/module.h> @@ -110,6 +111,20 @@ int unregister_module_notifier(struct notifier_block * nb) } EXPORT_SYMBOL(unregister_module_notifier); +struct load_info { + Elf_Ehdr *hdr; + unsigned long len; + Elf_Shdr *sechdrs; + char *secstrings, *strtab; + unsigned long *strmap; + unsigned long symoffs, stroffs; + struct _ddebug *debug; + unsigned int num_debug; + struct { + unsigned int sym, str, mod, vers, info, pcpu; + } index; +}; + /* We require a truly strong try_module_get(): 0 means failure due to ongoing or failed initialization etc. */ static inline int strong_try_module_get(struct module *mod) @@ -140,42 +155,38 @@ void __module_put_and_exit(struct module *mod, long code) EXPORT_SYMBOL(__module_put_and_exit); /* Find a module section: 0 means not found. */ -static unsigned int find_sec(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings, - const char *name) +static unsigned int find_sec(const struct load_info *info, const char *name) { unsigned int i; - for (i = 1; i < hdr->e_shnum; i++) + for (i = 1; i < info->hdr->e_shnum; i++) { + Elf_Shdr *shdr = &info->sechdrs[i]; /* Alloc bit cleared means "ignore it." */ - if ((sechdrs[i].sh_flags & SHF_ALLOC) - && strcmp(secstrings+sechdrs[i].sh_name, name) == 0) + if ((shdr->sh_flags & SHF_ALLOC) + && strcmp(info->secstrings + shdr->sh_name, name) == 0) return i; + } return 0; } /* Find a module section, or NULL. */ -static void *section_addr(Elf_Ehdr *hdr, Elf_Shdr *shdrs, - const char *secstrings, const char *name) +static void *section_addr(const struct load_info *info, const char *name) { /* Section 0 has sh_addr 0. */ - return (void *)shdrs[find_sec(hdr, shdrs, secstrings, name)].sh_addr; + return (void *)info->sechdrs[find_sec(info, name)].sh_addr; } /* Find a module section, or NULL. Fill in number of "objects" in section. */ -static void *section_objs(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings, +static void *section_objs(const struct load_info *info, const char *name, size_t object_size, unsigned int *num) { - unsigned int sec = find_sec(hdr, sechdrs, secstrings, name); + unsigned int sec = find_sec(info, name); /* Section 0 has sh_addr 0 and sh_size 0. */ - *num = sechdrs[sec].sh_size / object_size; - return (void *)sechdrs[sec].sh_addr; + *num = info->sechdrs[sec].sh_size / object_size; + return (void *)info->sechdrs[sec].sh_addr; } /* Provided by the linker */ @@ -227,7 +238,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner, unsigned int symnum, void *data), void *data) { struct module *mod; - const struct symsearch arr[] = { + static const struct symsearch arr[] = { { __start___ksymtab, __stop___ksymtab, __start___kcrctab, NOT_GPL_ONLY, false }, { __start___ksymtab_gpl, __stop___ksymtab_gpl, @@ -392,7 +403,8 @@ static int percpu_modalloc(struct module *mod, mod->percpu = __alloc_reserved_percpu(size, align); if (!mod->percpu) { printk(KERN_WARNING - "Could not allocate %lu bytes percpu data\n", size); + "%s: Could not allocate %lu bytes percpu data\n", + mod->name, size); return -ENOMEM; } mod->percpu_size = size; @@ -404,11 +416,9 @@ static void percpu_modfree(struct module *mod) free_percpu(mod->percpu); } -static unsigned int find_pcpusec(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings) +static unsigned int find_pcpusec(struct load_info *info) { - return find_sec(hdr, sechdrs, secstrings, ".data..percpu"); + return find_sec(info, ".data..percpu"); } static void percpu_modcopy(struct module *mod, @@ -468,9 +478,7 @@ static inline int percpu_modalloc(struct module *mod, static inline void percpu_modfree(struct module *mod) { } -static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings) +static unsigned int find_pcpusec(struct load_info *info) { return 0; } @@ -524,21 +532,21 @@ static char last_unloaded_module[MODULE_NAME_LEN+1]; EXPORT_TRACEPOINT_SYMBOL(module_get); /* Init the unload section of the module. */ -static void module_unload_init(struct module *mod) +static int module_unload_init(struct module *mod) { - int cpu; + mod->refptr = alloc_percpu(struct module_ref); + if (!mod->refptr) + return -ENOMEM; INIT_LIST_HEAD(&mod->source_list); INIT_LIST_HEAD(&mod->target_list); - for_each_possible_cpu(cpu) { - per_cpu_ptr(mod->refptr, cpu)->incs = 0; - per_cpu_ptr(mod->refptr, cpu)->decs = 0; - } /* Hold reference count during initialization. */ __this_cpu_write(mod->refptr->incs, 1); /* Backwards compatibility macros put refcount during init. */ mod->waiter = current; + + return 0; } /* Does a already use b? */ @@ -618,6 +626,8 @@ static void module_unload_free(struct module *mod) kfree(use); } mutex_unlock(&module_mutex); + + free_percpu(mod->refptr); } #ifdef CONFIG_MODULE_FORCE_UNLOAD @@ -787,7 +797,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, /* Store the name of the last unloaded module for diagnostic purposes */ strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); - ddebug_remove_module(mod->name); free_module(mod); return 0; @@ -892,8 +901,9 @@ int ref_module(struct module *a, struct module *b) } EXPORT_SYMBOL_GPL(ref_module); -static inline void module_unload_init(struct module *mod) +static inline int module_unload_init(struct module *mod) { + return 0; } #endif /* CONFIG_MODULE_UNLOAD */ @@ -1052,10 +1062,9 @@ static inline int same_magic(const char *amagic, const char *bmagic, #endif /* CONFIG_MODVERSIONS */ /* Resolve a symbol for this module. I.e. if we find one, record usage. */ -static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, - unsigned int versindex, +static const struct kernel_symbol *resolve_symbol(struct module *mod, + const struct load_info *info, const char *name, - struct module *mod, char ownername[]) { struct module *owner; @@ -1069,7 +1078,8 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, if (!sym) goto unlock; - if (!check_version(sechdrs, versindex, name, mod, crc, owner)) { + if (!check_version(info->sechdrs, info->index.vers, name, mod, crc, + owner)) { sym = ERR_PTR(-EINVAL); goto getname; } @@ -1088,21 +1098,20 @@ unlock: return sym; } -static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs, - unsigned int versindex, - const char *name, - struct module *mod) +static const struct kernel_symbol * +resolve_symbol_wait(struct module *mod, + const struct load_info *info, + const char *name) { const struct kernel_symbol *ksym; - char ownername[MODULE_NAME_LEN]; + char owner[MODULE_NAME_LEN]; if (wait_event_interruptible_timeout(module_wq, - !IS_ERR(ksym = resolve_symbol(sechdrs, versindex, name, - mod, ownername)) || - PTR_ERR(ksym) != -EBUSY, + !IS_ERR(ksym = resolve_symbol(mod, info, name, owner)) + || PTR_ERR(ksym) != -EBUSY, 30 * HZ) <= 0) { printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n", - mod->name, ownername); + mod->name, owner); } return ksym; } @@ -1111,8 +1120,9 @@ static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs, * /sys/module/foo/sections stuff * J. Corbet <corbet@lwn.net> */ -#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) +#ifdef CONFIG_SYSFS +#ifdef CONFIG_KALLSYMS static inline bool sect_empty(const Elf_Shdr *sect) { return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; @@ -1149,8 +1159,7 @@ static void free_sect_attrs(struct module_sect_attrs *sect_attrs) kfree(sect_attrs); } -static void add_sect_attrs(struct module *mod, unsigned int nsect, - char *secstrings, Elf_Shdr *sechdrs) +static void add_sect_attrs(struct module *mod, const struct load_info *info) { unsigned int nloaded = 0, i, size[2]; struct module_sect_attrs *sect_attrs; @@ -1158,8 +1167,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, struct attribute **gattr; /* Count loaded sections and allocate structures */ - for (i = 0; i < nsect; i++) - if (!sect_empty(&sechdrs[i])) + for (i = 0; i < info->hdr->e_shnum; i++) + if (!sect_empty(&info->sechdrs[i])) nloaded++; size[0] = ALIGN(sizeof(*sect_attrs) + nloaded * sizeof(sect_attrs->attrs[0]), @@ -1176,11 +1185,12 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, sect_attrs->nsections = 0; sattr = §_attrs->attrs[0]; gattr = §_attrs->grp.attrs[0]; - for (i = 0; i < nsect; i++) { - if (sect_empty(&sechdrs[i])) + for (i = 0; i < info->hdr->e_shnum; i++) { + Elf_Shdr *sec = &info->sechdrs[i]; + if (sect_empty(sec)) continue; - sattr->address = sechdrs[i].sh_addr; - sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, + sattr->address = sec->sh_addr; + sattr->name = kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL); if (sattr->name == NULL) goto out; @@ -1248,8 +1258,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs, kfree(notes_attrs); } -static void add_notes_attrs(struct module *mod, unsigned int nsect, - char *secstrings, Elf_Shdr *sechdrs) +static void add_notes_attrs(struct module *mod, const struct load_info *info) { unsigned int notes, loaded, i; struct module_notes_attrs *notes_attrs; @@ -1261,9 +1270,9 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, /* Count notes sections and allocate structures. */ notes = 0; - for (i = 0; i < nsect; i++) - if (!sect_empty(&sechdrs[i]) && - (sechdrs[i].sh_type == SHT_NOTE)) + for (i = 0; i < info->hdr->e_shnum; i++) + if (!sect_empty(&info->sechdrs[i]) && + (info->sechdrs[i].sh_type == SHT_NOTE)) ++notes; if (notes == 0) @@ -1277,15 +1286,15 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, notes_attrs->notes = notes; nattr = ¬es_attrs->attrs[0]; - for (loaded = i = 0; i < nsect; ++i) { - if (sect_empty(&sechdrs[i])) + for (loaded = i = 0; i < info->hdr->e_shnum; ++i) { + if (sect_empty(&info->sechdrs[i])) continue; - if (sechdrs[i].sh_type == SHT_NOTE) { + if (info->sechdrs[i].sh_type == SHT_NOTE) { sysfs_bin_attr_init(nattr); nattr->attr.name = mod->sect_attrs->attrs[loaded].name; nattr->attr.mode = S_IRUGO; - nattr->size = sechdrs[i].sh_size; - nattr->private = (void *) sechdrs[i].sh_addr; + nattr->size = info->sechdrs[i].sh_size; + nattr->private = (void *) info->sechdrs[i].sh_addr; nattr->read = module_notes_read; ++nattr; } @@ -1316,8 +1325,8 @@ static void remove_notes_attrs(struct module *mod) #else -static inline void add_sect_attrs(struct module *mod, unsigned int nsect, - char *sectstrings, Elf_Shdr *sechdrs) +static inline void add_sect_attrs(struct module *mod, + const struct load_info *info) { } @@ -1325,17 +1334,16 @@ static inline void remove_sect_attrs(struct module *mod) { } -static inline void add_notes_attrs(struct module *mod, unsigned int nsect, - char *sectstrings, Elf_Shdr *sechdrs) +static inline void add_notes_attrs(struct module *mod, + const struct load_info *info) { } static inline void remove_notes_attrs(struct module *mod) { } -#endif +#endif /* CONFIG_KALLSYMS */ -#ifdef CONFIG_SYSFS static void add_usage_links(struct module *mod) { #ifdef CONFIG_MODULE_UNLOAD @@ -1440,6 +1448,7 @@ out: } static int mod_sysfs_setup(struct module *mod, + const struct load_info *info, struct kernel_param *kparam, unsigned int num_params) { @@ -1464,6 +1473,8 @@ static int mod_sysfs_setup(struct module *mod, goto out_unreg_param; add_usage_links(mod); + add_sect_attrs(mod, info); + add_notes_attrs(mod, info); kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); return 0; @@ -1480,33 +1491,26 @@ out: static void mod_sysfs_fini(struct module *mod) { + remove_notes_attrs(mod); + remove_sect_attrs(mod); kobject_put(&mod->mkobj.kobj); } -#else /* CONFIG_SYSFS */ +#else /* !CONFIG_SYSFS */ -static inline int mod_sysfs_init(struct module *mod) -{ - return 0; -} - -static inline int mod_sysfs_setup(struct module *mod, +static int mod_sysfs_setup(struct module *mod, + const struct load_info *info, struct kernel_param *kparam, unsigned int num_params) { return 0; } -static inline int module_add_modinfo_attrs(struct module *mod) -{ - return 0; -} - -static inline void module_remove_modinfo_attrs(struct module *mod) +static void mod_sysfs_fini(struct module *mod) { } -static void mod_sysfs_fini(struct module *mod) +static void module_remove_modinfo_attrs(struct module *mod) { } @@ -1516,7 +1520,7 @@ static void del_usage_links(struct module *mod) #endif /* CONFIG_SYSFS */ -static void mod_kobject_remove(struct module *mod) +static void mod_sysfs_teardown(struct module *mod) { del_usage_links(mod); module_remove_modinfo_attrs(mod); @@ -1534,6 +1538,7 @@ static int __unlink_module(void *_mod) { struct module *mod = _mod; list_del(&mod->list); + module_bug_cleanup(mod); return 0; } @@ -1546,9 +1551,10 @@ static void free_module(struct module *mod) mutex_lock(&module_mutex); stop_machine(__unlink_module, mod, NULL); mutex_unlock(&module_mutex); - remove_notes_attrs(mod); - remove_sect_attrs(mod); - mod_kobject_remove(mod); + mod_sysfs_teardown(mod); + + /* Remove dynamic debug info */ + ddebug_remove_module(mod->name); /* Arch-specific cleanup. */ module_arch_cleanup(mod); @@ -1563,10 +1569,7 @@ static void free_module(struct module *mod) module_free(mod, mod->module_init); kfree(mod->args); percpu_modfree(mod); -#if defined(CONFIG_MODULE_UNLOAD) - if (mod->refptr) - free_percpu(mod->refptr); -#endif + /* Free lock-classes: */ lockdep_free_key_range(mod->module_core, mod->core_size); @@ -1632,25 +1635,23 @@ static int verify_export_symbols(struct module *mod) } /* Change all symbols so that st_value encodes the pointer directly. */ -static int simplify_symbols(Elf_Shdr *sechdrs, - unsigned int symindex, - const char *strtab, - unsigned int versindex, - unsigned int pcpuindex, - struct module *mod) -{ - Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr; +static int simplify_symbols(struct module *mod, const struct load_info *info) +{ + Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; + Elf_Sym *sym = (void *)symsec->sh_addr; unsigned long secbase; - unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym); + unsigned int i; int ret = 0; const struct kernel_symbol *ksym; - for (i = 1; i < n; i++) { + for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) { + const char *name = info->strtab + sym[i].st_name; + switch (sym[i].st_shndx) { case SHN_COMMON: /* We compiled with -fno-common. These are not supposed to happen. */ - DEBUGP("Common symbol: %s\n", strtab + sym[i].st_name); + DEBUGP("Common symbol: %s\n", name); printk("%s: please compile with -fno-common\n", mod->name); ret = -ENOEXEC; @@ -1663,9 +1664,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs, break; case SHN_UNDEF: - ksym = resolve_symbol_wait(sechdrs, versindex, - strtab + sym[i].st_name, - mod); + ksym = resolve_symbol_wait(mod, info, name); /* Ok if resolved. */ if (ksym && !IS_ERR(ksym)) { sym[i].st_value = ksym->value; @@ -1677,17 +1676,16 @@ static int simplify_symbols(Elf_Shdr *sechdrs, break; printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n", - mod->name, strtab + sym[i].st_name, - PTR_ERR(ksym)); + mod->name, name, PTR_ERR(ksym)); ret = PTR_ERR(ksym) ?: -ENOENT; break; default: /* Divert to percpu allocation if a percpu var. */ - if (sym[i].st_shndx == pcpuindex) + if (sym[i].st_shndx == info->index.pcpu) secbase = (unsigned long)mod_percpu(mod); else - secbase = sechdrs[sym[i].st_shndx].sh_addr; + secbase = info->sechdrs[sym[i].st_shndx].sh_addr; sym[i].st_value += secbase; break; } @@ -1696,6 +1694,35 @@ static int simplify_symbols(Elf_Shdr *sechdrs, return ret; } +static int apply_relocations(struct module *mod, const struct load_info *info) +{ + unsigned int i; + int err = 0; + + /* Now do relocations. */ + for (i = 1; i < info->hdr->e_shnum; i++) { + unsigned int infosec = info->sechdrs[i].sh_info; + + /* Not a valid relocation section? */ + if (infosec >= info->hdr->e_shnum) + continue; + + /* Don't bother with non-allocated sections */ + if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC)) + continue; + + if (info->sechdrs[i].sh_type == SHT_REL) + err = apply_relocate(info->sechdrs, info->strtab, + info->index.sym, i, mod); + else if (info->sechdrs[i].sh_type == SHT_RELA) + err = apply_relocate_add(info->sechdrs, info->strtab, + info->index.sym, i, mod); + if (err < 0) + break; + } + return err; +} + /* Additional bytes needed by arch in front of individual sections */ unsigned int __weak arch_mod_section_prepend(struct module *mod, unsigned int section) @@ -1720,10 +1747,7 @@ static long get_offset(struct module *mod, unsigned int *size, might -- code, read-only data, read-write data, small data. Tally sizes, and place the offsets into sh_entsize fields: high bit means it belongs in init. */ -static void layout_sections(struct module *mod, - const Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings) +static void layout_sections(struct module *mod, struct load_info *info) { static unsigned long const masks[][2] = { /* NOTE: all executable code must be the first section @@ -1736,21 +1760,22 @@ static void layout_sections(struct module *mod, }; unsigned int m, i; - for (i = 0; i < hdr->e_shnum; i++) - sechdrs[i].sh_entsize = ~0UL; + for (i = 0; i < info->hdr->e_shnum; i++) + info->sechdrs[i].sh_entsize = ~0UL; DEBUGP("Core section allocation order:\n"); for (m = 0; m < ARRAY_SIZE(masks); ++m) { - for (i = 0; i < hdr->e_shnum; ++i) { - Elf_Shdr *s = &sechdrs[i]; + for (i = 0; i < info->hdr->e_shnum; ++i) { + Elf_Shdr *s = &info->sechdrs[i]; + const char *sname = info->secstrings + s->sh_name; if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL - || strstarts(secstrings + s->sh_name, ".init")) + || strstarts(sname, ".init")) continue; s->sh_entsize = get_offset(mod, &mod->core_size, s, i); - DEBUGP("\t%s\n", secstrings + s->sh_name); + DEBUGP("\t%s\n", name); } if (m == 0) mod->core_text_size = mod->core_size; @@ -1758,17 +1783,18 @@ static void layout_sections(struct module *mod, DEBUGP("Init section allocation order:\n"); for (m = 0; m < ARRAY_SIZE(masks); ++m) { - for (i = 0; i < hdr->e_shnum; ++i) { - Elf_Shdr *s = &sechdrs[i]; + for (i = 0; i < info->hdr->e_shnum; ++i) { + Elf_Shdr *s = &info->sechdrs[i]; + const char *sname = info->secstrings + s->sh_name; if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL - || !strstarts(secstrings + s->sh_name, ".init")) + || !strstarts(sname, ".init")) continue; s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) | INIT_OFFSET_MASK); - DEBUGP("\t%s\n", secstrings + s->sh_name); + DEBUGP("\t%s\n", sname); } if (m == 0) mod->init_text_size = mod->init_size; @@ -1807,33 +1833,28 @@ static char *next_string(char *string, unsigned long *secsize) return string; } -static char *get_modinfo(Elf_Shdr *sechdrs, - unsigned int info, - const char *tag) +static char *get_modinfo(struct load_info *info, const char *tag) { char *p; unsigned int taglen = strlen(tag); - unsigned long size = sechdrs[info].sh_size; + Elf_Shdr *infosec = &info->sechdrs[info->index.info]; + unsigned long size = infosec->sh_size; - for (p = (char *)sechdrs[info].sh_addr; p; p = next_string(p, &size)) { + for (p = (char *)infosec->sh_addr; p; p = next_string(p, &size)) { if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') return p + taglen + 1; } return NULL; } -static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, - unsigned int infoindex) +static void setup_modinfo(struct module *mod, struct load_info *info) { struct module_attribute *attr; int i; for (i = 0; (attr = modinfo_attrs[i]); i++) { if (attr->setup) - attr->setup(mod, - get_modinfo(sechdrs, - infoindex, - attr->attr.name)); + attr->setup(mod, get_modinfo(info, attr->attr.name)); } } @@ -1874,11 +1895,10 @@ static int is_exported(const char *name, unsigned long value, } /* As per nm */ -static char elf_type(const Elf_Sym *sym, - Elf_Shdr *sechdrs, - const char *secstrings, - struct module *mod) +static char elf_type(const Elf_Sym *sym, const struct load_info *info) { + const Elf_Shdr *sechdrs = info->sechdrs; + if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) return 'v'; @@ -1908,8 +1928,10 @@ static char elf_type(const Elf_Sym *sym, else return 'b'; } - if (strstarts(secstrings + sechdrs[sym->st_shndx].sh_name, ".debug")) + if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name, + ".debug")) { return 'n'; + } return '?'; } @@ -1934,127 +1956,96 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, return true; } -static unsigned long layout_symtab(struct module *mod, - Elf_Shdr *sechdrs, - unsigned int symindex, - unsigned int strindex, - const Elf_Ehdr *hdr, - const char *secstrings, - unsigned long *pstroffs, - unsigned long *strmap) +static void layout_symtab(struct module *mod, struct load_info *info) { - unsigned long symoffs; - Elf_Shdr *symsect = sechdrs + symindex; - Elf_Shdr *strsect = sechdrs + strindex; + Elf_Shdr *symsect = info->sechdrs + info->index.sym; + Elf_Shdr *strsect = info->sechdrs + info->index.str; const Elf_Sym *src; - const char *strtab; unsigned int i, nsrc, ndst; /* Put symbol section at end of init part of module. */ symsect->sh_flags |= SHF_ALLOC; symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, - symindex) | INIT_OFFSET_MASK; - DEBUGP("\t%s\n", secstrings + symsect->sh_name); + info->index.sym) | INIT_OFFSET_MASK; + DEBUGP("\t%s\n", info->secstrings + symsect->sh_name); - src = (void *)hdr + symsect->sh_offset; + src = (void *)info->hdr + symsect->sh_offset; nsrc = symsect->sh_size / sizeof(*src); - strtab = (void *)hdr + strsect->sh_offset; for (ndst = i = 1; i < nsrc; ++i, ++src) - if (is_core_symbol(src, sechdrs, hdr->e_shnum)) { + if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { unsigned int j = src->st_name; - while(!__test_and_set_bit(j, strmap) && strtab[j]) + while (!__test_and_set_bit(j, info->strmap) + && info->strtab[j]) ++j; ++ndst; } /* Append room for core symbols at end of core part. */ - symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); - mod->core_size = symoffs + ndst * sizeof(Elf_Sym); + info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); + mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); /* Put string table section at end of init part of module. */ strsect->sh_flags |= SHF_ALLOC; strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, - strindex) | INIT_OFFSET_MASK; - DEBUGP("\t%s\n", secstrings + strsect->sh_name); + info->index.str) | INIT_OFFSET_MASK; + DEBUGP("\t%s\n", info->secstrings + strsect->sh_name); /* Append room for core symbols' strings at end of core part. */ - *pstroffs = mod->core_size; - __set_bit(0, strmap); - mod->core_size += bitmap_weight(strmap, strsect->sh_size); - - return symoffs; + info->stroffs = mod->core_size; + __set_bit(0, info->strmap); + mod->core_size += bitmap_weight(info->strmap, strsect->sh_size); } -static void add_kallsyms(struct module *mod, - Elf_Shdr *sechdrs, - unsigned int shnum, - unsigned int symindex, - unsigned int strindex, - unsigned long symoffs, - unsigned long stroffs, - const char *secstrings, - unsigned long *strmap) +static void add_kallsyms(struct module *mod, const struct load_info *info) { unsigned int i, ndst; const Elf_Sym *src; Elf_Sym *dst; char *s; + Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; - mod->symtab = (void *)sechdrs[symindex].sh_addr; - mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); - mod->strtab = (void *)sechdrs[strindex].sh_addr; + mod->symtab = (void *)symsec->sh_addr; + mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym); + /* Make sure we get permanent strtab: don't use info->strtab. */ + mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr; /* Set types up while we still have access to sections. */ for (i = 0; i < mod->num_symtab; i++) - mod->symtab[i].st_info - = elf_type(&mod->symtab[i], sechdrs, secstrings, mod); + mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); - mod->core_symtab = dst = mod->module_core + symoffs; + mod->core_symtab = dst = mod->module_core + info->symoffs; src = mod->symtab; *dst = *src; for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { - if (!is_core_symbol(src, sechdrs, shnum)) + if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) continue; dst[ndst] = *src; - dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name); + dst[ndst].st_name = bitmap_weight(info->strmap, + dst[ndst].st_name); ++ndst; } mod->core_num_syms = ndst; - mod->core_strtab = s = mod->module_core + stroffs; - for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i) - if (test_bit(i, strmap)) + mod->core_strtab = s = mod->module_core + info->stroffs; + for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i) + if (test_bit(i, info->strmap)) *++s = mod->strtab[i]; } #else -static inline unsigned long layout_symtab(struct module *mod, - Elf_Shdr *sechdrs, - unsigned int symindex, - unsigned int strindex, - const Elf_Ehdr *hdr, - const char *secstrings, - unsigned long *pstroffs, - unsigned long *strmap) +static inline void layout_symtab(struct module *mod, struct load_info *info) { - return 0; } -static inline void add_kallsyms(struct module *mod, - Elf_Shdr *sechdrs, - unsigned int shnum, - unsigned int symindex, - unsigned int strindex, - unsigned long symoffs, - unsigned long stroffs, - const char *secstrings, - const unsigned long *strmap) +static void add_kallsyms(struct module *mod, struct load_info *info) { } #endif /* CONFIG_KALLSYMS */ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) { + if (!debug) + return; #ifdef CONFIG_DYNAMIC_DEBUG if (ddebug_add_module(debug, num, debug->modname)) printk(KERN_ERR "dynamic debug error adding module: %s\n", @@ -2062,6 +2053,12 @@ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) #endif } +static void dynamic_debug_remove(struct _ddebug *debug) +{ + if (debug) + ddebug_remove_module(debug->modname); +} + static void *module_alloc_update_bounds(unsigned long size) { void *ret = module_alloc(size); @@ -2079,63 +2076,47 @@ static void *module_alloc_update_bounds(unsigned long size) } #ifdef CONFIG_DEBUG_KMEMLEAK -static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, char *secstrings) +static void kmemleak_load_module(const struct module *mod, + const struct load_info *info) { unsigned int i; /* only scan the sections containing data */ kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); - for (i = 1; i < hdr->e_shnum; i++) { - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + for (i = 1; i < info->hdr->e_shnum; i++) { + const char *name = info->secstrings + info->sechdrs[i].sh_name; + if (!(info->sechdrs[i].sh_flags & SHF_ALLOC)) continue; - if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0 - && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0) + if (!strstarts(name, ".data") && !strstarts(name, ".bss")) continue; - kmemleak_scan_area((void *)sechdrs[i].sh_addr, - sechdrs[i].sh_size, GFP_KERNEL); + kmemleak_scan_area((void *)info->sechdrs[i].sh_addr, + info->sechdrs[i].sh_size, GFP_KERNEL); } } #else -static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, char *secstrings) +static inline void kmemleak_load_module(const struct module *mod, + const struct load_info *info) { } #endif -/* Allocate and load the module: note that size of section 0 is always - zero, and we rely on this for optional sections. */ -static noinline struct module *load_module(void __user *umod, - unsigned long len, - const char __user *uargs) +/* Sets info->hdr and info->len. */ +static int copy_and_check(struct load_info *info, + const void __user *umod, unsigned long len, + const char __user *uargs) { + int err; Elf_Ehdr *hdr; - Elf_Shdr *sechdrs; - char *secstrings, *args, *modmagic, *strtab = NULL; - char *staging; - unsigned int i; - unsigned int symindex = 0; - unsigned int strindex = 0; - unsigned int modindex, versindex, infoindex, pcpuindex; - struct module *mod; - long err = 0; - void *ptr = NULL; /* Stops spurious gcc warning */ - unsigned long symoffs, stroffs, *strmap; - void __percpu *percpu; - - mm_segment_t old_fs; - DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", - umod, len, uargs); if (len < sizeof(*hdr)) - return ERR_PTR(-ENOEXEC); + return -ENOEXEC; /* Suck in entire file: we'll want most of it. */ /* vmalloc barfs on "unusual" numbers. Check here */ if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) - return ERR_PTR(-ENOMEM); + return -ENOMEM; if (copy_from_user(hdr, umod, len) != 0) { err = -EFAULT; @@ -2143,135 +2124,230 @@ static noinline struct module *load_module(void __user *umod, } /* Sanity checks against insmoding binaries or wrong arch, - weird elf version */ + weird elf version */ if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 || hdr->e_type != ET_REL || !elf_check_arch(hdr) - || hdr->e_shentsize != sizeof(*sechdrs)) { + || hdr->e_shentsize != sizeof(Elf_Shdr)) { + err = -ENOEXEC; + goto free_hdr; + } + + if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) { err = -ENOEXEC; goto free_hdr; } - if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) - goto truncated; + info->hdr = hdr; + info->len = len; + return 0; + +free_hdr: + vfree(hdr); + return err; +} + +static void free_copy(struct load_info *info) +{ + vfree(info->hdr); +} - /* Convenience variables */ - sechdrs = (void *)hdr + hdr->e_shoff; - secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - sechdrs[0].sh_addr = 0; +static int rewrite_section_headers(struct load_info *info) +{ + unsigned int i; - for (i = 1; i < hdr->e_shnum; i++) { - if (sechdrs[i].sh_type != SHT_NOBITS - && len < sechdrs[i].sh_offset + sechdrs[i].sh_size) - goto truncated; + /* This should always be true, but let's be sure. */ + info->sechdrs[0].sh_addr = 0; + + for (i = 1; i < info->hdr->e_shnum; i++) { + Elf_Shdr *shdr = &info->sechdrs[i]; + if (shdr->sh_type != SHT_NOBITS + && info->len < shdr->sh_offset + shdr->sh_size) { + printk(KERN_ERR "Module len %lu truncated\n", + info->len); + return -ENOEXEC; + } /* Mark all sections sh_addr with their address in the temporary image. */ - sechdrs[i].sh_addr = (size_t)hdr + sechdrs[i].sh_offset; + shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset; - /* Internal symbols and strings. */ - if (sechdrs[i].sh_type == SHT_SYMTAB) { - symindex = i; - strindex = sechdrs[i].sh_link; - strtab = (char *)hdr + sechdrs[strindex].sh_offset; - } #ifndef CONFIG_MODULE_UNLOAD /* Don't load .exit sections */ - if (strstarts(secstrings+sechdrs[i].sh_name, ".exit")) - sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC; + if (strstarts(info->secstrings+shdr->sh_name, ".exit")) + shdr->sh_flags &= ~(unsigned long)SHF_ALLOC; #endif } - modindex = find_sec(hdr, sechdrs, secstrings, - ".gnu.linkonce.this_module"); - if (!modindex) { + /* Track but don't keep modinfo and version sections. */ + info->index.vers = find_sec(info, "__versions"); + info->index.info = find_sec(info, ".modinfo"); + info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; + info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; + return 0; +} + +/* + * Set up our basic convenience variables (pointers to section headers, + * search for module section index etc), and do some basic section + * verification. + * + * Return the temporary module pointer (we'll replace it with the final + * one when we move the module sections around). + */ +static struct module *setup_load_info(struct load_info *info) +{ + unsigned int i; + int err; + struct module *mod; + + /* Set up the convenience variables */ + info->sechdrs = (void *)info->hdr + info->hdr->e_shoff; + info->secstrings = (void *)info->hdr + + info->sechdrs[info->hdr->e_shstrndx].sh_offset; + + err = rewrite_section_headers(info); + if (err) + return ERR_PTR(err); + + /* Find internal symbols and strings. */ + for (i = 1; i < info->hdr->e_shnum; i++) { + if (info->sechdrs[i].sh_type == SHT_SYMTAB) { + info->index.sym = i; + info->index.str = info->sechdrs[i].sh_link; + info->strtab = (char *)info->hdr + + info->sechdrs[info->index.str].sh_offset; + break; + } + } + + info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); + if (!info->index.mod) { printk(KERN_WARNING "No module found in object\n"); - err = -ENOEXEC; - goto free_hdr; + return ERR_PTR(-ENOEXEC); } /* This is temporary: point mod into copy of data. */ - mod = (void *)sechdrs[modindex].sh_addr; + mod = (void *)info->sechdrs[info->index.mod].sh_addr; - if (symindex == 0) { + if (info->index.sym == 0) { printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", mod->name); - err = -ENOEXEC; - goto free_hdr; + return ERR_PTR(-ENOEXEC); } - versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); - infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); - pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); - - /* Don't keep modinfo and version sections. */ - sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; - sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC; + info->index.pcpu = find_pcpusec(info); /* Check module struct version now, before we try to use module. */ - if (!check_modstruct_version(sechdrs, versindex, mod)) { - err = -ENOEXEC; - goto free_hdr; - } + if (!check_modstruct_version(info->sechdrs, info->index.vers, mod)) + return ERR_PTR(-ENOEXEC); + + return mod; +} + +static int check_modinfo(struct module *mod, struct load_info *info) +{ + const char *modmagic = get_modinfo(info, "vermagic"); + int err; - modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); /* This is allowed: modprobe --force will invalidate it. */ if (!modmagic) { err = try_to_force_load(mod, "bad vermagic"); if (err) - goto free_hdr; - } else if (!same_magic(modmagic, vermagic, versindex)) { + return err; + } else if (!same_magic(modmagic, vermagic, info->index.vers)) { printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", mod->name, modmagic, vermagic); - err = -ENOEXEC; - goto free_hdr; + return -ENOEXEC; } - staging = get_modinfo(sechdrs, infoindex, "staging"); - if (staging) { + if (get_modinfo(info, "staging")) { add_taint_module(mod, TAINT_CRAP); printk(KERN_WARNING "%s: module is from the staging directory," " the quality is unknown, you have been warned.\n", mod->name); } - /* Now copy in args */ - args = strndup_user(uargs, ~0UL >> 1); - if (IS_ERR(args)) { - err = PTR_ERR(args); - goto free_hdr; - } + /* Set up license info based on the info section */ + set_license(mod, get_modinfo(info, "license")); - strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size) - * sizeof(long), GFP_KERNEL); - if (!strmap) { - err = -ENOMEM; - goto free_mod; - } + return 0; +} - mod->state = MODULE_STATE_COMING; +static void find_module_sections(struct module *mod, struct load_info *info) +{ + mod->kp = section_objs(info, "__param", + sizeof(*mod->kp), &mod->num_kp); + mod->syms = section_objs(info, "__ksymtab", + sizeof(*mod->syms), &mod->num_syms); + mod->crcs = section_addr(info, "__kcrctab"); + mod->gpl_syms = section_objs(info, "__ksymtab_gpl", + sizeof(*mod->gpl_syms), + &mod->num_gpl_syms); + mod->gpl_crcs = section_addr(info, "__kcrctab_gpl"); + mod->gpl_future_syms = section_objs(info, + "__ksymtab_gpl_future", + sizeof(*mod->gpl_future_syms), + &mod->num_gpl_future_syms); + mod->gpl_future_crcs = section_addr(info, "__kcrctab_gpl_future"); - /* Allow arches to frob section contents and sizes. */ - err = module_frob_arch_sections(hdr, sechdrs, secstrings, mod); - if (err < 0) - goto free_mod; +#ifdef CONFIG_UNUSED_SYMBOLS + mod->unused_syms = section_objs(info, "__ksymtab_unused", + sizeof(*mod->unused_syms), + &mod->num_unused_syms); + mod->unused_crcs = section_addr(info, "__kcrctab_unused"); + mod->unused_gpl_syms = section_objs(info, "__ksymtab_unused_gpl", + sizeof(*mod->unused_gpl_syms), + &mod->num_unused_gpl_syms); + mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl"); +#endif +#ifdef CONFIG_CONSTRUCTORS + mod->ctors = section_objs(info, ".ctors", + sizeof(*mod->ctors), &mod->num_ctors); +#endif - if (pcpuindex) { - /* We have a special allocation for this section. */ - err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size, - sechdrs[pcpuindex].sh_addralign); - if (err) - goto free_mod; - sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; - } - /* Keep this around for failure path. */ - percpu = mod_percpu(mod); +#ifdef CONFIG_TRACEPOINTS + mod->tracepoints = section_objs(info, "__tracepoints", + sizeof(*mod->tracepoints), + &mod->num_tracepoints); +#endif +#ifdef HAVE_JUMP_LABEL + mod->jump_entries = section_objs(info, "__jump_table", + sizeof(*mod->jump_entries), + &mod->num_jump_entries); +#endif +#ifdef CONFIG_EVENT_TRACING + mod->trace_events = section_objs(info, "_ftrace_events", + sizeof(*mod->trace_events), + &mod->num_trace_events); + /* + * This section contains pointers to allocated objects in the trace + * code and not scanning it leads to false positives. + */ + kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * + mod->num_trace_events, GFP_KERNEL); +#endif +#ifdef CONFIG_FTRACE_MCOUNT_RECORD + /* sechdrs[0].sh_size is always zero */ + mod->ftrace_callsites = section_objs(info, "__mcount_loc", + sizeof(*mod->ftrace_callsites), + &mod->num_ftrace_callsites); +#endif - /* Determine total sizes, and put offsets in sh_entsize. For now - this is done generically; there doesn't appear to be any - special cases for the architectures. */ - layout_sections(mod, hdr, sechdrs, secstrings); - symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr, - secstrings, &stroffs, strmap); + mod->extable = section_objs(info, "__ex_table", + sizeof(*mod->extable), &mod->num_exentries); + + if (section_addr(info, "__obsparm")) + printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", + mod->name); + + info->debug = section_objs(info, "__verbose", + sizeof(*info->debug), &info->num_debug); +} + +static int move_module(struct module *mod, struct load_info *info) +{ + int i; + void *ptr; /* Do the allocs. */ ptr = module_alloc_update_bounds(mod->core_size); @@ -2281,10 +2357,9 @@ static noinline struct module *load_module(void __user *umod, * leak. */ kmemleak_not_leak(ptr); - if (!ptr) { - err = -ENOMEM; - goto free_percpu; - } + if (!ptr) + return -ENOMEM; + memset(ptr, 0, mod->core_size); mod->module_core = ptr; @@ -2297,50 +2372,40 @@ static noinline struct module *load_module(void __user *umod, */ kmemleak_ignore(ptr); if (!ptr && mod->init_size) { - err = -ENOMEM; - goto free_core; + module_free(mod, mod->module_core); + return -ENOMEM; } memset(ptr, 0, mod->init_size); mod->module_init = ptr; /* Transfer each section which specifies SHF_ALLOC */ DEBUGP("final section addresses:\n"); - for (i = 0; i < hdr->e_shnum; i++) { + for (i = 0; i < info->hdr->e_shnum; i++) { void *dest; + Elf_Shdr *shdr = &info->sechdrs[i]; - if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + if (!(shdr->sh_flags & SHF_ALLOC)) continue; - if (sechdrs[i].sh_entsize & INIT_OFFSET_MASK) + if (shdr->sh_entsize & INIT_OFFSET_MASK) dest = mod->module_init - + (sechdrs[i].sh_entsize & ~INIT_OFFSET_MASK); + + (shdr->sh_entsize & ~INIT_OFFSET_MASK); else - dest = mod->module_core + sechdrs[i].sh_entsize; + dest = mod->module_core + shdr->sh_entsize; - if (sechdrs[i].sh_type != SHT_NOBITS) - memcpy(dest, (void *)sechdrs[i].sh_addr, - sechdrs[i].sh_size); + if (shdr->sh_type != SHT_NOBITS) + memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); /* Update sh_addr to point to copy in image. */ - sechdrs[i].sh_addr = (unsigned long)dest; - DEBUGP("\t0x%lx %s\n", sechdrs[i].sh_addr, secstrings + sechdrs[i].sh_name); - } - /* Module has been moved. */ - mod = (void *)sechdrs[modindex].sh_addr; - kmemleak_load_module(mod, hdr, sechdrs, secstrings); - -#if defined(CONFIG_MODULE_UNLOAD) - mod->refptr = alloc_percpu(struct module_ref); - if (!mod->refptr) { - err = -ENOMEM; - goto free_init; + shdr->sh_addr = (unsigned long)dest; + DEBUGP("\t0x%lx %s\n", + shdr->sh_addr, info->secstrings + shdr->sh_name); } -#endif - /* Now we've moved module, initialize linked lists, etc. */ - module_unload_init(mod); - /* Set up license info based on the info section */ - set_license(mod, get_modinfo(sechdrs, infoindex, "license")); + return 0; +} +static int check_module_license_and_versions(struct module *mod) +{ /* * ndiswrapper is under GPL by itself, but loads proprietary modules. * Don't use add_taint_module(), as it would prevent ndiswrapper from @@ -2353,77 +2418,6 @@ static noinline struct module *load_module(void __user *umod, if (strcmp(mod->name, "driverloader") == 0) add_taint_module(mod, TAINT_PROPRIETARY_MODULE); - /* Set up MODINFO_ATTR fields */ - setup_modinfo(mod, sechdrs, infoindex); - - /* Fix up syms, so that st_value is a pointer to location. */ - err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex, - mod); - if (err < 0) - goto cleanup; - - /* Now we've got everything in the final locations, we can - * find optional sections. */ - mod->kp = section_objs(hdr, sechdrs, secstrings, "__param", - sizeof(*mod->kp), &mod->num_kp); - mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab", - sizeof(*mod->syms), &mod->num_syms); - mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab"); - mod->gpl_syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab_gpl", - sizeof(*mod->gpl_syms), - &mod->num_gpl_syms); - mod->gpl_crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab_gpl"); - mod->gpl_future_syms = section_objs(hdr, sechdrs, secstrings, - "__ksymtab_gpl_future", - sizeof(*mod->gpl_future_syms), - &mod->num_gpl_future_syms); - mod->gpl_future_crcs = section_addr(hdr, sechdrs, secstrings, - "__kcrctab_gpl_future"); - -#ifdef CONFIG_UNUSED_SYMBOLS - mod->unused_syms = section_objs(hdr, sechdrs, secstrings, - "__ksymtab_unused", - sizeof(*mod->unused_syms), - &mod->num_unused_syms); - mod->unused_crcs = section_addr(hdr, sechdrs, secstrings, - "__kcrctab_unused"); - mod->unused_gpl_syms = section_objs(hdr, sechdrs, secstrings, - "__ksymtab_unused_gpl", - sizeof(*mod->unused_gpl_syms), - &mod->num_unused_gpl_syms); - mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings, - "__kcrctab_unused_gpl"); -#endif -#ifdef CONFIG_CONSTRUCTORS - mod->ctors = section_objs(hdr, sechdrs, secstrings, ".ctors", - sizeof(*mod->ctors), &mod->num_ctors); -#endif - -#ifdef CONFIG_TRACEPOINTS - mod->tracepoints = section_objs(hdr, sechdrs, secstrings, - "__tracepoints", - sizeof(*mod->tracepoints), - &mod->num_tracepoints); -#endif -#ifdef CONFIG_EVENT_TRACING - mod->trace_events = section_objs(hdr, sechdrs, secstrings, - "_ftrace_events", - sizeof(*mod->trace_events), - &mod->num_trace_events); - /* - * This section contains pointers to allocated objects in the trace - * code and not scanning it leads to false positives. - */ - kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * - mod->num_trace_events, GFP_KERNEL); -#endif -#ifdef CONFIG_FTRACE_MCOUNT_RECORD - /* sechdrs[0].sh_size is always zero */ - mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings, - "__mcount_loc", - sizeof(*mod->ftrace_callsites), - &mod->num_ftrace_callsites); -#endif #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) || (mod->num_gpl_syms && !mod->gpl_crcs) @@ -2433,62 +2427,16 @@ static noinline struct module *load_module(void __user *umod, || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs) #endif ) { - err = try_to_force_load(mod, - "no versions for exported symbols"); - if (err) - goto cleanup; + return try_to_force_load(mod, + "no versions for exported symbols"); } #endif + return 0; +} - /* Now do relocations. */ - for (i = 1; i < hdr->e_shnum; i++) { - const char *strtab = (char *)sechdrs[strindex].sh_addr; - unsigned int info = sechdrs[i].sh_info; - - /* Not a valid relocation section? */ - if (info >= hdr->e_shnum) - continue; - - /* Don't bother with non-allocated sections */ - if (!(sechdrs[info].sh_flags & SHF_ALLOC)) - continue; - - if (sechdrs[i].sh_type == SHT_REL) - err = apply_relocate(sechdrs, strtab, symindex, i,mod); - else if (sechdrs[i].sh_type == SHT_RELA) - err = apply_relocate_add(sechdrs, strtab, symindex, i, - mod); - if (err < 0) - goto cleanup; - } - - /* Set up and sort exception table */ - mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table", - sizeof(*mod->extable), &mod->num_exentries); - sort_extable(mod->extable, mod->extable + mod->num_exentries); - - /* Finally, copy percpu area over. */ - percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr, - sechdrs[pcpuindex].sh_size); - - add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex, - symoffs, stroffs, secstrings, strmap); - kfree(strmap); - strmap = NULL; - - if (!mod->taints) { - struct _ddebug *debug; - unsigned int num_debug; - - debug = section_objs(hdr, sechdrs, secstrings, "__verbose", - sizeof(*debug), &num_debug); - if (debug) - dynamic_debug_setup(debug, num_debug); - } - - err = module_finalize(hdr, sechdrs, mod); - if (err < 0) - goto cleanup; +static void flush_module_icache(const struct module *mod) +{ + mm_segment_t old_fs; /* flush the icache in correct context */ old_fs = get_fs(); @@ -2507,11 +2455,160 @@ static noinline struct module *load_module(void __user *umod, (unsigned long)mod->module_core + mod->core_size); set_fs(old_fs); +} - mod->args = args; - if (section_addr(hdr, sechdrs, secstrings, "__obsparm")) - printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", - mod->name); +static struct module *layout_and_allocate(struct load_info *info) +{ + /* Module within temporary copy. */ + struct module *mod; + Elf_Shdr *pcpusec; + int err; + + mod = setup_load_info(info); + if (IS_ERR(mod)) + return mod; + + err = check_modinfo(mod, info); + if (err) + return ERR_PTR(err); + + /* Allow arches to frob section contents and sizes. */ + err = module_frob_arch_sections(info->hdr, info->sechdrs, + info->secstrings, mod); + if (err < 0) + goto out; + + pcpusec = &info->sechdrs[info->index.pcpu]; + if (pcpusec->sh_size) { + /* We have a special allocation for this section. */ + err = percpu_modalloc(mod, + pcpusec->sh_size, pcpusec->sh_addralign); + if (err) + goto out; + pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC; + } + + /* Determine total sizes, and put offsets in sh_entsize. For now + this is done generically; there doesn't appear to be any + special cases for the architectures. */ + layout_sections(mod, info); + + info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size) + * sizeof(long), GFP_KERNEL); + if (!info->strmap) { + err = -ENOMEM; + goto free_percpu; + } + layout_symtab(mod, info); + + /* Allocate and move to the final place */ + err = move_module(mod, info); + if (err) + goto free_strmap; + + /* Module has been copied to its final place now: return it. */ + mod = (void *)info->sechdrs[info->index.mod].sh_addr; + kmemleak_load_module(mod, info); + return mod; + +free_strmap: + kfree(info->strmap); +free_percpu: + percpu_modfree(mod); +out: + return ERR_PTR(err); +} + +/* mod is no longer valid after this! */ +static void module_deallocate(struct module *mod, struct load_info *info) +{ + kfree(info->strmap); + percpu_modfree(mod); + module_free(mod, mod->module_init); + module_free(mod, mod->module_core); +} + +static int post_relocation(struct module *mod, const struct load_info *info) +{ + /* Sort exception table now relocations are done. */ + sort_extable(mod->extable, mod->extable + mod->num_exentries); + + /* Copy relocated percpu area over. */ + percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr, + info->sechdrs[info->index.pcpu].sh_size); + + /* Setup kallsyms-specific fields. */ + add_kallsyms(mod, info); + + /* Arch-specific module finalizing. */ + return module_finalize(info->hdr, info->sechdrs, mod); +} + +/* Allocate and load the module: note that size of section 0 is always + zero, and we rely on this for optional sections. */ +static struct module *load_module(void __user *umod, + unsigned long len, + const char __user *uargs) +{ + struct load_info info = { NULL, }; + struct module *mod; + long err; + + DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", + umod, len, uargs); + + /* Copy in the blobs from userspace, check they are vaguely sane. */ + err = copy_and_check(&info, umod, len, uargs); + if (err) + return ERR_PTR(err); + + /* Figure out module layout, and allocate all the memory. */ + mod = layout_and_allocate(&info); + if (IS_ERR(mod)) { + err = PTR_ERR(mod); + goto free_copy; + } + + /* Now module is in final location, initialize linked lists, etc. */ + err = module_unload_init(mod); + if (err) + goto free_module; + + /* Now we've got everything in the final locations, we can + * find optional sections. */ + find_module_sections(mod, &info); + + err = check_module_license_and_versions(mod); + if (err) + goto free_unload; + + /* Set up MODINFO_ATTR fields */ + setup_modinfo(mod, &info); + + /* Fix up syms, so that st_value is a pointer to location. */ + err = simplify_symbols(mod, &info); + if (err < 0) + goto free_modinfo; + + err = apply_relocations(mod, &info); + if (err < 0) + goto free_modinfo; + + err = post_relocation(mod, &info); + if (err < 0) + goto free_modinfo; + + flush_module_icache(mod); + + /* Now copy in args */ + mod->args = strndup_user(uargs, ~0UL >> 1); + if (IS_ERR(mod->args)) { + err = PTR_ERR(mod->args); + goto free_arch_cleanup; + } + + /* Mark state as coming so strong_try_module_get() ignores us. */ + mod->state = MODULE_STATE_COMING; /* Now sew it into the lists so we can get lockdep and oops * info during argument parsing. Noone should access us, since @@ -2526,65 +2623,61 @@ static noinline struct module *load_module(void __user *umod, goto unlock; } + /* This has to be done once we're sure module name is unique. */ + if (!mod->taints) + dynamic_debug_setup(info.debug, info.num_debug); + /* Find duplicate symbols */ err = verify_export_symbols(mod); if (err < 0) - goto unlock; + goto ddebug; + module_bug_finalize(info.hdr, info.sechdrs, mod); list_add_rcu(&mod->list, &modules); mutex_unlock(&module_mutex); + /* Module is ready to execute: parsing args may do that. */ err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); if (err < 0) goto unlink; - err = mod_sysfs_setup(mod, mod->kp, mod->num_kp); + /* Link in to syfs. */ + err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp); if (err < 0) goto unlink; - add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); - add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); - - /* Get rid of temporary copy */ - vfree(hdr); - - trace_module_load(mod); + /* Get rid of temporary copy and strmap. */ + kfree(info.strmap); + free_copy(&info); /* Done! */ + trace_module_load(mod); return mod; unlink: mutex_lock(&module_mutex); /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); + module_bug_cleanup(mod); + + ddebug: + if (!mod->taints) + dynamic_debug_remove(info.debug); unlock: mutex_unlock(&module_mutex); synchronize_sched(); + kfree(mod->args); + free_arch_cleanup: module_arch_cleanup(mod); - cleanup: + free_modinfo: free_modinfo(mod); + free_unload: module_unload_free(mod); -#if defined(CONFIG_MODULE_UNLOAD) - free_percpu(mod->refptr); - free_init: -#endif - module_free(mod, mod->module_init); - free_core: - module_free(mod, mod->module_core); - /* mod will be freed with core. Don't access it beyond this line! */ - free_percpu: - free_percpu(percpu); - free_mod: - kfree(args); - kfree(strmap); - free_hdr: - vfree(hdr); + free_module: + module_deallocate(mod, &info); + free_copy: + free_copy(&info); return ERR_PTR(err); - - truncated: - printk(KERN_ERR "Module len %lu truncated\n", len); - err = -ENOEXEC; - goto free_hdr; } /* Call module constructors. */ diff --git a/kernel/mutex.c b/kernel/mutex.c index 4c0b7b3..200407c 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -36,15 +36,6 @@ # include <asm/mutex.h> #endif -/*** - * mutex_init - initialize the mutex - * @lock: the mutex to be initialized - * @key: the lock_class_key for the class; used by mutex lock debugging - * - * Initialize the mutex to unlocked state. - * - * It is not allowed to initialize an already locked mutex. - */ void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) { @@ -68,7 +59,7 @@ EXPORT_SYMBOL(__mutex_init); static __used noinline void __sched __mutex_lock_slowpath(atomic_t *lock_count); -/*** +/** * mutex_lock - acquire the mutex * @lock: the mutex to be acquired * @@ -105,7 +96,7 @@ EXPORT_SYMBOL(mutex_lock); static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); -/*** +/** * mutex_unlock - release the mutex * @lock: the mutex to be released * @@ -364,8 +355,8 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count); static noinline int __sched __mutex_lock_interruptible_slowpath(atomic_t *lock_count); -/*** - * mutex_lock_interruptible - acquire the mutex, interruptable +/** + * mutex_lock_interruptible - acquire the mutex, interruptible * @lock: the mutex to be acquired * * Lock the mutex like mutex_lock(), and return 0 if the mutex has @@ -456,15 +447,15 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) return prev == 1; } -/*** - * mutex_trylock - try acquire the mutex, without waiting +/** + * mutex_trylock - try to acquire the mutex, without waiting * @lock: the mutex to be acquired * * Try to acquire the mutex atomically. Returns 1 if the mutex * has been acquired successfully, and 0 on contention. * * NOTE: this function follows the spin_trylock() convention, so - * it is negated to the down_trylock() return values! Be careful + * it is negated from the down_trylock() return values! Be careful * about this when converting semaphore users to mutexes. * * This function must not be used in interrupt context. The diff --git a/kernel/padata.c b/kernel/padata.c index fdd8ae6..7510194 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -26,18 +26,19 @@ #include <linux/mutex.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/sysfs.h> #include <linux/rcupdate.h> -#define MAX_SEQ_NR INT_MAX - NR_CPUS +#define MAX_SEQ_NR (INT_MAX - NR_CPUS) #define MAX_OBJ_NUM 1000 static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) { int cpu, target_cpu; - target_cpu = cpumask_first(pd->cpumask); + target_cpu = cpumask_first(pd->cpumask.pcpu); for (cpu = 0; cpu < cpu_index; cpu++) - target_cpu = cpumask_next(target_cpu, pd->cpumask); + target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu); return target_cpu; } @@ -53,26 +54,27 @@ static int padata_cpu_hash(struct padata_priv *padata) * Hash the sequence numbers to the cpus by taking * seq_nr mod. number of cpus in use. */ - cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask); + cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask.pcpu); return padata_index_to_cpu(pd, cpu_index); } -static void padata_parallel_worker(struct work_struct *work) +static void padata_parallel_worker(struct work_struct *parallel_work) { - struct padata_queue *queue; + struct padata_parallel_queue *pqueue; struct parallel_data *pd; struct padata_instance *pinst; LIST_HEAD(local_list); local_bh_disable(); - queue = container_of(work, struct padata_queue, pwork); - pd = queue->pd; + pqueue = container_of(parallel_work, + struct padata_parallel_queue, work); + pd = pqueue->pd; pinst = pd->pinst; - spin_lock(&queue->parallel.lock); - list_replace_init(&queue->parallel.list, &local_list); - spin_unlock(&queue->parallel.lock); + spin_lock(&pqueue->parallel.lock); + list_replace_init(&pqueue->parallel.list, &local_list); + spin_unlock(&pqueue->parallel.lock); while (!list_empty(&local_list)) { struct padata_priv *padata; @@ -94,7 +96,7 @@ static void padata_parallel_worker(struct work_struct *work) * @pinst: padata instance * @padata: object to be parallelized * @cb_cpu: cpu the serialization callback function will run on, - * must be in the cpumask of padata. + * must be in the serial cpumask of padata(i.e. cpumask.cbcpu). * * The parallelization callback function will run with BHs off. * Note: Every object which is parallelized by padata_do_parallel @@ -104,15 +106,18 @@ int padata_do_parallel(struct padata_instance *pinst, struct padata_priv *padata, int cb_cpu) { int target_cpu, err; - struct padata_queue *queue; + struct padata_parallel_queue *queue; struct parallel_data *pd; rcu_read_lock_bh(); pd = rcu_dereference(pinst->pd); - err = 0; - if (!(pinst->flags & PADATA_INIT)) + err = -EINVAL; + if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID) + goto out; + + if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu)) goto out; err = -EBUSY; @@ -122,11 +127,7 @@ int padata_do_parallel(struct padata_instance *pinst, if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) goto out; - err = -EINVAL; - if (!cpumask_test_cpu(cb_cpu, pd->cpumask)) - goto out; - - err = -EINPROGRESS; + err = 0; atomic_inc(&pd->refcnt); padata->pd = pd; padata->cb_cpu = cb_cpu; @@ -137,13 +138,13 @@ int padata_do_parallel(struct padata_instance *pinst, padata->seq_nr = atomic_inc_return(&pd->seq_nr); target_cpu = padata_cpu_hash(padata); - queue = per_cpu_ptr(pd->queue, target_cpu); + queue = per_cpu_ptr(pd->pqueue, target_cpu); spin_lock(&queue->parallel.lock); list_add_tail(&padata->list, &queue->parallel.list); spin_unlock(&queue->parallel.lock); - queue_work_on(target_cpu, pinst->wq, &queue->pwork); + queue_work_on(target_cpu, pinst->wq, &queue->work); out: rcu_read_unlock_bh(); @@ -171,84 +172,52 @@ EXPORT_SYMBOL(padata_do_parallel); */ static struct padata_priv *padata_get_next(struct parallel_data *pd) { - int cpu, num_cpus, empty, calc_seq_nr; - int seq_nr, next_nr, overrun, next_overrun; - struct padata_queue *queue, *next_queue; + int cpu, num_cpus; + int next_nr, next_index; + struct padata_parallel_queue *queue, *next_queue; struct padata_priv *padata; struct padata_list *reorder; - empty = 0; - next_nr = -1; - next_overrun = 0; - next_queue = NULL; - - num_cpus = cpumask_weight(pd->cpumask); - - for_each_cpu(cpu, pd->cpumask) { - queue = per_cpu_ptr(pd->queue, cpu); - reorder = &queue->reorder; - - /* - * Calculate the seq_nr of the object that should be - * next in this reorder queue. - */ - overrun = 0; - calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus) - + queue->cpu_index; + num_cpus = cpumask_weight(pd->cpumask.pcpu); - if (unlikely(calc_seq_nr > pd->max_seq_nr)) { - calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1; - overrun = 1; - } - - if (!list_empty(&reorder->list)) { - padata = list_entry(reorder->list.next, - struct padata_priv, list); - - seq_nr = padata->seq_nr; - BUG_ON(calc_seq_nr != seq_nr); - } else { - seq_nr = calc_seq_nr; - empty++; - } - - if (next_nr < 0 || seq_nr < next_nr - || (next_overrun && !overrun)) { - next_nr = seq_nr; - next_overrun = overrun; - next_queue = queue; - } + /* + * Calculate the percpu reorder queue and the sequence + * number of the next object. + */ + next_nr = pd->processed; + next_index = next_nr % num_cpus; + cpu = padata_index_to_cpu(pd, next_index); + next_queue = per_cpu_ptr(pd->pqueue, cpu); + + if (unlikely(next_nr > pd->max_seq_nr)) { + next_nr = next_nr - pd->max_seq_nr - 1; + next_index = next_nr % num_cpus; + cpu = padata_index_to_cpu(pd, next_index); + next_queue = per_cpu_ptr(pd->pqueue, cpu); + pd->processed = 0; } padata = NULL; - if (empty == num_cpus) - goto out; - reorder = &next_queue->reorder; if (!list_empty(&reorder->list)) { padata = list_entry(reorder->list.next, struct padata_priv, list); - if (unlikely(next_overrun)) { - for_each_cpu(cpu, pd->cpumask) { - queue = per_cpu_ptr(pd->queue, cpu); - atomic_set(&queue->num_obj, 0); - } - } + BUG_ON(next_nr != padata->seq_nr); spin_lock(&reorder->lock); list_del_init(&padata->list); atomic_dec(&pd->reorder_objects); spin_unlock(&reorder->lock); - atomic_inc(&next_queue->num_obj); + pd->processed++; goto out; } - queue = per_cpu_ptr(pd->queue, smp_processor_id()); + queue = per_cpu_ptr(pd->pqueue, smp_processor_id()); if (queue->cpu_index == next_queue->cpu_index) { padata = ERR_PTR(-ENODATA); goto out; @@ -262,7 +231,7 @@ out: static void padata_reorder(struct parallel_data *pd) { struct padata_priv *padata; - struct padata_queue *queue; + struct padata_serial_queue *squeue; struct padata_instance *pinst = pd->pinst; /* @@ -301,13 +270,13 @@ static void padata_reorder(struct parallel_data *pd) return; } - queue = per_cpu_ptr(pd->queue, padata->cb_cpu); + squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu); - spin_lock(&queue->serial.lock); - list_add_tail(&padata->list, &queue->serial.list); - spin_unlock(&queue->serial.lock); + spin_lock(&squeue->serial.lock); + list_add_tail(&padata->list, &squeue->serial.list); + spin_unlock(&squeue->serial.lock); - queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork); + queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work); } spin_unlock_bh(&pd->lock); @@ -333,19 +302,19 @@ static void padata_reorder_timer(unsigned long arg) padata_reorder(pd); } -static void padata_serial_worker(struct work_struct *work) +static void padata_serial_worker(struct work_struct *serial_work) { - struct padata_queue *queue; + struct padata_serial_queue *squeue; struct parallel_data *pd; LIST_HEAD(local_list); local_bh_disable(); - queue = container_of(work, struct padata_queue, swork); - pd = queue->pd; + squeue = container_of(serial_work, struct padata_serial_queue, work); + pd = squeue->pd; - spin_lock(&queue->serial.lock); - list_replace_init(&queue->serial.list, &local_list); - spin_unlock(&queue->serial.lock); + spin_lock(&squeue->serial.lock); + list_replace_init(&squeue->serial.list, &local_list); + spin_unlock(&squeue->serial.lock); while (!list_empty(&local_list)) { struct padata_priv *padata; @@ -372,18 +341,18 @@ static void padata_serial_worker(struct work_struct *work) void padata_do_serial(struct padata_priv *padata) { int cpu; - struct padata_queue *queue; + struct padata_parallel_queue *pqueue; struct parallel_data *pd; pd = padata->pd; cpu = get_cpu(); - queue = per_cpu_ptr(pd->queue, cpu); + pqueue = per_cpu_ptr(pd->pqueue, cpu); - spin_lock(&queue->reorder.lock); + spin_lock(&pqueue->reorder.lock); atomic_inc(&pd->reorder_objects); - list_add_tail(&padata->list, &queue->reorder.list); - spin_unlock(&queue->reorder.lock); + list_add_tail(&padata->list, &pqueue->reorder.list); + spin_unlock(&pqueue->reorder.lock); put_cpu(); @@ -391,52 +360,89 @@ void padata_do_serial(struct padata_priv *padata) } EXPORT_SYMBOL(padata_do_serial); -/* Allocate and initialize the internal cpumask dependend resources. */ -static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, - const struct cpumask *cpumask) +static int padata_setup_cpumasks(struct parallel_data *pd, + const struct cpumask *pcpumask, + const struct cpumask *cbcpumask) { - int cpu, cpu_index, num_cpus; - struct padata_queue *queue; - struct parallel_data *pd; - - cpu_index = 0; + if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) + return -ENOMEM; - pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); - if (!pd) - goto err; + cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask); + if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) { + free_cpumask_var(pd->cpumask.cbcpu); + return -ENOMEM; + } - pd->queue = alloc_percpu(struct padata_queue); - if (!pd->queue) - goto err_free_pd; + cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask); + return 0; +} - if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL)) - goto err_free_queue; +static void __padata_list_init(struct padata_list *pd_list) +{ + INIT_LIST_HEAD(&pd_list->list); + spin_lock_init(&pd_list->lock); +} - cpumask_and(pd->cpumask, cpumask, cpu_active_mask); +/* Initialize all percpu queues used by serial workers */ +static void padata_init_squeues(struct parallel_data *pd) +{ + int cpu; + struct padata_serial_queue *squeue; - for_each_cpu(cpu, pd->cpumask) { - queue = per_cpu_ptr(pd->queue, cpu); + for_each_cpu(cpu, pd->cpumask.cbcpu) { + squeue = per_cpu_ptr(pd->squeue, cpu); + squeue->pd = pd; + __padata_list_init(&squeue->serial); + INIT_WORK(&squeue->work, padata_serial_worker); + } +} - queue->pd = pd; +/* Initialize all percpu queues used by parallel workers */ +static void padata_init_pqueues(struct parallel_data *pd) +{ + int cpu_index, num_cpus, cpu; + struct padata_parallel_queue *pqueue; - queue->cpu_index = cpu_index; + cpu_index = 0; + for_each_cpu(cpu, pd->cpumask.pcpu) { + pqueue = per_cpu_ptr(pd->pqueue, cpu); + pqueue->pd = pd; + pqueue->cpu_index = cpu_index; cpu_index++; - INIT_LIST_HEAD(&queue->reorder.list); - INIT_LIST_HEAD(&queue->parallel.list); - INIT_LIST_HEAD(&queue->serial.list); - spin_lock_init(&queue->reorder.lock); - spin_lock_init(&queue->parallel.lock); - spin_lock_init(&queue->serial.lock); - - INIT_WORK(&queue->pwork, padata_parallel_worker); - INIT_WORK(&queue->swork, padata_serial_worker); - atomic_set(&queue->num_obj, 0); + __padata_list_init(&pqueue->reorder); + __padata_list_init(&pqueue->parallel); + INIT_WORK(&pqueue->work, padata_parallel_worker); + atomic_set(&pqueue->num_obj, 0); } - num_cpus = cpumask_weight(pd->cpumask); - pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1; + num_cpus = cpumask_weight(pd->cpumask.pcpu); + pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0; +} + +/* Allocate and initialize the internal cpumask dependend resources. */ +static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, + const struct cpumask *pcpumask, + const struct cpumask *cbcpumask) +{ + struct parallel_data *pd; + pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); + if (!pd) + goto err; + + pd->pqueue = alloc_percpu(struct padata_parallel_queue); + if (!pd->pqueue) + goto err_free_pd; + + pd->squeue = alloc_percpu(struct padata_serial_queue); + if (!pd->squeue) + goto err_free_pqueue; + if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0) + goto err_free_squeue; + + padata_init_pqueues(pd); + padata_init_squeues(pd); setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); atomic_set(&pd->seq_nr, -1); atomic_set(&pd->reorder_objects, 0); @@ -446,8 +452,10 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, return pd; -err_free_queue: - free_percpu(pd->queue); +err_free_squeue: + free_percpu(pd->squeue); +err_free_pqueue: + free_percpu(pd->pqueue); err_free_pd: kfree(pd); err: @@ -456,8 +464,10 @@ err: static void padata_free_pd(struct parallel_data *pd) { - free_cpumask_var(pd->cpumask); - free_percpu(pd->queue); + free_cpumask_var(pd->cpumask.pcpu); + free_cpumask_var(pd->cpumask.cbcpu); + free_percpu(pd->pqueue); + free_percpu(pd->squeue); kfree(pd); } @@ -465,11 +475,12 @@ static void padata_free_pd(struct parallel_data *pd) static void padata_flush_queues(struct parallel_data *pd) { int cpu; - struct padata_queue *queue; + struct padata_parallel_queue *pqueue; + struct padata_serial_queue *squeue; - for_each_cpu(cpu, pd->cpumask) { - queue = per_cpu_ptr(pd->queue, cpu); - flush_work(&queue->pwork); + for_each_cpu(cpu, pd->cpumask.pcpu) { + pqueue = per_cpu_ptr(pd->pqueue, cpu); + flush_work(&pqueue->work); } del_timer_sync(&pd->timer); @@ -477,19 +488,39 @@ static void padata_flush_queues(struct parallel_data *pd) if (atomic_read(&pd->reorder_objects)) padata_reorder(pd); - for_each_cpu(cpu, pd->cpumask) { - queue = per_cpu_ptr(pd->queue, cpu); - flush_work(&queue->swork); + for_each_cpu(cpu, pd->cpumask.cbcpu) { + squeue = per_cpu_ptr(pd->squeue, cpu); + flush_work(&squeue->work); } BUG_ON(atomic_read(&pd->refcnt) != 0); } +static void __padata_start(struct padata_instance *pinst) +{ + pinst->flags |= PADATA_INIT; +} + +static void __padata_stop(struct padata_instance *pinst) +{ + if (!(pinst->flags & PADATA_INIT)) + return; + + pinst->flags &= ~PADATA_INIT; + + synchronize_rcu(); + + get_online_cpus(); + padata_flush_queues(pinst->pd); + put_online_cpus(); +} + /* Replace the internal control stucture with a new one. */ static void padata_replace(struct padata_instance *pinst, struct parallel_data *pd_new) { struct parallel_data *pd_old = pinst->pd; + int notification_mask = 0; pinst->flags |= PADATA_RESET; @@ -497,41 +528,162 @@ static void padata_replace(struct padata_instance *pinst, synchronize_rcu(); + if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu)) + notification_mask |= PADATA_CPU_PARALLEL; + if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu)) + notification_mask |= PADATA_CPU_SERIAL; + padata_flush_queues(pd_old); padata_free_pd(pd_old); + if (notification_mask) + blocking_notifier_call_chain(&pinst->cpumask_change_notifier, + notification_mask, + &pd_new->cpumask); + pinst->flags &= ~PADATA_RESET; } /** - * padata_set_cpumask - set the cpumask that padata should use + * padata_register_cpumask_notifier - Registers a notifier that will be called + * if either pcpu or cbcpu or both cpumasks change. * - * @pinst: padata instance - * @cpumask: the cpumask to use + * @pinst: A poineter to padata instance + * @nblock: A pointer to notifier block. */ -int padata_set_cpumask(struct padata_instance *pinst, - cpumask_var_t cpumask) +int padata_register_cpumask_notifier(struct padata_instance *pinst, + struct notifier_block *nblock) { + return blocking_notifier_chain_register(&pinst->cpumask_change_notifier, + nblock); +} +EXPORT_SYMBOL(padata_register_cpumask_notifier); + +/** + * padata_unregister_cpumask_notifier - Unregisters cpumask notifier + * registered earlier using padata_register_cpumask_notifier + * + * @pinst: A pointer to data instance. + * @nlock: A pointer to notifier block. + */ +int padata_unregister_cpumask_notifier(struct padata_instance *pinst, + struct notifier_block *nblock) +{ + return blocking_notifier_chain_unregister( + &pinst->cpumask_change_notifier, + nblock); +} +EXPORT_SYMBOL(padata_unregister_cpumask_notifier); + + +/* If cpumask contains no active cpu, we mark the instance as invalid. */ +static bool padata_validate_cpumask(struct padata_instance *pinst, + const struct cpumask *cpumask) +{ + if (!cpumask_intersects(cpumask, cpu_active_mask)) { + pinst->flags |= PADATA_INVALID; + return false; + } + + pinst->flags &= ~PADATA_INVALID; + return true; +} + +static int __padata_set_cpumasks(struct padata_instance *pinst, + cpumask_var_t pcpumask, + cpumask_var_t cbcpumask) +{ + int valid; struct parallel_data *pd; - int err = 0; + + valid = padata_validate_cpumask(pinst, pcpumask); + if (!valid) { + __padata_stop(pinst); + goto out_replace; + } + + valid = padata_validate_cpumask(pinst, cbcpumask); + if (!valid) + __padata_stop(pinst); + +out_replace: + pd = padata_alloc_pd(pinst, pcpumask, cbcpumask); + if (!pd) + return -ENOMEM; + + cpumask_copy(pinst->cpumask.pcpu, pcpumask); + cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); + + padata_replace(pinst, pd); + + if (valid) + __padata_start(pinst); + + return 0; +} + +/** + * padata_set_cpumasks - Set both parallel and serial cpumasks. The first + * one is used by parallel workers and the second one + * by the wokers doing serialization. + * + * @pinst: padata instance + * @pcpumask: the cpumask to use for parallel workers + * @cbcpumask: the cpumsak to use for serial workers + */ +int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask, + cpumask_var_t cbcpumask) +{ + int err; mutex_lock(&pinst->lock); + get_online_cpus(); + err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask); + + put_online_cpus(); + mutex_unlock(&pinst->lock); + + return err; + +} +EXPORT_SYMBOL(padata_set_cpumasks); + +/** + * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value + * equivalent to @cpumask. + * + * @pinst: padata instance + * @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding + * to parallel and serial cpumasks respectively. + * @cpumask: the cpumask to use + */ +int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, + cpumask_var_t cpumask) +{ + struct cpumask *serial_mask, *parallel_mask; + int err = -EINVAL; + + mutex_lock(&pinst->lock); get_online_cpus(); - pd = padata_alloc_pd(pinst, cpumask); - if (!pd) { - err = -ENOMEM; - goto out; + switch (cpumask_type) { + case PADATA_CPU_PARALLEL: + serial_mask = pinst->cpumask.cbcpu; + parallel_mask = cpumask; + break; + case PADATA_CPU_SERIAL: + parallel_mask = pinst->cpumask.pcpu; + serial_mask = cpumask; + break; + default: + goto out; } - cpumask_copy(pinst->cpumask, cpumask); - - padata_replace(pinst, pd); + err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask); out: put_online_cpus(); - mutex_unlock(&pinst->lock); return err; @@ -543,30 +695,48 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu) struct parallel_data *pd; if (cpumask_test_cpu(cpu, cpu_active_mask)) { - pd = padata_alloc_pd(pinst, pinst->cpumask); + pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu, + pinst->cpumask.cbcpu); if (!pd) return -ENOMEM; padata_replace(pinst, pd); + + if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) && + padata_validate_cpumask(pinst, pinst->cpumask.cbcpu)) + __padata_start(pinst); } return 0; } -/** - * padata_add_cpu - add a cpu to the padata cpumask + /** + * padata_add_cpu - add a cpu to one or both(parallel and serial) + * padata cpumasks. * * @pinst: padata instance * @cpu: cpu to add + * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added. + * The @mask may be any combination of the following flags: + * PADATA_CPU_SERIAL - serial cpumask + * PADATA_CPU_PARALLEL - parallel cpumask */ -int padata_add_cpu(struct padata_instance *pinst, int cpu) + +int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask) { int err; + if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL))) + return -EINVAL; + mutex_lock(&pinst->lock); get_online_cpus(); - cpumask_set_cpu(cpu, pinst->cpumask); + if (mask & PADATA_CPU_SERIAL) + cpumask_set_cpu(cpu, pinst->cpumask.cbcpu); + if (mask & PADATA_CPU_PARALLEL) + cpumask_set_cpu(cpu, pinst->cpumask.pcpu); + err = __padata_add_cpu(pinst, cpu); put_online_cpus(); @@ -578,10 +748,16 @@ EXPORT_SYMBOL(padata_add_cpu); static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) { - struct parallel_data *pd; + struct parallel_data *pd = NULL; if (cpumask_test_cpu(cpu, cpu_online_mask)) { - pd = padata_alloc_pd(pinst, pinst->cpumask); + + if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) || + !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu)) + __padata_stop(pinst); + + pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu, + pinst->cpumask.cbcpu); if (!pd) return -ENOMEM; @@ -591,20 +767,32 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) return 0; } -/** - * padata_remove_cpu - remove a cpu from the padata cpumask + /** + * padata_remove_cpu - remove a cpu from the one or both(serial and paralell) + * padata cpumasks. * * @pinst: padata instance * @cpu: cpu to remove + * @mask: bitmask specifying from which cpumask @cpu should be removed + * The @mask may be any combination of the following flags: + * PADATA_CPU_SERIAL - serial cpumask + * PADATA_CPU_PARALLEL - parallel cpumask */ -int padata_remove_cpu(struct padata_instance *pinst, int cpu) +int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask) { int err; + if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL))) + return -EINVAL; + mutex_lock(&pinst->lock); get_online_cpus(); - cpumask_clear_cpu(cpu, pinst->cpumask); + if (mask & PADATA_CPU_SERIAL) + cpumask_clear_cpu(cpu, pinst->cpumask.cbcpu); + if (mask & PADATA_CPU_PARALLEL) + cpumask_clear_cpu(cpu, pinst->cpumask.pcpu); + err = __padata_remove_cpu(pinst, cpu); put_online_cpus(); @@ -619,11 +807,20 @@ EXPORT_SYMBOL(padata_remove_cpu); * * @pinst: padata instance to start */ -void padata_start(struct padata_instance *pinst) +int padata_start(struct padata_instance *pinst) { + int err = 0; + mutex_lock(&pinst->lock); - pinst->flags |= PADATA_INIT; + + if (pinst->flags & PADATA_INVALID) + err =-EINVAL; + + __padata_start(pinst); + mutex_unlock(&pinst->lock); + + return err; } EXPORT_SYMBOL(padata_start); @@ -635,12 +832,20 @@ EXPORT_SYMBOL(padata_start); void padata_stop(struct padata_instance *pinst) { mutex_lock(&pinst->lock); - pinst->flags &= ~PADATA_INIT; + __padata_stop(pinst); mutex_unlock(&pinst->lock); } EXPORT_SYMBOL(padata_stop); #ifdef CONFIG_HOTPLUG_CPU + +static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu) +{ + return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) || + cpumask_test_cpu(cpu, pinst->cpumask.cbcpu); +} + + static int padata_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -653,7 +858,7 @@ static int padata_cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - if (!cpumask_test_cpu(cpu, pinst->cpumask)) + if (!pinst_has_cpu(pinst, cpu)) break; mutex_lock(&pinst->lock); err = __padata_add_cpu(pinst, cpu); @@ -664,7 +869,7 @@ static int padata_cpu_callback(struct notifier_block *nfb, case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: - if (!cpumask_test_cpu(cpu, pinst->cpumask)) + if (!pinst_has_cpu(pinst, cpu)) break; mutex_lock(&pinst->lock); err = __padata_remove_cpu(pinst, cpu); @@ -675,7 +880,7 @@ static int padata_cpu_callback(struct notifier_block *nfb, case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - if (!cpumask_test_cpu(cpu, pinst->cpumask)) + if (!pinst_has_cpu(pinst, cpu)) break; mutex_lock(&pinst->lock); __padata_remove_cpu(pinst, cpu); @@ -683,7 +888,7 @@ static int padata_cpu_callback(struct notifier_block *nfb, case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - if (!cpumask_test_cpu(cpu, pinst->cpumask)) + if (!pinst_has_cpu(pinst, cpu)) break; mutex_lock(&pinst->lock); __padata_add_cpu(pinst, cpu); @@ -694,36 +899,202 @@ static int padata_cpu_callback(struct notifier_block *nfb, } #endif +static void __padata_free(struct padata_instance *pinst) +{ +#ifdef CONFIG_HOTPLUG_CPU + unregister_hotcpu_notifier(&pinst->cpu_notifier); +#endif + + padata_stop(pinst); + padata_free_pd(pinst->pd); + free_cpumask_var(pinst->cpumask.pcpu); + free_cpumask_var(pinst->cpumask.cbcpu); + kfree(pinst); +} + +#define kobj2pinst(_kobj) \ + container_of(_kobj, struct padata_instance, kobj) +#define attr2pentry(_attr) \ + container_of(_attr, struct padata_sysfs_entry, attr) + +static void padata_sysfs_release(struct kobject *kobj) +{ + struct padata_instance *pinst = kobj2pinst(kobj); + __padata_free(pinst); +} + +struct padata_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct padata_instance *, struct attribute *, char *); + ssize_t (*store)(struct padata_instance *, struct attribute *, + const char *, size_t); +}; + +static ssize_t show_cpumask(struct padata_instance *pinst, + struct attribute *attr, char *buf) +{ + struct cpumask *cpumask; + ssize_t len; + + mutex_lock(&pinst->lock); + if (!strcmp(attr->name, "serial_cpumask")) + cpumask = pinst->cpumask.cbcpu; + else + cpumask = pinst->cpumask.pcpu; + + len = bitmap_scnprintf(buf, PAGE_SIZE, cpumask_bits(cpumask), + nr_cpu_ids); + if (PAGE_SIZE - len < 2) + len = -EINVAL; + else + len += sprintf(buf + len, "\n"); + + mutex_unlock(&pinst->lock); + return len; +} + +static ssize_t store_cpumask(struct padata_instance *pinst, + struct attribute *attr, + const char *buf, size_t count) +{ + cpumask_var_t new_cpumask; + ssize_t ret; + int mask_type; + + if (!alloc_cpumask_var(&new_cpumask, GFP_KERNEL)) + return -ENOMEM; + + ret = bitmap_parse(buf, count, cpumask_bits(new_cpumask), + nr_cpumask_bits); + if (ret < 0) + goto out; + + mask_type = !strcmp(attr->name, "serial_cpumask") ? + PADATA_CPU_SERIAL : PADATA_CPU_PARALLEL; + ret = padata_set_cpumask(pinst, mask_type, new_cpumask); + if (!ret) + ret = count; + +out: + free_cpumask_var(new_cpumask); + return ret; +} + +#define PADATA_ATTR_RW(_name, _show_name, _store_name) \ + static struct padata_sysfs_entry _name##_attr = \ + __ATTR(_name, 0644, _show_name, _store_name) +#define PADATA_ATTR_RO(_name, _show_name) \ + static struct padata_sysfs_entry _name##_attr = \ + __ATTR(_name, 0400, _show_name, NULL) + +PADATA_ATTR_RW(serial_cpumask, show_cpumask, store_cpumask); +PADATA_ATTR_RW(parallel_cpumask, show_cpumask, store_cpumask); + +/* + * Padata sysfs provides the following objects: + * serial_cpumask [RW] - cpumask for serial workers + * parallel_cpumask [RW] - cpumask for parallel workers + */ +static struct attribute *padata_default_attrs[] = { + &serial_cpumask_attr.attr, + ¶llel_cpumask_attr.attr, + NULL, +}; + +static ssize_t padata_sysfs_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct padata_instance *pinst; + struct padata_sysfs_entry *pentry; + ssize_t ret = -EIO; + + pinst = kobj2pinst(kobj); + pentry = attr2pentry(attr); + if (pentry->show) + ret = pentry->show(pinst, attr, buf); + + return ret; +} + +static ssize_t padata_sysfs_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct padata_instance *pinst; + struct padata_sysfs_entry *pentry; + ssize_t ret = -EIO; + + pinst = kobj2pinst(kobj); + pentry = attr2pentry(attr); + if (pentry->show) + ret = pentry->store(pinst, attr, buf, count); + + return ret; +} + +static const struct sysfs_ops padata_sysfs_ops = { + .show = padata_sysfs_show, + .store = padata_sysfs_store, +}; + +static struct kobj_type padata_attr_type = { + .sysfs_ops = &padata_sysfs_ops, + .default_attrs = padata_default_attrs, + .release = padata_sysfs_release, +}; + /** - * padata_alloc - allocate and initialize a padata instance + * padata_alloc_possible - Allocate and initialize padata instance. + * Use the cpu_possible_mask for serial and + * parallel workers. * - * @cpumask: cpumask that padata uses for parallelization * @wq: workqueue to use for the allocated padata instance */ -struct padata_instance *padata_alloc(const struct cpumask *cpumask, - struct workqueue_struct *wq) +struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq) +{ + return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask); +} +EXPORT_SYMBOL(padata_alloc_possible); + +/** + * padata_alloc - allocate and initialize a padata instance and specify + * cpumasks for serial and parallel workers. + * + * @wq: workqueue to use for the allocated padata instance + * @pcpumask: cpumask that will be used for padata parallelization + * @cbcpumask: cpumask that will be used for padata serialization + */ +struct padata_instance *padata_alloc(struct workqueue_struct *wq, + const struct cpumask *pcpumask, + const struct cpumask *cbcpumask) { struct padata_instance *pinst; - struct parallel_data *pd; + struct parallel_data *pd = NULL; pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL); if (!pinst) goto err; get_online_cpus(); - - pd = padata_alloc_pd(pinst, cpumask); - if (!pd) + if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) goto err_free_inst; + if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) { + free_cpumask_var(pinst->cpumask.pcpu); + goto err_free_inst; + } + if (!padata_validate_cpumask(pinst, pcpumask) || + !padata_validate_cpumask(pinst, cbcpumask)) + goto err_free_masks; - if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL)) - goto err_free_pd; + pd = padata_alloc_pd(pinst, pcpumask, cbcpumask); + if (!pd) + goto err_free_masks; rcu_assign_pointer(pinst->pd, pd); pinst->wq = wq; - cpumask_copy(pinst->cpumask, cpumask); + cpumask_copy(pinst->cpumask.pcpu, pcpumask); + cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); pinst->flags = 0; @@ -735,12 +1106,15 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask, put_online_cpus(); + BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier); + kobject_init(&pinst->kobj, &padata_attr_type); mutex_init(&pinst->lock); return pinst; -err_free_pd: - padata_free_pd(pd); +err_free_masks: + free_cpumask_var(pinst->cpumask.pcpu); + free_cpumask_var(pinst->cpumask.cbcpu); err_free_inst: kfree(pinst); put_online_cpus(); @@ -756,19 +1130,6 @@ EXPORT_SYMBOL(padata_alloc); */ void padata_free(struct padata_instance *pinst) { - padata_stop(pinst); - - synchronize_rcu(); - -#ifdef CONFIG_HOTPLUG_CPU - unregister_hotcpu_notifier(&pinst->cpu_notifier); -#endif - get_online_cpus(); - padata_flush_queues(pinst->pd); - put_online_cpus(); - - padata_free_pd(pinst->pd); - free_cpumask_var(pinst->cpumask); - kfree(pinst); + kobject_put(&pinst->kobj); } EXPORT_SYMBOL(padata_free); diff --git a/kernel/panic.c b/kernel/panic.c index 3b16cd9..4c13b1a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -24,6 +24,9 @@ #include <linux/nmi.h> #include <linux/dmi.h> +#define PANIC_TIMER_STEP 100 +#define PANIC_BLINK_SPD 18 + int panic_on_oops; static unsigned long tainted_mask; static int pause_on_oops; @@ -36,36 +39,15 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); -/* Returns how long it waited in ms */ -long (*panic_blink)(long time); -EXPORT_SYMBOL(panic_blink); - -static void panic_blink_one_second(void) +static long no_blink(int state) { - static long i = 0, end; - - if (panic_blink) { - end = i + MSEC_PER_SEC; - - while (i < end) { - i += panic_blink(i); - mdelay(1); - i++; - } - } else { - /* - * When running under a hypervisor a small mdelay may get - * rounded up to the hypervisor timeslice. For example, with - * a 1ms in 10ms hypervisor timeslice we might inflate a - * mdelay(1) loop by 10x. - * - * If we have nothing to blink, spin on 1 second calls to - * mdelay to avoid this. - */ - mdelay(MSEC_PER_SEC); - } + return 0; } +/* Returns how long it waited in ms */ +long (*panic_blink)(int state); +EXPORT_SYMBOL(panic_blink); + /** * panic - halt the system * @fmt: The text string to print @@ -78,7 +60,8 @@ NORET_TYPE void panic(const char * fmt, ...) { static char buf[1024]; va_list args; - long i; + long i, i_next = 0; + int state = 0; /* * It's possible to come here directly from a panic-assertion and @@ -117,6 +100,9 @@ NORET_TYPE void panic(const char * fmt, ...) bust_spinlocks(0); + if (!panic_blink) + panic_blink = no_blink; + if (panic_timeout > 0) { /* * Delay timeout seconds before rebooting the machine. @@ -124,9 +110,13 @@ NORET_TYPE void panic(const char * fmt, ...) */ printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); - for (i = 0; i < panic_timeout; i++) { + for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) { touch_nmi_watchdog(); - panic_blink_one_second(); + if (i >= i_next) { + i += panic_blink(state ^= 1); + i_next = i + 3600 / PANIC_BLINK_SPD; + } + mdelay(PANIC_TIMER_STEP); } /* * This will not be a clean reboot, with everything @@ -152,9 +142,13 @@ NORET_TYPE void panic(const char * fmt, ...) } #endif local_irq_enable(); - while (1) { + for (i = 0; ; i += PANIC_TIMER_STEP) { touch_softlockup_watchdog(); - panic_blink_one_second(); + if (i >= i_next) { + i += panic_blink(state ^= 1); + i_next = i + 3600 / PANIC_BLINK_SPD; + } + mdelay(PANIC_TIMER_STEP); } } @@ -344,7 +338,7 @@ static int init_oops_id(void) } late_initcall(init_oops_id); -static void print_oops_end_marker(void) +void print_oops_end_marker(void) { init_oops_id(); printk(KERN_WARNING "---[ end trace %016llx ]---\n", diff --git a/kernel/params.c b/kernel/params.c index 0b30ecd..08107d1 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -31,6 +31,42 @@ #define DEBUGP(fmt, a...) #endif +/* Protects all parameters, and incidentally kmalloced_param list. */ +static DEFINE_MUTEX(param_lock); + +/* This just allows us to keep track of which parameters are kmalloced. */ +struct kmalloced_param { + struct list_head list; + char val[]; +}; +static LIST_HEAD(kmalloced_params); + +static void *kmalloc_parameter(unsigned int size) +{ + struct kmalloced_param *p; + + p = kmalloc(sizeof(*p) + size, GFP_KERNEL); + if (!p) + return NULL; + + list_add(&p->list, &kmalloced_params); + return p->val; +} + +/* Does nothing if parameter wasn't kmalloced above. */ +static void maybe_kfree_parameter(void *param) +{ + struct kmalloced_param *p; + + list_for_each_entry(p, &kmalloced_params, list) { + if (p->val == param) { + list_del(&p->list); + kfree(p); + break; + } + } +} + static inline char dash2underscore(char c) { if (c == '-') @@ -49,18 +85,25 @@ static inline int parameq(const char *input, const char *paramname) static int parse_one(char *param, char *val, - struct kernel_param *params, + const struct kernel_param *params, unsigned num_params, int (*handle_unknown)(char *param, char *val)) { unsigned int i; + int err; /* Find parameter */ for (i = 0; i < num_params; i++) { if (parameq(param, params[i].name)) { + /* Noone handled NULL, so do it here. */ + if (!val && params[i].ops->set != param_set_bool) + return -EINVAL; DEBUGP("They are equal! Calling %p\n", - params[i].set); - return params[i].set(val, ¶ms[i]); + params[i].ops->set); + mutex_lock(¶m_lock); + err = params[i].ops->set(val, ¶ms[i]); + mutex_unlock(¶m_lock); + return err; } } @@ -128,7 +171,7 @@ static char *next_arg(char *args, char **param, char **val) /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ int parse_args(const char *name, char *args, - struct kernel_param *params, + const struct kernel_param *params, unsigned num, int (*unknown)(char *param, char *val)) { @@ -176,22 +219,29 @@ int parse_args(const char *name, /* Lazy bastard, eh? */ #define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \ - int param_set_##name(const char *val, struct kernel_param *kp) \ + int param_set_##name(const char *val, const struct kernel_param *kp) \ { \ tmptype l; \ int ret; \ \ - if (!val) return -EINVAL; \ ret = strtolfn(val, 0, &l); \ if (ret == -EINVAL || ((type)l != l)) \ return -EINVAL; \ *((type *)kp->arg) = l; \ return 0; \ } \ - int param_get_##name(char *buffer, struct kernel_param *kp) \ + int param_get_##name(char *buffer, const struct kernel_param *kp) \ { \ return sprintf(buffer, format, *((type *)kp->arg)); \ - } + } \ + struct kernel_param_ops param_ops_##name = { \ + .set = param_set_##name, \ + .get = param_get_##name, \ + }; \ + EXPORT_SYMBOL(param_set_##name); \ + EXPORT_SYMBOL(param_get_##name); \ + EXPORT_SYMBOL(param_ops_##name) + STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul); STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol); @@ -201,39 +251,50 @@ STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul); STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol); STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); -int param_set_charp(const char *val, struct kernel_param *kp) +int param_set_charp(const char *val, const struct kernel_param *kp) { - if (!val) { - printk(KERN_ERR "%s: string parameter expected\n", - kp->name); - return -EINVAL; - } - if (strlen(val) > 1024) { printk(KERN_ERR "%s: string parameter too long\n", kp->name); return -ENOSPC; } - /* This is a hack. We can't need to strdup in early boot, and we + maybe_kfree_parameter(*(char **)kp->arg); + + /* This is a hack. We can't kmalloc in early boot, and we * don't need to; this mangled commandline is preserved. */ if (slab_is_available()) { - *(char **)kp->arg = kstrdup(val, GFP_KERNEL); + *(char **)kp->arg = kmalloc_parameter(strlen(val)+1); if (!*(char **)kp->arg) return -ENOMEM; + strcpy(*(char **)kp->arg, val); } else *(const char **)kp->arg = val; return 0; } +EXPORT_SYMBOL(param_set_charp); -int param_get_charp(char *buffer, struct kernel_param *kp) +int param_get_charp(char *buffer, const struct kernel_param *kp) { return sprintf(buffer, "%s", *((char **)kp->arg)); } +EXPORT_SYMBOL(param_get_charp); + +static void param_free_charp(void *arg) +{ + maybe_kfree_parameter(*((char **)arg)); +} + +struct kernel_param_ops param_ops_charp = { + .set = param_set_charp, + .get = param_get_charp, + .free = param_free_charp, +}; +EXPORT_SYMBOL(param_ops_charp); /* Actually could be a bool or an int, for historical reasons. */ -int param_set_bool(const char *val, struct kernel_param *kp) +int param_set_bool(const char *val, const struct kernel_param *kp) { bool v; @@ -258,8 +319,9 @@ int param_set_bool(const char *val, struct kernel_param *kp) *(int *)kp->arg = v; return 0; } +EXPORT_SYMBOL(param_set_bool); -int param_get_bool(char *buffer, struct kernel_param *kp) +int param_get_bool(char *buffer, const struct kernel_param *kp) { bool val; if (kp->flags & KPARAM_ISBOOL) @@ -270,9 +332,16 @@ int param_get_bool(char *buffer, struct kernel_param *kp) /* Y and N chosen as being relatively non-coder friendly */ return sprintf(buffer, "%c", val ? 'Y' : 'N'); } +EXPORT_SYMBOL(param_get_bool); + +struct kernel_param_ops param_ops_bool = { + .set = param_set_bool, + .get = param_get_bool, +}; +EXPORT_SYMBOL(param_ops_bool); /* This one must be bool. */ -int param_set_invbool(const char *val, struct kernel_param *kp) +int param_set_invbool(const char *val, const struct kernel_param *kp) { int ret; bool boolval; @@ -285,18 +354,26 @@ int param_set_invbool(const char *val, struct kernel_param *kp) *(bool *)kp->arg = !boolval; return ret; } +EXPORT_SYMBOL(param_set_invbool); -int param_get_invbool(char *buffer, struct kernel_param *kp) +int param_get_invbool(char *buffer, const struct kernel_param *kp) { return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y'); } +EXPORT_SYMBOL(param_get_invbool); + +struct kernel_param_ops param_ops_invbool = { + .set = param_set_invbool, + .get = param_get_invbool, +}; +EXPORT_SYMBOL(param_ops_invbool); /* We break the rule and mangle the string. */ static int param_array(const char *name, const char *val, unsigned int min, unsigned int max, void *elem, int elemsize, - int (*set)(const char *, struct kernel_param *kp), + int (*set)(const char *, const struct kernel_param *kp), u16 flags, unsigned int *num) { @@ -309,12 +386,6 @@ static int param_array(const char *name, kp.arg = elem; kp.flags = flags; - /* No equals sign? */ - if (!val) { - printk(KERN_ERR "%s: expects arguments\n", name); - return -EINVAL; - } - *num = 0; /* We expect a comma-separated list of values. */ do { @@ -330,6 +401,7 @@ static int param_array(const char *name, /* nul-terminate and parse */ save = val[len]; ((char *)val)[len] = '\0'; + BUG_ON(!mutex_is_locked(¶m_lock)); ret = set(val, &kp); if (ret != 0) @@ -347,17 +419,17 @@ static int param_array(const char *name, return 0; } -int param_array_set(const char *val, struct kernel_param *kp) +static int param_array_set(const char *val, const struct kernel_param *kp) { const struct kparam_array *arr = kp->arr; unsigned int temp_num; return param_array(kp->name, val, 1, arr->max, arr->elem, - arr->elemsize, arr->set, kp->flags, + arr->elemsize, arr->ops->set, kp->flags, arr->num ?: &temp_num); } -int param_array_get(char *buffer, struct kernel_param *kp) +static int param_array_get(char *buffer, const struct kernel_param *kp) { int i, off, ret; const struct kparam_array *arr = kp->arr; @@ -368,7 +440,8 @@ int param_array_get(char *buffer, struct kernel_param *kp) if (i) buffer[off++] = ','; p.arg = arr->elem + arr->elemsize * i; - ret = arr->get(buffer + off, &p); + BUG_ON(!mutex_is_locked(¶m_lock)); + ret = arr->ops->get(buffer + off, &p); if (ret < 0) return ret; off += ret; @@ -377,14 +450,27 @@ int param_array_get(char *buffer, struct kernel_param *kp) return off; } -int param_set_copystring(const char *val, struct kernel_param *kp) +static void param_array_free(void *arg) +{ + unsigned int i; + const struct kparam_array *arr = arg; + + if (arr->ops->free) + for (i = 0; i < (arr->num ? *arr->num : arr->max); i++) + arr->ops->free(arr->elem + arr->elemsize * i); +} + +struct kernel_param_ops param_array_ops = { + .set = param_array_set, + .get = param_array_get, + .free = param_array_free, +}; +EXPORT_SYMBOL(param_array_ops); + +int param_set_copystring(const char *val, const struct kernel_param *kp) { const struct kparam_string *kps = kp->str; - if (!val) { - printk(KERN_ERR "%s: missing param set value\n", kp->name); - return -EINVAL; - } if (strlen(val)+1 > kps->maxlen) { printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", kp->name, kps->maxlen-1); @@ -393,12 +479,20 @@ int param_set_copystring(const char *val, struct kernel_param *kp) strcpy(kps->string, val); return 0; } +EXPORT_SYMBOL(param_set_copystring); -int param_get_string(char *buffer, struct kernel_param *kp) +int param_get_string(char *buffer, const struct kernel_param *kp) { const struct kparam_string *kps = kp->str; return strlcpy(buffer, kps->string, kps->maxlen); } +EXPORT_SYMBOL(param_get_string); + +struct kernel_param_ops param_ops_string = { + .set = param_set_copystring, + .get = param_get_string, +}; +EXPORT_SYMBOL(param_ops_string); /* sysfs output in /sys/modules/XYZ/parameters/ */ #define to_module_attr(n) container_of(n, struct module_attribute, attr) @@ -409,7 +503,7 @@ extern struct kernel_param __start___param[], __stop___param[]; struct param_attribute { struct module_attribute mattr; - struct kernel_param *param; + const struct kernel_param *param; }; struct module_param_attrs @@ -428,10 +522,12 @@ static ssize_t param_attr_show(struct module_attribute *mattr, int count; struct param_attribute *attribute = to_param_attr(mattr); - if (!attribute->param->get) + if (!attribute->param->ops->get) return -EPERM; - count = attribute->param->get(buf, attribute->param); + mutex_lock(¶m_lock); + count = attribute->param->ops->get(buf, attribute->param); + mutex_unlock(¶m_lock); if (count > 0) { strcat(buf, "\n"); ++count; @@ -447,10 +543,12 @@ static ssize_t param_attr_store(struct module_attribute *mattr, int err; struct param_attribute *attribute = to_param_attr(mattr); - if (!attribute->param->set) + if (!attribute->param->ops->set) return -EPERM; - err = attribute->param->set(buf, attribute->param); + mutex_lock(¶m_lock); + err = attribute->param->ops->set(buf, attribute->param); + mutex_unlock(¶m_lock); if (!err) return len; return err; @@ -464,6 +562,18 @@ static ssize_t param_attr_store(struct module_attribute *mattr, #endif #ifdef CONFIG_SYSFS +void __kernel_param_lock(void) +{ + mutex_lock(¶m_lock); +} +EXPORT_SYMBOL(__kernel_param_lock); + +void __kernel_param_unlock(void) +{ + mutex_unlock(¶m_lock); +} +EXPORT_SYMBOL(__kernel_param_unlock); + /* * add_sysfs_param - add a parameter to sysfs * @mk: struct module_kobject @@ -475,7 +585,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr, * if there's an error. */ static __modinit int add_sysfs_param(struct module_kobject *mk, - struct kernel_param *kp, + const struct kernel_param *kp, const char *name) { struct module_param_attrs *new; @@ -557,7 +667,7 @@ static void free_module_param_attrs(struct module_kobject *mk) * /sys/module/[mod->name]/parameters/ */ int module_param_sysfs_setup(struct module *mod, - struct kernel_param *kparam, + const struct kernel_param *kparam, unsigned int num_params) { int i, err; @@ -602,7 +712,11 @@ void module_param_sysfs_remove(struct module *mod) void destroy_params(const struct kernel_param *params, unsigned num) { - /* FIXME: This should free kmalloced charp parameters. It doesn't. */ + unsigned int i; + + for (i = 0; i < num; i++) + if (params[i].ops->free) + params[i].ops->free(params[i].arg); } static void __init kernel_add_sysfs_param(const char *name, @@ -768,28 +882,3 @@ static int __init param_sysfs_init(void) subsys_initcall(param_sysfs_init); #endif /* CONFIG_SYSFS */ - -EXPORT_SYMBOL(param_set_byte); -EXPORT_SYMBOL(param_get_byte); -EXPORT_SYMBOL(param_set_short); -EXPORT_SYMBOL(param_get_short); -EXPORT_SYMBOL(param_set_ushort); -EXPORT_SYMBOL(param_get_ushort); -EXPORT_SYMBOL(param_set_int); -EXPORT_SYMBOL(param_get_int); -EXPORT_SYMBOL(param_set_uint); -EXPORT_SYMBOL(param_get_uint); -EXPORT_SYMBOL(param_set_long); -EXPORT_SYMBOL(param_get_long); -EXPORT_SYMBOL(param_set_ulong); -EXPORT_SYMBOL(param_get_ulong); -EXPORT_SYMBOL(param_set_charp); -EXPORT_SYMBOL(param_get_charp); -EXPORT_SYMBOL(param_set_bool); -EXPORT_SYMBOL(param_get_bool); -EXPORT_SYMBOL(param_set_invbool); -EXPORT_SYMBOL(param_get_invbool); -EXPORT_SYMBOL(param_array_set); -EXPORT_SYMBOL(param_array_get); -EXPORT_SYMBOL(param_set_copystring); -EXPORT_SYMBOL(param_get_string); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index ff86c55..f309e80 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -31,24 +31,18 @@ #include <linux/kernel_stat.h> #include <linux/perf_event.h> #include <linux/ftrace_event.h> -#include <linux/hw_breakpoint.h> #include <asm/irq_regs.h> -/* - * Each CPU has a list of per CPU events: - */ -static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); - -int perf_max_events __read_mostly = 1; -static int perf_reserved_percpu __read_mostly; -static int perf_overcommit __read_mostly = 1; - -static atomic_t nr_events __read_mostly; +atomic_t perf_task_events __read_mostly; static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_task_events __read_mostly; +static LIST_HEAD(pmus); +static DEFINE_MUTEX(pmus_lock); +static struct srcu_struct pmus_srcu; + /* * perf event paranoia level: * -1 - not paranoid at all @@ -67,36 +61,43 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000; static atomic64_t perf_event_id; -/* - * Lock for (sysadmin-configurable) event reservations: - */ -static DEFINE_SPINLOCK(perf_resource_lock); +void __weak perf_event_print_debug(void) { } -/* - * Architecture provided APIs - weak aliases: - */ -extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) +extern __weak const char *perf_pmu_name(void) { - return NULL; + return "pmu"; } -void __weak hw_perf_disable(void) { barrier(); } -void __weak hw_perf_enable(void) { barrier(); } - -void __weak perf_event_print_debug(void) { } - -static DEFINE_PER_CPU(int, perf_disable_count); +void perf_pmu_disable(struct pmu *pmu) +{ + int *count = this_cpu_ptr(pmu->pmu_disable_count); + if (!(*count)++) + pmu->pmu_disable(pmu); +} -void perf_disable(void) +void perf_pmu_enable(struct pmu *pmu) { - if (!__get_cpu_var(perf_disable_count)++) - hw_perf_disable(); + int *count = this_cpu_ptr(pmu->pmu_disable_count); + if (!--(*count)) + pmu->pmu_enable(pmu); } -void perf_enable(void) +static DEFINE_PER_CPU(struct list_head, rotation_list); + +/* + * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized + * because they're strictly cpu affine and rotate_start is called with IRQs + * disabled, while rotate_context is called from IRQ context. + */ +static void perf_pmu_rotate_start(struct pmu *pmu) { - if (!--__get_cpu_var(perf_disable_count)) - hw_perf_enable(); + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct list_head *head = &__get_cpu_var(rotation_list); + + WARN_ON(!irqs_disabled()); + + if (list_empty(&cpuctx->rotation_list)) + list_add(&cpuctx->rotation_list, head); } static void get_ctx(struct perf_event_context *ctx) @@ -151,13 +152,13 @@ static u64 primary_event_id(struct perf_event *event) * the context could get moved to another task. */ static struct perf_event_context * -perf_lock_task_context(struct task_struct *task, unsigned long *flags) +perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) { struct perf_event_context *ctx; rcu_read_lock(); - retry: - ctx = rcu_dereference(task->perf_event_ctxp); +retry: + ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); if (ctx) { /* * If this context is a clone of another, it might @@ -170,7 +171,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) * can't get swapped on us any more. */ raw_spin_lock_irqsave(&ctx->lock, *flags); - if (ctx != rcu_dereference(task->perf_event_ctxp)) { + if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { raw_spin_unlock_irqrestore(&ctx->lock, *flags); goto retry; } @@ -189,12 +190,13 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) * can't get swapped to another task. This also increments its * reference count so that the context can't get freed. */ -static struct perf_event_context *perf_pin_task_context(struct task_struct *task) +static struct perf_event_context * +perf_pin_task_context(struct task_struct *task, int ctxn) { struct perf_event_context *ctx; unsigned long flags; - ctx = perf_lock_task_context(task, &flags); + ctx = perf_lock_task_context(task, ctxn, &flags); if (ctx) { ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); @@ -214,7 +216,7 @@ static void perf_unpin_context(struct perf_event_context *ctx) static inline u64 perf_clock(void) { - return cpu_clock(raw_smp_processor_id()); + return local_clock(); } /* @@ -302,6 +304,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) } list_add_rcu(&event->event_entry, &ctx->event_list); + if (!ctx->nr_events) + perf_pmu_rotate_start(ctx->pmu); ctx->nr_events++; if (event->attr.inherit_stat) ctx->nr_stat++; @@ -311,7 +315,12 @@ static void perf_group_attach(struct perf_event *event) { struct perf_event *group_leader = event->group_leader; - WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP); + /* + * We can have double attach due to group movement in perf_event_open. + */ + if (event->attach_state & PERF_ATTACH_GROUP) + return; + event->attach_state |= PERF_ATTACH_GROUP; if (group_leader == event) @@ -402,21 +411,40 @@ static void perf_group_detach(struct perf_event *event) } } -static void -event_sched_out(struct perf_event *event, +static inline int +event_filter_match(struct perf_event *event) +{ + return event->cpu == -1 || event->cpu == smp_processor_id(); +} + +static int +__event_sched_out(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { + u64 delta; + /* + * An event which could not be activated because of + * filter mismatch still needs to have its timings + * maintained, otherwise bogus information is return + * via read() for time_enabled, time_running: + */ + if (event->state == PERF_EVENT_STATE_INACTIVE + && !event_filter_match(event)) { + delta = ctx->time - event->tstamp_stopped; + event->tstamp_running += delta; + event->tstamp_stopped = ctx->time; + } + if (event->state != PERF_EVENT_STATE_ACTIVE) - return; + return 0; event->state = PERF_EVENT_STATE_INACTIVE; if (event->pending_disable) { event->pending_disable = 0; event->state = PERF_EVENT_STATE_OFF; } - event->tstamp_stopped = ctx->time; - event->pmu->disable(event); + event->pmu->del(event, 0); event->oncpu = -1; if (!is_software_event(event)) @@ -424,6 +452,19 @@ event_sched_out(struct perf_event *event, ctx->nr_active--; if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; + return 1; +} + +static void +event_sched_out(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + int ret; + + ret = __event_sched_out(event, cpuctx, ctx); + if (ret) + event->tstamp_stopped = ctx->time; } static void @@ -432,9 +473,7 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event; - - if (group_event->state != PERF_EVENT_STATE_ACTIVE) - return; + int state = group_event->state; event_sched_out(group_event, cpuctx, ctx); @@ -444,10 +483,16 @@ group_sched_out(struct perf_event *group_event, list_for_each_entry(event, &group_event->sibling_list, group_entry) event_sched_out(event, cpuctx, ctx); - if (group_event->attr.exclusive) + if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) cpuctx->exclusive = 0; } +static inline struct perf_cpu_context * +__get_cpu_context(struct perf_event_context *ctx) +{ + return this_cpu_ptr(ctx->pmu->pmu_cpu_context); +} + /* * Cross CPU call to remove a performance event * @@ -456,9 +501,9 @@ group_sched_out(struct perf_event *group_event, */ static void __perf_event_remove_from_context(void *info) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); /* * If this is a task context, we need to check whether it is @@ -469,27 +514,11 @@ static void __perf_event_remove_from_context(void *info) return; raw_spin_lock(&ctx->lock); - /* - * Protect the list operation against NMI by disabling the - * events on a global level. - */ - perf_disable(); event_sched_out(event, cpuctx, ctx); list_del_event(event, ctx); - if (!ctx->task) { - /* - * Allow more per task events with respect to the - * reservation: - */ - cpuctx->max_pertask = - min(perf_max_events - ctx->nr_events, - perf_max_events - perf_reserved_percpu); - } - - perf_enable(); raw_spin_unlock(&ctx->lock); } @@ -554,8 +583,8 @@ retry: static void __perf_event_disable(void *info) { struct perf_event *event = info; - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); /* * If this is a per-task event, need to check whether this @@ -610,7 +639,7 @@ void perf_event_disable(struct perf_event *event) return; } - retry: +retry: task_oncpu_function_call(task, __perf_event_disable, event); raw_spin_lock_irq(&ctx->lock); @@ -635,7 +664,7 @@ void perf_event_disable(struct perf_event *event) } static int -event_sched_in(struct perf_event *event, +__event_sched_in(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { @@ -649,14 +678,12 @@ event_sched_in(struct perf_event *event, */ smp_wmb(); - if (event->pmu->enable(event)) { + if (event->pmu->add(event, PERF_EF_START)) { event->state = PERF_EVENT_STATE_INACTIVE; event->oncpu = -1; return -EAGAIN; } - event->tstamp_running += ctx->time - event->tstamp_stopped; - if (!is_software_event(event)) cpuctx->active_oncpu++; ctx->nr_active++; @@ -667,29 +694,56 @@ event_sched_in(struct perf_event *event, return 0; } +static inline int +event_sched_in(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + int ret = __event_sched_in(event, cpuctx, ctx); + if (ret) + return ret; + event->tstamp_running += ctx->time - event->tstamp_stopped; + return 0; +} + +static void +group_commit_event_sched_in(struct perf_event *group_event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + struct perf_event *event; + u64 now = ctx->time; + + group_event->tstamp_running += now - group_event->tstamp_stopped; + /* + * Schedule in siblings as one group (if any): + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) { + event->tstamp_running += now - event->tstamp_stopped; + } +} + static int group_sched_in(struct perf_event *group_event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { struct perf_event *event, *partial_group = NULL; - const struct pmu *pmu = group_event->pmu; - bool txn = false; - int ret; + struct pmu *pmu = group_event->pmu; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; - /* Check if group transaction availabe */ - if (pmu->start_txn) - txn = true; + pmu->start_txn(pmu); - if (txn) - pmu->start_txn(pmu); - - if (event_sched_in(group_event, cpuctx, ctx)) { - if (txn) - pmu->cancel_txn(pmu); + /* + * use __event_sched_in() to delay updating tstamp_running + * until the transaction is committed. In case of failure + * we will keep an unmodified tstamp_running which is a + * requirement to get correct timing information + */ + if (__event_sched_in(group_event, cpuctx, ctx)) { + pmu->cancel_txn(pmu); return -EAGAIN; } @@ -697,35 +751,33 @@ group_sched_in(struct perf_event *group_event, * Schedule in siblings as one group (if any): */ list_for_each_entry(event, &group_event->sibling_list, group_entry) { - if (event_sched_in(event, cpuctx, ctx)) { + if (__event_sched_in(event, cpuctx, ctx)) { partial_group = event; goto group_error; } } - if (!txn) - return 0; - - ret = pmu->commit_txn(pmu); - if (!ret) { - pmu->cancel_txn(pmu); + if (!pmu->commit_txn(pmu)) { + /* commit tstamp_running */ + group_commit_event_sched_in(group_event, cpuctx, ctx); return 0; } - group_error: /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: + * + * use __event_sched_out() to avoid updating tstamp_stopped + * because the event never actually ran */ list_for_each_entry(event, &group_event->sibling_list, group_entry) { if (event == partial_group) break; - event_sched_out(event, cpuctx, ctx); + __event_sched_out(event, cpuctx, ctx); } - event_sched_out(group_event, cpuctx, ctx); + __event_sched_out(group_event, cpuctx, ctx); - if (txn) - pmu->cancel_txn(pmu); + pmu->cancel_txn(pmu); return -EAGAIN; } @@ -778,10 +830,10 @@ static void add_event_to_ctx(struct perf_event *event, */ static void __perf_install_in_context(void *info) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; struct perf_event *leader = event->group_leader; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); int err; /* @@ -801,12 +853,6 @@ static void __perf_install_in_context(void *info) ctx->is_active = 1; update_context_time(ctx); - /* - * Protect the list operation against NMI by disabling the - * events on a global level. NOP for non NMI based events. - */ - perf_disable(); - add_event_to_ctx(event, ctx); if (event->cpu != -1 && event->cpu != smp_processor_id()) @@ -844,12 +890,7 @@ static void __perf_install_in_context(void *info) } } - if (!err && !ctx->task && cpuctx->max_pertask) - cpuctx->max_pertask--; - - unlock: - perf_enable(); - +unlock: raw_spin_unlock(&ctx->lock); } @@ -872,6 +913,8 @@ perf_install_in_context(struct perf_event_context *ctx, { struct task_struct *task = ctx->task; + event->ctx = ctx; + if (!task) { /* * Per cpu events are installed via an smp call and @@ -920,10 +963,12 @@ static void __perf_event_mark_enabled(struct perf_event *event, event->state = PERF_EVENT_STATE_INACTIVE; event->tstamp_enabled = ctx->time - event->total_time_enabled; - list_for_each_entry(sub, &event->sibling_list, group_entry) - if (sub->state >= PERF_EVENT_STATE_INACTIVE) + list_for_each_entry(sub, &event->sibling_list, group_entry) { + if (sub->state >= PERF_EVENT_STATE_INACTIVE) { sub->tstamp_enabled = ctx->time - sub->total_time_enabled; + } + } } /* @@ -932,9 +977,9 @@ static void __perf_event_mark_enabled(struct perf_event *event, static void __perf_event_enable(void *info) { struct perf_event *event = info; - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event_context *ctx = event->ctx; struct perf_event *leader = event->group_leader; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); int err; /* @@ -968,12 +1013,10 @@ static void __perf_event_enable(void *info) if (!group_can_go_on(event, cpuctx, 1)) { err = -EEXIST; } else { - perf_disable(); if (event == leader) err = group_sched_in(event, cpuctx, ctx); else err = event_sched_in(event, cpuctx, ctx); - perf_enable(); } if (err) { @@ -989,7 +1032,7 @@ static void __perf_event_enable(void *info) } } - unlock: +unlock: raw_spin_unlock(&ctx->lock); } @@ -1030,7 +1073,7 @@ void perf_event_enable(struct perf_event *event) if (event->state == PERF_EVENT_STATE_ERROR) event->state = PERF_EVENT_STATE_OFF; - retry: +retry: raw_spin_unlock_irq(&ctx->lock); task_oncpu_function_call(task, __perf_event_enable, event); @@ -1050,7 +1093,7 @@ void perf_event_enable(struct perf_event *event) if (event->state == PERF_EVENT_STATE_OFF) __perf_event_mark_enabled(event, ctx); - out: +out: raw_spin_unlock_irq(&ctx->lock); } @@ -1081,26 +1124,26 @@ static void ctx_sched_out(struct perf_event_context *ctx, struct perf_event *event; raw_spin_lock(&ctx->lock); + perf_pmu_disable(ctx->pmu); ctx->is_active = 0; if (likely(!ctx->nr_events)) goto out; update_context_time(ctx); - perf_disable(); if (!ctx->nr_active) - goto out_enable; + goto out; - if (event_type & EVENT_PINNED) + if (event_type & EVENT_PINNED) { list_for_each_entry(event, &ctx->pinned_groups, group_entry) group_sched_out(event, cpuctx, ctx); + } - if (event_type & EVENT_FLEXIBLE) + if (event_type & EVENT_FLEXIBLE) { list_for_each_entry(event, &ctx->flexible_groups, group_entry) group_sched_out(event, cpuctx, ctx); - - out_enable: - perf_enable(); - out: + } +out: + perf_pmu_enable(ctx->pmu); raw_spin_unlock(&ctx->lock); } @@ -1155,9 +1198,9 @@ static void __perf_event_sync_stat(struct perf_event *event, * In order to keep per-task stats reliable we need to flip the event * values when we flip the contexts. */ - value = atomic64_read(&next_event->count); - value = atomic64_xchg(&event->count, value); - atomic64_set(&next_event->count, value); + value = local64_read(&next_event->count); + value = local64_xchg(&event->count, value); + local64_set(&next_event->count, value); swap(event->total_time_enabled, next_event->total_time_enabled); swap(event->total_time_running, next_event->total_time_running); @@ -1198,34 +1241,25 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, } } -/* - * Called from scheduler to remove the events of the current task, - * with interrupts disabled. - * - * We stop each event and update the event value in event->count. - * - * This does not protect us against NMI, but disable() - * sets the disabled bit in the control field of event _before_ - * accessing the event control register. If a NMI hits, then it will - * not restart the event. - */ -void perf_event_task_sched_out(struct task_struct *task, - struct task_struct *next) +void perf_event_context_sched_out(struct task_struct *task, int ctxn, + struct task_struct *next) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; struct perf_event_context *next_ctx; struct perf_event_context *parent; + struct perf_cpu_context *cpuctx; int do_switch = 1; - perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); + if (likely(!ctx)) + return; - if (likely(!ctx || !cpuctx->task_ctx)) + cpuctx = __get_cpu_context(ctx); + if (!cpuctx->task_ctx) return; rcu_read_lock(); parent = rcu_dereference(ctx->parent_ctx); - next_ctx = next->perf_event_ctxp; + next_ctx = next->perf_event_ctxp[ctxn]; if (parent && next_ctx && rcu_dereference(next_ctx->parent_ctx) == parent) { /* @@ -1244,8 +1278,8 @@ void perf_event_task_sched_out(struct task_struct *task, * XXX do we need a memory barrier of sorts * wrt to rcu_dereference() of perf_event_ctxp */ - task->perf_event_ctxp = next_ctx; - next->perf_event_ctxp = ctx; + task->perf_event_ctxp[ctxn] = next_ctx; + next->perf_event_ctxp[ctxn] = ctx; ctx->task = next; next_ctx->task = task; do_switch = 0; @@ -1263,10 +1297,35 @@ void perf_event_task_sched_out(struct task_struct *task, } } +#define for_each_task_context_nr(ctxn) \ + for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) + +/* + * Called from scheduler to remove the events of the current task, + * with interrupts disabled. + * + * We stop each event and update the event value in event->count. + * + * This does not protect us against NMI, but disable() + * sets the disabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * not restart the event. + */ +void __perf_event_task_sched_out(struct task_struct *task, + struct task_struct *next) +{ + int ctxn; + + perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); + + for_each_task_context_nr(ctxn) + perf_event_context_sched_out(task, ctxn, next); +} + static void task_ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); if (!cpuctx->task_ctx) return; @@ -1281,14 +1340,6 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, /* * Called with IRQs disabled */ -static void __perf_event_task_sched_out(struct perf_event_context *ctx) -{ - task_ctx_sched_out(ctx, EVENT_ALL); -} - -/* - * Called with IRQs disabled - */ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, enum event_type_t event_type) { @@ -1339,9 +1390,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, if (event->cpu != -1 && event->cpu != smp_processor_id()) continue; - if (group_can_go_on(event, cpuctx, can_add_hw)) + if (group_can_go_on(event, cpuctx, can_add_hw)) { if (group_sched_in(event, cpuctx, ctx)) can_add_hw = 0; + } } } @@ -1357,8 +1409,6 @@ ctx_sched_in(struct perf_event_context *ctx, ctx->timestamp = perf_clock(); - perf_disable(); - /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. @@ -1370,8 +1420,7 @@ ctx_sched_in(struct perf_event_context *ctx, if (event_type & EVENT_FLEXIBLE) ctx_flexible_sched_in(ctx, cpuctx); - perf_enable(); - out: +out: raw_spin_unlock(&ctx->lock); } @@ -1383,43 +1432,28 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, ctx_sched_in(ctx, cpuctx, event_type); } -static void task_ctx_sched_in(struct task_struct *task, +static void task_ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_cpu_context *cpuctx; - if (likely(!ctx)) - return; + cpuctx = __get_cpu_context(ctx); if (cpuctx->task_ctx == ctx) return; + ctx_sched_in(ctx, cpuctx, event_type); cpuctx->task_ctx = ctx; } -/* - * Called from scheduler to add the events of the current task - * with interrupts disabled. - * - * We restore the event value and then enable it. - * - * This does not protect us against NMI, but enable() - * sets the enabled bit in the control field of event _before_ - * accessing the event control register. If a NMI hits, then it will - * keep the event running. - */ -void perf_event_task_sched_in(struct task_struct *task) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_event_context *ctx = task->perf_event_ctxp; - if (likely(!ctx)) - return; +void perf_event_context_sched_in(struct perf_event_context *ctx) +{ + struct perf_cpu_context *cpuctx; + cpuctx = __get_cpu_context(ctx); if (cpuctx->task_ctx == ctx) return; - perf_disable(); - + perf_pmu_disable(ctx->pmu); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -1433,7 +1467,37 @@ void perf_event_task_sched_in(struct task_struct *task) cpuctx->task_ctx = ctx; - perf_enable(); + /* + * Since these rotations are per-cpu, we need to ensure the + * cpu-context we got scheduled on is actually rotating. + */ + perf_pmu_rotate_start(ctx->pmu); + perf_pmu_enable(ctx->pmu); +} + +/* + * Called from scheduler to add the events of the current task + * with interrupts disabled. + * + * We restore the event value and then enable it. + * + * This does not protect us against NMI, but enable() + * sets the enabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * keep the event running. + */ +void __perf_event_task_sched_in(struct task_struct *task) +{ + struct perf_event_context *ctx; + int ctxn; + + for_each_task_context_nr(ctxn) { + ctx = task->perf_event_ctxp[ctxn]; + if (likely(!ctx)) + continue; + + perf_event_context_sched_in(ctx); + } } #define MAX_INTERRUPTS (~0ULL) @@ -1513,22 +1577,6 @@ do { \ return div64_u64(dividend, divisor); } -static void perf_event_stop(struct perf_event *event) -{ - if (!event->pmu->stop) - return event->pmu->disable(event); - - return event->pmu->stop(event); -} - -static int perf_event_start(struct perf_event *event) -{ - if (!event->pmu->start) - return event->pmu->enable(event); - - return event->pmu->start(event); -} - static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) { struct hw_perf_event *hwc = &event->hw; @@ -1547,16 +1595,14 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) hwc->sample_period = sample_period; - if (atomic64_read(&hwc->period_left) > 8*sample_period) { - perf_disable(); - perf_event_stop(event); - atomic64_set(&hwc->period_left, 0); - perf_event_start(event); - perf_enable(); + if (local64_read(&hwc->period_left) > 8*sample_period) { + event->pmu->stop(event, PERF_EF_UPDATE); + local64_set(&hwc->period_left, 0); + event->pmu->start(event, PERF_EF_RELOAD); } } -static void perf_ctx_adjust_freq(struct perf_event_context *ctx) +static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) { struct perf_event *event; struct hw_perf_event *hwc; @@ -1581,23 +1627,19 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) */ if (interrupts == MAX_INTERRUPTS) { perf_log_throttle(event, 1); - perf_disable(); - event->pmu->unthrottle(event); - perf_enable(); + event->pmu->start(event, 0); } if (!event->attr.freq || !event->attr.sample_freq) continue; - perf_disable(); event->pmu->read(event); - now = atomic64_read(&event->count); + now = local64_read(&event->count); delta = now - hwc->freq_count_stamp; hwc->freq_count_stamp = now; if (delta > 0) - perf_adjust_period(event, TICK_NSEC, delta); - perf_enable(); + perf_adjust_period(event, period, delta); } raw_spin_unlock(&ctx->lock); } @@ -1615,32 +1657,38 @@ static void rotate_ctx(struct perf_event_context *ctx) raw_spin_unlock(&ctx->lock); } -void perf_event_task_tick(struct task_struct *curr) +/* + * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized + * because they're strictly cpu affine and rotate_start is called with IRQs + * disabled, while rotate_context is called from IRQ context. + */ +static void perf_rotate_context(struct perf_cpu_context *cpuctx) { - struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; - int rotate = 0; + u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; + struct perf_event_context *ctx = NULL; + int rotate = 0, remove = 1; - if (!atomic_read(&nr_events)) - return; - - cpuctx = &__get_cpu_var(perf_cpu_context); - if (cpuctx->ctx.nr_events && - cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) - rotate = 1; + if (cpuctx->ctx.nr_events) { + remove = 0; + if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) + rotate = 1; + } - ctx = curr->perf_event_ctxp; - if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active) - rotate = 1; + ctx = cpuctx->task_ctx; + if (ctx && ctx->nr_events) { + remove = 0; + if (ctx->nr_events != ctx->nr_active) + rotate = 1; + } - perf_ctx_adjust_freq(&cpuctx->ctx); + perf_pmu_disable(cpuctx->ctx.pmu); + perf_ctx_adjust_freq(&cpuctx->ctx, interval); if (ctx) - perf_ctx_adjust_freq(ctx); + perf_ctx_adjust_freq(ctx, interval); if (!rotate) - return; + goto done; - perf_disable(); cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); if (ctx) task_ctx_sched_out(ctx, EVENT_FLEXIBLE); @@ -1651,8 +1699,27 @@ void perf_event_task_tick(struct task_struct *curr) cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); if (ctx) - task_ctx_sched_in(curr, EVENT_FLEXIBLE); - perf_enable(); + task_ctx_sched_in(ctx, EVENT_FLEXIBLE); + +done: + if (remove) + list_del_init(&cpuctx->rotation_list); + + perf_pmu_enable(cpuctx->ctx.pmu); +} + +void perf_event_task_tick(void) +{ + struct list_head *head = &__get_cpu_var(rotation_list); + struct perf_cpu_context *cpuctx, *tmp; + + WARN_ON(!irqs_disabled()); + + list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { + if (cpuctx->jiffies_interval == 1 || + !(jiffies % cpuctx->jiffies_interval)) + perf_rotate_context(cpuctx); + } } static int event_enable_on_exec(struct perf_event *event, @@ -1674,20 +1741,18 @@ static int event_enable_on_exec(struct perf_event *event, * Enable all of a task's events that have been marked enable-on-exec. * This expects task == current. */ -static void perf_event_enable_on_exec(struct task_struct *task) +static void perf_event_enable_on_exec(struct perf_event_context *ctx) { - struct perf_event_context *ctx; struct perf_event *event; unsigned long flags; int enabled = 0; int ret; local_irq_save(flags); - ctx = task->perf_event_ctxp; if (!ctx || !ctx->nr_events) goto out; - __perf_event_task_sched_out(ctx); + task_ctx_sched_out(ctx, EVENT_ALL); raw_spin_lock(&ctx->lock); @@ -1711,8 +1776,8 @@ static void perf_event_enable_on_exec(struct task_struct *task) raw_spin_unlock(&ctx->lock); - perf_event_task_sched_in(task); - out: + perf_event_context_sched_in(ctx); +out: local_irq_restore(flags); } @@ -1721,9 +1786,9 @@ static void perf_event_enable_on_exec(struct task_struct *task) */ static void __perf_event_read(void *info) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); /* * If this is a task context, we need to check whether it is @@ -1743,6 +1808,11 @@ static void __perf_event_read(void *info) event->pmu->read(event); } +static inline u64 perf_event_count(struct perf_event *event) +{ + return local64_read(&event->count) + atomic64_read(&event->child_count); +} + static u64 perf_event_read(struct perf_event *event) { /* @@ -1757,20 +1827,234 @@ static u64 perf_event_read(struct perf_event *event) unsigned long flags; raw_spin_lock_irqsave(&ctx->lock, flags); - update_context_time(ctx); + /* + * may read while context is not active + * (e.g., thread is blocked), in that case + * we cannot update context time + */ + if (ctx->is_active) + update_context_time(ctx); update_event_times(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } - return atomic64_read(&event->count); + return perf_event_count(event); } /* - * Initialize the perf_event context in a task_struct: + * Callchain support */ + +struct callchain_cpus_entries { + struct rcu_head rcu_head; + struct perf_callchain_entry *cpu_entries[0]; +}; + +static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); +static atomic_t nr_callchain_events; +static DEFINE_MUTEX(callchain_mutex); +struct callchain_cpus_entries *callchain_cpus_entries; + + +__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +__weak void perf_callchain_user(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +static void release_callchain_buffers_rcu(struct rcu_head *head) +{ + struct callchain_cpus_entries *entries; + int cpu; + + entries = container_of(head, struct callchain_cpus_entries, rcu_head); + + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + + kfree(entries); +} + +static void release_callchain_buffers(void) +{ + struct callchain_cpus_entries *entries; + + entries = callchain_cpus_entries; + rcu_assign_pointer(callchain_cpus_entries, NULL); + call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); +} + +static int alloc_callchain_buffers(void) +{ + int cpu; + int size; + struct callchain_cpus_entries *entries; + + /* + * We can't use the percpu allocation API for data that can be + * accessed from NMI. Use a temporary manual per cpu allocation + * until that gets sorted out. + */ + size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) * + num_possible_cpus(); + + entries = kzalloc(size, GFP_KERNEL); + if (!entries) + return -ENOMEM; + + size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; + + for_each_possible_cpu(cpu) { + entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, + cpu_to_node(cpu)); + if (!entries->cpu_entries[cpu]) + goto fail; + } + + rcu_assign_pointer(callchain_cpus_entries, entries); + + return 0; + +fail: + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + kfree(entries); + + return -ENOMEM; +} + +static int get_callchain_buffers(void) +{ + int err = 0; + int count; + + mutex_lock(&callchain_mutex); + + count = atomic_inc_return(&nr_callchain_events); + if (WARN_ON_ONCE(count < 1)) { + err = -EINVAL; + goto exit; + } + + if (count > 1) { + /* If the allocation failed, give up */ + if (!callchain_cpus_entries) + err = -ENOMEM; + goto exit; + } + + err = alloc_callchain_buffers(); + if (err) + release_callchain_buffers(); +exit: + mutex_unlock(&callchain_mutex); + + return err; +} + +static void put_callchain_buffers(void) +{ + if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { + release_callchain_buffers(); + mutex_unlock(&callchain_mutex); + } +} + +static int get_recursion_context(int *recursion) +{ + int rctx; + + if (in_nmi()) + rctx = 3; + else if (in_irq()) + rctx = 2; + else if (in_softirq()) + rctx = 1; + else + rctx = 0; + + if (recursion[rctx]) + return -1; + + recursion[rctx]++; + barrier(); + + return rctx; +} + +static inline void put_recursion_context(int *recursion, int rctx) +{ + barrier(); + recursion[rctx]--; +} + +static struct perf_callchain_entry *get_callchain_entry(int *rctx) +{ + int cpu; + struct callchain_cpus_entries *entries; + + *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); + if (*rctx == -1) + return NULL; + + entries = rcu_dereference(callchain_cpus_entries); + if (!entries) + return NULL; + + cpu = smp_processor_id(); + + return &entries->cpu_entries[cpu][*rctx]; +} + static void -__perf_event_init_context(struct perf_event_context *ctx, - struct task_struct *task) +put_callchain_entry(int rctx) +{ + put_recursion_context(__get_cpu_var(callchain_recursion), rctx); +} + +static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + int rctx; + struct perf_callchain_entry *entry; + + + entry = get_callchain_entry(&rctx); + if (rctx == -1) + return NULL; + + if (!entry) + goto exit_put; + + entry->nr = 0; + + if (!user_mode(regs)) { + perf_callchain_store(entry, PERF_CONTEXT_KERNEL); + perf_callchain_kernel(entry, regs); + if (current->mm) + regs = task_pt_regs(current); + else + regs = NULL; + } + + if (regs) { + perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_user(entry, regs); + } + +exit_put: + put_callchain_entry(rctx); + + return entry; +} + +/* + * Initialize the perf_event context in a task_struct: + */ +static void __perf_event_init_context(struct perf_event_context *ctx) { raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); @@ -1778,45 +2062,38 @@ __perf_event_init_context(struct perf_event_context *ctx, INIT_LIST_HEAD(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); atomic_set(&ctx->refcount, 1); - ctx->task = task; } -static struct perf_event_context *find_get_context(pid_t pid, int cpu) +static struct perf_event_context * +alloc_perf_context(struct pmu *pmu, struct task_struct *task) { struct perf_event_context *ctx; - struct perf_cpu_context *cpuctx; - struct task_struct *task; - unsigned long flags; - int err; - if (pid == -1 && cpu != -1) { - /* Must be root to operate on a CPU event: */ - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EACCES); - - if (cpu < 0 || cpu >= nr_cpumask_bits) - return ERR_PTR(-EINVAL); + ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); + if (!ctx) + return NULL; - /* - * We could be clever and allow to attach a event to an - * offline CPU and activate it when the CPU comes up, but - * that's for later. - */ - if (!cpu_online(cpu)) - return ERR_PTR(-ENODEV); + __perf_event_init_context(ctx); + if (task) { + ctx->task = task; + get_task_struct(task); + } + ctx->pmu = pmu; - cpuctx = &per_cpu(perf_cpu_context, cpu); - ctx = &cpuctx->ctx; - get_ctx(ctx); + return ctx; +} - return ctx; - } +static struct task_struct * +find_lively_task_by_vpid(pid_t vpid) +{ + struct task_struct *task; + int err; rcu_read_lock(); - if (!pid) + if (!vpid) task = current; else - task = find_task_by_vpid(pid); + task = find_task_by_vpid(vpid); if (task) get_task_struct(task); rcu_read_unlock(); @@ -1836,36 +2113,78 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto errout; - retry: - ctx = perf_lock_task_context(task, &flags); + return task; +errout: + put_task_struct(task); + return ERR_PTR(err); + +} + +static struct perf_event_context * +find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) +{ + struct perf_event_context *ctx; + struct perf_cpu_context *cpuctx; + unsigned long flags; + int ctxn, err; + + if (!task && cpu != -1) { + /* Must be root to operate on a CPU event: */ + if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EACCES); + + if (cpu < 0 || cpu >= nr_cpumask_bits) + return ERR_PTR(-EINVAL); + + /* + * We could be clever and allow to attach a event to an + * offline CPU and activate it when the CPU comes up, but + * that's for later. + */ + if (!cpu_online(cpu)) + return ERR_PTR(-ENODEV); + + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + ctx = &cpuctx->ctx; + get_ctx(ctx); + + return ctx; + } + + err = -EINVAL; + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto errout; + +retry: + ctx = perf_lock_task_context(task, ctxn, &flags); if (ctx) { unclone_ctx(ctx); raw_spin_unlock_irqrestore(&ctx->lock, flags); } if (!ctx) { - ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); + ctx = alloc_perf_context(pmu, task); err = -ENOMEM; if (!ctx) goto errout; - __perf_event_init_context(ctx, task); + get_ctx(ctx); - if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { + + if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) { /* * We raced with some other task; use * the context they set. */ + put_task_struct(task); kfree(ctx); goto retry; } - get_task_struct(task); } - put_task_struct(task); return ctx; - errout: - put_task_struct(task); +errout: return ERR_PTR(err); } @@ -1882,32 +2201,36 @@ static void free_event_rcu(struct rcu_head *head) kfree(event); } -static void perf_pending_sync(struct perf_event *event); -static void perf_mmap_data_put(struct perf_mmap_data *data); +static void perf_buffer_put(struct perf_buffer *buffer); static void free_event(struct perf_event *event) { - perf_pending_sync(event); + irq_work_sync(&event->pending); if (!event->parent) { - atomic_dec(&nr_events); - if (event->attr.mmap) + if (event->attach_state & PERF_ATTACH_TASK) + jump_label_dec(&perf_task_events); + if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) atomic_dec(&nr_comm_events); if (event->attr.task) atomic_dec(&nr_task_events); + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + put_callchain_buffers(); } - if (event->data) { - perf_mmap_data_put(event->data); - event->data = NULL; + if (event->buffer) { + perf_buffer_put(event->buffer); + event->buffer = NULL; } if (event->destroy) event->destroy(event); - put_ctx(event->ctx); + if (event->ctx) + put_ctx(event->ctx); + call_rcu(&event->rcu_head, free_event_rcu); } @@ -2126,13 +2449,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) static unsigned int perf_poll(struct file *file, poll_table *wait) { struct perf_event *event = file->private_data; - struct perf_mmap_data *data; + struct perf_buffer *buffer; unsigned int events = POLL_HUP; rcu_read_lock(); - data = rcu_dereference(event->data); - if (data) - events = atomic_xchg(&data->poll, 0); + buffer = rcu_dereference(event->buffer); + if (buffer) + events = atomic_xchg(&buffer->poll, 0); rcu_read_unlock(); poll_wait(file, &event->waitq, wait); @@ -2143,7 +2466,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) static void perf_event_reset(struct perf_event *event) { (void)perf_event_read(event); - atomic64_set(&event->count, 0); + local64_set(&event->count, 0); perf_event_update_userpage(event); } @@ -2186,15 +2509,13 @@ static void perf_event_for_each(struct perf_event *event, static int perf_event_period(struct perf_event *event, u64 __user *arg) { struct perf_event_context *ctx = event->ctx; - unsigned long size; int ret = 0; u64 value; if (!event->attr.sample_period) return -EINVAL; - size = copy_from_user(&value, arg, sizeof(value)); - if (size != sizeof(value)) + if (copy_from_user(&value, arg, sizeof(value))) return -EFAULT; if (!value) @@ -2328,6 +2649,9 @@ int perf_event_task_disable(void) static int perf_event_index(struct perf_event *event) { + if (event->hw.state & PERF_HES_STOPPED) + return 0; + if (event->state != PERF_EVENT_STATE_ACTIVE) return 0; @@ -2342,14 +2666,14 @@ static int perf_event_index(struct perf_event *event) void perf_event_update_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; - struct perf_mmap_data *data; + struct perf_buffer *buffer; rcu_read_lock(); - data = rcu_dereference(event->data); - if (!data) + buffer = rcu_dereference(event->buffer); + if (!buffer) goto unlock; - userpg = data->user_page; + userpg = buffer->user_page; /* * Disable preemption so as to not let the corresponding user-space @@ -2359,9 +2683,9 @@ void perf_event_update_userpage(struct perf_event *event) ++userpg->lock; barrier(); userpg->index = perf_event_index(event); - userpg->offset = atomic64_read(&event->count); + userpg->offset = perf_event_count(event); if (event->state == PERF_EVENT_STATE_ACTIVE) - userpg->offset -= atomic64_read(&event->hw.prev_count); + userpg->offset -= local64_read(&event->hw.prev_count); userpg->time_enabled = event->total_time_enabled + atomic64_read(&event->child_total_time_enabled); @@ -2376,6 +2700,25 @@ unlock: rcu_read_unlock(); } +static unsigned long perf_data_size(struct perf_buffer *buffer); + +static void +perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags) +{ + long max_size = perf_data_size(buffer); + + if (watermark) + buffer->watermark = min(max_size, watermark); + + if (!buffer->watermark) + buffer->watermark = max_size / 2; + + if (flags & PERF_BUFFER_WRITABLE) + buffer->writable = 1; + + atomic_set(&buffer->refcount, 1); +} + #ifndef CONFIG_PERF_USE_VMALLOC /* @@ -2383,15 +2726,15 @@ unlock: */ static struct page * -perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) +perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) { - if (pgoff > data->nr_pages) + if (pgoff > buffer->nr_pages) return NULL; if (pgoff == 0) - return virt_to_page(data->user_page); + return virt_to_page(buffer->user_page); - return virt_to_page(data->data_pages[pgoff - 1]); + return virt_to_page(buffer->data_pages[pgoff - 1]); } static void *perf_mmap_alloc_page(int cpu) @@ -2407,42 +2750,44 @@ static void *perf_mmap_alloc_page(int cpu) return page_address(page); } -static struct perf_mmap_data * -perf_mmap_data_alloc(struct perf_event *event, int nr_pages) +static struct perf_buffer * +perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; unsigned long size; int i; - size = sizeof(struct perf_mmap_data); + size = sizeof(struct perf_buffer); size += nr_pages * sizeof(void *); - data = kzalloc(size, GFP_KERNEL); - if (!data) + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) goto fail; - data->user_page = perf_mmap_alloc_page(event->cpu); - if (!data->user_page) + buffer->user_page = perf_mmap_alloc_page(cpu); + if (!buffer->user_page) goto fail_user_page; for (i = 0; i < nr_pages; i++) { - data->data_pages[i] = perf_mmap_alloc_page(event->cpu); - if (!data->data_pages[i]) + buffer->data_pages[i] = perf_mmap_alloc_page(cpu); + if (!buffer->data_pages[i]) goto fail_data_pages; } - data->nr_pages = nr_pages; + buffer->nr_pages = nr_pages; + + perf_buffer_init(buffer, watermark, flags); - return data; + return buffer; fail_data_pages: for (i--; i >= 0; i--) - free_page((unsigned long)data->data_pages[i]); + free_page((unsigned long)buffer->data_pages[i]); - free_page((unsigned long)data->user_page); + free_page((unsigned long)buffer->user_page); fail_user_page: - kfree(data); + kfree(buffer); fail: return NULL; @@ -2456,17 +2801,17 @@ static void perf_mmap_free_page(unsigned long addr) __free_page(page); } -static void perf_mmap_data_free(struct perf_mmap_data *data) +static void perf_buffer_free(struct perf_buffer *buffer) { int i; - perf_mmap_free_page((unsigned long)data->user_page); - for (i = 0; i < data->nr_pages; i++) - perf_mmap_free_page((unsigned long)data->data_pages[i]); - kfree(data); + perf_mmap_free_page((unsigned long)buffer->user_page); + for (i = 0; i < buffer->nr_pages; i++) + perf_mmap_free_page((unsigned long)buffer->data_pages[i]); + kfree(buffer); } -static inline int page_order(struct perf_mmap_data *data) +static inline int page_order(struct perf_buffer *buffer) { return 0; } @@ -2479,18 +2824,18 @@ static inline int page_order(struct perf_mmap_data *data) * Required for architectures that have d-cache aliasing issues. */ -static inline int page_order(struct perf_mmap_data *data) +static inline int page_order(struct perf_buffer *buffer) { - return data->page_order; + return buffer->page_order; } static struct page * -perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) +perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) { - if (pgoff > (1UL << page_order(data))) + if (pgoff > (1UL << page_order(buffer))) return NULL; - return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); + return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE); } static void perf_mmap_unmark_page(void *addr) @@ -2500,57 +2845,59 @@ static void perf_mmap_unmark_page(void *addr) page->mapping = NULL; } -static void perf_mmap_data_free_work(struct work_struct *work) +static void perf_buffer_free_work(struct work_struct *work) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; void *base; int i, nr; - data = container_of(work, struct perf_mmap_data, work); - nr = 1 << page_order(data); + buffer = container_of(work, struct perf_buffer, work); + nr = 1 << page_order(buffer); - base = data->user_page; + base = buffer->user_page; for (i = 0; i < nr + 1; i++) perf_mmap_unmark_page(base + (i * PAGE_SIZE)); vfree(base); - kfree(data); + kfree(buffer); } -static void perf_mmap_data_free(struct perf_mmap_data *data) +static void perf_buffer_free(struct perf_buffer *buffer) { - schedule_work(&data->work); + schedule_work(&buffer->work); } -static struct perf_mmap_data * -perf_mmap_data_alloc(struct perf_event *event, int nr_pages) +static struct perf_buffer * +perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; unsigned long size; void *all_buf; - size = sizeof(struct perf_mmap_data); + size = sizeof(struct perf_buffer); size += sizeof(void *); - data = kzalloc(size, GFP_KERNEL); - if (!data) + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) goto fail; - INIT_WORK(&data->work, perf_mmap_data_free_work); + INIT_WORK(&buffer->work, perf_buffer_free_work); all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); if (!all_buf) goto fail_all_buf; - data->user_page = all_buf; - data->data_pages[0] = all_buf + PAGE_SIZE; - data->page_order = ilog2(nr_pages); - data->nr_pages = 1; + buffer->user_page = all_buf; + buffer->data_pages[0] = all_buf + PAGE_SIZE; + buffer->page_order = ilog2(nr_pages); + buffer->nr_pages = 1; + + perf_buffer_init(buffer, watermark, flags); - return data; + return buffer; fail_all_buf: - kfree(data); + kfree(buffer); fail: return NULL; @@ -2558,15 +2905,15 @@ fail: #endif -static unsigned long perf_data_size(struct perf_mmap_data *data) +static unsigned long perf_data_size(struct perf_buffer *buffer) { - return data->nr_pages << (PAGE_SHIFT + page_order(data)); + return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer)); } static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct perf_event *event = vma->vm_file->private_data; - struct perf_mmap_data *data; + struct perf_buffer *buffer; int ret = VM_FAULT_SIGBUS; if (vmf->flags & FAULT_FLAG_MKWRITE) { @@ -2576,14 +2923,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } rcu_read_lock(); - data = rcu_dereference(event->data); - if (!data) + buffer = rcu_dereference(event->buffer); + if (!buffer) goto unlock; if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) goto unlock; - vmf->page = perf_mmap_to_page(data, vmf->pgoff); + vmf->page = perf_mmap_to_page(buffer, vmf->pgoff); if (!vmf->page) goto unlock; @@ -2598,52 +2945,35 @@ unlock: return ret; } -static void -perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) +static void perf_buffer_free_rcu(struct rcu_head *rcu_head) { - long max_size = perf_data_size(data); + struct perf_buffer *buffer; - if (event->attr.watermark) { - data->watermark = min_t(long, max_size, - event->attr.wakeup_watermark); - } - - if (!data->watermark) - data->watermark = max_size / 2; - - atomic_set(&data->refcount, 1); - rcu_assign_pointer(event->data, data); -} - -static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head) -{ - struct perf_mmap_data *data; - - data = container_of(rcu_head, struct perf_mmap_data, rcu_head); - perf_mmap_data_free(data); + buffer = container_of(rcu_head, struct perf_buffer, rcu_head); + perf_buffer_free(buffer); } -static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event) +static struct perf_buffer *perf_buffer_get(struct perf_event *event) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; rcu_read_lock(); - data = rcu_dereference(event->data); - if (data) { - if (!atomic_inc_not_zero(&data->refcount)) - data = NULL; + buffer = rcu_dereference(event->buffer); + if (buffer) { + if (!atomic_inc_not_zero(&buffer->refcount)) + buffer = NULL; } rcu_read_unlock(); - return data; + return buffer; } -static void perf_mmap_data_put(struct perf_mmap_data *data) +static void perf_buffer_put(struct perf_buffer *buffer) { - if (!atomic_dec_and_test(&data->refcount)) + if (!atomic_dec_and_test(&buffer->refcount)) return; - call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); + call_rcu(&buffer->rcu_head, perf_buffer_free_rcu); } static void perf_mmap_open(struct vm_area_struct *vma) @@ -2658,16 +2988,16 @@ static void perf_mmap_close(struct vm_area_struct *vma) struct perf_event *event = vma->vm_file->private_data; if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { - unsigned long size = perf_data_size(event->data); + unsigned long size = perf_data_size(event->buffer); struct user_struct *user = event->mmap_user; - struct perf_mmap_data *data = event->data; + struct perf_buffer *buffer = event->buffer; atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); vma->vm_mm->locked_vm -= event->mmap_locked; - rcu_assign_pointer(event->data, NULL); + rcu_assign_pointer(event->buffer, NULL); mutex_unlock(&event->mmap_mutex); - perf_mmap_data_put(data); + perf_buffer_put(buffer); free_uid(user); } } @@ -2685,11 +3015,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); unsigned long locked, lock_limit; - struct perf_mmap_data *data; + struct perf_buffer *buffer; unsigned long vma_size; unsigned long nr_pages; long user_extra, extra; - int ret = 0; + int ret = 0, flags = 0; /* * Don't allow mmap() of inherited per-task counters. This would @@ -2706,7 +3036,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) nr_pages = (vma_size / PAGE_SIZE) - 1; /* - * If we have data pages ensure they're a power-of-two number, so we + * If we have buffer pages ensure they're a power-of-two number, so we * can do bitmasks instead of modulo. */ if (nr_pages != 0 && !is_power_of_2(nr_pages)) @@ -2720,9 +3050,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->mmap_mutex); - if (event->data) { - if (event->data->nr_pages == nr_pages) - atomic_inc(&event->data->refcount); + if (event->buffer) { + if (event->buffer->nr_pages == nr_pages) + atomic_inc(&event->buffer->refcount); else ret = -EINVAL; goto unlock; @@ -2752,17 +3082,18 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; } - WARN_ON(event->data); + WARN_ON(event->buffer); + + if (vma->vm_flags & VM_WRITE) + flags |= PERF_BUFFER_WRITABLE; - data = perf_mmap_data_alloc(event, nr_pages); - if (!data) { + buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark, + event->cpu, flags); + if (!buffer) { ret = -ENOMEM; goto unlock; } - - perf_mmap_data_init(event, data); - if (vma->vm_flags & VM_WRITE) - event->data->writable = 1; + rcu_assign_pointer(event->buffer, buffer); atomic_long_add(user_extra, &user->locked_vm); event->mmap_locked = extra; @@ -2824,16 +3155,7 @@ void perf_event_wakeup(struct perf_event *event) } } -/* - * Pending wakeups - * - * Handle the case where we need to wakeup up from NMI (or rq->lock) context. - * - * The NMI bit means we cannot possibly take locks. Therefore, maintain a - * single linked list and use cmpxchg() to add entries lockless. - */ - -static void perf_pending_event(struct perf_pending_entry *entry) +static void perf_pending_event(struct irq_work *entry) { struct perf_event *event = container_of(entry, struct perf_event, pending); @@ -2849,104 +3171,6 @@ static void perf_pending_event(struct perf_pending_entry *entry) } } -#define PENDING_TAIL ((struct perf_pending_entry *)-1UL) - -static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { - PENDING_TAIL, -}; - -static void perf_pending_queue(struct perf_pending_entry *entry, - void (*func)(struct perf_pending_entry *)) -{ - struct perf_pending_entry **head; - - if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) - return; - - entry->func = func; - - head = &get_cpu_var(perf_pending_head); - - do { - entry->next = *head; - } while (cmpxchg(head, entry->next, entry) != entry->next); - - set_perf_event_pending(); - - put_cpu_var(perf_pending_head); -} - -static int __perf_pending_run(void) -{ - struct perf_pending_entry *list; - int nr = 0; - - list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); - while (list != PENDING_TAIL) { - void (*func)(struct perf_pending_entry *); - struct perf_pending_entry *entry = list; - - list = list->next; - - func = entry->func; - entry->next = NULL; - /* - * Ensure we observe the unqueue before we issue the wakeup, - * so that we won't be waiting forever. - * -- see perf_not_pending(). - */ - smp_wmb(); - - func(entry); - nr++; - } - - return nr; -} - -static inline int perf_not_pending(struct perf_event *event) -{ - /* - * If we flush on whatever cpu we run, there is a chance we don't - * need to wait. - */ - get_cpu(); - __perf_pending_run(); - put_cpu(); - - /* - * Ensure we see the proper queue state before going to sleep - * so that we do not miss the wakeup. -- see perf_pending_handle() - */ - smp_rmb(); - return event->pending.next == NULL; -} - -static void perf_pending_sync(struct perf_event *event) -{ - wait_event(event->waitq, perf_not_pending(event)); -} - -void perf_event_do_pending(void) -{ - __perf_pending_run(); -} - -/* - * Callchain support -- arch specific - */ - -__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) -{ - return NULL; -} - -__weak -void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) -{ -} - - /* * We assume there is only KVM supporting the callbacks. * Later on, we might change it to a list if there is @@ -2971,15 +3195,15 @@ EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); /* * Output */ -static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, +static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail, unsigned long offset, unsigned long head) { unsigned long mask; - if (!data->writable) + if (!buffer->writable) return true; - mask = perf_data_size(data) - 1; + mask = perf_data_size(buffer) - 1; offset = (offset - tail) & mask; head = (head - tail) & mask; @@ -2992,12 +3216,11 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, static void perf_output_wakeup(struct perf_output_handle *handle) { - atomic_set(&handle->data->poll, POLL_IN); + atomic_set(&handle->buffer->poll, POLL_IN); if (handle->nmi) { handle->event->pending_wakeup = 1; - perf_pending_queue(&handle->event->pending, - perf_pending_event); + irq_work_queue(&handle->event->pending); } else perf_event_wakeup(handle->event); } @@ -3012,48 +3235,48 @@ static void perf_output_wakeup(struct perf_output_handle *handle) */ static void perf_output_get_handle(struct perf_output_handle *handle) { - struct perf_mmap_data *data = handle->data; + struct perf_buffer *buffer = handle->buffer; preempt_disable(); - local_inc(&data->nest); - handle->wakeup = local_read(&data->wakeup); + local_inc(&buffer->nest); + handle->wakeup = local_read(&buffer->wakeup); } static void perf_output_put_handle(struct perf_output_handle *handle) { - struct perf_mmap_data *data = handle->data; + struct perf_buffer *buffer = handle->buffer; unsigned long head; again: - head = local_read(&data->head); + head = local_read(&buffer->head); /* * IRQ/NMI can happen here, which means we can miss a head update. */ - if (!local_dec_and_test(&data->nest)) + if (!local_dec_and_test(&buffer->nest)) goto out; /* * Publish the known good head. Rely on the full barrier implied - * by atomic_dec_and_test() order the data->head read and this + * by atomic_dec_and_test() order the buffer->head read and this * write. */ - data->user_page->data_head = head; + buffer->user_page->data_head = head; /* * Now check if we missed an update, rely on the (compiler) - * barrier in atomic_dec_and_test() to re-read data->head. + * barrier in atomic_dec_and_test() to re-read buffer->head. */ - if (unlikely(head != local_read(&data->head))) { - local_inc(&data->nest); + if (unlikely(head != local_read(&buffer->head))) { + local_inc(&buffer->nest); goto again; } - if (handle->wakeup != local_read(&data->wakeup)) + if (handle->wakeup != local_read(&buffer->wakeup)) perf_output_wakeup(handle); - out: +out: preempt_enable(); } @@ -3070,12 +3293,12 @@ __always_inline void perf_output_copy(struct perf_output_handle *handle, buf += size; handle->size -= size; if (!handle->size) { - struct perf_mmap_data *data = handle->data; + struct perf_buffer *buffer = handle->buffer; handle->page++; - handle->page &= data->nr_pages - 1; - handle->addr = data->data_pages[handle->page]; - handle->size = PAGE_SIZE << page_order(data); + handle->page &= buffer->nr_pages - 1; + handle->addr = buffer->data_pages[handle->page]; + handle->size = PAGE_SIZE << page_order(buffer); } } while (len); } @@ -3084,7 +3307,7 @@ int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size, int nmi, int sample) { - struct perf_mmap_data *data; + struct perf_buffer *buffer; unsigned long tail, offset, head; int have_lost; struct { @@ -3100,19 +3323,19 @@ int perf_output_begin(struct perf_output_handle *handle, if (event->parent) event = event->parent; - data = rcu_dereference(event->data); - if (!data) + buffer = rcu_dereference(event->buffer); + if (!buffer) goto out; - handle->data = data; + handle->buffer = buffer; handle->event = event; handle->nmi = nmi; handle->sample = sample; - if (!data->nr_pages) + if (!buffer->nr_pages) goto out; - have_lost = local_read(&data->lost); + have_lost = local_read(&buffer->lost); if (have_lost) size += sizeof(lost_event); @@ -3124,30 +3347,30 @@ int perf_output_begin(struct perf_output_handle *handle, * tail pointer. So that all reads will be completed before the * write is issued. */ - tail = ACCESS_ONCE(data->user_page->data_tail); + tail = ACCESS_ONCE(buffer->user_page->data_tail); smp_rmb(); - offset = head = local_read(&data->head); + offset = head = local_read(&buffer->head); head += size; - if (unlikely(!perf_output_space(data, tail, offset, head))) + if (unlikely(!perf_output_space(buffer, tail, offset, head))) goto fail; - } while (local_cmpxchg(&data->head, offset, head) != offset); + } while (local_cmpxchg(&buffer->head, offset, head) != offset); - if (head - local_read(&data->wakeup) > data->watermark) - local_add(data->watermark, &data->wakeup); + if (head - local_read(&buffer->wakeup) > buffer->watermark) + local_add(buffer->watermark, &buffer->wakeup); - handle->page = offset >> (PAGE_SHIFT + page_order(data)); - handle->page &= data->nr_pages - 1; - handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1); - handle->addr = data->data_pages[handle->page]; + handle->page = offset >> (PAGE_SHIFT + page_order(buffer)); + handle->page &= buffer->nr_pages - 1; + handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1); + handle->addr = buffer->data_pages[handle->page]; handle->addr += handle->size; - handle->size = (PAGE_SIZE << page_order(data)) - handle->size; + handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size; if (have_lost) { lost_event.header.type = PERF_RECORD_LOST; lost_event.header.misc = 0; lost_event.header.size = sizeof(lost_event); lost_event.id = event->id; - lost_event.lost = local_xchg(&data->lost, 0); + lost_event.lost = local_xchg(&buffer->lost, 0); perf_output_put(handle, lost_event); } @@ -3155,7 +3378,7 @@ int perf_output_begin(struct perf_output_handle *handle, return 0; fail: - local_inc(&data->lost); + local_inc(&buffer->lost); perf_output_put_handle(handle); out: rcu_read_unlock(); @@ -3166,15 +3389,15 @@ out: void perf_output_end(struct perf_output_handle *handle) { struct perf_event *event = handle->event; - struct perf_mmap_data *data = handle->data; + struct perf_buffer *buffer = handle->buffer; int wakeup_events = event->attr.wakeup_events; if (handle->sample && wakeup_events) { - int events = local_inc_return(&data->events); + int events = local_inc_return(&buffer->events); if (events >= wakeup_events) { - local_sub(wakeup_events, &data->events); - local_inc(&data->wakeup); + local_sub(wakeup_events, &buffer->events); + local_inc(&buffer->wakeup); } } @@ -3211,7 +3434,7 @@ static void perf_output_read_one(struct perf_output_handle *handle, u64 values[4]; int n = 0; - values[n++] = atomic64_read(&event->count); + values[n++] = perf_event_count(event); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { values[n++] = event->total_time_enabled + atomic64_read(&event->child_total_time_enabled); @@ -3248,7 +3471,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (leader != event) leader->pmu->read(leader); - values[n++] = atomic64_read(&leader->count); + values[n++] = perf_event_count(leader); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(leader); @@ -3260,7 +3483,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (sub != event) sub->pmu->read(sub); - values[n++] = atomic64_read(&sub->count); + values[n++] = perf_event_count(sub); if (read_format & PERF_FORMAT_ID) values[n++] = primary_event_id(sub); @@ -3441,14 +3664,20 @@ static void perf_event_output(struct perf_event *event, int nmi, struct perf_output_handle handle; struct perf_event_header header; + /* protect the callchain buffers */ + rcu_read_lock(); + perf_prepare_sample(&header, data, event, regs); if (perf_output_begin(&handle, event, header.size, nmi, 1)) - return; + goto exit; perf_output_sample(&handle, &header, data, event); perf_output_end(&handle); + +exit: + rcu_read_unlock(); } /* @@ -3491,7 +3720,7 @@ perf_event_read_event(struct perf_event *event, /* * task tracking -- fork/exit * - * enabled by: attr.comm | attr.mmap | attr.task + * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task */ struct perf_task_event { @@ -3541,7 +3770,8 @@ static int perf_event_task_match(struct perf_event *event) if (event->cpu != -1 && event->cpu != smp_processor_id()) return 0; - if (event->attr.comm || event->attr.mmap || event->attr.task) + if (event->attr.comm || event->attr.mmap || + event->attr.mmap_data || event->attr.task) return 1; return 0; @@ -3561,16 +3791,27 @@ static void perf_event_task_ctx(struct perf_event_context *ctx, static void perf_event_task_event(struct perf_task_event *task_event) { struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx = task_event->task_ctx; + struct perf_event_context *ctx; + struct pmu *pmu; + int ctxn; rcu_read_lock(); - cpuctx = &get_cpu_var(perf_cpu_context); - perf_event_task_ctx(&cpuctx->ctx, task_event); - if (!ctx) - ctx = rcu_dereference(current->perf_event_ctxp); - if (ctx) - perf_event_task_ctx(ctx, task_event); - put_cpu_var(perf_cpu_context); + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + perf_event_task_ctx(&cpuctx->ctx, task_event); + + ctx = task_event->task_ctx; + if (!ctx) { + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto next; + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + } + if (ctx) + perf_event_task_ctx(ctx, task_event); +next: + put_cpu_ptr(pmu->pmu_cpu_context); + } rcu_read_unlock(); } @@ -3675,8 +3916,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; - unsigned int size; char comm[TASK_COMM_LEN]; + unsigned int size; + struct pmu *pmu; + int ctxn; memset(comm, 0, sizeof(comm)); strlcpy(comm, comm_event->task->comm, sizeof(comm)); @@ -3688,21 +3931,36 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; rcu_read_lock(); - cpuctx = &get_cpu_var(perf_cpu_context); - perf_event_comm_ctx(&cpuctx->ctx, comm_event); - ctx = rcu_dereference(current->perf_event_ctxp); - if (ctx) - perf_event_comm_ctx(ctx, comm_event); - put_cpu_var(perf_cpu_context); + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + perf_event_comm_ctx(&cpuctx->ctx, comm_event); + + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto next; + + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) + perf_event_comm_ctx(ctx, comm_event); +next: + put_cpu_ptr(pmu->pmu_cpu_context); + } rcu_read_unlock(); } void perf_event_comm(struct task_struct *task) { struct perf_comm_event comm_event; + struct perf_event_context *ctx; + int ctxn; - if (task->perf_event_ctxp) - perf_event_enable_on_exec(task); + for_each_task_context_nr(ctxn) { + ctx = task->perf_event_ctxp[ctxn]; + if (!ctx) + continue; + + perf_event_enable_on_exec(ctx); + } if (!atomic_read(&nr_comm_events)) return; @@ -3766,7 +4024,8 @@ static void perf_event_mmap_output(struct perf_event *event, } static int perf_event_mmap_match(struct perf_event *event, - struct perf_mmap_event *mmap_event) + struct perf_mmap_event *mmap_event, + int executable) { if (event->state < PERF_EVENT_STATE_INACTIVE) return 0; @@ -3774,19 +4033,21 @@ static int perf_event_mmap_match(struct perf_event *event, if (event->cpu != -1 && event->cpu != smp_processor_id()) return 0; - if (event->attr.mmap) + if ((!executable && event->attr.mmap_data) || + (executable && event->attr.mmap)) return 1; return 0; } static void perf_event_mmap_ctx(struct perf_event_context *ctx, - struct perf_mmap_event *mmap_event) + struct perf_mmap_event *mmap_event, + int executable) { struct perf_event *event; list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (perf_event_mmap_match(event, mmap_event)) + if (perf_event_mmap_match(event, mmap_event, executable)) perf_event_mmap_output(event, mmap_event); } } @@ -3801,6 +4062,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) char tmp[16]; char *buf = NULL; const char *name; + struct pmu *pmu; + int ctxn; memset(tmp, 0, sizeof(tmp)); @@ -3830,6 +4093,14 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) if (!vma->vm_mm) { name = strncpy(tmp, "[vdso]", sizeof(tmp)); goto got_name; + } else if (vma->vm_start <= vma->vm_mm->start_brk && + vma->vm_end >= vma->vm_mm->brk) { + name = strncpy(tmp, "[heap]", sizeof(tmp)); + goto got_name; + } else if (vma->vm_start <= vma->vm_mm->start_stack && + vma->vm_end >= vma->vm_mm->start_stack) { + name = strncpy(tmp, "[stack]", sizeof(tmp)); + goto got_name; } name = strncpy(tmp, "//anon", sizeof(tmp)); @@ -3845,18 +4116,29 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; rcu_read_lock(); - cpuctx = &get_cpu_var(perf_cpu_context); - perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); - ctx = rcu_dereference(current->perf_event_ctxp); - if (ctx) - perf_event_mmap_ctx(ctx, mmap_event); - put_cpu_var(perf_cpu_context); + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, + vma->vm_flags & VM_EXEC); + + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto next; + + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) { + perf_event_mmap_ctx(ctx, mmap_event, + vma->vm_flags & VM_EXEC); + } +next: + put_cpu_ptr(pmu->pmu_cpu_context); + } rcu_read_unlock(); kfree(buf); } -void __perf_event_mmap(struct vm_area_struct *vma) +void perf_event_mmap(struct vm_area_struct *vma) { struct perf_mmap_event mmap_event; @@ -3932,8 +4214,6 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, struct hw_perf_event *hwc = &event->hw; int ret = 0; - throttle = (throttle && event->pmu->unthrottle != NULL); - if (!throttle) { hwc->interrupts++; } else { @@ -3976,8 +4256,7 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, event->pending_kill = POLL_HUP; if (nmi) { event->pending_disable = 1; - perf_pending_queue(&event->pending, - perf_pending_event); + irq_work_queue(&event->pending); } else perf_event_disable(event); } @@ -4001,6 +4280,17 @@ int perf_event_overflow(struct perf_event *event, int nmi, * Generic software event infrastructure */ +struct swevent_htable { + struct swevent_hlist *swevent_hlist; + struct mutex hlist_mutex; + int hlist_refcount; + + /* Recursion avoidance in each contexts */ + int recursion[PERF_NR_CONTEXTS]; +}; + +static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); + /* * We directly increment event->count and keep a second value in * event->hw.period_left to count intervals. This period event @@ -4018,14 +4308,14 @@ static u64 perf_swevent_set_period(struct perf_event *event) hwc->last_period = hwc->sample_period; again: - old = val = atomic64_read(&hwc->period_left); + old = val = local64_read(&hwc->period_left); if (val < 0) return 0; nr = div64_u64(period + val, period); offset = nr * period; val -= offset; - if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) + if (local64_cmpxchg(&hwc->period_left, old, val) != old) goto again; return nr; @@ -4058,13 +4348,13 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, } } -static void perf_swevent_add(struct perf_event *event, u64 nr, +static void perf_swevent_event(struct perf_event *event, u64 nr, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { struct hw_perf_event *hwc = &event->hw; - atomic64_add(nr, &event->count); + local64_add(nr, &event->count); if (!regs) return; @@ -4075,7 +4365,7 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) return perf_swevent_overflow(event, 1, nmi, data, regs); - if (atomic64_add_negative(nr, &hwc->period_left)) + if (local64_add_negative(nr, &hwc->period_left)) return; perf_swevent_overflow(event, 0, nmi, data, regs); @@ -4084,6 +4374,9 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { + if (event->hw.state & PERF_HES_STOPPED) + return 0; + if (regs) { if (event->attr.exclude_user && user_mode(regs)) return 1; @@ -4130,11 +4423,11 @@ __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) /* For the read side: events when they trigger */ static inline struct hlist_head * -find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) +find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id) { struct swevent_hlist *hlist; - hlist = rcu_dereference(ctx->swevent_hlist); + hlist = rcu_dereference(swhash->swevent_hlist); if (!hlist) return NULL; @@ -4143,7 +4436,7 @@ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) /* For the event head insertion and removal in the hlist */ static inline struct hlist_head * -find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) +find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) { struct swevent_hlist *hlist; u32 event_id = event->attr.config; @@ -4154,7 +4447,7 @@ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) * and release. Which makes the protected version suitable here. * The context lock guarantees that. */ - hlist = rcu_dereference_protected(ctx->swevent_hlist, + hlist = rcu_dereference_protected(swhash->swevent_hlist, lockdep_is_held(&event->ctx->lock)); if (!hlist) return NULL; @@ -4167,23 +4460,19 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, struct perf_sample_data *data, struct pt_regs *regs) { - struct perf_cpu_context *cpuctx; + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); struct perf_event *event; struct hlist_node *node; struct hlist_head *head; - cpuctx = &__get_cpu_var(perf_cpu_context); - rcu_read_lock(); - - head = find_swevent_head_rcu(cpuctx, type, event_id); - + head = find_swevent_head_rcu(swhash, type, event_id); if (!head) goto end; hlist_for_each_entry_rcu(event, node, head, hlist_entry) { if (perf_swevent_match(event, type, event_id, data, regs)) - perf_swevent_add(event, nr, nmi, data, regs); + perf_swevent_event(event, nr, nmi, data, regs); } end: rcu_read_unlock(); @@ -4191,36 +4480,18 @@ end: int perf_swevent_get_recursion_context(void) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - int rctx; - - if (in_nmi()) - rctx = 3; - else if (in_irq()) - rctx = 2; - else if (in_softirq()) - rctx = 1; - else - rctx = 0; - - if (cpuctx->recursion[rctx]) - return -1; - - cpuctx->recursion[rctx]++; - barrier(); + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); - return rctx; + return get_recursion_context(swhash->recursion); } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); -void perf_swevent_put_recursion_context(int rctx) +void inline perf_swevent_put_recursion_context(int rctx) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - barrier(); - cpuctx->recursion[rctx]--; -} -EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + put_recursion_context(swhash->recursion, rctx); +} void __perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) @@ -4245,20 +4516,20 @@ static void perf_swevent_read(struct perf_event *event) { } -static int perf_swevent_enable(struct perf_event *event) +static int perf_swevent_add(struct perf_event *event, int flags) { + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); struct hw_perf_event *hwc = &event->hw; - struct perf_cpu_context *cpuctx; struct hlist_head *head; - cpuctx = &__get_cpu_var(perf_cpu_context); - if (hwc->sample_period) { hwc->last_period = hwc->sample_period; perf_swevent_set_period(event); } - head = find_swevent_head(cpuctx, event); + hwc->state = !(flags & PERF_EF_START); + + head = find_swevent_head(swhash, event); if (WARN_ON_ONCE(!head)) return -EINVAL; @@ -4267,202 +4538,27 @@ static int perf_swevent_enable(struct perf_event *event) return 0; } -static void perf_swevent_disable(struct perf_event *event) +static void perf_swevent_del(struct perf_event *event, int flags) { hlist_del_rcu(&event->hlist_entry); } -static void perf_swevent_void(struct perf_event *event) -{ -} - -static int perf_swevent_int(struct perf_event *event) -{ - return 0; -} - -static const struct pmu perf_ops_generic = { - .enable = perf_swevent_enable, - .disable = perf_swevent_disable, - .start = perf_swevent_int, - .stop = perf_swevent_void, - .read = perf_swevent_read, - .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */ -}; - -/* - * hrtimer based swevent callback - */ - -static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) -{ - enum hrtimer_restart ret = HRTIMER_RESTART; - struct perf_sample_data data; - struct pt_regs *regs; - struct perf_event *event; - u64 period; - - event = container_of(hrtimer, struct perf_event, hw.hrtimer); - event->pmu->read(event); - - perf_sample_data_init(&data, 0); - data.period = event->hw.last_period; - regs = get_irq_regs(); - - if (regs && !perf_exclude_event(event, regs)) { - if (!(event->attr.exclude_idle && current->pid == 0)) - if (perf_event_overflow(event, 0, &data, regs)) - ret = HRTIMER_NORESTART; - } - - period = max_t(u64, 10000, event->hw.sample_period); - hrtimer_forward_now(hrtimer, ns_to_ktime(period)); - - return ret; -} - -static void perf_swevent_start_hrtimer(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swevent_hrtimer; - if (hwc->sample_period) { - u64 period; - - if (hwc->remaining) { - if (hwc->remaining < 0) - period = 10000; - else - period = hwc->remaining; - hwc->remaining = 0; - } else { - period = max_t(u64, 10000, hwc->sample_period); - } - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL, 0); - } -} - -static void perf_swevent_cancel_hrtimer(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - if (hwc->sample_period) { - ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); - hwc->remaining = ktime_to_ns(remaining); - - hrtimer_cancel(&hwc->hrtimer); - } -} - -/* - * Software event: cpu wall time clock - */ - -static void cpu_clock_perf_event_update(struct perf_event *event) -{ - int cpu = raw_smp_processor_id(); - s64 prev; - u64 now; - - now = cpu_clock(cpu); - prev = atomic64_xchg(&event->hw.prev_count, now); - atomic64_add(now - prev, &event->count); -} - -static int cpu_clock_perf_event_enable(struct perf_event *event) +static void perf_swevent_start(struct perf_event *event, int flags) { - struct hw_perf_event *hwc = &event->hw; - int cpu = raw_smp_processor_id(); - - atomic64_set(&hwc->prev_count, cpu_clock(cpu)); - perf_swevent_start_hrtimer(event); - - return 0; + event->hw.state = 0; } -static void cpu_clock_perf_event_disable(struct perf_event *event) +static void perf_swevent_stop(struct perf_event *event, int flags) { - perf_swevent_cancel_hrtimer(event); - cpu_clock_perf_event_update(event); + event->hw.state = PERF_HES_STOPPED; } -static void cpu_clock_perf_event_read(struct perf_event *event) -{ - cpu_clock_perf_event_update(event); -} - -static const struct pmu perf_ops_cpu_clock = { - .enable = cpu_clock_perf_event_enable, - .disable = cpu_clock_perf_event_disable, - .read = cpu_clock_perf_event_read, -}; - -/* - * Software event: task time clock - */ - -static void task_clock_perf_event_update(struct perf_event *event, u64 now) -{ - u64 prev; - s64 delta; - - prev = atomic64_xchg(&event->hw.prev_count, now); - delta = now - prev; - atomic64_add(delta, &event->count); -} - -static int task_clock_perf_event_enable(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - u64 now; - - now = event->ctx->time; - - atomic64_set(&hwc->prev_count, now); - - perf_swevent_start_hrtimer(event); - - return 0; -} - -static void task_clock_perf_event_disable(struct perf_event *event) -{ - perf_swevent_cancel_hrtimer(event); - task_clock_perf_event_update(event, event->ctx->time); - -} - -static void task_clock_perf_event_read(struct perf_event *event) -{ - u64 time; - - if (!in_nmi()) { - update_context_time(event->ctx); - time = event->ctx->time; - } else { - u64 now = perf_clock(); - u64 delta = now - event->ctx->timestamp; - time = event->ctx->time + delta; - } - - task_clock_perf_event_update(event, time); -} - -static const struct pmu perf_ops_task_clock = { - .enable = task_clock_perf_event_enable, - .disable = task_clock_perf_event_disable, - .read = task_clock_perf_event_read, -}; - /* Deref the hlist from the update side */ static inline struct swevent_hlist * -swevent_hlist_deref(struct perf_cpu_context *cpuctx) +swevent_hlist_deref(struct swevent_htable *swhash) { - return rcu_dereference_protected(cpuctx->swevent_hlist, - lockdep_is_held(&cpuctx->hlist_mutex)); + return rcu_dereference_protected(swhash->swevent_hlist, + lockdep_is_held(&swhash->hlist_mutex)); } static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) @@ -4473,27 +4569,27 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) kfree(hlist); } -static void swevent_hlist_release(struct perf_cpu_context *cpuctx) +static void swevent_hlist_release(struct swevent_htable *swhash) { - struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx); + struct swevent_hlist *hlist = swevent_hlist_deref(swhash); if (!hlist) return; - rcu_assign_pointer(cpuctx->swevent_hlist, NULL); + rcu_assign_pointer(swhash->swevent_hlist, NULL); call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); } static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - mutex_lock(&cpuctx->hlist_mutex); + mutex_lock(&swhash->hlist_mutex); - if (!--cpuctx->hlist_refcount) - swevent_hlist_release(cpuctx); + if (!--swhash->hlist_refcount) + swevent_hlist_release(swhash); - mutex_unlock(&cpuctx->hlist_mutex); + mutex_unlock(&swhash->hlist_mutex); } static void swevent_hlist_put(struct perf_event *event) @@ -4511,12 +4607,12 @@ static void swevent_hlist_put(struct perf_event *event) static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); int err = 0; - mutex_lock(&cpuctx->hlist_mutex); + mutex_lock(&swhash->hlist_mutex); - if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) { + if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { struct swevent_hlist *hlist; hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); @@ -4524,11 +4620,11 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) err = -ENOMEM; goto exit; } - rcu_assign_pointer(cpuctx->swevent_hlist, hlist); + rcu_assign_pointer(swhash->swevent_hlist, hlist); } - cpuctx->hlist_refcount++; - exit: - mutex_unlock(&cpuctx->hlist_mutex); + swhash->hlist_refcount++; +exit: + mutex_unlock(&swhash->hlist_mutex); return err; } @@ -4552,7 +4648,7 @@ static int swevent_hlist_get(struct perf_event *event) put_online_cpus(); return 0; - fail: +fail: for_each_possible_cpu(cpu) { if (cpu == failed_cpu) break; @@ -4563,17 +4659,64 @@ static int swevent_hlist_get(struct perf_event *event) return err; } -#ifdef CONFIG_EVENT_TRACING +atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; + +static void sw_perf_event_destroy(struct perf_event *event) +{ + u64 event_id = event->attr.config; + + WARN_ON(event->parent); + + jump_label_dec(&perf_swevent_enabled[event_id]); + swevent_hlist_put(event); +} + +static int perf_swevent_init(struct perf_event *event) +{ + int event_id = event->attr.config; + + if (event->attr.type != PERF_TYPE_SOFTWARE) + return -ENOENT; + + switch (event_id) { + case PERF_COUNT_SW_CPU_CLOCK: + case PERF_COUNT_SW_TASK_CLOCK: + return -ENOENT; + + default: + break; + } + + if (event_id > PERF_COUNT_SW_MAX) + return -ENOENT; + + if (!event->parent) { + int err; + + err = swevent_hlist_get(event); + if (err) + return err; + + jump_label_inc(&perf_swevent_enabled[event_id]); + event->destroy = sw_perf_event_destroy; + } + + return 0; +} + +static struct pmu perf_swevent = { + .task_ctx_nr = perf_sw_context, -static const struct pmu perf_ops_tracepoint = { - .enable = perf_trace_enable, - .disable = perf_trace_disable, - .start = perf_swevent_int, - .stop = perf_swevent_void, + .event_init = perf_swevent_init, + .add = perf_swevent_add, + .del = perf_swevent_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, .read = perf_swevent_read, - .unthrottle = perf_swevent_void, }; +#ifdef CONFIG_EVENT_TRACING + static int perf_tp_filter_match(struct perf_event *event, struct perf_sample_data *data) { @@ -4601,7 +4744,7 @@ static int perf_tp_event_match(struct perf_event *event, } void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, - struct pt_regs *regs, struct hlist_head *head) + struct pt_regs *regs, struct hlist_head *head, int rctx) { struct perf_sample_data data; struct perf_event *event; @@ -4615,12 +4758,12 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, perf_sample_data_init(&data, addr); data.raw = &raw; - rcu_read_lock(); hlist_for_each_entry_rcu(event, node, head, hlist_entry) { if (perf_tp_event_match(event, &data, regs)) - perf_swevent_add(event, count, 1, &data, regs); + perf_swevent_event(event, count, 1, &data, regs); } - rcu_read_unlock(); + + perf_swevent_put_recursion_context(rctx); } EXPORT_SYMBOL_GPL(perf_tp_event); @@ -4629,10 +4772,13 @@ static void tp_perf_event_destroy(struct perf_event *event) perf_trace_destroy(event); } -static const struct pmu *tp_perf_event_init(struct perf_event *event) +static int perf_tp_event_init(struct perf_event *event) { int err; + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -ENOENT; + /* * Raw tracepoint data is a severe data leak, only allow root to * have these. @@ -4640,15 +4786,31 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) if ((event->attr.sample_type & PERF_SAMPLE_RAW) && perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); + return -EPERM; err = perf_trace_init(event); if (err) - return NULL; + return err; event->destroy = tp_perf_event_destroy; - return &perf_ops_tracepoint; + return 0; +} + +static struct pmu perf_tracepoint = { + .task_ctx_nr = perf_sw_context, + + .event_init = perf_tp_event_init, + .add = perf_trace_add, + .del = perf_trace_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, + .read = perf_swevent_read, +}; + +static inline void perf_tp_register(void) +{ + perf_pmu_register(&perf_tracepoint); } static int perf_event_set_filter(struct perf_event *event, void __user *arg) @@ -4676,9 +4838,8 @@ static void perf_event_free_filter(struct perf_event *event) #else -static const struct pmu *tp_perf_event_init(struct perf_event *event) +static inline void perf_tp_register(void) { - return NULL; } static int perf_event_set_filter(struct perf_event *event, void __user *arg) @@ -4693,105 +4854,389 @@ static void perf_event_free_filter(struct perf_event *event) #endif /* CONFIG_EVENT_TRACING */ #ifdef CONFIG_HAVE_HW_BREAKPOINT -static void bp_perf_event_destroy(struct perf_event *event) +void perf_bp_event(struct perf_event *bp, void *data) +{ + struct perf_sample_data sample; + struct pt_regs *regs = data; + + perf_sample_data_init(&sample, bp->attr.bp_addr); + + if (!bp->hw.state && !perf_exclude_event(bp, regs)) + perf_swevent_event(bp, 1, 1, &sample, regs); +} +#endif + +/* + * hrtimer based swevent callback + */ + +static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) { - release_bp_slot(event); + enum hrtimer_restart ret = HRTIMER_RESTART; + struct perf_sample_data data; + struct pt_regs *regs; + struct perf_event *event; + u64 period; + + event = container_of(hrtimer, struct perf_event, hw.hrtimer); + event->pmu->read(event); + + perf_sample_data_init(&data, 0); + data.period = event->hw.last_period; + regs = get_irq_regs(); + + if (regs && !perf_exclude_event(event, regs)) { + if (!(event->attr.exclude_idle && current->pid == 0)) + if (perf_event_overflow(event, 0, &data, regs)) + ret = HRTIMER_NORESTART; + } + + period = max_t(u64, 10000, event->hw.sample_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + + return ret; } -static const struct pmu *bp_perf_event_init(struct perf_event *bp) +static void perf_swevent_start_hrtimer(struct perf_event *event) { - int err; + struct hw_perf_event *hwc = &event->hw; - err = register_perf_hw_breakpoint(bp); - if (err) - return ERR_PTR(err); + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + if (hwc->sample_period) { + s64 period = local64_read(&hwc->period_left); - bp->destroy = bp_perf_event_destroy; + if (period) { + if (period < 0) + period = 10000; - return &perf_ops_bp; + local64_set(&hwc->period_left, 0); + } else { + period = max_t(u64, 10000, hwc->sample_period); + } + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(period), 0, + HRTIMER_MODE_REL_PINNED, 0); + } } -void perf_bp_event(struct perf_event *bp, void *data) +static void perf_swevent_cancel_hrtimer(struct perf_event *event) { - struct perf_sample_data sample; - struct pt_regs *regs = data; + struct hw_perf_event *hwc = &event->hw; - perf_sample_data_init(&sample, bp->attr.bp_addr); + if (hwc->sample_period) { + ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); + local64_set(&hwc->period_left, ktime_to_ns(remaining)); - if (!perf_exclude_event(bp, regs)) - perf_swevent_add(bp, 1, 1, &sample, regs); + hrtimer_cancel(&hwc->hrtimer); + } } -#else -static const struct pmu *bp_perf_event_init(struct perf_event *bp) + +/* + * Software event: cpu wall time clock + */ + +static void cpu_clock_event_update(struct perf_event *event) { - return NULL; + s64 prev; + u64 now; + + now = local_clock(); + prev = local64_xchg(&event->hw.prev_count, now); + local64_add(now - prev, &event->count); } -void perf_bp_event(struct perf_event *bp, void *regs) +static void cpu_clock_event_start(struct perf_event *event, int flags) { + local64_set(&event->hw.prev_count, local_clock()); + perf_swevent_start_hrtimer(event); } -#endif -atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; +static void cpu_clock_event_stop(struct perf_event *event, int flags) +{ + perf_swevent_cancel_hrtimer(event); + cpu_clock_event_update(event); +} -static void sw_perf_event_destroy(struct perf_event *event) +static int cpu_clock_event_add(struct perf_event *event, int flags) { - u64 event_id = event->attr.config; + if (flags & PERF_EF_START) + cpu_clock_event_start(event, flags); - WARN_ON(event->parent); + return 0; +} - atomic_dec(&perf_swevent_enabled[event_id]); - swevent_hlist_put(event); +static void cpu_clock_event_del(struct perf_event *event, int flags) +{ + cpu_clock_event_stop(event, flags); } -static const struct pmu *sw_perf_event_init(struct perf_event *event) +static void cpu_clock_event_read(struct perf_event *event) { - const struct pmu *pmu = NULL; - u64 event_id = event->attr.config; + cpu_clock_event_update(event); +} + +static int cpu_clock_event_init(struct perf_event *event) +{ + if (event->attr.type != PERF_TYPE_SOFTWARE) + return -ENOENT; + + if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) + return -ENOENT; + + return 0; +} + +static struct pmu perf_cpu_clock = { + .task_ctx_nr = perf_sw_context, + + .event_init = cpu_clock_event_init, + .add = cpu_clock_event_add, + .del = cpu_clock_event_del, + .start = cpu_clock_event_start, + .stop = cpu_clock_event_stop, + .read = cpu_clock_event_read, +}; + +/* + * Software event: task time clock + */ + +static void task_clock_event_update(struct perf_event *event, u64 now) +{ + u64 prev; + s64 delta; + + prev = local64_xchg(&event->hw.prev_count, now); + delta = now - prev; + local64_add(delta, &event->count); +} + +static void task_clock_event_start(struct perf_event *event, int flags) +{ + local64_set(&event->hw.prev_count, event->ctx->time); + perf_swevent_start_hrtimer(event); +} + +static void task_clock_event_stop(struct perf_event *event, int flags) +{ + perf_swevent_cancel_hrtimer(event); + task_clock_event_update(event, event->ctx->time); +} + +static int task_clock_event_add(struct perf_event *event, int flags) +{ + if (flags & PERF_EF_START) + task_clock_event_start(event, flags); + + return 0; +} + +static void task_clock_event_del(struct perf_event *event, int flags) +{ + task_clock_event_stop(event, PERF_EF_UPDATE); +} + +static void task_clock_event_read(struct perf_event *event) +{ + u64 time; + + if (!in_nmi()) { + update_context_time(event->ctx); + time = event->ctx->time; + } else { + u64 now = perf_clock(); + u64 delta = now - event->ctx->timestamp; + time = event->ctx->time + delta; + } + + task_clock_event_update(event, time); +} + +static int task_clock_event_init(struct perf_event *event) +{ + if (event->attr.type != PERF_TYPE_SOFTWARE) + return -ENOENT; + + if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) + return -ENOENT; + + return 0; +} + +static struct pmu perf_task_clock = { + .task_ctx_nr = perf_sw_context, + + .event_init = task_clock_event_init, + .add = task_clock_event_add, + .del = task_clock_event_del, + .start = task_clock_event_start, + .stop = task_clock_event_stop, + .read = task_clock_event_read, +}; + +static void perf_pmu_nop_void(struct pmu *pmu) +{ +} + +static int perf_pmu_nop_int(struct pmu *pmu) +{ + return 0; +} + +static void perf_pmu_start_txn(struct pmu *pmu) +{ + perf_pmu_disable(pmu); +} + +static int perf_pmu_commit_txn(struct pmu *pmu) +{ + perf_pmu_enable(pmu); + return 0; +} +static void perf_pmu_cancel_txn(struct pmu *pmu) +{ + perf_pmu_enable(pmu); +} + +/* + * Ensures all contexts with the same task_ctx_nr have the same + * pmu_cpu_context too. + */ +static void *find_pmu_context(int ctxn) +{ + struct pmu *pmu; + + if (ctxn < 0) + return NULL; + + list_for_each_entry(pmu, &pmus, entry) { + if (pmu->task_ctx_nr == ctxn) + return pmu->pmu_cpu_context; + } + + return NULL; +} + +static void free_pmu_context(void * __percpu cpu_context) +{ + struct pmu *pmu; + + mutex_lock(&pmus_lock); /* - * Software events (currently) can't in general distinguish - * between user, kernel and hypervisor events. - * However, context switches and cpu migrations are considered - * to be kernel events, and page faults are never hypervisor - * events. + * Like a real lame refcount. */ - switch (event_id) { - case PERF_COUNT_SW_CPU_CLOCK: - pmu = &perf_ops_cpu_clock; + list_for_each_entry(pmu, &pmus, entry) { + if (pmu->pmu_cpu_context == cpu_context) + goto out; + } - break; - case PERF_COUNT_SW_TASK_CLOCK: - /* - * If the user instantiates this as a per-cpu event, - * use the cpu_clock event instead. - */ - if (event->ctx->task) - pmu = &perf_ops_task_clock; - else - pmu = &perf_ops_cpu_clock; + free_percpu(cpu_context); +out: + mutex_unlock(&pmus_lock); +} - break; - case PERF_COUNT_SW_PAGE_FAULTS: - case PERF_COUNT_SW_PAGE_FAULTS_MIN: - case PERF_COUNT_SW_PAGE_FAULTS_MAJ: - case PERF_COUNT_SW_CONTEXT_SWITCHES: - case PERF_COUNT_SW_CPU_MIGRATIONS: - case PERF_COUNT_SW_ALIGNMENT_FAULTS: - case PERF_COUNT_SW_EMULATION_FAULTS: - if (!event->parent) { - int err; - - err = swevent_hlist_get(event); - if (err) - return ERR_PTR(err); +int perf_pmu_register(struct pmu *pmu) +{ + int cpu, ret; + + mutex_lock(&pmus_lock); + ret = -ENOMEM; + pmu->pmu_disable_count = alloc_percpu(int); + if (!pmu->pmu_disable_count) + goto unlock; + + pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); + if (pmu->pmu_cpu_context) + goto got_cpu_context; - atomic_inc(&perf_swevent_enabled[event_id]); - event->destroy = sw_perf_event_destroy; + pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); + if (!pmu->pmu_cpu_context) + goto free_pdc; + + for_each_possible_cpu(cpu) { + struct perf_cpu_context *cpuctx; + + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + __perf_event_init_context(&cpuctx->ctx); + cpuctx->ctx.type = cpu_context; + cpuctx->ctx.pmu = pmu; + cpuctx->jiffies_interval = 1; + INIT_LIST_HEAD(&cpuctx->rotation_list); + } + +got_cpu_context: + if (!pmu->start_txn) { + if (pmu->pmu_enable) { + /* + * If we have pmu_enable/pmu_disable calls, install + * transaction stubs that use that to try and batch + * hardware accesses. + */ + pmu->start_txn = perf_pmu_start_txn; + pmu->commit_txn = perf_pmu_commit_txn; + pmu->cancel_txn = perf_pmu_cancel_txn; + } else { + pmu->start_txn = perf_pmu_nop_void; + pmu->commit_txn = perf_pmu_nop_int; + pmu->cancel_txn = perf_pmu_nop_void; + } + } + + if (!pmu->pmu_enable) { + pmu->pmu_enable = perf_pmu_nop_void; + pmu->pmu_disable = perf_pmu_nop_void; + } + + list_add_rcu(&pmu->entry, &pmus); + ret = 0; +unlock: + mutex_unlock(&pmus_lock); + + return ret; + +free_pdc: + free_percpu(pmu->pmu_disable_count); + goto unlock; +} + +void perf_pmu_unregister(struct pmu *pmu) +{ + mutex_lock(&pmus_lock); + list_del_rcu(&pmu->entry); + mutex_unlock(&pmus_lock); + + /* + * We dereference the pmu list under both SRCU and regular RCU, so + * synchronize against both of those. + */ + synchronize_srcu(&pmus_srcu); + synchronize_rcu(); + + free_percpu(pmu->pmu_disable_count); + free_pmu_context(pmu->pmu_cpu_context); +} + +struct pmu *perf_init_event(struct perf_event *event) +{ + struct pmu *pmu = NULL; + int idx; + + idx = srcu_read_lock(&pmus_srcu); + list_for_each_entry_rcu(pmu, &pmus, entry) { + int ret = pmu->event_init(event); + if (!ret) + goto unlock; + + if (ret != -ENOENT) { + pmu = ERR_PTR(ret); + goto unlock; } - pmu = &perf_ops_generic; - break; } + pmu = ERR_PTR(-ENOENT); +unlock: + srcu_read_unlock(&pmus_srcu, idx); return pmu; } @@ -4800,20 +5245,18 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event) * Allocate and initialize a event structure */ static struct perf_event * -perf_event_alloc(struct perf_event_attr *attr, - int cpu, - struct perf_event_context *ctx, - struct perf_event *group_leader, - struct perf_event *parent_event, - perf_overflow_handler_t overflow_handler, - gfp_t gfpflags) -{ - const struct pmu *pmu; +perf_event_alloc(struct perf_event_attr *attr, int cpu, + struct task_struct *task, + struct perf_event *group_leader, + struct perf_event *parent_event, + perf_overflow_handler_t overflow_handler) +{ + struct pmu *pmu; struct perf_event *event; struct hw_perf_event *hwc; long err; - event = kzalloc(sizeof(*event), gfpflags); + event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) return ERR_PTR(-ENOMEM); @@ -4831,6 +5274,7 @@ perf_event_alloc(struct perf_event_attr *attr, INIT_LIST_HEAD(&event->event_entry); INIT_LIST_HEAD(&event->sibling_list); init_waitqueue_head(&event->waitq); + init_irq_work(&event->pending, perf_pending_event); mutex_init(&event->mmap_mutex); @@ -4838,7 +5282,6 @@ perf_event_alloc(struct perf_event_attr *attr, event->attr = *attr; event->group_leader = group_leader; event->pmu = NULL; - event->ctx = ctx; event->oncpu = -1; event->parent = parent_event; @@ -4848,6 +5291,17 @@ perf_event_alloc(struct perf_event_attr *attr, event->state = PERF_EVENT_STATE_INACTIVE; + if (task) { + event->attach_state = PERF_ATTACH_TASK; +#ifdef CONFIG_HAVE_HW_BREAKPOINT + /* + * hw_breakpoint is a bit difficult here.. + */ + if (attr->type == PERF_TYPE_BREAKPOINT) + event->hw.bp_target = task; +#endif + } + if (!overflow_handler && parent_event) overflow_handler = parent_event->overflow_handler; @@ -4864,7 +5318,7 @@ perf_event_alloc(struct perf_event_attr *attr, hwc->sample_period = 1; hwc->last_period = hwc->sample_period; - atomic64_set(&hwc->period_left, hwc->sample_period); + local64_set(&hwc->period_left, hwc->sample_period); /* * we currently do not support PERF_FORMAT_GROUP on inherited events @@ -4872,29 +5326,8 @@ perf_event_alloc(struct perf_event_attr *attr, if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) goto done; - switch (attr->type) { - case PERF_TYPE_RAW: - case PERF_TYPE_HARDWARE: - case PERF_TYPE_HW_CACHE: - pmu = hw_perf_event_init(event); - break; - - case PERF_TYPE_SOFTWARE: - pmu = sw_perf_event_init(event); - break; - - case PERF_TYPE_TRACEPOINT: - pmu = tp_perf_event_init(event); - break; - - case PERF_TYPE_BREAKPOINT: - pmu = bp_perf_event_init(event); - break; - + pmu = perf_init_event(event); - default: - break; - } done: err = 0; if (!pmu) @@ -4912,13 +5345,21 @@ done: event->pmu = pmu; if (!event->parent) { - atomic_inc(&nr_events); - if (event->attr.mmap) + if (event->attach_state & PERF_ATTACH_TASK) + jump_label_inc(&perf_task_events); + if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) atomic_inc(&nr_comm_events); if (event->attr.task) atomic_inc(&nr_task_events); + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { + err = get_callchain_buffers(); + if (err) { + free_event(event); + return ERR_PTR(err); + } + } } return event; @@ -5007,7 +5448,7 @@ err_size: static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { - struct perf_mmap_data *data = NULL, *old_data = NULL; + struct perf_buffer *buffer = NULL, *old_buffer = NULL; int ret = -EINVAL; if (!output_event) @@ -5037,19 +5478,19 @@ set: if (output_event) { /* get the buffer we want to redirect to */ - data = perf_mmap_data_get(output_event); - if (!data) + buffer = perf_buffer_get(output_event); + if (!buffer) goto unlock; } - old_data = event->data; - rcu_assign_pointer(event->data, data); + old_buffer = event->buffer; + rcu_assign_pointer(event->buffer, buffer); ret = 0; unlock: mutex_unlock(&event->mmap_mutex); - if (old_data) - perf_mmap_data_put(old_data); + if (old_buffer) + perf_buffer_put(old_buffer); out: return ret; } @@ -5066,12 +5507,16 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { - struct perf_event *event, *group_leader = NULL, *output_event = NULL; + struct perf_event *group_leader = NULL, *output_event = NULL; + struct perf_event *event, *sibling; struct perf_event_attr attr; struct perf_event_context *ctx; struct file *event_file = NULL; struct file *group_file = NULL; + struct task_struct *task = NULL; + struct pmu *pmu; int event_fd; + int move_group = 0; int fput_needed = 0; int err; @@ -5097,20 +5542,11 @@ SYSCALL_DEFINE5(perf_event_open, if (event_fd < 0) return event_fd; - /* - * Get the target context (task or percpu): - */ - ctx = find_get_context(pid, cpu); - if (IS_ERR(ctx)) { - err = PTR_ERR(ctx); - goto err_fd; - } - if (group_fd != -1) { group_leader = perf_fget_light(group_fd, &fput_needed); if (IS_ERR(group_leader)) { err = PTR_ERR(group_leader); - goto err_put_context; + goto err_fd; } group_file = group_leader->filp; if (flags & PERF_FLAG_FD_OUTPUT) @@ -5119,6 +5555,58 @@ SYSCALL_DEFINE5(perf_event_open, group_leader = NULL; } + if (pid != -1) { + task = find_lively_task_by_vpid(pid); + if (IS_ERR(task)) { + err = PTR_ERR(task); + goto err_group_fd; + } + } + + event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL); + if (IS_ERR(event)) { + err = PTR_ERR(event); + goto err_task; + } + + /* + * Special case software events and allow them to be part of + * any hardware group. + */ + pmu = event->pmu; + + if (group_leader && + (is_software_event(event) != is_software_event(group_leader))) { + if (is_software_event(event)) { + /* + * If event and group_leader are not both a software + * event, and event is, then group leader is not. + * + * Allow the addition of software events to !software + * groups, this is safe because software events never + * fail to schedule. + */ + pmu = group_leader->pmu; + } else if (is_software_event(group_leader) && + (group_leader->group_flags & PERF_GROUP_SOFTWARE)) { + /* + * In case the group is a pure software group, and we + * try to add a hardware event, move the whole group to + * the hardware context. + */ + move_group = 1; + } + } + + /* + * Get the target context (task or percpu): + */ + ctx = find_get_context(pmu, task, cpu); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto err_alloc; + } + /* * Look up the group leader (we will attach this event to it): */ @@ -5130,42 +5618,66 @@ SYSCALL_DEFINE5(perf_event_open, * becoming part of another group-sibling): */ if (group_leader->group_leader != group_leader) - goto err_put_context; + goto err_context; /* * Do not allow to attach to a group in a different * task or CPU context: */ - if (group_leader->ctx != ctx) - goto err_put_context; + if (move_group) { + if (group_leader->ctx->type != ctx->type) + goto err_context; + } else { + if (group_leader->ctx != ctx) + goto err_context; + } + /* * Only a group leader can be exclusive or pinned */ if (attr.exclusive || attr.pinned) - goto err_put_context; - } - - event = perf_event_alloc(&attr, cpu, ctx, group_leader, - NULL, NULL, GFP_KERNEL); - if (IS_ERR(event)) { - err = PTR_ERR(event); - goto err_put_context; + goto err_context; } if (output_event) { err = perf_event_set_output(event, output_event); if (err) - goto err_free_put_context; + goto err_context; } event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); if (IS_ERR(event_file)) { err = PTR_ERR(event_file); - goto err_free_put_context; + goto err_context; + } + + if (move_group) { + struct perf_event_context *gctx = group_leader->ctx; + + mutex_lock(&gctx->mutex); + perf_event_remove_from_context(group_leader); + list_for_each_entry(sibling, &group_leader->sibling_list, + group_entry) { + perf_event_remove_from_context(sibling); + put_ctx(gctx); + } + mutex_unlock(&gctx->mutex); + put_ctx(gctx); } event->filp = event_file; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); + + if (move_group) { + perf_install_in_context(ctx, group_leader, cpu); + get_ctx(ctx); + list_for_each_entry(sibling, &group_leader->sibling_list, + group_entry) { + perf_install_in_context(ctx, sibling, cpu); + get_ctx(ctx); + } + } + perf_install_in_context(ctx, event, cpu); ++ctx->generation; mutex_unlock(&ctx->mutex); @@ -5186,11 +5698,15 @@ SYSCALL_DEFINE5(perf_event_open, fd_install(event_fd, event_file); return event_fd; -err_free_put_context: +err_context: + put_ctx(ctx); +err_alloc: free_event(event); -err_put_context: +err_task: + if (task) + put_task_struct(task); +err_group_fd: fput_light(group_file, fput_needed); - put_ctx(ctx); err_fd: put_unused_fd(event_fd); return err; @@ -5201,32 +5717,31 @@ err_fd: * * @attr: attributes of the counter to create * @cpu: cpu in which the counter is bound - * @pid: task to profile + * @task: task to profile (NULL for percpu) */ struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, - pid_t pid, + struct task_struct *task, perf_overflow_handler_t overflow_handler) { - struct perf_event *event; struct perf_event_context *ctx; + struct perf_event *event; int err; /* * Get the target context (task or percpu): */ - ctx = find_get_context(pid, cpu); - if (IS_ERR(ctx)) { - err = PTR_ERR(ctx); - goto err_exit; - } - - event = perf_event_alloc(attr, cpu, ctx, NULL, - NULL, overflow_handler, GFP_KERNEL); + event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler); if (IS_ERR(event)) { err = PTR_ERR(event); - goto err_put_context; + goto err; + } + + ctx = find_get_context(event->pmu, task, cpu); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto err_free; } event->filp = NULL; @@ -5244,112 +5759,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, return event; - err_put_context: - put_ctx(ctx); - err_exit: +err_free: + free_event(event); +err: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); -/* - * inherit a event from parent task to child task: - */ -static struct perf_event * -inherit_event(struct perf_event *parent_event, - struct task_struct *parent, - struct perf_event_context *parent_ctx, - struct task_struct *child, - struct perf_event *group_leader, - struct perf_event_context *child_ctx) -{ - struct perf_event *child_event; - - /* - * Instead of creating recursive hierarchies of events, - * we link inherited events back to the original parent, - * which has a filp for sure, which we use as the reference - * count: - */ - if (parent_event->parent) - parent_event = parent_event->parent; - - child_event = perf_event_alloc(&parent_event->attr, - parent_event->cpu, child_ctx, - group_leader, parent_event, - NULL, GFP_KERNEL); - if (IS_ERR(child_event)) - return child_event; - get_ctx(child_ctx); - - /* - * Make the child state follow the state of the parent event, - * not its attr.disabled bit. We hold the parent's mutex, - * so we won't race with perf_event_{en, dis}able_family. - */ - if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) - child_event->state = PERF_EVENT_STATE_INACTIVE; - else - child_event->state = PERF_EVENT_STATE_OFF; - - if (parent_event->attr.freq) { - u64 sample_period = parent_event->hw.sample_period; - struct hw_perf_event *hwc = &child_event->hw; - - hwc->sample_period = sample_period; - hwc->last_period = sample_period; - - atomic64_set(&hwc->period_left, sample_period); - } - - child_event->overflow_handler = parent_event->overflow_handler; - - /* - * Link it up in the child's context: - */ - add_event_to_ctx(child_event, child_ctx); - - /* - * Get a reference to the parent filp - we will fput it - * when the child event exits. This is safe to do because - * we are in the parent and we know that the filp still - * exists and has a nonzero count: - */ - atomic_long_inc(&parent_event->filp->f_count); - - /* - * Link this into the parent event's child list - */ - WARN_ON_ONCE(parent_event->ctx->parent_ctx); - mutex_lock(&parent_event->child_mutex); - list_add_tail(&child_event->child_list, &parent_event->child_list); - mutex_unlock(&parent_event->child_mutex); - - return child_event; -} - -static int inherit_group(struct perf_event *parent_event, - struct task_struct *parent, - struct perf_event_context *parent_ctx, - struct task_struct *child, - struct perf_event_context *child_ctx) -{ - struct perf_event *leader; - struct perf_event *sub; - struct perf_event *child_ctr; - - leader = inherit_event(parent_event, parent, parent_ctx, - child, NULL, child_ctx); - if (IS_ERR(leader)) - return PTR_ERR(leader); - list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { - child_ctr = inherit_event(sub, parent, parent_ctx, - child, leader, child_ctx); - if (IS_ERR(child_ctr)) - return PTR_ERR(child_ctr); - } - return 0; -} - static void sync_child_event(struct perf_event *child_event, struct task_struct *child) { @@ -5359,12 +5775,12 @@ static void sync_child_event(struct perf_event *child_event, if (child_event->attr.inherit_stat) perf_event_read_event(child_event, child); - child_val = atomic64_read(&child_event->count); + child_val = perf_event_count(child_event); /* * Add back the child's count to the parent's count: */ - atomic64_add(child_val, &parent_event->count); + atomic64_add(child_val, &parent_event->child_count); atomic64_add(child_event->total_time_enabled, &parent_event->child_total_time_enabled); atomic64_add(child_event->total_time_running, @@ -5406,16 +5822,13 @@ __perf_event_exit_task(struct perf_event *child_event, } } -/* - * When a child task exits, feed back event values to parent events. - */ -void perf_event_exit_task(struct task_struct *child) +static void perf_event_exit_task_context(struct task_struct *child, int ctxn) { struct perf_event *child_event, *tmp; struct perf_event_context *child_ctx; unsigned long flags; - if (likely(!child->perf_event_ctxp)) { + if (likely(!child->perf_event_ctxp[ctxn])) { perf_event_task(child, NULL, 0); return; } @@ -5427,8 +5840,8 @@ void perf_event_exit_task(struct task_struct *child) * scheduled, so we are now safe from rescheduling changing * our context. */ - child_ctx = child->perf_event_ctxp; - __perf_event_task_sched_out(child_ctx); + child_ctx = child->perf_event_ctxp[ctxn]; + task_ctx_sched_out(child_ctx, EVENT_ALL); /* * Take the context lock here so that if find_get_context is @@ -5436,7 +5849,7 @@ void perf_event_exit_task(struct task_struct *child) * incremented the context's refcount before we do put_ctx below. */ raw_spin_lock(&child_ctx->lock); - child->perf_event_ctxp = NULL; + child->perf_event_ctxp[ctxn] = NULL; /* * If this context is a clone; unclone it so it can't get * swapped to another process while we're removing all @@ -5489,6 +5902,17 @@ again: put_ctx(child_ctx); } +/* + * When a child task exits, feed back event values to parent events. + */ +void perf_event_exit_task(struct task_struct *child) +{ + int ctxn; + + for_each_task_context_nr(ctxn) + perf_event_exit_task_context(child, ctxn); +} + static void perf_free_event(struct perf_event *event, struct perf_event_context *ctx) { @@ -5510,48 +5934,166 @@ static void perf_free_event(struct perf_event *event, /* * free an unexposed, unused context as created by inheritance by - * init_task below, used by fork() in case of fail. + * perf_event_init_task below, used by fork() in case of fail. */ void perf_event_free_task(struct task_struct *task) { - struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_event_context *ctx; struct perf_event *event, *tmp; + int ctxn; - if (!ctx) - return; + for_each_task_context_nr(ctxn) { + ctx = task->perf_event_ctxp[ctxn]; + if (!ctx) + continue; - mutex_lock(&ctx->mutex); + mutex_lock(&ctx->mutex); again: - list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) - perf_free_event(event, ctx); + list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, + group_entry) + perf_free_event(event, ctx); - list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, - group_entry) - perf_free_event(event, ctx); + list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, + group_entry) + perf_free_event(event, ctx); - if (!list_empty(&ctx->pinned_groups) || - !list_empty(&ctx->flexible_groups)) - goto again; + if (!list_empty(&ctx->pinned_groups) || + !list_empty(&ctx->flexible_groups)) + goto again; - mutex_unlock(&ctx->mutex); + mutex_unlock(&ctx->mutex); - put_ctx(ctx); + put_ctx(ctx); + } +} + +void perf_event_delayed_put(struct task_struct *task) +{ + int ctxn; + + for_each_task_context_nr(ctxn) + WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); +} + +/* + * inherit a event from parent task to child task: + */ +static struct perf_event * +inherit_event(struct perf_event *parent_event, + struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + struct perf_event *group_leader, + struct perf_event_context *child_ctx) +{ + struct perf_event *child_event; + unsigned long flags; + + /* + * Instead of creating recursive hierarchies of events, + * we link inherited events back to the original parent, + * which has a filp for sure, which we use as the reference + * count: + */ + if (parent_event->parent) + parent_event = parent_event->parent; + + child_event = perf_event_alloc(&parent_event->attr, + parent_event->cpu, + child, + group_leader, parent_event, + NULL); + if (IS_ERR(child_event)) + return child_event; + get_ctx(child_ctx); + + /* + * Make the child state follow the state of the parent event, + * not its attr.disabled bit. We hold the parent's mutex, + * so we won't race with perf_event_{en, dis}able_family. + */ + if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) + child_event->state = PERF_EVENT_STATE_INACTIVE; + else + child_event->state = PERF_EVENT_STATE_OFF; + + if (parent_event->attr.freq) { + u64 sample_period = parent_event->hw.sample_period; + struct hw_perf_event *hwc = &child_event->hw; + + hwc->sample_period = sample_period; + hwc->last_period = sample_period; + + local64_set(&hwc->period_left, sample_period); + } + + child_event->ctx = child_ctx; + child_event->overflow_handler = parent_event->overflow_handler; + + /* + * Link it up in the child's context: + */ + raw_spin_lock_irqsave(&child_ctx->lock, flags); + add_event_to_ctx(child_event, child_ctx); + raw_spin_unlock_irqrestore(&child_ctx->lock, flags); + + /* + * Get a reference to the parent filp - we will fput it + * when the child event exits. This is safe to do because + * we are in the parent and we know that the filp still + * exists and has a nonzero count: + */ + atomic_long_inc(&parent_event->filp->f_count); + + /* + * Link this into the parent event's child list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_add_tail(&child_event->child_list, &parent_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + return child_event; +} + +static int inherit_group(struct perf_event *parent_event, + struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + struct perf_event_context *child_ctx) +{ + struct perf_event *leader; + struct perf_event *sub; + struct perf_event *child_ctr; + + leader = inherit_event(parent_event, parent, parent_ctx, + child, NULL, child_ctx); + if (IS_ERR(leader)) + return PTR_ERR(leader); + list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { + child_ctr = inherit_event(sub, parent, parent_ctx, + child, leader, child_ctx); + if (IS_ERR(child_ctr)) + return PTR_ERR(child_ctr); + } + return 0; } static int inherit_task_group(struct perf_event *event, struct task_struct *parent, struct perf_event_context *parent_ctx, - struct task_struct *child, + struct task_struct *child, int ctxn, int *inherited_all) { int ret; - struct perf_event_context *child_ctx = child->perf_event_ctxp; + struct perf_event_context *child_ctx; if (!event->attr.inherit) { *inherited_all = 0; return 0; } + child_ctx = child->perf_event_ctxp[ctxn]; if (!child_ctx) { /* * This is executed from the parent task context, so @@ -5560,14 +6102,11 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, * child. */ - child_ctx = kzalloc(sizeof(struct perf_event_context), - GFP_KERNEL); + child_ctx = alloc_perf_context(event->pmu, child); if (!child_ctx) return -ENOMEM; - __perf_event_init_context(child_ctx, child); - child->perf_event_ctxp = child_ctx; - get_task_struct(child); + child->perf_event_ctxp[ctxn] = child_ctx; } ret = inherit_group(event, parent, parent_ctx, @@ -5579,11 +6118,10 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, return ret; } - /* * Initialize the perf_event context in task_struct */ -int perf_event_init_task(struct task_struct *child) +int perf_event_init_context(struct task_struct *child, int ctxn) { struct perf_event_context *child_ctx, *parent_ctx; struct perf_event_context *cloned_ctx; @@ -5592,19 +6130,19 @@ int perf_event_init_task(struct task_struct *child) int inherited_all = 1; int ret = 0; - child->perf_event_ctxp = NULL; + child->perf_event_ctxp[ctxn] = NULL; mutex_init(&child->perf_event_mutex); INIT_LIST_HEAD(&child->perf_event_list); - if (likely(!parent->perf_event_ctxp)) + if (likely(!parent->perf_event_ctxp[ctxn])) return 0; /* * If the parent's context is a clone, pin it so it won't get * swapped under us. */ - parent_ctx = perf_pin_task_context(parent); + parent_ctx = perf_pin_task_context(parent, ctxn); /* * No need to check if parent_ctx != NULL here; since we saw @@ -5624,20 +6162,20 @@ int perf_event_init_task(struct task_struct *child) * the list, not manipulating it: */ list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { - ret = inherit_task_group(event, parent, parent_ctx, child, - &inherited_all); + ret = inherit_task_group(event, parent, parent_ctx, + child, ctxn, &inherited_all); if (ret) break; } list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { - ret = inherit_task_group(event, parent, parent_ctx, child, - &inherited_all); + ret = inherit_task_group(event, parent, parent_ctx, + child, ctxn, &inherited_all); if (ret) break; } - child_ctx = child->perf_event_ctxp; + child_ctx = child->perf_event_ctxp[ctxn]; if (child_ctx && inherited_all) { /* @@ -5666,63 +6204,98 @@ int perf_event_init_task(struct task_struct *child) return ret; } +/* + * Initialize the perf_event context in task_struct + */ +int perf_event_init_task(struct task_struct *child) +{ + int ctxn, ret; + + for_each_task_context_nr(ctxn) { + ret = perf_event_init_context(child, ctxn); + if (ret) + return ret; + } + + return 0; +} + static void __init perf_event_init_all_cpus(void) { + struct swevent_htable *swhash; int cpu; - struct perf_cpu_context *cpuctx; for_each_possible_cpu(cpu) { - cpuctx = &per_cpu(perf_cpu_context, cpu); - mutex_init(&cpuctx->hlist_mutex); - __perf_event_init_context(&cpuctx->ctx, NULL); + swhash = &per_cpu(swevent_htable, cpu); + mutex_init(&swhash->hlist_mutex); + INIT_LIST_HEAD(&per_cpu(rotation_list, cpu)); } } static void __cpuinit perf_event_init_cpu(int cpu) { - struct perf_cpu_context *cpuctx; - - cpuctx = &per_cpu(perf_cpu_context, cpu); - - spin_lock(&perf_resource_lock); - cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; - spin_unlock(&perf_resource_lock); + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - mutex_lock(&cpuctx->hlist_mutex); - if (cpuctx->hlist_refcount > 0) { + mutex_lock(&swhash->hlist_mutex); + if (swhash->hlist_refcount > 0) { struct swevent_hlist *hlist; - hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); - WARN_ON_ONCE(!hlist); - rcu_assign_pointer(cpuctx->swevent_hlist, hlist); + hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); + WARN_ON(!hlist); + rcu_assign_pointer(swhash->swevent_hlist, hlist); } - mutex_unlock(&cpuctx->hlist_mutex); + mutex_unlock(&swhash->hlist_mutex); } #ifdef CONFIG_HOTPLUG_CPU -static void __perf_event_exit_cpu(void *info) +static void perf_pmu_rotate_stop(struct pmu *pmu) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_event_context *ctx = &cpuctx->ctx; + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + WARN_ON(!irqs_disabled()); + + list_del_init(&cpuctx->rotation_list); +} + +static void __perf_event_exit_context(void *__info) +{ + struct perf_event_context *ctx = __info; struct perf_event *event, *tmp; + perf_pmu_rotate_stop(ctx->pmu); + list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) __perf_event_remove_from_context(event); list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) __perf_event_remove_from_context(event); } + +static void perf_event_exit_cpu_context(int cpu) +{ + struct perf_event_context *ctx; + struct pmu *pmu; + int idx; + + idx = srcu_read_lock(&pmus_srcu); + list_for_each_entry_rcu(pmu, &pmus, entry) { + ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; + + mutex_lock(&ctx->mutex); + smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); + mutex_unlock(&ctx->mutex); + } + srcu_read_unlock(&pmus_srcu, idx); +} + static void perf_event_exit_cpu(int cpu) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_event_context *ctx = &cpuctx->ctx; + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - mutex_lock(&cpuctx->hlist_mutex); - swevent_hlist_release(cpuctx); - mutex_unlock(&cpuctx->hlist_mutex); + mutex_lock(&swhash->hlist_mutex); + swevent_hlist_release(swhash); + mutex_unlock(&swhash->hlist_mutex); - mutex_lock(&ctx->mutex); - smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); - mutex_unlock(&ctx->mutex); + perf_event_exit_cpu_context(cpu); } #else static inline void perf_event_exit_cpu(int cpu) { } @@ -5733,15 +6306,15 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (long)hcpu; - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: + case CPU_DOWN_FAILED: perf_event_init_cpu(cpu); break; + case CPU_UP_CANCELED: case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: perf_event_exit_cpu(cpu); break; @@ -5752,118 +6325,13 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) return NOTIFY_OK; } -/* - * This has to have a higher priority than migration_notifier in sched.c. - */ -static struct notifier_block __cpuinitdata perf_cpu_nb = { - .notifier_call = perf_cpu_notify, - .priority = 20, -}; - void __init perf_event_init(void) { perf_event_init_all_cpus(); - perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, - (void *)(long)smp_processor_id()); - register_cpu_notifier(&perf_cpu_nb); -} - -static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, - struct sysdev_class_attribute *attr, - char *buf) -{ - return sprintf(buf, "%d\n", perf_reserved_percpu); -} - -static ssize_t -perf_set_reserve_percpu(struct sysdev_class *class, - struct sysdev_class_attribute *attr, - const char *buf, - size_t count) -{ - struct perf_cpu_context *cpuctx; - unsigned long val; - int err, cpu, mpt; - - err = strict_strtoul(buf, 10, &val); - if (err) - return err; - if (val > perf_max_events) - return -EINVAL; - - spin_lock(&perf_resource_lock); - perf_reserved_percpu = val; - for_each_online_cpu(cpu) { - cpuctx = &per_cpu(perf_cpu_context, cpu); - raw_spin_lock_irq(&cpuctx->ctx.lock); - mpt = min(perf_max_events - cpuctx->ctx.nr_events, - perf_max_events - perf_reserved_percpu); - cpuctx->max_pertask = mpt; - raw_spin_unlock_irq(&cpuctx->ctx.lock); - } - spin_unlock(&perf_resource_lock); - - return count; -} - -static ssize_t perf_show_overcommit(struct sysdev_class *class, - struct sysdev_class_attribute *attr, - char *buf) -{ - return sprintf(buf, "%d\n", perf_overcommit); -} - -static ssize_t -perf_set_overcommit(struct sysdev_class *class, - struct sysdev_class_attribute *attr, - const char *buf, size_t count) -{ - unsigned long val; - int err; - - err = strict_strtoul(buf, 10, &val); - if (err) - return err; - if (val > 1) - return -EINVAL; - - spin_lock(&perf_resource_lock); - perf_overcommit = val; - spin_unlock(&perf_resource_lock); - - return count; -} - -static SYSDEV_CLASS_ATTR( - reserve_percpu, - 0644, - perf_show_reserve_percpu, - perf_set_reserve_percpu - ); - -static SYSDEV_CLASS_ATTR( - overcommit, - 0644, - perf_show_overcommit, - perf_set_overcommit - ); - -static struct attribute *perfclass_attrs[] = { - &attr_reserve_percpu.attr, - &attr_overcommit.attr, - NULL -}; - -static struct attribute_group perfclass_attr_group = { - .attrs = perfclass_attrs, - .name = "perf_events", -}; - -static int __init perf_event_sysfs_init(void) -{ - return sysfs_create_group(&cpu_sysdev_class.kset.kobj, - &perfclass_attr_group); + init_srcu_struct(&pmus_srcu); + perf_pmu_register(&perf_swevent); + perf_pmu_register(&perf_cpu_clock); + perf_pmu_register(&perf_task_clock); + perf_tp_register(); + perf_cpu_notifier(perf_cpu_notify); } -device_initcall(perf_event_sysfs_init); diff --git a/kernel/pid.c b/kernel/pid.c index e9fd8c1..39b65b6 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -122,6 +122,43 @@ static void free_pidmap(struct upid *upid) atomic_inc(&map->nr_free); } +/* + * If we started walking pids at 'base', is 'a' seen before 'b'? + */ +static int pid_before(int base, int a, int b) +{ + /* + * This is the same as saying + * + * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT + * and that mapping orders 'a' and 'b' with respect to 'base'. + */ + return (unsigned)(a - base) < (unsigned)(b - base); +} + +/* + * We might be racing with someone else trying to set pid_ns->last_pid. + * We want the winner to have the "later" value, because if the + * "earlier" value prevails, then a pid may get reused immediately. + * + * Since pids rollover, it is not sufficient to just pick the bigger + * value. We have to consider where we started counting from. + * + * 'base' is the value of pid_ns->last_pid that we observed when + * we started looking for a pid. + * + * 'pid' is the pid that we eventually found. + */ +static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid) +{ + int prev; + int last_write = base; + do { + prev = last_write; + last_write = cmpxchg(&pid_ns->last_pid, prev, pid); + } while ((prev != last_write) && (pid_before(base, last_write, pid))); +} + static int alloc_pidmap(struct pid_namespace *pid_ns) { int i, offset, max_scan, pid, last = pid_ns->last_pid; @@ -132,7 +169,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) pid = RESERVED_PIDS; offset = pid & BITS_PER_PAGE_MASK; map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; - max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; + /* + * If last_pid points into the middle of the map->page we + * want to scan this bitmap block twice, the second time + * we start with offset == 0 (or RESERVED_PIDS). + */ + max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset; for (i = 0; i <= max_scan; ++i) { if (unlikely(!map->page)) { void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); @@ -154,20 +196,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) do { if (!test_and_set_bit(offset, map->page)) { atomic_dec(&map->nr_free); - pid_ns->last_pid = pid; + set_last_pid(pid_ns, last, pid); return pid; } offset = find_next_offset(map, offset); pid = mk_pid(pid_ns, map, offset); - /* - * find_next_offset() found a bit, the pid from it - * is in-bounds, and if we fell back to the last - * bitmap block and the final block was the same - * as the starting point, pid is before last_pid. - */ - } while (offset < BITS_PER_PAGE && pid < pid_max && - (i != max_scan || pid < last || - !((last+1) & BITS_PER_PAGE_MASK))); + } while (offset < BITS_PER_PAGE && pid < pid_max); } if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { ++map; @@ -367,7 +401,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) struct task_struct *result = NULL; if (pid) { struct hlist_node *first; - first = rcu_dereference_check(pid->tasks[type].first, + first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), rcu_read_lock_held() || lockdep_tasklist_lock_is_held()); if (first) @@ -382,6 +416,7 @@ EXPORT_SYMBOL(pid_task); */ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) { + rcu_lockdep_assert(rcu_read_lock_held()); return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); } diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index f42d3f7..645e541 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -48,59 +48,49 @@ * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock * held, taken with _irqsave. One lock to rule them all */ -struct pm_qos_request_list { - struct list_head list; - union { - s32 value; - s32 usec; - s32 kbps; - }; - int pm_qos_class; +enum pm_qos_type { + PM_QOS_MAX, /* return the largest value */ + PM_QOS_MIN /* return the smallest value */ }; -static s32 max_compare(s32 v1, s32 v2); -static s32 min_compare(s32 v1, s32 v2); - struct pm_qos_object { - struct pm_qos_request_list requests; + struct plist_head requests; struct blocking_notifier_head *notifiers; struct miscdevice pm_qos_power_miscdev; char *name; s32 default_value; - atomic_t target_value; - s32 (*comparitor)(s32, s32); + enum pm_qos_type type; }; +static DEFINE_SPINLOCK(pm_qos_lock); + static struct pm_qos_object null_pm_qos; static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); static struct pm_qos_object cpu_dma_pm_qos = { - .requests = {LIST_HEAD_INIT(cpu_dma_pm_qos.requests.list)}, + .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock), .notifiers = &cpu_dma_lat_notifier, .name = "cpu_dma_latency", .default_value = 2000 * USEC_PER_SEC, - .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), - .comparitor = min_compare + .type = PM_QOS_MIN, }; static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); static struct pm_qos_object network_lat_pm_qos = { - .requests = {LIST_HEAD_INIT(network_lat_pm_qos.requests.list)}, + .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock), .notifiers = &network_lat_notifier, .name = "network_latency", .default_value = 2000 * USEC_PER_SEC, - .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), - .comparitor = min_compare + .type = PM_QOS_MIN }; static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); static struct pm_qos_object network_throughput_pm_qos = { - .requests = {LIST_HEAD_INIT(network_throughput_pm_qos.requests.list)}, + .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock), .notifiers = &network_throughput_notifier, .name = "network_throughput", .default_value = 0, - .target_value = ATOMIC_INIT(0), - .comparitor = max_compare + .type = PM_QOS_MAX, }; @@ -111,8 +101,6 @@ static struct pm_qos_object *pm_qos_array[] = { &network_throughput_pm_qos }; -static DEFINE_SPINLOCK(pm_qos_lock); - static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, size_t count, loff_t *f_pos); static int pm_qos_power_open(struct inode *inode, struct file *filp); @@ -124,46 +112,55 @@ static const struct file_operations pm_qos_power_fops = { .release = pm_qos_power_release, }; -/* static helper functions */ -static s32 max_compare(s32 v1, s32 v2) +/* unlocked internal variant */ +static inline int pm_qos_get_value(struct pm_qos_object *o) { - return max(v1, v2); -} + if (plist_head_empty(&o->requests)) + return o->default_value; -static s32 min_compare(s32 v1, s32 v2) -{ - return min(v1, v2); -} + switch (o->type) { + case PM_QOS_MIN: + return plist_last(&o->requests)->prio; + + case PM_QOS_MAX: + return plist_first(&o->requests)->prio; + default: + /* runtime check for not using enum */ + BUG(); + } +} -static void update_target(int pm_qos_class) +static void update_target(struct pm_qos_object *o, struct plist_node *node, + int del, int value) { - s32 extreme_value; - struct pm_qos_request_list *node; unsigned long flags; - int call_notifier = 0; + int prev_value, curr_value; spin_lock_irqsave(&pm_qos_lock, flags); - extreme_value = pm_qos_array[pm_qos_class]->default_value; - list_for_each_entry(node, - &pm_qos_array[pm_qos_class]->requests.list, list) { - extreme_value = pm_qos_array[pm_qos_class]->comparitor( - extreme_value, node->value); - } - if (atomic_read(&pm_qos_array[pm_qos_class]->target_value) != - extreme_value) { - call_notifier = 1; - atomic_set(&pm_qos_array[pm_qos_class]->target_value, - extreme_value); - pr_debug(KERN_ERR "new target for qos %d is %d\n", pm_qos_class, - atomic_read(&pm_qos_array[pm_qos_class]->target_value)); + prev_value = pm_qos_get_value(o); + /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */ + if (value != PM_QOS_DEFAULT_VALUE) { + /* + * to change the list, we atomically remove, reinit + * with new value and add, then see if the extremal + * changed + */ + plist_del(node, &o->requests); + plist_node_init(node, value); + plist_add(node, &o->requests); + } else if (del) { + plist_del(node, &o->requests); + } else { + plist_add(node, &o->requests); } + curr_value = pm_qos_get_value(o); spin_unlock_irqrestore(&pm_qos_lock, flags); - if (call_notifier) - blocking_notifier_call_chain( - pm_qos_array[pm_qos_class]->notifiers, - (unsigned long) extreme_value, NULL); + if (prev_value != curr_value) + blocking_notifier_call_chain(o->notifiers, + (unsigned long)curr_value, + NULL); } static int register_pm_qos_misc(struct pm_qos_object *qos) @@ -196,42 +193,53 @@ static int find_pm_qos_object_by_minor(int minor) */ int pm_qos_request(int pm_qos_class) { - return atomic_read(&pm_qos_array[pm_qos_class]->target_value); + unsigned long flags; + int value; + + spin_lock_irqsave(&pm_qos_lock, flags); + value = pm_qos_get_value(pm_qos_array[pm_qos_class]); + spin_unlock_irqrestore(&pm_qos_lock, flags); + + return value; } EXPORT_SYMBOL_GPL(pm_qos_request); +int pm_qos_request_active(struct pm_qos_request_list *req) +{ + return req->pm_qos_class != 0; +} +EXPORT_SYMBOL_GPL(pm_qos_request_active); + /** * pm_qos_add_request - inserts new qos request into the list - * @pm_qos_class: identifies which list of qos request to us + * @dep: pointer to a preallocated handle + * @pm_qos_class: identifies which list of qos request to use * @value: defines the qos request * * This function inserts a new entry in the pm_qos_class list of requested qos * performance characteristics. It recomputes the aggregate QoS expectations - * for the pm_qos_class of parameters, and returns the pm_qos_request list - * element as a handle for use in updating and removal. Call needs to save - * this handle for later use. + * for the pm_qos_class of parameters and initializes the pm_qos_request_list + * handle. Caller needs to save this handle for later use in updates and + * removal. */ -struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value) + +void pm_qos_add_request(struct pm_qos_request_list *dep, + int pm_qos_class, s32 value) { - struct pm_qos_request_list *dep; - unsigned long flags; + struct pm_qos_object *o = pm_qos_array[pm_qos_class]; + int new_value; - dep = kzalloc(sizeof(struct pm_qos_request_list), GFP_KERNEL); - if (dep) { - if (value == PM_QOS_DEFAULT_VALUE) - dep->value = pm_qos_array[pm_qos_class]->default_value; - else - dep->value = value; - dep->pm_qos_class = pm_qos_class; - - spin_lock_irqsave(&pm_qos_lock, flags); - list_add(&dep->list, - &pm_qos_array[pm_qos_class]->requests.list); - spin_unlock_irqrestore(&pm_qos_lock, flags); - update_target(pm_qos_class); + if (pm_qos_request_active(dep)) { + WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n"); + return; } - - return dep; + if (value == PM_QOS_DEFAULT_VALUE) + new_value = o->default_value; + else + new_value = value; + plist_node_init(&dep->list, new_value); + dep->pm_qos_class = pm_qos_class; + update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE); } EXPORT_SYMBOL_GPL(pm_qos_add_request); @@ -246,27 +254,28 @@ EXPORT_SYMBOL_GPL(pm_qos_add_request); * Attempts are made to make this code callable on hot code paths. */ void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req, - s32 new_value) + s32 new_value) { - unsigned long flags; - int pending_update = 0; s32 temp; + struct pm_qos_object *o; + + if (!pm_qos_req) /*guard against callers passing in null */ + return; - if (pm_qos_req) { /*guard against callers passing in null */ - spin_lock_irqsave(&pm_qos_lock, flags); - if (new_value == PM_QOS_DEFAULT_VALUE) - temp = pm_qos_array[pm_qos_req->pm_qos_class]->default_value; - else - temp = new_value; - - if (temp != pm_qos_req->value) { - pending_update = 1; - pm_qos_req->value = temp; - } - spin_unlock_irqrestore(&pm_qos_lock, flags); - if (pending_update) - update_target(pm_qos_req->pm_qos_class); + if (!pm_qos_request_active(pm_qos_req)) { + WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n"); + return; } + + o = pm_qos_array[pm_qos_req->pm_qos_class]; + + if (new_value == PM_QOS_DEFAULT_VALUE) + temp = o->default_value; + else + temp = new_value; + + if (temp != pm_qos_req->list.prio) + update_target(o, &pm_qos_req->list, 0, temp); } EXPORT_SYMBOL_GPL(pm_qos_update_request); @@ -280,19 +289,20 @@ EXPORT_SYMBOL_GPL(pm_qos_update_request); */ void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req) { - unsigned long flags; - int qos_class; + struct pm_qos_object *o; if (pm_qos_req == NULL) return; /* silent return to keep pcm code cleaner */ - qos_class = pm_qos_req->pm_qos_class; - spin_lock_irqsave(&pm_qos_lock, flags); - list_del(&pm_qos_req->list); - kfree(pm_qos_req); - spin_unlock_irqrestore(&pm_qos_lock, flags); - update_target(qos_class); + if (!pm_qos_request_active(pm_qos_req)) { + WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n"); + return; + } + + o = pm_qos_array[pm_qos_req->pm_qos_class]; + update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE); + memset(pm_qos_req, 0, sizeof(*pm_qos_req)); } EXPORT_SYMBOL_GPL(pm_qos_remove_request); @@ -340,8 +350,12 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp) pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); if (pm_qos_class >= 0) { - filp->private_data = (void *) pm_qos_add_request(pm_qos_class, - PM_QOS_DEFAULT_VALUE); + struct pm_qos_request_list *req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE); + filp->private_data = req; if (filp->private_data) return 0; @@ -353,8 +367,9 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp) { struct pm_qos_request_list *req; - req = (struct pm_qos_request_list *)filp->private_data; + req = filp->private_data; pm_qos_remove_request(req); + kfree(req); return 0; } @@ -374,10 +389,12 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, } else if (count == 11) { /* len('0x12345678/0') */ if (copy_from_user(ascii_value, buf, 11)) return -EFAULT; + if (strlen(ascii_value) != 10) + return -EINVAL; x = sscanf(ascii_value, "%x", &value); if (x != 1) return -EINVAL; - pr_debug(KERN_ERR "%s, %d, 0x%x\n", ascii_value, x, value); + pr_debug("%s, %d, 0x%x\n", ascii_value, x, value); } else return -EINVAL; diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 9829646..6842eeb 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -16,13 +16,13 @@ * siglock protection since other code may update expiration cache as * well. */ -void update_rlimit_cpu(unsigned long rlim_new) +void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) { cputime_t cputime = secs_to_cputime(rlim_new); - spin_lock_irq(¤t->sighand->siglock); - set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); - spin_unlock_irq(¤t->sighand->siglock); + spin_lock_irq(&task->sighand->siglock); + set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL); + spin_unlock_irq(&task->sighand->siglock); } static int check_clock(const clockid_t which_clock) @@ -232,31 +232,24 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) { - struct sighand_struct *sighand; - struct signal_struct *sig; + struct signal_struct *sig = tsk->signal; struct task_struct *t; - *times = INIT_CPUTIME; + times->utime = sig->utime; + times->stime = sig->stime; + times->sum_exec_runtime = sig->sum_sched_runtime; rcu_read_lock(); - sighand = rcu_dereference(tsk->sighand); - if (!sighand) + /* make sure we can trust tsk->thread_group list */ + if (!likely(pid_alive(tsk))) goto out; - sig = tsk->signal; - t = tsk; do { times->utime = cputime_add(times->utime, t->utime); times->stime = cputime_add(times->stime, t->stime); times->sum_exec_runtime += t->se.sum_exec_runtime; - - t = next_thread(t); - } while (t != tsk); - - times->utime = cputime_add(times->utime, sig->utime); - times->stime = cputime_add(times->stime, sig->stime); - times->sum_exec_runtime += sig->sum_sched_runtime; + } while_each_thread(tsk, t); out: rcu_read_unlock(); } @@ -1279,10 +1272,6 @@ static inline int fastpath_timer_check(struct task_struct *tsk) { struct signal_struct *sig; - /* tsk == current, ensure it is safe to use ->signal/sighand */ - if (unlikely(tsk->exit_state)) - return 0; - if (!task_cputime_zero(&tsk->cputime_expires)) { struct task_cputime task_sample = { .utime = tsk->utime, @@ -1298,7 +1287,10 @@ static inline int fastpath_timer_check(struct task_struct *tsk) if (sig->cputimer.running) { struct task_cputime group_sample; - thread_group_cputimer(tsk, &group_sample); + spin_lock(&sig->cputimer.lock); + group_sample = sig->cputimer.cputime; + spin_unlock(&sig->cputimer.lock); + if (task_cputime_expired(&group_sample, &sig->cputime_expires)) return 1; } @@ -1315,6 +1307,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) { LIST_HEAD(firing); struct k_itimer *timer, *next; + unsigned long flags; BUG_ON(!irqs_disabled()); @@ -1325,7 +1318,8 @@ void run_posix_cpu_timers(struct task_struct *tsk) if (!fastpath_timer_check(tsk)) return; - spin_lock(&tsk->sighand->siglock); + if (!lock_task_sighand(tsk, &flags)) + return; /* * Here we take off tsk->signal->cpu_timers[N] and * tsk->cpu_timers[N] all the timers that are firing, and @@ -1347,7 +1341,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) * that gets the timer lock before we do will give it up and * spin until we've taken care of that timer below. */ - spin_unlock(&tsk->sighand->siglock); + unlock_task_sighand(tsk, &flags); /* * Now that all the timers on our list have the firing flag, diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index ad72342..9ca4973 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -560,11 +560,6 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, new_timer->it_clock = which_clock; new_timer->it_overrun = -1; - if (copy_to_user(created_timer_id, - &new_timer_id, sizeof (new_timer_id))) { - error = -EFAULT; - goto out; - } if (timer_event_spec) { if (copy_from_user(&event, timer_event_spec, sizeof (event))) { error = -EFAULT; @@ -590,6 +585,12 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, new_timer->sigq->info.si_tid = new_timer->it_id; new_timer->sigq->info.si_code = SI_TIMER; + if (copy_to_user(created_timer_id, + &new_timer_id, sizeof (new_timer_id))) { + error = -EFAULT; + goto out; + } + error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); if (error) goto out; diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ca6066a..29bff61 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -86,6 +86,7 @@ config PM_SLEEP_SMP depends on SMP depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE depends on PM_SLEEP + select HOTPLUG select HOTPLUG_CPU default y @@ -137,6 +138,8 @@ config SUSPEND_FREEZER config HIBERNATION bool "Hibernation (aka 'suspend to disk')" depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE + select LZO_COMPRESS + select LZO_DECOMPRESS select SUSPEND_NVS if HAS_IOMEM ---help--- Enable the suspend to disk (STD) functionality, which is usually @@ -242,3 +245,17 @@ config PM_OPS bool depends on PM_SLEEP || PM_RUNTIME default y + +config PM_OPP + bool "Operating Performance Point (OPP) Layer library" + depends on PM + ---help--- + SOCs have a standard set of tuples consisting of frequency and + voltage pairs that the device will support per voltage domain. This + is called Operating Performance Point or OPP. The actual definitions + of OPP varies over silicon within the same family of devices. + + OPP layer organizes the data internally using device pointers + representing individual voltage domains and provides SOC + implementations a ready to use framework to manage OPPs. + For more information, read <file:Documentation/power/opp.txt> diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c index 97024fd..83bbc7c 100644 --- a/kernel/power/block_io.c +++ b/kernel/power/block_io.c @@ -28,7 +28,7 @@ static int submit(int rw, struct block_device *bdev, sector_t sector, struct page *page, struct bio **bio_chain) { - const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); + const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG; struct bio *bio; bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index aa9e916..657272e 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -3,7 +3,7 @@ * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab - * Copyright (c) 2004 Pavel Machek <pavel@suse.cz> + * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz> * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. * * This file is released under the GPLv2. @@ -29,6 +29,7 @@ #include "power.h" +static int nocompress = 0; static int noresume = 0; static char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; @@ -277,7 +278,7 @@ static int create_image(int platform_mode) goto Enable_irqs; } - if (hibernation_test(TEST_CORE)) + if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events()) goto Power_up; in_suspend = 1; @@ -288,8 +289,10 @@ static int create_image(int platform_mode) error); /* Restore control flow magically appears here */ restore_processor_state(); - if (!in_suspend) + if (!in_suspend) { + events_check_enabled = false; platform_leave(platform_mode); + } Power_up: sysdev_resume(); @@ -328,7 +331,7 @@ int hibernation_snapshot(int platform_mode) error = platform_begin(platform_mode); if (error) - return error; + goto Close; /* Preallocate image memory before shutting down devices. */ error = hibernate_preallocate_memory(); @@ -511,18 +514,24 @@ int hibernation_platform_enter(void) local_irq_disable(); sysdev_suspend(PMSG_HIBERNATE); + if (!pm_check_wakeup_events()) { + error = -EAGAIN; + goto Power_up; + } + hibernation_ops->enter(); /* We should never get here */ while (1); - /* - * We don't need to reenable the nonboot CPUs or resume consoles, since - * the system is going to be halted anyway. - */ + Power_up: + sysdev_resume(); + local_irq_enable(); + enable_nonboot_cpus(); + Platform_finish: hibernation_ops->finish(); - dpm_suspend_noirq(PMSG_RESTORE); + dpm_resume_noirq(PMSG_RESTORE); Resume_devices: entering_platform_hibernation = false; @@ -630,6 +639,8 @@ int hibernate(void) if (hibernation_mode == HIBERNATION_PLATFORM) flags |= SF_PLATFORM_MODE; + if (nocompress) + flags |= SF_NOCOMPRESS_MODE; pr_debug("PM: writing image.\n"); error = swsusp_write(flags); swsusp_free(); @@ -697,7 +708,7 @@ static int software_resume(void) goto Unlock; } - pr_debug("PM: Checking image partition %s\n", resume_file); + pr_debug("PM: Checking hibernation image partition %s\n", resume_file); /* Check if the device is there */ swsusp_resume_device = name_to_dev_t(resume_file); @@ -722,10 +733,10 @@ static int software_resume(void) } Check_image: - pr_debug("PM: Resume from partition %d:%d\n", + pr_debug("PM: Hibernation image partition %d:%d present\n", MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); - pr_debug("PM: Checking hibernation image.\n"); + pr_debug("PM: Looking for hibernation image.\n"); error = swsusp_check(); if (error) goto Unlock; @@ -757,14 +768,14 @@ static int software_resume(void) goto Done; } - pr_debug("PM: Reading hibernation image.\n"); + pr_debug("PM: Loading hibernation image.\n"); error = swsusp_read(&flags); swsusp_close(FMODE_READ); if (!error) hibernation_restore(flags & SF_PLATFORM_MODE); - printk(KERN_ERR "PM: Restore failed, recovering.\n"); + printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); swsusp_free(); thaw_processes(); Done: @@ -777,7 +788,7 @@ static int software_resume(void) /* For success case, the suspend path will release the lock */ Unlock: mutex_unlock(&pm_mutex); - pr_debug("PM: Resume from disk failed.\n"); + pr_debug("PM: Hibernation image not present or could not be loaded.\n"); return error; close_finish: swsusp_close(FMODE_READ); @@ -996,6 +1007,15 @@ static int __init resume_offset_setup(char *str) return 1; } +static int __init hibernate_setup(char *str) +{ + if (!strncmp(str, "noresume", 8)) + noresume = 1; + else if (!strncmp(str, "nocompress", 10)) + nocompress = 1; + return 1; +} + static int __init noresume_setup(char *str) { noresume = 1; @@ -1005,3 +1025,4 @@ static int __init noresume_setup(char *str) __setup("noresume", noresume_setup); __setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); +__setup("hibernate=", hibernate_setup); diff --git a/kernel/power/main.c b/kernel/power/main.c index b58800b..7b5db6a 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -204,6 +204,60 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, power_attr(state); +#ifdef CONFIG_PM_SLEEP +/* + * The 'wakeup_count' attribute, along with the functions defined in + * drivers/base/power/wakeup.c, provides a means by which wakeup events can be + * handled in a non-racy way. + * + * If a wakeup event occurs when the system is in a sleep state, it simply is + * woken up. In turn, if an event that would wake the system up from a sleep + * state occurs when it is undergoing a transition to that sleep state, the + * transition should be aborted. Moreover, if such an event occurs when the + * system is in the working state, an attempt to start a transition to the + * given sleep state should fail during certain period after the detection of + * the event. Using the 'state' attribute alone is not sufficient to satisfy + * these requirements, because a wakeup event may occur exactly when 'state' + * is being written to and may be delivered to user space right before it is + * frozen, so the event will remain only partially processed until the system is + * woken up by another event. In particular, it won't cause the transition to + * a sleep state to be aborted. + * + * This difficulty may be overcome if user space uses 'wakeup_count' before + * writing to 'state'. It first should read from 'wakeup_count' and store + * the read value. Then, after carrying out its own preparations for the system + * transition to a sleep state, it should write the stored value to + * 'wakeup_count'. If that fails, at least one wakeup event has occured since + * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it + * is allowed to write to 'state', but the transition will be aborted if there + * are any wakeup events detected after 'wakeup_count' was written to. + */ + +static ssize_t wakeup_count_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + unsigned int val; + + return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR; +} + +static ssize_t wakeup_count_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned int val; + + if (sscanf(buf, "%u", &val) == 1) { + if (pm_save_wakeup_count(val)) + return n; + } + return -EINVAL; +} + +power_attr(wakeup_count); +#endif /* CONFIG_PM_SLEEP */ + #ifdef CONFIG_PM_TRACE int pm_trace_enabled; @@ -227,15 +281,34 @@ pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr, } power_attr(pm_trace); + +static ssize_t pm_trace_dev_match_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return show_trace_dev_match(buf, PAGE_SIZE); +} + +static ssize_t +pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + return -EINVAL; +} + +power_attr(pm_trace_dev_match); + #endif /* CONFIG_PM_TRACE */ static struct attribute * g[] = { &state_attr.attr, #ifdef CONFIG_PM_TRACE &pm_trace_attr.attr, + &pm_trace_dev_match_attr.attr, #endif #ifdef CONFIG_PM_SLEEP &pm_async_attr.attr, + &wakeup_count_attr.attr, #ifdef CONFIG_PM_DEBUG &pm_test_attr.attr, #endif @@ -253,7 +326,7 @@ EXPORT_SYMBOL_GPL(pm_wq); static int __init pm_start_workqueue(void) { - pm_wq = create_freezeable_workqueue("pm"); + pm_wq = alloc_workqueue("pm", WQ_FREEZEABLE, 0); return pm_wq ? 0 : -ENOMEM; } @@ -266,6 +339,7 @@ static int __init pm_init(void) int error = pm_start_workqueue(); if (error) return error; + hibernate_image_size_init(); power_kobj = kobject_create_and_add("power", NULL); if (!power_kobj) return -ENOMEM; diff --git a/kernel/power/power.h b/kernel/power/power.h index 006270f..03634be 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -14,6 +14,9 @@ struct swsusp_info { } __attribute__((aligned(PAGE_SIZE))); #ifdef CONFIG_HIBERNATION +/* kernel/power/snapshot.c */ +extern void __init hibernate_image_size_init(void); + #ifdef CONFIG_ARCH_HIBERNATION_HEADER /* Maximum size of architecture specific data in a hibernation header */ #define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4) @@ -49,7 +52,11 @@ static inline char *check_image_kernel(struct swsusp_info *info) extern int hibernation_snapshot(int platform_mode); extern int hibernation_restore(int platform_mode); extern int hibernation_platform_enter(void); -#endif + +#else /* !CONFIG_HIBERNATION */ + +static inline void hibernate_image_size_init(void) {} +#endif /* !CONFIG_HIBERNATION */ extern int pfn_is_nosave(unsigned long); @@ -134,6 +141,7 @@ extern int swsusp_swap_in_use(void); * the image header. */ #define SF_PLATFORM_MODE 1 +#define SF_NOCOMPRESS_MODE 2 /* kernel/power/hibernate.c */ extern int swsusp_check(void); diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index e8b3370..d523593 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -24,7 +24,7 @@ static void do_poweroff(struct work_struct *dummy) static DECLARE_WORK(poweroff_work, do_poweroff); -static void handle_poweroff(int key, struct tty_struct *tty) +static void handle_poweroff(int key) { /* run sysrq poweroff on boot cpu */ schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); diff --git a/kernel/power/process.c b/kernel/power/process.c index 71ae290..e50b4c1 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -15,6 +15,7 @@ #include <linux/syscalls.h> #include <linux/freezer.h> #include <linux/delay.h> +#include <linux/workqueue.h> /* * Timeout for stopping processes @@ -35,13 +36,19 @@ static int try_to_freeze_tasks(bool sig_only) struct task_struct *g, *p; unsigned long end_time; unsigned int todo; + bool wq_busy = false; struct timeval start, end; u64 elapsed_csecs64; unsigned int elapsed_csecs; + bool wakeup = false; do_gettimeofday(&start); end_time = jiffies + TIMEOUT; + + if (!sig_only) + freeze_workqueues_begin(); + while (true) { todo = 0; read_lock(&tasklist_lock); @@ -63,9 +70,20 @@ static int try_to_freeze_tasks(bool sig_only) todo++; } while_each_thread(g, p); read_unlock(&tasklist_lock); + + if (!sig_only) { + wq_busy = freeze_workqueues_busy(); + todo += wq_busy; + } + if (!todo || time_after(jiffies, end_time)) break; + if (!pm_check_wakeup_events()) { + wakeup = true; + break; + } + /* * We need to retry, but first give the freezing tasks some * time to enter the regrigerator. @@ -85,13 +103,18 @@ static int try_to_freeze_tasks(bool sig_only) * but it cleans up leftover PF_FREEZE requests. */ printk("\n"); - printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " - "(%d tasks refusing to freeze):\n", - elapsed_csecs / 100, elapsed_csecs % 100, todo); + printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds " + "(%d tasks refusing to freeze, wq_busy=%d):\n", + wakeup ? "aborted" : "failed", + elapsed_csecs / 100, elapsed_csecs % 100, + todo - wq_busy, wq_busy); + + thaw_workqueues(); + read_lock(&tasklist_lock); do_each_thread(g, p) { task_lock(p); - if (freezing(p) && !freezer_should_skip(p)) + if (!wakeup && freezing(p) && !freezer_should_skip(p)) sched_show_task(p); cancel_freezing(p); task_unlock(p); @@ -157,6 +180,7 @@ void thaw_processes(void) oom_killer_enable(); printk("Restarting tasks ... "); + thaw_workqueues(); thaw_tasks(true); thaw_tasks(false); schedule(); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 25ce010..ac7eb10 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -3,7 +3,7 @@ * * This file provides system snapshot/restore functionality for swsusp. * - * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> + * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> * * This file is released under the GPLv2. @@ -46,7 +46,12 @@ static void swsusp_unset_page_forbidden(struct page *); * size will not exceed N bytes, but if that is impossible, it will * try to create the smallest image possible. */ -unsigned long image_size = 500 * 1024 * 1024; +unsigned long image_size; + +void __init hibernate_image_size_init(void) +{ + image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; +} /* List of PBEs needed for restoring the pages that were allocated before * the suspend and included in the suspend image, but have also been @@ -1121,9 +1126,19 @@ static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask) return nr_alloc; } -static unsigned long preallocate_image_memory(unsigned long nr_pages) +static unsigned long preallocate_image_memory(unsigned long nr_pages, + unsigned long avail_normal) { - return preallocate_image_pages(nr_pages, GFP_IMAGE); + unsigned long alloc; + + if (avail_normal <= alloc_normal) + return 0; + + alloc = avail_normal - alloc_normal; + if (nr_pages < alloc) + alloc = nr_pages; + + return preallocate_image_pages(alloc, GFP_IMAGE); } #ifdef CONFIG_HIGHMEM @@ -1169,15 +1184,22 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, */ static void free_unnecessary_pages(void) { - unsigned long save_highmem, to_free_normal, to_free_highmem; + unsigned long save, to_free_normal, to_free_highmem; - to_free_normal = alloc_normal - count_data_pages(); - save_highmem = count_highmem_pages(); - if (alloc_highmem > save_highmem) { - to_free_highmem = alloc_highmem - save_highmem; + save = count_data_pages(); + if (alloc_normal >= save) { + to_free_normal = alloc_normal - save; + save = 0; + } else { + to_free_normal = 0; + save -= alloc_normal; + } + save += count_highmem_pages(); + if (alloc_highmem >= save) { + to_free_highmem = alloc_highmem - save; } else { to_free_highmem = 0; - to_free_normal -= save_highmem - alloc_highmem; + to_free_normal -= save - alloc_highmem; } memory_bm_position_reset(©_bm); @@ -1258,7 +1280,7 @@ int hibernate_preallocate_memory(void) { struct zone *zone; unsigned long saveable, size, max_size, count, highmem, pages = 0; - unsigned long alloc, save_highmem, pages_highmem; + unsigned long alloc, save_highmem, pages_highmem, avail_normal; struct timeval start, stop; int error; @@ -1295,26 +1317,38 @@ int hibernate_preallocate_memory(void) else count += zone_page_state(zone, NR_FREE_PAGES); } + avail_normal = count; count += highmem; count -= totalreserve_pages; /* Compute the maximum number of saveable pages to leave in memory. */ max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES; + /* Compute the desired number of image pages specified by image_size. */ size = DIV_ROUND_UP(image_size, PAGE_SIZE); if (size > max_size) size = max_size; /* - * If the maximum is not less than the current number of saveable pages - * in memory, allocate page frames for the image and we're done. + * If the desired number of image pages is at least as large as the + * current number of saveable pages in memory, allocate page frames for + * the image and we're done. */ if (size >= saveable) { pages = preallocate_image_highmem(save_highmem); - pages += preallocate_image_memory(saveable - pages); + pages += preallocate_image_memory(saveable - pages, avail_normal); goto out; } /* Estimate the minimum size of the image. */ pages = minimum_image_size(saveable); + /* + * To avoid excessive pressure on the normal zone, leave room in it to + * accommodate an image of the minimum size (unless it's already too + * small, in which case don't preallocate pages from it at all). + */ + if (avail_normal > pages) + avail_normal -= pages; + else + avail_normal = 0; if (size < pages) size = min_t(unsigned long, pages, max_size); @@ -1335,16 +1369,34 @@ int hibernate_preallocate_memory(void) */ pages_highmem = preallocate_image_highmem(highmem / 2); alloc = (count - max_size) - pages_highmem; - pages = preallocate_image_memory(alloc); - if (pages < alloc) - goto err_out; - size = max_size - size; - alloc = size; - size = preallocate_highmem_fraction(size, highmem, count); - pages_highmem += size; - alloc -= size; - pages += preallocate_image_memory(alloc); - pages += pages_highmem; + pages = preallocate_image_memory(alloc, avail_normal); + if (pages < alloc) { + /* We have exhausted non-highmem pages, try highmem. */ + alloc -= pages; + pages += pages_highmem; + pages_highmem = preallocate_image_highmem(alloc); + if (pages_highmem < alloc) + goto err_out; + pages += pages_highmem; + /* + * size is the desired number of saveable pages to leave in + * memory, so try to preallocate (all memory - size) pages. + */ + alloc = (count - pages) - size; + pages += preallocate_image_highmem(alloc); + } else { + /* + * There are approximately max_size saveable pages at this point + * and we want to reduce this number down to size. + */ + alloc = max_size - size; + size = preallocate_highmem_fraction(alloc, highmem, count); + pages_highmem += size; + alloc -= size; + size = preallocate_image_memory(alloc, avail_normal); + pages_highmem += preallocate_image_highmem(alloc - size); + pages += pages_highmem + size; + } /* * We only need as many page frames for the image as there are saveable diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index f37cb7d..7335952 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -136,19 +136,19 @@ static int suspend_enter(suspend_state_t state) if (suspend_ops->prepare) { error = suspend_ops->prepare(); if (error) - return error; + goto Platform_finish; } error = dpm_suspend_noirq(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to power down\n"); - goto Platfrom_finish; + goto Platform_finish; } if (suspend_ops->prepare_late) { error = suspend_ops->prepare_late(); if (error) - goto Power_up_devices; + goto Platform_wake; } if (suspend_test(TEST_PLATFORM)) @@ -163,8 +163,10 @@ static int suspend_enter(suspend_state_t state) error = sysdev_suspend(PMSG_SUSPEND); if (!error) { - if (!suspend_test(TEST_CORE)) + if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) { error = suspend_ops->enter(state); + events_check_enabled = false; + } sysdev_resume(); } @@ -178,10 +180,9 @@ static int suspend_enter(suspend_state_t state) if (suspend_ops->wake) suspend_ops->wake(); - Power_up_devices: dpm_resume_noirq(PMSG_RESUME); - Platfrom_finish: + Platform_finish: if (suspend_ops->finish) suspend_ops->finish(); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index b0bb217..916eaa7 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -4,7 +4,7 @@ * This file provides functions for reading the suspend image from * and writing it to a swap partition. * - * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> + * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> * * This file is released under the GPLv2. @@ -24,15 +24,17 @@ #include <linux/swapops.h> #include <linux/pm.h> #include <linux/slab.h> +#include <linux/lzo.h> +#include <linux/vmalloc.h> #include "power.h" -#define SWSUSP_SIG "S1SUSPEND" +#define HIBERNATE_SIG "LINHIB0001" /* * The swap map is a data structure used for keeping track of each page * written to a swap partition. It consists of many swap_map_page - * structures that contain each an array of MAP_PAGE_SIZE swap entries. + * structures that contain each an array of MAP_PAGE_ENTRIES swap entries. * These structures are stored on the swap and linked together with the * help of the .next_swap member. * @@ -148,7 +150,7 @@ sector_t alloc_swapdev_block(int swap) /** * free_all_swap_pages - free swap pages allocated for saving image data. - * It also frees the extents used to register which swap entres had been + * It also frees the extents used to register which swap entries had been * allocated. */ @@ -193,7 +195,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); - memcpy(swsusp_header->sig,SWSUSP_SIG, 10); + memcpy(swsusp_header->sig, HIBERNATE_SIG, 10); swsusp_header->image = handle->first_sector; swsusp_header->flags = flags; error = hib_bio_write_page(swsusp_resume_block, @@ -357,6 +359,18 @@ static int swap_writer_finish(struct swap_map_handle *handle, return error; } +/* We need to remember how much compressed data we need to read. */ +#define LZO_HEADER sizeof(size_t) + +/* Number of pages/bytes we'll compress at one time. */ +#define LZO_UNC_PAGES 32 +#define LZO_UNC_SIZE (LZO_UNC_PAGES * PAGE_SIZE) + +/* Number of pages/bytes we need for compressed data (worst case). */ +#define LZO_CMP_PAGES DIV_ROUND_UP(lzo1x_worst_compress(LZO_UNC_SIZE) + \ + LZO_HEADER, PAGE_SIZE) +#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE) + /** * save_image - save the suspend image data */ @@ -404,6 +418,137 @@ static int save_image(struct swap_map_handle *handle, return ret; } + +/** + * save_image_lzo - Save the suspend image data compressed with LZO. + * @handle: Swap mam handle to use for saving the image. + * @snapshot: Image to read data from. + * @nr_to_write: Number of pages to save. + */ +static int save_image_lzo(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_to_write) +{ + unsigned int m; + int ret = 0; + int nr_pages; + int err2; + struct bio *bio; + struct timeval start; + struct timeval stop; + size_t off, unc_len, cmp_len; + unsigned char *unc, *cmp, *wrk, *page; + + page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + if (!page) { + printk(KERN_ERR "PM: Failed to allocate LZO page\n"); + return -ENOMEM; + } + + wrk = vmalloc(LZO1X_1_MEM_COMPRESS); + if (!wrk) { + printk(KERN_ERR "PM: Failed to allocate LZO workspace\n"); + free_page((unsigned long)page); + return -ENOMEM; + } + + unc = vmalloc(LZO_UNC_SIZE); + if (!unc) { + printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); + vfree(wrk); + free_page((unsigned long)page); + return -ENOMEM; + } + + cmp = vmalloc(LZO_CMP_SIZE); + if (!cmp) { + printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); + vfree(unc); + vfree(wrk); + free_page((unsigned long)page); + return -ENOMEM; + } + + printk(KERN_INFO + "PM: Compressing and saving image data (%u pages) ... ", + nr_to_write); + m = nr_to_write / 100; + if (!m) + m = 1; + nr_pages = 0; + bio = NULL; + do_gettimeofday(&start); + for (;;) { + for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { + ret = snapshot_read_next(snapshot); + if (ret < 0) + goto out_finish; + + if (!ret) + break; + + memcpy(unc + off, data_of(*snapshot), PAGE_SIZE); + + if (!(nr_pages % m)) + printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + } + + if (!off) + break; + + unc_len = off; + ret = lzo1x_1_compress(unc, unc_len, + cmp + LZO_HEADER, &cmp_len, wrk); + if (ret < 0) { + printk(KERN_ERR "PM: LZO compression failed\n"); + break; + } + + if (unlikely(!cmp_len || + cmp_len > lzo1x_worst_compress(unc_len))) { + printk(KERN_ERR "PM: Invalid LZO compressed length\n"); + ret = -1; + break; + } + + *(size_t *)cmp = cmp_len; + + /* + * Given we are writing one page at a time to disk, we copy + * that much from the buffer, although the last bit will likely + * be smaller than full page. This is OK - we saved the length + * of the compressed data, so any garbage at the end will be + * discarded when we read it. + */ + for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) { + memcpy(page, cmp + off, PAGE_SIZE); + + ret = swap_write_page(handle, page, &bio); + if (ret) + goto out_finish; + } + } + +out_finish: + err2 = hib_wait_on_bio_chain(&bio); + do_gettimeofday(&stop); + if (!ret) + ret = err2; + if (!ret) + printk(KERN_CONT "\b\b\b\bdone\n"); + else + printk(KERN_CONT "\n"); + swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); + + vfree(cmp); + vfree(unc); + vfree(wrk); + free_page((unsigned long)page); + + return ret; +} + /** * enough_swap - Make sure we have enough swap to save the image. * @@ -411,12 +556,16 @@ static int save_image(struct swap_map_handle *handle, * space avaiable from the resume partition. */ -static int enough_swap(unsigned int nr_pages) +static int enough_swap(unsigned int nr_pages, unsigned int flags) { unsigned int free_swap = count_swap_pages(root_swap, 1); + unsigned int required; pr_debug("PM: Free swap pages: %u\n", free_swap); - return free_swap > nr_pages + PAGES_FOR_IO; + + required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ? + nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1); + return free_swap > required; } /** @@ -443,7 +592,7 @@ int swsusp_write(unsigned int flags) printk(KERN_ERR "PM: Cannot get swap writer\n"); return error; } - if (!enough_swap(pages)) { + if (!enough_swap(pages, flags)) { printk(KERN_ERR "PM: Not enough free swap\n"); error = -ENOSPC; goto out_finish; @@ -458,8 +607,11 @@ int swsusp_write(unsigned int flags) } header = (struct swsusp_info *)data_of(snapshot); error = swap_write_page(&handle, header, NULL); - if (!error) - error = save_image(&handle, &snapshot, pages - 1); + if (!error) { + error = (flags & SF_NOCOMPRESS_MODE) ? + save_image(&handle, &snapshot, pages - 1) : + save_image_lzo(&handle, &snapshot, pages - 1); + } out_finish: error = swap_writer_finish(&handle, flags, error); return error; @@ -590,6 +742,127 @@ static int load_image(struct swap_map_handle *handle, } /** + * load_image_lzo - Load compressed image data and decompress them with LZO. + * @handle: Swap map handle to use for loading data. + * @snapshot: Image to copy uncompressed data into. + * @nr_to_read: Number of pages to load. + */ +static int load_image_lzo(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_to_read) +{ + unsigned int m; + int error = 0; + struct timeval start; + struct timeval stop; + unsigned nr_pages; + size_t off, unc_len, cmp_len; + unsigned char *unc, *cmp, *page; + + page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + if (!page) { + printk(KERN_ERR "PM: Failed to allocate LZO page\n"); + return -ENOMEM; + } + + unc = vmalloc(LZO_UNC_SIZE); + if (!unc) { + printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); + free_page((unsigned long)page); + return -ENOMEM; + } + + cmp = vmalloc(LZO_CMP_SIZE); + if (!cmp) { + printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); + vfree(unc); + free_page((unsigned long)page); + return -ENOMEM; + } + + printk(KERN_INFO + "PM: Loading and decompressing image data (%u pages) ... ", + nr_to_read); + m = nr_to_read / 100; + if (!m) + m = 1; + nr_pages = 0; + do_gettimeofday(&start); + + error = snapshot_write_next(snapshot); + if (error <= 0) + goto out_finish; + + for (;;) { + error = swap_read_page(handle, page, NULL); /* sync */ + if (error) + break; + + cmp_len = *(size_t *)page; + if (unlikely(!cmp_len || + cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) { + printk(KERN_ERR "PM: Invalid LZO compressed length\n"); + error = -1; + break; + } + + memcpy(cmp, page, PAGE_SIZE); + for (off = PAGE_SIZE; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) { + error = swap_read_page(handle, page, NULL); /* sync */ + if (error) + goto out_finish; + + memcpy(cmp + off, page, PAGE_SIZE); + } + + unc_len = LZO_UNC_SIZE; + error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len, + unc, &unc_len); + if (error < 0) { + printk(KERN_ERR "PM: LZO decompression failed\n"); + break; + } + + if (unlikely(!unc_len || + unc_len > LZO_UNC_SIZE || + unc_len & (PAGE_SIZE - 1))) { + printk(KERN_ERR "PM: Invalid LZO uncompressed length\n"); + error = -1; + break; + } + + for (off = 0; off < unc_len; off += PAGE_SIZE) { + memcpy(data_of(*snapshot), unc + off, PAGE_SIZE); + + if (!(nr_pages % m)) + printk("\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + + error = snapshot_write_next(snapshot); + if (error <= 0) + goto out_finish; + } + } + +out_finish: + do_gettimeofday(&stop); + if (!error) { + printk("\b\b\b\bdone\n"); + snapshot_write_finalize(snapshot); + if (!snapshot_image_loaded(snapshot)) + error = -ENODATA; + } else + printk("\n"); + swsusp_show_speed(&start, &stop, nr_to_read, "Read"); + + vfree(cmp); + vfree(unc); + free_page((unsigned long)page); + + return error; +} + +/** * swsusp_read - read the hibernation image. * @flags_p: flags passed by the "frozen" kernel in the image header should * be written into this memeory location @@ -612,8 +885,11 @@ int swsusp_read(unsigned int *flags_p) goto end; if (!error) error = swap_read_page(&handle, header, NULL); - if (!error) - error = load_image(&handle, &snapshot, header->pages - 1); + if (!error) { + error = (*flags_p & SF_NOCOMPRESS_MODE) ? + load_image(&handle, &snapshot, header->pages - 1) : + load_image_lzo(&handle, &snapshot, header->pages - 1); + } swap_reader_finish(&handle); end: if (!error) @@ -640,7 +916,7 @@ int swsusp_check(void) if (error) goto put; - if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { + if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); /* Reset swap signature now */ error = hib_bio_write_page(swsusp_resume_block, @@ -653,13 +929,13 @@ put: if (error) blkdev_put(hib_resume_bdev, FMODE_READ); else - pr_debug("PM: Signature found, resuming\n"); + pr_debug("PM: Image signature found, resuming\n"); } else { error = PTR_ERR(hib_resume_bdev); } if (error) - pr_debug("PM: Error %d checking image file\n", error); + pr_debug("PM: Image not found (code %d)\n", error); return error; } diff --git a/kernel/printk.c b/kernel/printk.c index 444b770..2531017 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -37,6 +37,8 @@ #include <linux/ratelimit.h> #include <linux/kmsg_dump.h> #include <linux/syslog.h> +#include <linux/cpu.h> +#include <linux/notifier.h> #include <asm/uaccess.h> @@ -83,7 +85,7 @@ EXPORT_SYMBOL(oops_in_progress); * provides serialisation for access to the entire console * driver system. */ -static DECLARE_MUTEX(console_sem); +static DEFINE_SEMAPHORE(console_sem); struct console *console_drivers; EXPORT_SYMBOL_GPL(console_drivers); @@ -554,7 +556,7 @@ static void zap_locks(void) /* If a crash is occurring, make sure we can't deadlock */ spin_lock_init(&logbuf_lock); /* And make sure that we print immediately */ - init_MUTEX(&console_sem); + sema_init(&console_sem, 1); } #if defined(CONFIG_PRINTK_TIME) @@ -985,6 +987,32 @@ void resume_console(void) } /** + * console_cpu_notify - print deferred console messages after CPU hotplug + * @self: notifier struct + * @action: CPU hotplug event + * @hcpu: unused + * + * If printk() is called from a CPU that is not online yet, the messages + * will be spooled but will not show up on the console. This function is + * called when a new CPU comes online (or fails to come up), and ensures + * that any such output gets printed. + */ +static int __cpuinit console_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + switch (action) { + case CPU_ONLINE: + case CPU_DEAD: + case CPU_DYING: + case CPU_DOWN_FAILED: + case CPU_UP_CANCELED: + acquire_console_sem(); + release_console_sem(); + } + return NOTIFY_OK; +} + +/** * acquire_console_sem - lock the console system for exclusive use. * * Acquires a semaphore which guarantees that the caller has @@ -1371,7 +1399,7 @@ int unregister_console(struct console *console) } EXPORT_SYMBOL(unregister_console); -static int __init disable_boot_consoles(void) +static int __init printk_late_init(void) { struct console *con; @@ -1382,9 +1410,10 @@ static int __init disable_boot_consoles(void) unregister_console(con); } } + hotcpu_notifier(console_cpu_notify, 0); return 0; } -late_initcall(disable_boot_consoles); +late_initcall(printk_late_init); #if defined CONFIG_PRINTK @@ -1520,9 +1549,9 @@ void kmsg_dump(enum kmsg_dump_reason reason) chars = logged_chars; spin_unlock_irqrestore(&logbuf_lock, flags); - if (logged_chars > end) { - s1 = log_buf + log_buf_len - logged_chars + end; - l1 = logged_chars - end; + if (chars > end) { + s1 = log_buf + log_buf_len - chars + end; + l1 = chars - end; s2 = log_buf; l2 = end; @@ -1530,8 +1559,8 @@ void kmsg_dump(enum kmsg_dump_reason reason) s1 = ""; l1 = 0; - s2 = log_buf + end - logged_chars; - l2 = logged_chars; + s2 = log_buf + end - chars; + l2 = chars; } if (!spin_trylock_irqsave(&dump_list_lock, flags)) { diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 74a3d69..f34d798 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -324,26 +324,32 @@ int ptrace_detach(struct task_struct *child, unsigned int data) } /* - * Detach all tasks we were using ptrace on. + * Detach all tasks we were using ptrace on. Called with tasklist held + * for writing, and returns with it held too. But note it can release + * and reacquire the lock. */ void exit_ptrace(struct task_struct *tracer) { struct task_struct *p, *n; LIST_HEAD(ptrace_dead); - write_lock_irq(&tasklist_lock); + if (likely(list_empty(&tracer->ptraced))) + return; + list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { if (__ptrace_detach(tracer, p)) list_add(&p->ptrace_entry, &ptrace_dead); } - write_unlock_irq(&tasklist_lock); + write_unlock_irq(&tasklist_lock); BUG_ON(!list_empty(&tracer->ptraced)); list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) { list_del_init(&p->ptrace_entry); release_task(p); } + + write_lock_irq(&tasklist_lock); } int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) diff --git a/kernel/range.c b/kernel/range.c index 74e2e61..471b66a 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -7,10 +7,6 @@ #include <linux/range.h> -#ifndef ARRAY_SIZE -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) -#endif - int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) { if (start >= end) diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 72a8dc9..a23a57a 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -73,12 +73,14 @@ int debug_lockdep_rcu_enabled(void) EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); /** - * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section? + * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? * * Check for bottom half being disabled, which covers both the * CONFIG_PROVE_RCU and not cases. Note that if someone uses * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) - * will show the situation. + * will show the situation. This is useful for debug checks in functions + * that require that they be called within an RCU read-side critical + * section. * * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. */ @@ -86,7 +88,7 @@ int rcu_read_lock_bh_held(void) { if (!debug_lockdep_rcu_enabled()) return 1; - return in_softirq(); + return in_softirq() || irqs_disabled(); } EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); @@ -114,3 +116,163 @@ int rcu_my_thread_group_empty(void) } EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); #endif /* #ifdef CONFIG_PROVE_RCU */ + +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD +static inline void debug_init_rcu_head(struct rcu_head *head) +{ + debug_object_init(head, &rcuhead_debug_descr); +} + +static inline void debug_rcu_head_free(struct rcu_head *head) +{ + debug_object_free(head, &rcuhead_debug_descr); +} + +/* + * fixup_init is called when: + * - an active object is initialized + */ +static int rcuhead_fixup_init(void *addr, enum debug_obj_state state) +{ + struct rcu_head *head = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + /* + * Ensure that queued callbacks are all executed. + * If we detect that we are nested in a RCU read-side critical + * section, we should simply fail, otherwise we would deadlock. + */ + if (rcu_preempt_depth() != 0 || preempt_count() != 0 || + irqs_disabled()) { + WARN_ON(1); + return 0; + } + rcu_barrier(); + rcu_barrier_sched(); + rcu_barrier_bh(); + debug_object_init(head, &rcuhead_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_activate is called when: + * - an active object is activated + * - an unknown object is activated (might be a statically initialized object) + * Activation is performed internally by call_rcu(). + */ +static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) +{ + struct rcu_head *head = addr; + + switch (state) { + + case ODEBUG_STATE_NOTAVAILABLE: + /* + * This is not really a fixup. We just make sure that it is + * tracked in the object tracker. + */ + debug_object_init(head, &rcuhead_debug_descr); + debug_object_activate(head, &rcuhead_debug_descr); + return 0; + + case ODEBUG_STATE_ACTIVE: + /* + * Ensure that queued callbacks are all executed. + * If we detect that we are nested in a RCU read-side critical + * section, we should simply fail, otherwise we would deadlock. + */ + if (rcu_preempt_depth() != 0 || preempt_count() != 0 || + irqs_disabled()) { + WARN_ON(1); + return 0; + } + rcu_barrier(); + rcu_barrier_sched(); + rcu_barrier_bh(); + debug_object_activate(head, &rcuhead_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_free is called when: + * - an active object is freed + */ +static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) +{ + struct rcu_head *head = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + /* + * Ensure that queued callbacks are all executed. + * If we detect that we are nested in a RCU read-side critical + * section, we should simply fail, otherwise we would deadlock. + */ +#ifndef CONFIG_PREEMPT + WARN_ON(1); + return 0; +#else + if (rcu_preempt_depth() != 0 || preempt_count() != 0 || + irqs_disabled()) { + WARN_ON(1); + return 0; + } + rcu_barrier(); + rcu_barrier_sched(); + rcu_barrier_bh(); + debug_object_free(head, &rcuhead_debug_descr); + return 1; +#endif + default: + return 0; + } +} + +/** + * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects + * @head: pointer to rcu_head structure to be initialized + * + * This function informs debugobjects of a new rcu_head structure that + * has been allocated as an auto variable on the stack. This function + * is not required for rcu_head structures that are statically defined or + * that are dynamically allocated on the heap. This function has no + * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. + */ +void init_rcu_head_on_stack(struct rcu_head *head) +{ + debug_object_init_on_stack(head, &rcuhead_debug_descr); +} +EXPORT_SYMBOL_GPL(init_rcu_head_on_stack); + +/** + * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects + * @head: pointer to rcu_head structure to be initialized + * + * This function informs debugobjects that an on-stack rcu_head structure + * is about to go out of scope. As with init_rcu_head_on_stack(), this + * function is not required for rcu_head structures that are statically + * defined or that are dynamically allocated on the heap. Also as with + * init_rcu_head_on_stack(), this function has no effect for + * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. + */ +void destroy_rcu_head_on_stack(struct rcu_head *head) +{ + debug_object_free(head, &rcuhead_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); + +struct debug_obj_descr rcuhead_debug_descr = { + .name = "rcu_head", + .fixup_init = rcuhead_fixup_init, + .fixup_activate = rcuhead_fixup_activate, + .fixup_free = rcuhead_fixup_free, +}; +EXPORT_SYMBOL_GPL(rcuhead_debug_descr); +#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 38729d3..d806735 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -59,6 +59,14 @@ int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(rcu_scheduler_active); #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +/* Forward declarations for rcutiny_plugin.h. */ +static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); +static void __call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu), + struct rcu_ctrlblk *rcp); + +#include "rcutiny_plugin.h" + #ifdef CONFIG_NO_HZ static long rcu_dynticks_nesting = 1; @@ -140,6 +148,7 @@ void rcu_check_callbacks(int cpu, int user) rcu_sched_qs(cpu); else if (!in_softirq()) rcu_bh_qs(cpu); + rcu_preempt_check_callbacks(); } /* @@ -162,6 +171,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) *rcp->donetail = NULL; if (rcp->curtail == rcp->donetail) rcp->curtail = &rcp->rcucblist; + rcu_preempt_remove_callbacks(rcp); rcp->donetail = &rcp->rcucblist; local_irq_restore(flags); @@ -169,6 +179,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) while (list) { next = list->next; prefetch(next); + debug_rcu_head_unqueue(list); list->func(list); list = next; } @@ -181,6 +192,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) { __rcu_process_callbacks(&rcu_sched_ctrlblk); __rcu_process_callbacks(&rcu_bh_ctrlblk); + rcu_preempt_process_callbacks(); } /* @@ -211,6 +223,7 @@ static void __call_rcu(struct rcu_head *head, { unsigned long flags; + debug_rcu_head_queue(head); head->func = func; head->next = NULL; @@ -221,15 +234,15 @@ static void __call_rcu(struct rcu_head *head, } /* - * Post an RCU callback to be invoked after the end of an RCU grace + * Post an RCU callback to be invoked after the end of an RCU-sched grace * period. But since we have but one CPU, that would be after any * quiescent state. */ -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { __call_rcu(head, func, &rcu_sched_ctrlblk); } -EXPORT_SYMBOL_GPL(call_rcu); +EXPORT_SYMBOL_GPL(call_rcu_sched); /* * Post an RCU bottom-half callback to be invoked after any subsequent @@ -241,20 +254,6 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu_bh); -void rcu_barrier(void) -{ - struct rcu_synchronize rcu; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); -} -EXPORT_SYMBOL_GPL(rcu_barrier); - void rcu_barrier_bh(void) { struct rcu_synchronize rcu; @@ -287,5 +286,3 @@ void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); } - -#include "rcutiny_plugin.h" diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index d223a92bc..6ceca4f 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -1,7 +1,7 @@ /* - * Read-Copy Update mechanism for mutual exclusion (tree-based version) + * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition * Internal non-public definitions that provide either classic - * or preemptable semantics. + * or preemptible semantics. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -17,11 +17,587 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * - * Copyright IBM Corporation, 2009 + * Copyright (c) 2010 Linaro * * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> */ +#ifdef CONFIG_TINY_PREEMPT_RCU + +#include <linux/delay.h> + +/* Global control variables for preemptible RCU. */ +struct rcu_preempt_ctrlblk { + struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */ + struct rcu_head **nexttail; + /* Tasks blocked in a preemptible RCU */ + /* read-side critical section while an */ + /* preemptible-RCU grace period is in */ + /* progress must wait for a later grace */ + /* period. This pointer points to the */ + /* ->next pointer of the last task that */ + /* must wait for a later grace period, or */ + /* to &->rcb.rcucblist if there is no */ + /* such task. */ + struct list_head blkd_tasks; + /* Tasks blocked in RCU read-side critical */ + /* section. Tasks are placed at the head */ + /* of this list and age towards the tail. */ + struct list_head *gp_tasks; + /* Pointer to the first task blocking the */ + /* current grace period, or NULL if there */ + /* is not such task. */ + struct list_head *exp_tasks; + /* Pointer to first task blocking the */ + /* current expedited grace period, or NULL */ + /* if there is no such task. If there */ + /* is no current expedited grace period, */ + /* then there cannot be any such task. */ + u8 gpnum; /* Current grace period. */ + u8 gpcpu; /* Last grace period blocked by the CPU. */ + u8 completed; /* Last grace period completed. */ + /* If all three are equal, RCU is idle. */ +}; + +static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { + .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist, + .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, + .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, + .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), +}; + +static int rcu_preempted_readers_exp(void); +static void rcu_report_exp_done(void); + +/* + * Return true if the CPU has not yet responded to the current grace period. + */ +static int rcu_cpu_blocking_cur_gp(void) +{ + return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum; +} + +/* + * Check for a running RCU reader. Because there is only one CPU, + * there can be but one running RCU reader at a time. ;-) + */ +static int rcu_preempt_running_reader(void) +{ + return current->rcu_read_lock_nesting; +} + +/* + * Check for preempted RCU readers blocking any grace period. + * If the caller needs a reliable answer, it must disable hard irqs. + */ +static int rcu_preempt_blocked_readers_any(void) +{ + return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks); +} + +/* + * Check for preempted RCU readers blocking the current grace period. + * If the caller needs a reliable answer, it must disable hard irqs. + */ +static int rcu_preempt_blocked_readers_cgp(void) +{ + return rcu_preempt_ctrlblk.gp_tasks != NULL; +} + +/* + * Return true if another preemptible-RCU grace period is needed. + */ +static int rcu_preempt_needs_another_gp(void) +{ + return *rcu_preempt_ctrlblk.rcb.curtail != NULL; +} + +/* + * Return true if a preemptible-RCU grace period is in progress. + * The caller must disable hardirqs. + */ +static int rcu_preempt_gp_in_progress(void) +{ + return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum; +} + +/* + * Record a preemptible-RCU quiescent state for the specified CPU. Note + * that this just means that the task currently running on the CPU is + * in a quiescent state. There might be any number of tasks blocked + * while in an RCU read-side critical section. + * + * Unlike the other rcu_*_qs() functions, callers to this function + * must disable irqs in order to protect the assignment to + * ->rcu_read_unlock_special. + * + * Because this is a single-CPU implementation, the only way a grace + * period can end is if the CPU is in a quiescent state. The reason is + * that a blocked preemptible-RCU reader can exit its critical section + * only if the CPU is running it at the time. Therefore, when the + * last task blocking the current grace period exits its RCU read-side + * critical section, neither the CPU nor blocked tasks will be stopping + * the current grace period. (In contrast, SMP implementations + * might have CPUs running in RCU read-side critical sections that + * block later grace periods -- but this is not possible given only + * one CPU.) + */ +static void rcu_preempt_cpu_qs(void) +{ + /* Record both CPU and task as having responded to current GP. */ + rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; + current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; + + /* + * If there is no GP, or if blocked readers are still blocking GP, + * then there is nothing more to do. + */ + if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp()) + return; + + /* Advance callbacks. */ + rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum; + rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail; + rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail; + + /* If there are no blocked readers, next GP is done instantly. */ + if (!rcu_preempt_blocked_readers_any()) + rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; + + /* If there are done callbacks, make RCU_SOFTIRQ process them. */ + if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) + raise_softirq(RCU_SOFTIRQ); +} + +/* + * Start a new RCU grace period if warranted. Hard irqs must be disabled. + */ +static void rcu_preempt_start_gp(void) +{ + if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) { + + /* Official start of GP. */ + rcu_preempt_ctrlblk.gpnum++; + + /* Any blocked RCU readers block new GP. */ + if (rcu_preempt_blocked_readers_any()) + rcu_preempt_ctrlblk.gp_tasks = + rcu_preempt_ctrlblk.blkd_tasks.next; + + /* If there is no running reader, CPU is done with GP. */ + if (!rcu_preempt_running_reader()) + rcu_preempt_cpu_qs(); + } +} + +/* + * We have entered the scheduler, and the current task might soon be + * context-switched away from. If this task is in an RCU read-side + * critical section, we will no longer be able to rely on the CPU to + * record that fact, so we enqueue the task on the blkd_tasks list. + * If the task started after the current grace period began, as recorded + * by ->gpcpu, we enqueue at the beginning of the list. Otherwise + * before the element referenced by ->gp_tasks (or at the tail if + * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element. + * The task will dequeue itself when it exits the outermost enclosing + * RCU read-side critical section. Therefore, the current grace period + * cannot be permitted to complete until the ->gp_tasks pointer becomes + * NULL. + * + * Caller must disable preemption. + */ +void rcu_preempt_note_context_switch(void) +{ + struct task_struct *t = current; + unsigned long flags; + + local_irq_save(flags); /* must exclude scheduler_tick(). */ + if (rcu_preempt_running_reader() && + (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { + + /* Possibly blocking in an RCU read-side critical section. */ + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; + + /* + * If this CPU has already checked in, then this task + * will hold up the next grace period rather than the + * current grace period. Queue the task accordingly. + * If the task is queued for the current grace period + * (i.e., this CPU has not yet passed through a quiescent + * state for the current grace period), then as long + * as that task remains queued, the current grace period + * cannot end. + */ + list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks); + if (rcu_cpu_blocking_cur_gp()) + rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry; + } + + /* + * Either we were not in an RCU read-side critical section to + * begin with, or we have now recorded that critical section + * globally. Either way, we can now note a quiescent state + * for this CPU. Again, if we were in an RCU read-side critical + * section, and if that critical section was blocking the current + * grace period, then the fact that the task has been enqueued + * means that current grace period continues to be blocked. + */ + rcu_preempt_cpu_qs(); + local_irq_restore(flags); +} + +/* + * Tiny-preemptible RCU implementation for rcu_read_lock(). + * Just increment ->rcu_read_lock_nesting, shared state will be updated + * if we block. + */ +void __rcu_read_lock(void) +{ + current->rcu_read_lock_nesting++; + barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */ +} +EXPORT_SYMBOL_GPL(__rcu_read_lock); + +/* + * Handle special cases during rcu_read_unlock(), such as needing to + * notify RCU core processing or task having blocked during the RCU + * read-side critical section. + */ +static void rcu_read_unlock_special(struct task_struct *t) +{ + int empty; + int empty_exp; + unsigned long flags; + struct list_head *np; + int special; + + /* + * NMI handlers cannot block and cannot safely manipulate state. + * They therefore cannot possibly be special, so just leave. + */ + if (in_nmi()) + return; + + local_irq_save(flags); + + /* + * If RCU core is waiting for this CPU to exit critical section, + * let it know that we have done so. + */ + special = t->rcu_read_unlock_special; + if (special & RCU_READ_UNLOCK_NEED_QS) + rcu_preempt_cpu_qs(); + + /* Hardware IRQ handlers cannot block. */ + if (in_irq()) { + local_irq_restore(flags); + return; + } + + /* Clean up if blocked during RCU read-side critical section. */ + if (special & RCU_READ_UNLOCK_BLOCKED) { + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; + + /* + * Remove this task from the ->blkd_tasks list and adjust + * any pointers that might have been referencing it. + */ + empty = !rcu_preempt_blocked_readers_cgp(); + empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; + np = t->rcu_node_entry.next; + if (np == &rcu_preempt_ctrlblk.blkd_tasks) + np = NULL; + list_del(&t->rcu_node_entry); + if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) + rcu_preempt_ctrlblk.gp_tasks = np; + if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) + rcu_preempt_ctrlblk.exp_tasks = np; + INIT_LIST_HEAD(&t->rcu_node_entry); + + /* + * If this was the last task on the current list, and if + * we aren't waiting on the CPU, report the quiescent state + * and start a new grace period if needed. + */ + if (!empty && !rcu_preempt_blocked_readers_cgp()) { + rcu_preempt_cpu_qs(); + rcu_preempt_start_gp(); + } + + /* + * If this was the last task on the expedited lists, + * then we need wake up the waiting task. + */ + if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) + rcu_report_exp_done(); + } + local_irq_restore(flags); +} + +/* + * Tiny-preemptible RCU implementation for rcu_read_unlock(). + * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost + * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then + * invoke rcu_read_unlock_special() to clean up after a context switch + * in an RCU read-side critical section and other special cases. + */ +void __rcu_read_unlock(void) +{ + struct task_struct *t = current; + + barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */ + --t->rcu_read_lock_nesting; + barrier(); /* decrement before load of ->rcu_read_unlock_special */ + if (t->rcu_read_lock_nesting == 0 && + unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) + rcu_read_unlock_special(t); +#ifdef CONFIG_PROVE_LOCKING + WARN_ON_ONCE(t->rcu_read_lock_nesting < 0); +#endif /* #ifdef CONFIG_PROVE_LOCKING */ +} +EXPORT_SYMBOL_GPL(__rcu_read_unlock); + +/* + * Check for a quiescent state from the current CPU. When a task blocks, + * the task is recorded in the rcu_preempt_ctrlblk structure, which is + * checked elsewhere. This is called from the scheduling-clock interrupt. + * + * Caller must disable hard irqs. + */ +static void rcu_preempt_check_callbacks(void) +{ + struct task_struct *t = current; + + if (rcu_preempt_gp_in_progress() && + (!rcu_preempt_running_reader() || + !rcu_cpu_blocking_cur_gp())) + rcu_preempt_cpu_qs(); + if (&rcu_preempt_ctrlblk.rcb.rcucblist != + rcu_preempt_ctrlblk.rcb.donetail) + raise_softirq(RCU_SOFTIRQ); + if (rcu_preempt_gp_in_progress() && + rcu_cpu_blocking_cur_gp() && + rcu_preempt_running_reader()) + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; +} + +/* + * TINY_PREEMPT_RCU has an extra callback-list tail pointer to + * update, so this is invoked from __rcu_process_callbacks() to + * handle that case. Of course, it is invoked for all flavors of + * RCU, but RCU callbacks can appear only on one of the lists, and + * neither ->nexttail nor ->donetail can possibly be NULL, so there + * is no need for an explicit check. + */ +static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) +{ + if (rcu_preempt_ctrlblk.nexttail == rcp->donetail) + rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist; +} + +/* + * Process callbacks for preemptible RCU. + */ +static void rcu_preempt_process_callbacks(void) +{ + __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); +} + +/* + * Queue a preemptible -RCU callback for invocation after a grace period. + */ +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + unsigned long flags; + + debug_rcu_head_queue(head); + head->func = func; + head->next = NULL; + + local_irq_save(flags); + *rcu_preempt_ctrlblk.nexttail = head; + rcu_preempt_ctrlblk.nexttail = &head->next; + rcu_preempt_start_gp(); /* checks to see if GP needed. */ + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(call_rcu); + +void rcu_barrier(void) +{ + struct rcu_synchronize rcu; + + init_rcu_head_on_stack(&rcu.head); + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + +/* + * synchronize_rcu - wait until a grace period has elapsed. + * + * Control will return to the caller some time after a full grace + * period has elapsed, in other words after all currently executing RCU + * read-side critical sections have completed. RCU read-side critical + * sections are delimited by rcu_read_lock() and rcu_read_unlock(), + * and may be nested. + */ +void synchronize_rcu(void) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (!rcu_scheduler_active) + return; +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + + WARN_ON_ONCE(rcu_preempt_running_reader()); + if (!rcu_preempt_blocked_readers_any()) + return; + + /* Once we get past the fastpath checks, same code as rcu_barrier(). */ + rcu_barrier(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu); + +static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); +static unsigned long sync_rcu_preempt_exp_count; +static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); + +/* + * Return non-zero if there are any tasks in RCU read-side critical + * sections blocking the current preemptible-RCU expedited grace period. + * If there is no preemptible-RCU expedited grace period currently in + * progress, returns zero unconditionally. + */ +static int rcu_preempted_readers_exp(void) +{ + return rcu_preempt_ctrlblk.exp_tasks != NULL; +} + +/* + * Report the exit from RCU read-side critical section for the last task + * that queued itself during or before the current expedited preemptible-RCU + * grace period. + */ +static void rcu_report_exp_done(void) +{ + wake_up(&sync_rcu_preempt_exp_wq); +} + +/* + * Wait for an rcu-preempt grace period, but expedite it. The basic idea + * is to rely in the fact that there is but one CPU, and that it is + * illegal for a task to invoke synchronize_rcu_expedited() while in a + * preemptible-RCU read-side critical section. Therefore, any such + * critical sections must correspond to blocked tasks, which must therefore + * be on the ->blkd_tasks list. So just record the current head of the + * list in the ->exp_tasks pointer, and wait for all tasks including and + * after the task pointed to by ->exp_tasks to drain. + */ +void synchronize_rcu_expedited(void) +{ + unsigned long flags; + struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk; + unsigned long snap; + + barrier(); /* ensure prior action seen before grace period. */ + + WARN_ON_ONCE(rcu_preempt_running_reader()); + + /* + * Acquire lock so that there is only one preemptible RCU grace + * period in flight. Of course, if someone does the expedited + * grace period for us while we are acquiring the lock, just leave. + */ + snap = sync_rcu_preempt_exp_count + 1; + mutex_lock(&sync_rcu_preempt_exp_mutex); + if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count)) + goto unlock_mb_ret; /* Others did our work for us. */ + + local_irq_save(flags); + + /* + * All RCU readers have to already be on blkd_tasks because + * we cannot legally be executing in an RCU read-side critical + * section. + */ + + /* Snapshot current head of ->blkd_tasks list. */ + rpcp->exp_tasks = rpcp->blkd_tasks.next; + if (rpcp->exp_tasks == &rpcp->blkd_tasks) + rpcp->exp_tasks = NULL; + local_irq_restore(flags); + + /* Wait for tail of ->blkd_tasks list to drain. */ + if (rcu_preempted_readers_exp()) + wait_event(sync_rcu_preempt_exp_wq, + !rcu_preempted_readers_exp()); + + /* Clean up and exit. */ + barrier(); /* ensure expedited GP seen before counter increment. */ + sync_rcu_preempt_exp_count++; +unlock_mb_ret: + mutex_unlock(&sync_rcu_preempt_exp_mutex); + barrier(); /* ensure subsequent action seen after grace period. */ +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +/* + * Does preemptible RCU need the CPU to stay out of dynticks mode? + */ +int rcu_preempt_needs_cpu(void) +{ + if (!rcu_preempt_running_reader()) + rcu_preempt_cpu_qs(); + return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; +} + +/* + * Check for a task exiting while in a preemptible -RCU read-side + * critical section, clean up if so. No need to issue warnings, + * as debug_check_no_locks_held() already does this if lockdep + * is enabled. + */ +void exit_rcu(void) +{ + struct task_struct *t = current; + + if (t->rcu_read_lock_nesting == 0) + return; + t->rcu_read_lock_nesting = 1; + rcu_read_unlock(); +} + +#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ + +/* + * Because preemptible RCU does not exist, it never has any callbacks + * to check. + */ +static void rcu_preempt_check_callbacks(void) +{ +} + +/* + * Because preemptible RCU does not exist, it never has any callbacks + * to remove. + */ +static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) +{ +} + +/* + * Because preemptible RCU does not exist, it never has any callbacks + * to process. + */ +static void rcu_preempt_process_callbacks(void) +{ +} + +#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC #include <linux/kernel_stat.h> diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 6535ac8..9d8e8fb 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -120,7 +120,7 @@ struct rcu_torture { }; static LIST_HEAD(rcu_torture_freelist); -static struct rcu_torture *rcu_torture_current; +static struct rcu_torture __rcu *rcu_torture_current; static long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); @@ -153,8 +153,10 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; #define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ #define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ static int fullstop = FULLSTOP_RMMOD; -DEFINE_MUTEX(fullstop_mutex); /* Protect fullstop transitions and spawning */ - /* of kthreads. */ +/* + * Protect fullstop transitions and spawning of kthreads. + */ +static DEFINE_MUTEX(fullstop_mutex); /* * Detect and respond to a system shutdown. @@ -239,8 +241,7 @@ static unsigned long rcu_random(struct rcu_random_state *rrsp) { if (--rrsp->rrs_count < 0) { - rrsp->rrs_state += - (unsigned long)cpu_clock(raw_smp_processor_id()); + rrsp->rrs_state += (unsigned long)local_clock(); rrsp->rrs_count = RCU_RANDOM_REFRESH; } rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; @@ -304,6 +305,10 @@ static void rcu_read_delay(struct rcu_random_state *rrsp) mdelay(longdelay_ms); if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) udelay(shortdelay_us); +#ifdef CONFIG_PREEMPT + if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000))) + preempt_schedule(); /* No QS if preempt_disable() in effect */ +#endif } static void rcu_torture_read_unlock(int idx) __releases(RCU) @@ -537,6 +542,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp) delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); if (!delay) schedule_timeout_interruptible(longdelay); + else + rcu_read_delay(rrsp); } static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) @@ -732,7 +739,8 @@ rcu_torture_writer(void *arg) continue; rp->rtort_pipe_count = 0; udelay(rcu_random(&rand) & 0x3ff); - old_rp = rcu_torture_current; + old_rp = rcu_dereference_check(rcu_torture_current, + current == writer_task); rp->rtort_mbtest = 1; rcu_assign_pointer(rcu_torture_current, rp); smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d443734..ccdc04c 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -143,6 +143,11 @@ module_param(blimit, int, 0); module_param(qhimark, int, 0); module_param(qlowmark, int, 0); +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR +int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT; +module_param(rcu_cpu_stall_suppress, int, 0644); +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + static void force_quiescent_state(struct rcu_state *rsp, int relaxed); static int rcu_pending(int cpu); @@ -450,7 +455,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) #ifdef CONFIG_RCU_CPU_STALL_DETECTOR -int rcu_cpu_stall_panicking __read_mostly; +int rcu_cpu_stall_suppress __read_mostly; static void record_gp_stall_check_time(struct rcu_state *rsp) { @@ -482,8 +487,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp) rcu_print_task_stall(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); - /* OK, time to rat on our buddy... */ - + /* + * OK, time to rat on our buddy... + * See Documentation/RCU/stallwarn.txt for info on how to debug + * RCU CPU stall warnings. + */ printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", rsp->name); rcu_for_each_leaf_node(rsp, rnp) { @@ -512,6 +520,11 @@ static void print_cpu_stall(struct rcu_state *rsp) unsigned long flags; struct rcu_node *rnp = rcu_get_root(rsp); + /* + * OK, time to rat on ourselves... + * See Documentation/RCU/stallwarn.txt for info on how to debug + * RCU CPU stall warnings. + */ printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", rsp->name, smp_processor_id(), jiffies - rsp->gp_start); trigger_all_cpu_backtrace(); @@ -530,11 +543,11 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) long delta; struct rcu_node *rnp; - if (rcu_cpu_stall_panicking) + if (rcu_cpu_stall_suppress) return; - delta = jiffies - rsp->jiffies_stall; + delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); rnp = rdp->mynode; - if ((rnp->qsmask & rdp->grpmask) && delta >= 0) { + if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); @@ -548,10 +561,26 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) { - rcu_cpu_stall_panicking = 1; + rcu_cpu_stall_suppress = 1; return NOTIFY_DONE; } +/** + * rcu_cpu_stall_reset - prevent further stall warnings in current grace period + * + * Set the stall-warning timeout way off into the future, thus preventing + * any RCU CPU stall-warning messages from appearing in the current set of + * RCU grace periods. + * + * The caller must disable hard irqs. + */ +void rcu_cpu_stall_reset(void) +{ + rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2; + rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2; + rcu_preempt_stall_reset(); +} + static struct notifier_block rcu_panic_block = { .notifier_call = rcu_panic, }; @@ -571,6 +600,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) { } +void rcu_cpu_stall_reset(void) +{ +} + static void __init check_cpu_stall_init(void) { } @@ -712,7 +745,7 @@ static void rcu_start_gp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { - struct rcu_data *rdp = rsp->rda[smp_processor_id()]; + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_node *rnp = rcu_get_root(rsp); if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { @@ -960,7 +993,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) { int i; - struct rcu_data *rdp = rsp->rda[smp_processor_id()]; + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); if (rdp->nxtlist == NULL) return; /* irqs disabled, so comparison is stable. */ @@ -971,6 +1004,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; rsp->orphan_qlen += rdp->qlen; + rdp->n_cbs_orphaned += rdp->qlen; rdp->qlen = 0; raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ } @@ -984,7 +1018,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) struct rcu_data *rdp; raw_spin_lock_irqsave(&rsp->onofflock, flags); - rdp = rsp->rda[smp_processor_id()]; + rdp = this_cpu_ptr(rsp->rda); if (rsp->orphan_cbs_list == NULL) { raw_spin_unlock_irqrestore(&rsp->onofflock, flags); return; @@ -992,6 +1026,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; rdp->qlen += rsp->orphan_qlen; + rdp->n_cbs_adopted += rsp->orphan_qlen; rsp->orphan_cbs_list = NULL; rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; rsp->orphan_qlen = 0; @@ -1007,7 +1042,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) unsigned long flags; unsigned long mask; int need_report = 0; - struct rcu_data *rdp = rsp->rda[cpu]; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp; /* Exclude any attempts to start a new grace period. */ @@ -1112,6 +1147,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) while (list) { next = list->next; prefetch(next); + debug_rcu_head_unqueue(list); list->func(list); list = next; if (++count >= rdp->blimit) @@ -1122,6 +1158,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* Update count, and requeue any remaining callbacks. */ rdp->qlen -= count; + rdp->n_cbs_invoked += count; if (list != NULL) { *tail = rdp->nxtlist; rdp->nxtlist = list; @@ -1225,7 +1262,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) cpu = rnp->grplo; bit = 1; for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { - if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) + if ((rnp->qsmask & bit) != 0 && + f(per_cpu_ptr(rsp->rda, cpu))) mask |= bit; } if (mask != 0) { @@ -1388,6 +1426,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), unsigned long flags; struct rcu_data *rdp; + debug_rcu_head_queue(head); head->func = func; head->next = NULL; @@ -1400,7 +1439,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), * a quiescent state betweentimes. */ local_irq_save(flags); - rdp = rsp->rda[smp_processor_id()]; + rdp = this_cpu_ptr(rsp->rda); rcu_process_gp_end(rsp, rdp); check_for_new_grace_period(rsp, rdp); @@ -1699,7 +1738,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) { unsigned long flags; int i; - struct rcu_data *rdp = rsp->rda[cpu]; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ @@ -1727,7 +1766,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) { unsigned long flags; unsigned long mask; - struct rcu_data *rdp = rsp->rda[cpu]; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ @@ -1863,7 +1902,8 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) /* * Helper function for rcu_init() that initializes one rcu_state structure. */ -static void __init rcu_init_one(struct rcu_state *rsp) +static void __init rcu_init_one(struct rcu_state *rsp, + struct rcu_data __percpu *rda) { static char *buf[] = { "rcu_node_level_0", "rcu_node_level_1", @@ -1916,37 +1956,23 @@ static void __init rcu_init_one(struct rcu_state *rsp) } } + rsp->rda = rda; rnp = rsp->level[NUM_RCU_LVLS - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) rnp++; - rsp->rda[i]->mynode = rnp; + per_cpu_ptr(rsp->rda, i)->mynode = rnp; rcu_boot_init_percpu_data(i, rsp); } } -/* - * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used - * nowhere else! Assigns leaf node pointers into each CPU's rcu_data - * structure. - */ -#define RCU_INIT_FLAVOR(rsp, rcu_data) \ -do { \ - int i; \ - \ - for_each_possible_cpu(i) { \ - (rsp)->rda[i] = &per_cpu(rcu_data, i); \ - } \ - rcu_init_one(rsp); \ -} while (0) - void __init rcu_init(void) { int cpu; rcu_bootup_announce(); - RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); - RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); + rcu_init_one(&rcu_sched_state, &rcu_sched_data); + rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 14c040b..91d4170 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -202,6 +202,9 @@ struct rcu_data { long qlen; /* # of queued callbacks */ long qlen_last_fqs_check; /* qlen at last check for QS forcing */ + unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ + unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */ + unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */ unsigned long n_force_qs_snap; /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ @@ -254,19 +257,23 @@ struct rcu_data { #define RCU_STALL_DELAY_DELTA 0 #endif -#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA) +#define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \ + RCU_STALL_DELAY_DELTA) /* for rsp->jiffies_stall */ -#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA) +#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30) /* for rsp->jiffies_stall */ #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ /* to take at least one */ /* scheduling clock irq */ /* before ratting on them. */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE +#define RCU_CPU_STALL_SUPPRESS_INIT 0 +#else +#define RCU_CPU_STALL_SUPPRESS_INIT 1 +#endif -#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) -#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* * RCU global state, including node hierarchy. This hierarchy is @@ -283,7 +290,7 @@ struct rcu_state { struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ - struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */ + struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ /* The following fields are guarded by the root rcu_node's lock. */ @@ -365,6 +372,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, #ifdef CONFIG_RCU_CPU_STALL_DETECTOR static void rcu_print_detail_task_stall(struct rcu_state *rsp); static void rcu_print_task_stall(struct rcu_node *rnp); +static void rcu_preempt_stall_reset(void); #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 0e4f420..71a4147 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -57,7 +57,7 @@ static void __init rcu_bootup_announce_oddness(void) printk(KERN_INFO "\tRCU-based detection of stalled CPUs is disabled.\n"); #endif -#ifndef CONFIG_RCU_CPU_STALL_VERBOSE +#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); #endif #if NUM_RCU_LVL_4 != 0 @@ -154,7 +154,7 @@ static void rcu_preempt_note_context_switch(int cpu) (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { /* Possibly blocking in an RCU read-side critical section. */ - rdp = rcu_preempt_state.rda[cpu]; + rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; @@ -201,7 +201,7 @@ static void rcu_preempt_note_context_switch(int cpu) */ void __rcu_read_lock(void) { - ACCESS_ONCE(current->rcu_read_lock_nesting)++; + current->rcu_read_lock_nesting++; barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ } EXPORT_SYMBOL_GPL(__rcu_read_lock); @@ -344,7 +344,9 @@ void __rcu_read_unlock(void) struct task_struct *t = current; barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ - if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && + --t->rcu_read_lock_nesting; + barrier(); /* decrement before load of ->rcu_read_unlock_special */ + if (t->rcu_read_lock_nesting == 0 && unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) rcu_read_unlock_special(t); #ifdef CONFIG_PROVE_LOCKING @@ -417,6 +419,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp) } } +/* + * Suppress preemptible RCU's CPU stall warnings by pushing the + * time of the next stall-warning message comfortably far into the + * future. + */ +static void rcu_preempt_stall_reset(void) +{ + rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; +} + #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* @@ -546,9 +558,11 @@ EXPORT_SYMBOL_GPL(call_rcu); * * Control will return to the caller some time after a full grace * period has elapsed, in other words after all currently executing RCU - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), - * and may be nested. + * read-side critical sections have completed. Note, however, that + * upon return from synchronize_rcu(), the caller might well be executing + * concurrently with new RCU read-side critical sections that began while + * synchronize_rcu() was waiting. RCU read-side critical sections are + * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. */ void synchronize_rcu(void) { @@ -771,7 +785,7 @@ static void rcu_preempt_send_cbs_to_orphanage(void) */ static void __init __rcu_init_preempt(void) { - RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); + rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); } /* @@ -865,6 +879,14 @@ static void rcu_print_task_stall(struct rcu_node *rnp) { } +/* + * Because preemptible RCU does not exist, there is no need to suppress + * its CPU stall warnings. + */ +static void rcu_preempt_stall_reset(void) +{ +} + #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* @@ -919,15 +941,6 @@ static void rcu_preempt_process_callbacks(void) } /* - * In classic RCU, call_rcu() is just call_rcu_sched(). - */ -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) -{ - call_rcu_sched(head, func); -} -EXPORT_SYMBOL_GPL(call_rcu); - -/* * Wait for an rcu-preempt grace period, but make it happen quickly. * But because preemptable RCU does not exist, map to rcu-sched. */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 36c95b4..d15430b 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -64,7 +64,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit); + seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit); + seq_printf(m, " ci=%lu co=%lu ca=%lu\n", + rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } #define PRINT_RCU_DATA(name, func, m) \ @@ -119,7 +121,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit); + seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit); + seq_printf(m, ",%lu,%lu,%lu\n", + rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } static int show_rcudata_csv(struct seq_file *m, void *unused) @@ -128,7 +132,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) #ifdef CONFIG_NO_HZ seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); #endif /* #ifdef CONFIG_NO_HZ */ - seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); + seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n"); #ifdef CONFIG_TREE_PREEMPT_RCU seq_puts(m, "\"rcu_preempt:\"\n"); PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); @@ -262,7 +266,7 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) struct rcu_data *rdp; for_each_possible_cpu(cpu) { - rdp = rsp->rda[cpu]; + rdp = per_cpu_ptr(rsp->rda, cpu); if (rdp->beenonline) print_one_rcu_pending(m, rdp); } diff --git a/kernel/sched.c b/kernel/sched.c index cb816e3..d42992b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -77,6 +77,7 @@ #include <asm/irq_regs.h> #include "sched_cpupri.h" +#include "workqueue_sched.h" #define CREATE_TRACE_POINTS #include <trace/events/sched.h> @@ -425,9 +426,7 @@ struct root_domain { */ cpumask_var_t rto_mask; atomic_t rto_count; -#ifdef CONFIG_SMP struct cpupri cpupri; -#endif }; /* @@ -436,7 +435,7 @@ struct root_domain { */ static struct root_domain def_root_domain; -#endif +#endif /* CONFIG_SMP */ /* * This is the main, per-CPU runqueue data structure. @@ -456,9 +455,10 @@ struct rq { unsigned long nr_running; #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; + unsigned long last_load_update_tick; #ifdef CONFIG_NO_HZ u64 nohz_stamp; - unsigned char in_nohz_recently; + unsigned char nohz_balance_kick; #endif unsigned int skip_clock_update; @@ -486,11 +486,12 @@ struct rq { */ unsigned long nr_uninterruptible; - struct task_struct *curr, *idle; + struct task_struct *curr, *idle, *stop; unsigned long next_balance; struct mm_struct *prev_mm; u64 clock; + u64 clock_task; atomic_t nr_iowait; @@ -518,6 +519,10 @@ struct rq { u64 avg_idle; #endif +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + u64 prev_irq_time; +#endif + /* calc_load related fields */ unsigned long calc_load_update; long calc_load_active; @@ -641,10 +646,22 @@ static inline struct task_group *task_group(struct task_struct *p) #endif /* CONFIG_CGROUP_SCHED */ +static u64 irq_time_cpu(int cpu); +static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time); + inline void update_rq_clock(struct rq *rq) { - if (!rq->skip_clock_update) - rq->clock = sched_clock_cpu(cpu_of(rq)); + if (!rq->skip_clock_update) { + int cpu = cpu_of(rq); + u64 irq_time; + + rq->clock = sched_clock_cpu(cpu); + irq_time = irq_time_cpu(cpu); + if (rq->clock - irq_time > rq->clock_task) + rq->clock_task = rq->clock - irq_time; + + sched_irq_time_avg_update(rq, irq_time); + } } /* @@ -721,7 +738,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[64]; - char *cmp = buf; + char *cmp; int neg = 0; int i; @@ -732,6 +749,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, return -EFAULT; buf[cnt] = 0; + cmp = strstrip(buf); if (strncmp(buf, "NO_", 3) == 0) { neg = 1; @@ -739,9 +757,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, } for (i = 0; sched_feat_names[i]; i++) { - int len = strlen(sched_feat_names[i]); - - if (strncmp(cmp, sched_feat_names[i], len) == 0) { + if (strcmp(cmp, sched_feat_names[i]) == 0) { if (neg) sysctl_sched_features &= ~(1UL << i); else @@ -1193,6 +1209,27 @@ static void resched_cpu(int cpu) #ifdef CONFIG_NO_HZ /* + * In the semi idle case, use the nearest busy cpu for migrating timers + * from an idle cpu. This is good for power-savings. + * + * We don't do similar optimization for completely idle system, as + * selecting an idle cpu will add more delays to the timers than intended + * (as that cpu's timer base may not be uptodate wrt jiffies etc). + */ +int get_nohz_timer_target(void) +{ + int cpu = smp_processor_id(); + int i; + struct sched_domain *sd; + + for_each_domain(cpu, sd) { + for_each_cpu(i, sched_domain_span(sd)) + if (!idle_cpu(i)) + return i; + } + return cpu; +} +/* * When add_timer_on() enqueues a timer into the timer wheel of an * idle CPU then this timer might expire before the next timer event * which is scheduled to wake up that CPU. In case of a completely @@ -1232,16 +1269,6 @@ void wake_up_idle_cpu(int cpu) smp_send_reschedule(cpu); } -int nohz_ratelimit(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - u64 diff = rq->clock - rq->nohz_stamp; - - rq->nohz_stamp = rq->clock; - - return diff < (NSEC_PER_SEC / HZ) >> 1; -} - #endif /* CONFIG_NO_HZ */ static u64 sched_avg_period(void) @@ -1281,6 +1308,10 @@ static void resched_task(struct task_struct *p) static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } + +static void sched_avg_update(struct rq *rq) +{ +} #endif /* CONFIG_SMP */ #if BITS_PER_LONG == 32 @@ -1652,7 +1683,7 @@ static void update_shares(struct sched_domain *sd) if (root_task_group_empty()) return; - now = cpu_clock(raw_smp_processor_id()); + now = local_clock(); elapsed = now - sd->last_update; if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { @@ -1805,6 +1836,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) static void calc_load_account_idle(struct rq *this_rq); static void update_sysctl(void); static int get_update_sysctl_factor(void); +static void update_cpu_load(struct rq *this_rq); static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { @@ -1822,7 +1854,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) static const struct sched_class rt_sched_class; -#define sched_class_highest (&rt_sched_class) +#define sched_class_highest (&stop_sched_class) #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) @@ -1840,12 +1872,6 @@ static void dec_nr_running(struct rq *rq) static void set_load_weight(struct task_struct *p) { - if (task_has_rt_policy(p)) { - p->se.load.weight = 0; - p->se.load.inv_weight = WMULT_CONST; - return; - } - /* * SCHED_IDLE tasks get minimal weight: */ @@ -1899,13 +1925,132 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) dec_nr_running(rq); } +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +/* + * There are no locks covering percpu hardirq/softirq time. + * They are only modified in account_system_vtime, on corresponding CPU + * with interrupts disabled. So, writes are safe. + * They are read and saved off onto struct rq in update_rq_clock(). + * This may result in other CPU reading this CPU's irq time and can + * race with irq/account_system_vtime on this CPU. We would either get old + * or new value (or semi updated value on 32 bit) with a side effect of + * accounting a slice of irq time to wrong task when irq is in progress + * while we read rq->clock. That is a worthy compromise in place of having + * locks on each irq in account_system_time. + */ +static DEFINE_PER_CPU(u64, cpu_hardirq_time); +static DEFINE_PER_CPU(u64, cpu_softirq_time); + +static DEFINE_PER_CPU(u64, irq_start_time); +static int sched_clock_irqtime; + +void enable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 1; +} + +void disable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 0; +} + +static u64 irq_time_cpu(int cpu) +{ + if (!sched_clock_irqtime) + return 0; + + return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); +} + +void account_system_vtime(struct task_struct *curr) +{ + unsigned long flags; + int cpu; + u64 now, delta; + + if (!sched_clock_irqtime) + return; + + local_irq_save(flags); + + cpu = smp_processor_id(); + now = sched_clock_cpu(cpu); + delta = now - per_cpu(irq_start_time, cpu); + per_cpu(irq_start_time, cpu) = now; + /* + * We do not account for softirq time from ksoftirqd here. + * We want to continue accounting softirq time to ksoftirqd thread + * in that case, so as not to confuse scheduler with a special task + * that do not consume any time, but still wants to run. + */ + if (hardirq_count()) + per_cpu(cpu_hardirq_time, cpu) += delta; + else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) + per_cpu(cpu_softirq_time, cpu) += delta; + + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(account_system_vtime); + +static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) +{ + if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) { + u64 delta_irq = curr_irq_time - rq->prev_irq_time; + rq->prev_irq_time = curr_irq_time; + sched_rt_avg_update(rq, delta_irq); + } +} + +#else + +static u64 irq_time_cpu(int cpu) +{ + return 0; +} + +static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { } + +#endif + #include "sched_idletask.c" #include "sched_fair.c" #include "sched_rt.c" +#include "sched_stoptask.c" #ifdef CONFIG_SCHED_DEBUG # include "sched_debug.c" #endif +void sched_set_stop_task(int cpu, struct task_struct *stop) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; + struct task_struct *old_stop = cpu_rq(cpu)->stop; + + if (stop) { + /* + * Make it appear like a SCHED_FIFO task, its something + * userspace knows about and won't get confused about. + * + * Also, it will make PI more or less work without too + * much confusion -- but then, stop work should not + * rely on PI working anyway. + */ + sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); + + stop->sched_class = &stop_sched_class; + } + + cpu_rq(cpu)->stop = stop; + + if (old_stop) { + /* + * Reset it back to a normal scheduling class so that + * it can die in pieces. + */ + old_stop->sched_class = &rt_sched_class; + } +} + /* * __normal_prio - return the priority that is based on the static prio */ @@ -1985,6 +2130,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) if (p->sched_class != &fair_sched_class) return 0; + if (unlikely(p->policy == SCHED_IDLE)) + return 0; + /* * Buddy candidates are cache hot: */ @@ -2267,11 +2415,55 @@ static void update_avg(u64 *avg, u64 sample) } #endif -/*** +static inline void ttwu_activate(struct task_struct *p, struct rq *rq, + bool is_sync, bool is_migrate, bool is_local, + unsigned long en_flags) +{ + schedstat_inc(p, se.statistics.nr_wakeups); + if (is_sync) + schedstat_inc(p, se.statistics.nr_wakeups_sync); + if (is_migrate) + schedstat_inc(p, se.statistics.nr_wakeups_migrate); + if (is_local) + schedstat_inc(p, se.statistics.nr_wakeups_local); + else + schedstat_inc(p, se.statistics.nr_wakeups_remote); + + activate_task(rq, p, en_flags); +} + +static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, + int wake_flags, bool success) +{ + trace_sched_wakeup(p, success); + check_preempt_curr(rq, p, wake_flags); + + p->state = TASK_RUNNING; +#ifdef CONFIG_SMP + if (p->sched_class->task_woken) + p->sched_class->task_woken(rq, p); + + if (unlikely(rq->idle_stamp)) { + u64 delta = rq->clock - rq->idle_stamp; + u64 max = 2*sysctl_sched_migration_cost; + + if (delta > max) + rq->avg_idle = max; + else + update_avg(&rq->avg_idle, delta); + rq->idle_stamp = 0; + } +#endif + /* if a worker is waking up, notify workqueue */ + if ((p->flags & PF_WQ_WORKER) && success) + wq_worker_waking_up(p, cpu_of(rq)); +} + +/** * try_to_wake_up - wake up a thread - * @p: the to-be-woken-up thread + * @p: the thread to be awakened * @state: the mask of task states that can be woken - * @sync: do a synchronous wakeup? + * @wake_flags: wake modifier flags (WF_*) * * Put it on the run-queue if it's not already there. The "current" * thread is always on the run-queue (except when the actual @@ -2279,7 +2471,8 @@ static void update_avg(u64 *avg, u64 sample) * the simpler "current->state = TASK_RUNNING" to mark yourself * runnable without the overhead of this. * - * returns failure only if the task is already active. + * Returns %true if @p was woken up, %false if it was already running + * or @state didn't match @p's state. */ static int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) @@ -2359,38 +2552,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, out_activate: #endif /* CONFIG_SMP */ - schedstat_inc(p, se.statistics.nr_wakeups); - if (wake_flags & WF_SYNC) - schedstat_inc(p, se.statistics.nr_wakeups_sync); - if (orig_cpu != cpu) - schedstat_inc(p, se.statistics.nr_wakeups_migrate); - if (cpu == this_cpu) - schedstat_inc(p, se.statistics.nr_wakeups_local); - else - schedstat_inc(p, se.statistics.nr_wakeups_remote); - activate_task(rq, p, en_flags); + ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, + cpu == this_cpu, en_flags); success = 1; - out_running: - trace_sched_wakeup(p, success); - check_preempt_curr(rq, p, wake_flags); - - p->state = TASK_RUNNING; -#ifdef CONFIG_SMP - if (p->sched_class->task_woken) - p->sched_class->task_woken(rq, p); - - if (unlikely(rq->idle_stamp)) { - u64 delta = rq->clock - rq->idle_stamp; - u64 max = 2*sysctl_sched_migration_cost; - - if (delta > max) - rq->avg_idle = max; - else - update_avg(&rq->avg_idle, delta); - rq->idle_stamp = 0; - } -#endif + ttwu_post_activation(p, rq, wake_flags, success); out: task_rq_unlock(rq, &flags); put_cpu(); @@ -2399,6 +2565,37 @@ out: } /** + * try_to_wake_up_local - try to wake up a local task with rq lock held + * @p: the thread to be awakened + * + * Put @p on the run-queue if it's not alredy there. The caller must + * ensure that this_rq() is locked, @p is bound to this_rq() and not + * the current task. this_rq() stays locked over invocation. + */ +static void try_to_wake_up_local(struct task_struct *p) +{ + struct rq *rq = task_rq(p); + bool success = false; + + BUG_ON(rq != this_rq()); + BUG_ON(p == current); + lockdep_assert_held(&rq->lock); + + if (!(p->state & TASK_NORMAL)) + return; + + if (!p->se.on_rq) { + if (likely(!task_running(rq, p))) { + schedstat_inc(rq, ttwu_count); + schedstat_inc(rq, ttwu_local); + } + ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); + success = true; + } + ttwu_post_activation(p, rq, 0, success); +} + +/** * wake_up_process - Wake up a specific process * @p: The process to be woken up. * @@ -2785,14 +2982,14 @@ context_switch(struct rq *rq, struct task_struct *prev, */ arch_start_context_switch(prev); - if (likely(!mm)) { + if (!mm) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else switch_mm(oldmm, mm, next); - if (likely(!prev->mm)) { + if (!prev->mm) { prev->active_mm = NULL; rq->prev_mm = oldmm; } @@ -2873,9 +3070,9 @@ unsigned long nr_iowait(void) return sum; } -unsigned long nr_iowait_cpu(void) +unsigned long nr_iowait_cpu(int cpu) { - struct rq *this = this_rq(); + struct rq *this = cpu_rq(cpu); return atomic_read(&this->nr_iowait); } @@ -3012,23 +3209,102 @@ static void calc_load_account_active(struct rq *this_rq) } /* + * The exact cpuload at various idx values, calculated at every tick would be + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * + * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called + * on nth tick when cpu may be busy, then we have: + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * + * decay_load_missed() below does efficient calculation of + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load + * + * The calculation is approximated on a 128 point scale. + * degrade_zero_ticks is the number of ticks after which load at any + * particular idx is approximated to be zero. + * degrade_factor is a precomputed table, a row for each load idx. + * Each column corresponds to degradation factor for a power of two ticks, + * based on 128 point scale. + * Example: + * row 2, col 3 (=12) says that the degradation at load idx 2 after + * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). + * + * With this power of 2 load factors, we can degrade the load n times + * by looking at 1 bits in n and doing as many mult/shift instead of + * n mult/shifts needed by the exact degradation. + */ +#define DEGRADE_SHIFT 7 +static const unsigned char + degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const unsigned char + degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { + {0, 0, 0, 0, 0, 0, 0, 0}, + {64, 32, 8, 0, 0, 0, 0, 0}, + {96, 72, 40, 12, 1, 0, 0}, + {112, 98, 75, 43, 15, 1, 0}, + {120, 112, 98, 76, 45, 16, 2} }; + +/* + * Update cpu_load for any missed ticks, due to tickless idle. The backlog + * would be when CPU is idle and so we just decay the old load without + * adding any new load. + */ +static unsigned long +decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) +{ + int j = 0; + + if (!missed_updates) + return load; + + if (missed_updates >= degrade_zero_ticks[idx]) + return 0; + + if (idx == 1) + return load >> missed_updates; + + while (missed_updates) { + if (missed_updates % 2) + load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; + + missed_updates >>= 1; + j++; + } + return load; +} + +/* * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). + * scheduler tick (TICK_NSEC). With tickless idle this will not be called + * every tick. We fix it up based on jiffies. */ static void update_cpu_load(struct rq *this_rq) { unsigned long this_load = this_rq->load.weight; + unsigned long curr_jiffies = jiffies; + unsigned long pending_updates; int i, scale; this_rq->nr_load_updates++; + /* Avoid repeated calls on same jiffy, when moving in and out of idle */ + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + this_rq->last_load_update_tick = curr_jiffies; + /* Update our load: */ - for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { + this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ + for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { unsigned long old_load, new_load; /* scale is effectively 1 << i now, and >> i divides by scale */ old_load = this_rq->cpu_load[i]; + old_load = decay_load_missed(old_load, pending_updates - 1, i); new_load = this_load; /* * Round up the averaging division if load is increasing. This @@ -3036,10 +3312,18 @@ static void update_cpu_load(struct rq *this_rq) * example. */ if (new_load > old_load) - new_load += scale-1; - this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; + new_load += scale - 1; + + this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; } + sched_avg_update(this_rq); +} + +static void update_cpu_load_active(struct rq *this_rq) +{ + update_cpu_load(this_rq); + calc_load_account_active(this_rq); } @@ -3094,7 +3378,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) if (task_current(rq, p)) { update_rq_clock(rq); - ns = rq->clock - p->se.exec_start; + ns = rq->clock_task - p->se.exec_start; if ((s64)ns < 0) ns = 0; } @@ -3243,7 +3527,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, tmp = cputime_to_cputime64(cputime); if (hardirq_count() - hardirq_offset) cpustat->irq = cputime64_add(cpustat->irq, tmp); - else if (softirq_count()) + else if (in_serving_softirq()) cpustat->softirq = cputime64_add(cpustat->softirq, tmp); else cpustat->system = cputime64_add(cpustat->system, tmp); @@ -3359,9 +3643,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) rtime = nsecs_to_cputime(p->se.sum_exec_runtime); if (total) { - u64 temp; + u64 temp = rtime; - temp = (u64)(rtime * utime); + temp *= utime; do_div(temp, total); utime = (cputime_t)temp; } else @@ -3392,9 +3676,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) rtime = nsecs_to_cputime(cputime.sum_exec_runtime); if (total) { - u64 temp; + u64 temp = rtime; - temp = (u64)(rtime * cputime.utime); + temp *= cputime.utime; do_div(temp, total); utime = (cputime_t)temp; } else @@ -3426,11 +3710,11 @@ void scheduler_tick(void) raw_spin_lock(&rq->lock); update_rq_clock(rq); - update_cpu_load(rq); + update_cpu_load_active(rq); curr->sched_class->task_tick(rq, curr, 0); raw_spin_unlock(&rq->lock); - perf_event_task_tick(curr); + perf_event_task_tick(); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); @@ -3569,17 +3853,13 @@ pick_next_task(struct rq *rq) return p; } - class = sched_class_highest; - for ( ; ; ) { + for_each_class(class) { p = class->pick_next_task(rq); if (p) return p; - /* - * Will never be NULL as the idle class always - * returns a non-NULL p: - */ - class = class->next; } + + BUG(); /* the idle class will always have a runnable task */ } /* @@ -3598,7 +3878,6 @@ need_resched: rq = cpu_rq(cpu); rcu_note_context_switch(cpu); prev = rq->curr; - switch_count = &prev->nivcsw; release_kernel_lock(prev); need_resched_nonpreemptible: @@ -3611,11 +3890,26 @@ need_resched_nonpreemptible: raw_spin_lock_irq(&rq->lock); clear_tsk_need_resched(prev); + switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { - if (unlikely(signal_pending_state(prev->state, prev))) + if (unlikely(signal_pending_state(prev->state, prev))) { prev->state = TASK_RUNNING; - else + } else { + /* + * If a worker is going to sleep, notify and + * ask workqueue whether it wants to wake up a + * task to maintain concurrency. If so, wake + * up the task. + */ + if (prev->flags & PF_WQ_WORKER) { + struct task_struct *to_wakeup; + + to_wakeup = wq_worker_sleeping(prev, cpu); + if (to_wakeup) + try_to_wake_up_local(to_wakeup); + } deactivate_task(rq, prev, DEQUEUE_SLEEP); + } switch_count = &prev->nvcsw; } @@ -3637,8 +3931,10 @@ need_resched_nonpreemptible: context_switch(rq, prev, next); /* unlocks the rq */ /* - * the context switch might have flipped the stack from under - * us, hence refresh the local variables. + * The context switch have flipped the stack from under us + * and restored the local variables which were saved when + * this task called schedule() in the past. prev == current + * is still correct, but it can be moved to another cpu/rq. */ cpu = smp_processor_id(); rq = cpu_rq(cpu); @@ -3647,11 +3943,8 @@ need_resched_nonpreemptible: post_schedule(rq); - if (unlikely(reacquire_kernel_lock(current) < 0)) { - prev = rq->curr; - switch_count = &prev->nivcsw; + if (unlikely(reacquire_kernel_lock(prev))) goto need_resched_nonpreemptible; - } preempt_enable_no_resched(); if (need_resched()) @@ -3704,8 +3997,16 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) /* * Owner changed, break to re-assess state. */ - if (lock->owner != owner) + if (lock->owner != owner) { + /* + * If the lock has switched to a different owner, + * we likely have heavy contention. Return 0 to quit + * optimistic spinning and not contend further: + */ + if (lock->owner) + return 0; break; + } /* * Is that owner really running on that cpu? @@ -3726,7 +4027,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) * off of preempt_enable. Kernel preemptions off return from interrupt * occur there and call schedule directly. */ -asmlinkage void __sched preempt_schedule(void) +asmlinkage void __sched notrace preempt_schedule(void) { struct thread_info *ti = current_thread_info(); @@ -3738,9 +4039,9 @@ asmlinkage void __sched preempt_schedule(void) return; do { - add_preempt_count(PREEMPT_ACTIVE); + add_preempt_count_notrace(PREEMPT_ACTIVE); schedule(); - sub_preempt_count(PREEMPT_ACTIVE); + sub_preempt_count_notrace(PREEMPT_ACTIVE); /* * Check again in case we missed a preemption opportunity @@ -4183,6 +4484,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) rq = task_rq_lock(p, &flags); + trace_sched_pi_setprio(p, prio); oldprio = p->prio; prev_class = p->sched_class; on_rq = p->se.on_rq; @@ -4441,12 +4743,8 @@ recheck: */ if (user && !capable(CAP_SYS_NICE)) { if (rt_policy(policy)) { - unsigned long rlim_rtprio; - - if (!lock_task_sighand(p, &flags)) - return -ESRCH; - rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); - unlock_task_sighand(p, &flags); + unsigned long rlim_rtprio = + task_rlimit(p, RLIMIT_RTPRIO); /* can't set/change the rt policy */ if (policy != p->policy && !rlim_rtprio) @@ -4474,7 +4772,7 @@ recheck: } if (user) { - retval = security_task_setscheduler(p, policy, param); + retval = security_task_setscheduler(p); if (retval) return retval; } @@ -4490,6 +4788,15 @@ recheck: */ rq = __task_rq_lock(p); + /* + * Changing the policy of the stop threads its a very bad idea + */ + if (p == rq->stop) { + __task_rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + return -EINVAL; + } + #ifdef CONFIG_RT_GROUP_SCHED if (user) { /* @@ -4716,13 +5023,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) goto out_unlock; - retval = security_task_setscheduler(p, 0, NULL); + retval = security_task_setscheduler(p); if (retval) goto out_unlock; cpuset_cpus_allowed(p, cpus_allowed); cpumask_and(new_mask, in_mask, cpus_allowed); - again: +again: retval = set_cpus_allowed_ptr(p, new_mask); if (!retval) { @@ -5166,7 +5473,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) idle->se.exec_start = sched_clock(); cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); + /* + * We're having a chicken and egg problem, even though we are + * holding rq->lock, the cpu isn't yet set to this cpu so the + * lockdep check in task_group() will fail. + * + * Similar case to sched_fork(). / Alternatively we could + * use task_rq_lock() here and obtain the other rq->lock. + * + * Silence PROVE_RCU + */ + rcu_read_lock(); __set_task_cpu(idle, cpu); + rcu_read_unlock(); rq->curr = rq->idle = idle; #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) @@ -5816,20 +6135,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) */ static struct notifier_block __cpuinitdata migration_notifier = { .notifier_call = migration_call, - .priority = 10 + .priority = CPU_PRI_MIGRATION, }; +static int __cpuinit sched_cpu_active(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + set_cpu_active((long)hcpu, true); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + +static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + set_cpu_active((long)hcpu, false); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + static int __init migration_init(void) { void *cpu = (void *)(long)smp_processor_id(); int err; - /* Start one for the boot CPU: */ + /* Initialize migration for the boot CPU */ err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); BUG_ON(err == NOTIFY_BAD); migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); + /* Register cpu active notifiers */ + cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); + cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); + return 0; } early_initcall(migration_init); @@ -6064,23 +6412,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) free_rootdomain(old_rd); } -static int init_rootdomain(struct root_domain *rd, bool bootmem) +static int init_rootdomain(struct root_domain *rd) { - gfp_t gfp = GFP_KERNEL; - memset(rd, 0, sizeof(*rd)); - if (bootmem) - gfp = GFP_NOWAIT; - - if (!alloc_cpumask_var(&rd->span, gfp)) + if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) goto out; - if (!alloc_cpumask_var(&rd->online, gfp)) + if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) goto free_span; - if (!alloc_cpumask_var(&rd->rto_mask, gfp)) + if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_online; - if (cpupri_init(&rd->cpupri, bootmem) != 0) + if (cpupri_init(&rd->cpupri) != 0) goto free_rto_mask; return 0; @@ -6096,7 +6439,7 @@ out: static void init_defrootdomain(void) { - init_rootdomain(&def_root_domain, true); + init_rootdomain(&def_root_domain); atomic_set(&def_root_domain.refcount, 1); } @@ -6109,7 +6452,7 @@ static struct root_domain *alloc_rootdomain(void) if (!rd) return NULL; - if (init_rootdomain(rd, false) != 0) { + if (init_rootdomain(rd) != 0) { kfree(rd); return NULL; } @@ -6319,6 +6662,7 @@ struct s_data { cpumask_var_t nodemask; cpumask_var_t this_sibling_map; cpumask_var_t this_core_map; + cpumask_var_t this_book_map; cpumask_var_t send_covered; cpumask_var_t tmpmask; struct sched_group **sched_group_nodes; @@ -6330,6 +6674,7 @@ enum s_alloc { sa_rootdomain, sa_tmpmask, sa_send_covered, + sa_this_book_map, sa_this_core_map, sa_this_sibling_map, sa_nodemask, @@ -6365,31 +6710,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, #ifdef CONFIG_SCHED_MC static DEFINE_PER_CPU(struct static_sched_domain, core_domains); static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); -#endif /* CONFIG_SCHED_MC */ -#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) static int cpu_to_core_group(int cpu, const struct cpumask *cpu_map, struct sched_group **sg, struct cpumask *mask) { int group; - +#ifdef CONFIG_SCHED_SMT cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); group = cpumask_first(mask); +#else + group = cpu; +#endif if (sg) *sg = &per_cpu(sched_group_core, group).sg; return group; } -#elif defined(CONFIG_SCHED_MC) +#endif /* CONFIG_SCHED_MC */ + +/* + * book sched-domains: + */ +#ifdef CONFIG_SCHED_BOOK +static DEFINE_PER_CPU(struct static_sched_domain, book_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_book); + static int -cpu_to_core_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *unused) +cpu_to_book_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *mask) { + int group = cpu; +#ifdef CONFIG_SCHED_MC + cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); + group = cpumask_first(mask); +#elif defined(CONFIG_SCHED_SMT) + cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); + group = cpumask_first(mask); +#endif if (sg) - *sg = &per_cpu(sched_group_core, cpu).sg; - return cpu; + *sg = &per_cpu(sched_group_book, group).sg; + return group; } -#endif +#endif /* CONFIG_SCHED_BOOK */ static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); @@ -6399,7 +6761,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, struct sched_group **sg, struct cpumask *mask) { int group; -#ifdef CONFIG_SCHED_MC +#ifdef CONFIG_SCHED_BOOK + cpumask_and(mask, cpu_book_mask(cpu), cpu_map); + group = cpumask_first(mask); +#elif defined(CONFIG_SCHED_MC) cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); group = cpumask_first(mask); #elif defined(CONFIG_SCHED_SMT) @@ -6660,6 +7025,9 @@ SD_INIT_FUNC(CPU) #ifdef CONFIG_SCHED_MC SD_INIT_FUNC(MC) #endif +#ifdef CONFIG_SCHED_BOOK + SD_INIT_FUNC(BOOK) +#endif static int default_relax_domain_level = -1; @@ -6709,6 +7077,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, free_cpumask_var(d->tmpmask); /* fall through */ case sa_send_covered: free_cpumask_var(d->send_covered); /* fall through */ + case sa_this_book_map: + free_cpumask_var(d->this_book_map); /* fall through */ case sa_this_core_map: free_cpumask_var(d->this_core_map); /* fall through */ case sa_this_sibling_map: @@ -6755,8 +7125,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, return sa_nodemask; if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) return sa_this_sibling_map; - if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) + if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL)) return sa_this_core_map; + if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) + return sa_this_book_map; if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) return sa_send_covered; d->rd = alloc_rootdomain(); @@ -6814,6 +7186,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, return sd; } +static struct sched_domain *__build_book_sched_domain(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *parent, int i) +{ + struct sched_domain *sd = parent; +#ifdef CONFIG_SCHED_BOOK + sd = &per_cpu(book_domains, i).sd; + SD_INIT(sd, BOOK); + set_domain_attribute(sd, attr); + cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i)); + sd->parent = parent; + parent->child = sd; + cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask); +#endif + return sd; +} + static struct sched_domain *__build_mc_sched_domain(struct s_data *d, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *parent, int i) @@ -6871,6 +7260,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, d->send_covered, d->tmpmask); break; #endif +#ifdef CONFIG_SCHED_BOOK + case SD_LV_BOOK: /* set up book groups */ + cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu)); + if (cpu == cpumask_first(d->this_book_map)) + init_sched_build_groups(d->this_book_map, cpu_map, + &cpu_to_book_group, + d->send_covered, d->tmpmask); + break; +#endif case SD_LV_CPU: /* set up physical groups */ cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); if (!cpumask_empty(d->nodemask)) @@ -6918,12 +7316,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = __build_numa_sched_domains(&d, cpu_map, attr, i); sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); + sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); } for_each_cpu(i, cpu_map) { build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); + build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); build_sched_groups(&d, SD_LV_MC, cpu_map, i); } @@ -6954,6 +7354,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map, init_sched_groups_power(i, sd); } #endif +#ifdef CONFIG_SCHED_BOOK + for_each_cpu(i, cpu_map) { + sd = &per_cpu(book_domains, i).sd; + init_sched_groups_power(i, sd); + } +#endif for_each_cpu(i, cpu_map) { sd = &per_cpu(phys_domains, i).sd; @@ -6979,6 +7385,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = &per_cpu(cpu_domains, i).sd; #elif defined(CONFIG_SCHED_MC) sd = &per_cpu(core_domains, i).sd; +#elif defined(CONFIG_SCHED_BOOK) + sd = &per_cpu(book_domains, i).sd; #else sd = &per_cpu(phys_domains, i).sd; #endif @@ -7288,29 +7696,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) } #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -#ifndef CONFIG_CPUSETS /* - * Add online and remove offline CPUs from the scheduler domains. - * When cpusets are enabled they take over this function. + * Update cpusets according to cpu_active mask. If cpusets are + * disabled, cpuset_update_active_cpus() becomes a simple wrapper + * around partition_sched_domains(). */ -static int update_sched_domains(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, + void *hcpu) { - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - partition_sched_domains(1, NULL, NULL); + cpuset_update_active_cpus(); return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} +static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + cpuset_update_active_cpus(); + return NOTIFY_OK; default: return NOTIFY_DONE; } } -#endif static int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -7356,10 +7770,8 @@ void __init sched_init_smp(void) mutex_unlock(&sched_domains_mutex); put_online_cpus(); -#ifndef CONFIG_CPUSETS - /* XXX: Theoretical race here - CPU may be hotplugged now */ - hotcpu_notifier(update_sched_domains, 0); -#endif + hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); + hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); /* RT runtime code needs to handle some hotplug events */ hotcpu_notifier(update_runtime, 0); @@ -7604,6 +8016,9 @@ void __init sched_init(void) for (j = 0; j < CPU_LOAD_IDX_MAX; j++) rq->cpu_load[j] = 0; + + rq->last_load_update_tick = jiffies; + #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; @@ -7617,6 +8032,10 @@ void __init sched_init(void) rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; rq_attach_root(rq, &def_root_domain); +#ifdef CONFIG_NO_HZ + rq->nohz_balance_kick = 0; + init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); +#endif #endif init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); @@ -7661,8 +8080,11 @@ void __init sched_init(void) zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ - zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); - alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); + zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); + alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); + atomic_set(&nohz.load_balancer, nr_cpu_ids); + atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); + atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); #endif /* May be allocated at isolcpus cmdline parse time */ if (cpu_isolated_map == NULL) @@ -7869,9 +8291,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) return 1; - err_free_rq: +err_free_rq: kfree(cfs_rq); - err: +err: return 0; } @@ -7959,9 +8381,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) return 1; - err_free_rq: +err_free_rq: kfree(rt_rq); - err: +err: return 0; } @@ -8319,7 +8741,7 @@ static int tg_set_bandwidth(struct task_group *tg, raw_spin_unlock(&rt_rq->rt_runtime_lock); } raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); - unlock: +unlock: read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 906a0f7..52f1a14 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -10,19 +10,55 @@ * Ingo Molnar <mingo@redhat.com> * Guillaume Chazarain <guichaz@gmail.com> * - * Create a semi stable clock from a mixture of other events, including: - * - gtod + * + * What: + * + * cpu_clock(i) provides a fast (execution time) high resolution + * clock with bounded drift between CPUs. The value of cpu_clock(i) + * is monotonic for constant i. The timestamp returned is in nanoseconds. + * + * ######################### BIG FAT WARNING ########################## + * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # + * # go backwards !! # + * #################################################################### + * + * There is no strict promise about the base, although it tends to start + * at 0 on boot (but people really shouldn't rely on that). + * + * cpu_clock(i) -- can be used from any context, including NMI. + * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI) + * local_clock() -- is cpu_clock() on the current cpu. + * + * How: + * + * The implementation either uses sched_clock() when + * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the + * sched_clock() is assumed to provide these properties (mostly it means + * the architecture provides a globally synchronized highres time source). + * + * Otherwise it tries to create a semi stable clock from a mixture of other + * clocks, including: + * + * - GTOD (clock monotomic) * - sched_clock() * - explicit idle events * - * We use gtod as base and the unstable clock deltas. The deltas are filtered, - * making it monotonic and keeping it within an expected window. + * We use GTOD as base and use sched_clock() deltas to improve resolution. The + * deltas are filtered to provide monotonicity and keeping it within an + * expected window. * * Furthermore, explicit sleep and wakeup hooks allow us to account for time * that is otherwise invisible (TSC gets stopped). * - * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat - * consistent between cpus (never more than 2 jiffies difference). + * + * Notes: + * + * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things + * like cpufreq interrupts that can change the base clock (TSC) multiplier + * and cause funny jumps in time -- although the filtering provided by + * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it + * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on + * sched_clock(). */ #include <linux/spinlock.h> #include <linux/hardirq.h> @@ -170,6 +206,11 @@ again: return val; } +/* + * Similar to cpu_clock(), but requires local IRQs to be disabled. + * + * See cpu_clock(). + */ u64 sched_clock_cpu(int cpu) { struct sched_clock_data *scd; @@ -237,9 +278,19 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -unsigned long long cpu_clock(int cpu) +/* + * As outlined at the top, provides a fast, high resolution, nanosecond + * time source that is monotonic per cpu argument and has bounded drift + * between cpus. + * + * ######################### BIG FAT WARNING ########################## + * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # + * # go backwards !! # + * #################################################################### + */ +u64 cpu_clock(int cpu) { - unsigned long long clock; + u64 clock; unsigned long flags; local_irq_save(flags); @@ -249,6 +300,25 @@ unsigned long long cpu_clock(int cpu) return clock; } +/* + * Similar to cpu_clock() for the current cpu. Time will only be observed + * to be monotonic if care is taken to only compare timestampt taken on the + * same CPU. + * + * See cpu_clock(). + */ +u64 local_clock(void) +{ + u64 clock; + unsigned long flags; + + local_irq_save(flags); + clock = sched_clock_cpu(smp_processor_id()); + local_irq_restore(flags); + + return clock; +} + #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ void sched_clock_init(void) @@ -264,12 +334,17 @@ u64 sched_clock_cpu(int cpu) return sched_clock(); } - -unsigned long long cpu_clock(int cpu) +u64 cpu_clock(int cpu) { return sched_clock_cpu(cpu); } +u64 local_clock(void) +{ + return sched_clock_cpu(0); +} + #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ EXPORT_SYMBOL_GPL(cpu_clock); +EXPORT_SYMBOL_GPL(local_clock); diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index e6871cb..2722dc1 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -166,14 +166,10 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) * * Returns: -ENOMEM if memory fails. */ -int cpupri_init(struct cpupri *cp, bool bootmem) +int cpupri_init(struct cpupri *cp) { - gfp_t gfp = GFP_KERNEL; int i; - if (bootmem) - gfp = GFP_NOWAIT; - memset(cp, 0, sizeof(*cp)); for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { @@ -181,7 +177,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem) raw_spin_lock_init(&vec->lock); vec->count = 0; - if (!zalloc_cpumask_var(&vec->mask, gfp)) + if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) goto cleanup; } diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index 7cb5bb6..9fc7d38 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h @@ -27,7 +27,7 @@ struct cpupri { int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); void cpupri_set(struct cpupri *cp, int cpu, int pri); -int cpupri_init(struct cpupri *cp, bool bootmem); +int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); #else #define cpupri_set(cp, cpu, pri) do { } while (0) diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 3556539..2e1b0d1 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -332,7 +332,7 @@ static int sched_debug_show(struct seq_file *m, void *v) PN(sysctl_sched_latency); PN(sysctl_sched_min_granularity); PN(sysctl_sched_wakeup_granularity); - PN(sysctl_sched_child_runs_first); + P(sysctl_sched_child_runs_first); P(sysctl_sched_features); #undef PN #undef P diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a878b53..933f3d1 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -25,7 +25,7 @@ /* * Targeted preemption latency for CPU-bound tasks: - * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) * * NOTE: this latency value is not the same as the concept of * 'timeslice length' - timeslices in CFS are of variable length @@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling /* * Minimal preemption granularity for CPU-bound tasks: - * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_min_granularity = 2000000ULL; -unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL; +unsigned int sysctl_sched_min_granularity = 750000ULL; +unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; /* * is kept at sysctl_sched_latency / sysctl_sched_min_granularity */ -static unsigned int sched_nr_latency = 3; +static unsigned int sched_nr_latency = 8; /* * After fork, child runs first. If set to 0 (default) then @@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; - u64 now = rq_of(cfs_rq)->clock; + u64 now = rq_of(cfs_rq)->clock_task; unsigned long delta_exec; if (unlikely(!curr)) @@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) /* * We are starting a new run period: */ - se->exec_start = rq_of(cfs_rq)->clock; + se->exec_start = rq_of(cfs_rq)->clock_task; } /************************************************** @@ -1313,7 +1313,7 @@ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int load_idx) { - struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; + struct sched_group *idlest = NULL, *group = sd->groups; unsigned long min_load = ULONG_MAX, this_load = 0; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -1348,7 +1348,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, if (local_group) { this_load = avg_load; - this = group; } else if (avg_load < min_load) { min_load = avg_load; idlest = group; @@ -1765,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, set_task_cpu(p, this_cpu); activate_task(this_rq, p, 0); check_preempt_curr(this_rq, p, 0); + + /* re-arm NEWIDLE balancing when moving tasks */ + src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost; + this_rq->idle_stamp = 0; } /* @@ -1799,7 +1802,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, rq->clock, sd); + tsk_cache_hot = task_hot(p, rq->clock_task, sd); if (!tsk_cache_hot || sd->nr_balance_failed > sd->cache_nice_tries) { #ifdef CONFIG_SCHEDSTATS @@ -2031,12 +2034,14 @@ struct sd_lb_stats { unsigned long this_load; unsigned long this_load_per_task; unsigned long this_nr_running; + unsigned long this_has_capacity; /* Statistics of the busiest group */ unsigned long max_load; unsigned long busiest_load_per_task; unsigned long busiest_nr_running; unsigned long busiest_group_capacity; + unsigned long busiest_has_capacity; int group_imb; /* Is there imbalance in this sd */ #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) @@ -2059,6 +2064,7 @@ struct sg_lb_stats { unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long group_capacity; int group_imb; /* Is there an imbalance in the group ? */ + int group_has_capacity; /* Is there extra capacity in the group? */ }; /** @@ -2268,10 +2274,14 @@ unsigned long scale_rt_power(int cpu) struct rq *rq = cpu_rq(cpu); u64 total, available; - sched_avg_update(rq); - total = sched_avg_period() + (rq->clock - rq->age_stamp); - available = total - rq->rt_avg; + + if (unlikely(total < rq->rt_avg)) { + /* Ensures that power won't end up being negative */ + available = 0; + } else { + available = total - rq->rt_avg; + } if (unlikely((s64)total < SCHED_LOAD_SCALE)) total = SCHED_LOAD_SCALE; @@ -2287,13 +2297,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) unsigned long power = SCHED_LOAD_SCALE; struct sched_group *sdg = sd->groups; - if (sched_feat(ARCH_POWER)) - power *= arch_scale_freq_power(sd, cpu); - else - power *= default_scale_freq_power(sd, cpu); - - power >>= SCHED_LOAD_SHIFT; - if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { if (sched_feat(ARCH_POWER)) power *= arch_scale_smt_power(sd, cpu); @@ -2303,6 +2306,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) power >>= SCHED_LOAD_SHIFT; } + sdg->cpu_power_orig = power; + + if (sched_feat(ARCH_POWER)) + power *= arch_scale_freq_power(sd, cpu); + else + power *= default_scale_freq_power(sd, cpu); + + power >>= SCHED_LOAD_SHIFT; + power *= scale_rt_power(cpu); power >>= SCHED_LOAD_SHIFT; @@ -2335,6 +2347,31 @@ static void update_group_power(struct sched_domain *sd, int cpu) sdg->cpu_power = power; } +/* + * Try and fix up capacity for tiny siblings, this is needed when + * things like SD_ASYM_PACKING need f_b_g to select another sibling + * which on its own isn't powerful enough. + * + * See update_sd_pick_busiest() and check_asym_packing(). + */ +static inline int +fix_small_capacity(struct sched_domain *sd, struct sched_group *group) +{ + /* + * Only siblings can have significantly less than SCHED_LOAD_SCALE + */ + if (sd->level != SD_LV_SIBLING) + return 0; + + /* + * If ~90% of the cpu_power is still there, we're good. + */ + if (group->cpu_power * 32 > group->cpu_power_orig * 29) + return 1; + + return 0; +} + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @sd: The sched_domain whose statistics are to be updated. @@ -2354,7 +2391,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, int local_group, const struct cpumask *cpus, int *balance, struct sg_lb_stats *sgs) { - unsigned long load, max_cpu_load, min_cpu_load; + unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; int i; unsigned int balance_cpu = -1, first_idle_cpu = 0; unsigned long avg_load_per_task = 0; @@ -2365,6 +2402,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, /* Tally up the load of all CPUs in the group */ max_cpu_load = 0; min_cpu_load = ~0UL; + max_nr_running = 0; for_each_cpu_and(i, sched_group_cpus(group), cpus) { struct rq *rq = cpu_rq(i); @@ -2382,8 +2420,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, load = target_load(i, load_idx); } else { load = source_load(i, load_idx); - if (load > max_cpu_load) + if (load > max_cpu_load) { max_cpu_load = load; + max_nr_running = rq->nr_running; + } if (min_cpu_load > load) min_cpu_load = load; } @@ -2400,14 +2440,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, * domains. In the newly idle case, we will allow all the cpu's * to do the newly idle load balance. */ - if (idle != CPU_NEWLY_IDLE && local_group && - balance_cpu != this_cpu) { - *balance = 0; - return; + if (idle != CPU_NEWLY_IDLE && local_group) { + if (balance_cpu != this_cpu) { + *balance = 0; + return; + } + update_group_power(sd, this_cpu); } - update_group_power(sd, this_cpu); - /* Adjust by relative CPU power of the group */ sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; @@ -2423,11 +2463,58 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, if (sgs->sum_nr_running) avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; - if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) + if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1) sgs->group_imb = 1; - sgs->group_capacity = - DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); + sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); + if (!sgs->group_capacity) + sgs->group_capacity = fix_small_capacity(sd, group); + + if (sgs->group_capacity > sgs->sum_nr_running) + sgs->group_has_capacity = 1; +} + +/** + * update_sd_pick_busiest - return 1 on busiest group + * @sd: sched_domain whose statistics are to be checked + * @sds: sched_domain statistics + * @sg: sched_group candidate to be checked for being the busiest + * @sgs: sched_group statistics + * @this_cpu: the current cpu + * + * Determine if @sg is a busier group than the previously selected + * busiest group. + */ +static bool update_sd_pick_busiest(struct sched_domain *sd, + struct sd_lb_stats *sds, + struct sched_group *sg, + struct sg_lb_stats *sgs, + int this_cpu) +{ + if (sgs->avg_load <= sds->max_load) + return false; + + if (sgs->sum_nr_running > sgs->group_capacity) + return true; + + if (sgs->group_imb) + return true; + + /* + * ASYM_PACKING needs to move all the work to the lowest + * numbered CPUs in the group, therefore mark all groups + * higher than ourself as busy. + */ + if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && + this_cpu < group_first_cpu(sg)) { + if (!sds->busiest) + return true; + + if (group_first_cpu(sds->busiest) > group_first_cpu(sg)) + return true; + } + + return false; } /** @@ -2435,7 +2522,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, * @sd: sched_domain whose statistics are to be updated. * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu - * @sd_idle: Idle status of the sched_domain containing group. + * @sd_idle: Idle status of the sched_domain containing sg. * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. @@ -2446,7 +2533,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, struct sd_lb_stats *sds) { struct sched_domain *child = sd->child; - struct sched_group *group = sd->groups; + struct sched_group *sg = sd->groups; struct sg_lb_stats sgs; int load_idx, prefer_sibling = 0; @@ -2459,45 +2546,100 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, do { int local_group; - local_group = cpumask_test_cpu(this_cpu, - sched_group_cpus(group)); + local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, + update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, local_group, cpus, balance, &sgs); if (local_group && !(*balance)) return; sds->total_load += sgs.group_load; - sds->total_pwr += group->cpu_power; + sds->total_pwr += sg->cpu_power; /* * In case the child domain prefers tasks go to siblings - * first, lower the group capacity to one so that we'll try - * and move all the excess tasks away. + * first, lower the sg capacity to one so that we'll try + * and move all the excess tasks away. We lower the capacity + * of a group only if the local group has the capacity to fit + * these excess tasks, i.e. nr_running < group_capacity. The + * extra check prevents the case where you always pull from the + * heaviest group when it is already under-utilized (possible + * with a large weight task outweighs the tasks on the system). */ - if (prefer_sibling) + if (prefer_sibling && !local_group && sds->this_has_capacity) sgs.group_capacity = min(sgs.group_capacity, 1UL); if (local_group) { sds->this_load = sgs.avg_load; - sds->this = group; + sds->this = sg; sds->this_nr_running = sgs.sum_nr_running; sds->this_load_per_task = sgs.sum_weighted_load; - } else if (sgs.avg_load > sds->max_load && - (sgs.sum_nr_running > sgs.group_capacity || - sgs.group_imb)) { + sds->this_has_capacity = sgs.group_has_capacity; + } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { sds->max_load = sgs.avg_load; - sds->busiest = group; + sds->busiest = sg; sds->busiest_nr_running = sgs.sum_nr_running; sds->busiest_group_capacity = sgs.group_capacity; sds->busiest_load_per_task = sgs.sum_weighted_load; + sds->busiest_has_capacity = sgs.group_has_capacity; sds->group_imb = sgs.group_imb; } - update_sd_power_savings_stats(group, sds, local_group, &sgs); - group = group->next; - } while (group != sd->groups); + update_sd_power_savings_stats(sg, sds, local_group, &sgs); + sg = sg->next; + } while (sg != sd->groups); +} + +int __weak arch_sd_sibling_asym_packing(void) +{ + return 0*SD_ASYM_PACKING; +} + +/** + * check_asym_packing - Check to see if the group is packed into the + * sched doman. + * + * This is primarily intended to used at the sibling level. Some + * cores like POWER7 prefer to use lower numbered SMT threads. In the + * case of POWER7, it can move to lower SMT modes only when higher + * threads are idle. When in lower SMT modes, the threads will + * perform better since they share less core resources. Hence when we + * have idle threads, we want them to be the higher ones. + * + * This packing function is run on idle threads. It checks to see if + * the busiest CPU in this domain (core in the P7 case) has a higher + * CPU number than the packing function is being run on. Here we are + * assuming lower CPU number will be equivalent to lower a SMT thread + * number. + * + * Returns 1 when packing is required and a task should be moved to + * this CPU. The amount of the imbalance is returned in *imbalance. + * + * @sd: The sched_domain whose packing is to be checked. + * @sds: Statistics of the sched_domain which is to be packed + * @this_cpu: The cpu at whose sched_domain we're performing load-balance. + * @imbalance: returns amount of imbalanced due to packing. + */ +static int check_asym_packing(struct sched_domain *sd, + struct sd_lb_stats *sds, + int this_cpu, unsigned long *imbalance) +{ + int busiest_cpu; + + if (!(sd->flags & SD_ASYM_PACKING)) + return 0; + + if (!sds->busiest) + return 0; + + busiest_cpu = group_first_cpu(sds->busiest); + if (this_cpu > busiest_cpu) + return 0; + + *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, + SCHED_LOAD_SCALE); + return 1; } /** @@ -2637,6 +2779,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, return fix_small_imbalance(sds, this_cpu, imbalance); } + /******* find_busiest_group() helpers end here *********************/ /** @@ -2688,13 +2831,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * 4) This group is more busy than the avg busieness at this * sched_domain. * 5) The imbalance is within the specified limit. + * + * Note: when doing newidle balance, if the local group has excess + * capacity (i.e. nr_running < group_capacity) and the busiest group + * does not have any capacity, we force a load balance to pull tasks + * to the local group. In this case, we skip past checks 3, 4 and 5. */ if (!(*balance)) goto ret; + if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && + check_asym_packing(sd, &sds, this_cpu, imbalance)) + return sds.busiest; + if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; + /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ + if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && + !sds.busiest_has_capacity) + goto force_balance; + if (sds.this_load >= sds.max_load) goto out_balanced; @@ -2706,6 +2863,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) goto out_balanced; +force_balance: /* Looks like there is an imbalance. Compute it */ calculate_imbalance(&sds, this_cpu, imbalance); return sds.busiest; @@ -2726,8 +2884,9 @@ ret: * find_busiest_queue - find the busiest runqueue among the cpus in group. */ static struct rq * -find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, - unsigned long imbalance, const struct cpumask *cpus) +find_busiest_queue(struct sched_domain *sd, struct sched_group *group, + enum cpu_idle_type idle, unsigned long imbalance, + const struct cpumask *cpus) { struct rq *busiest = NULL, *rq; unsigned long max_load = 0; @@ -2738,6 +2897,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); unsigned long wl; + if (!capacity) + capacity = fix_small_capacity(sd, group); + if (!cpumask_test_cpu(i, cpus)) continue; @@ -2777,9 +2939,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, /* Working cpumask for load_balance and load_balance_newidle. */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); -static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) +static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, + int busiest_cpu, int this_cpu) { if (idle == CPU_NEWLY_IDLE) { + + /* + * ASYM_PACKING needs to force migrate tasks from busy but + * higher numbered CPUs in order to pack all tasks in the + * lowest numbered CPUs. + */ + if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) + return 1; + /* * The only task running in a non-idle cpu can be moved to this * cpu in an attempt to completely freeup the other CPU @@ -2854,7 +3026,7 @@ redo: goto out_balanced; } - busiest = find_busiest_queue(group, idle, imbalance, cpus); + busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; @@ -2896,9 +3068,17 @@ redo: if (!ld_moved) { schedstat_inc(sd, lb_failed[idle]); - sd->nr_balance_failed++; + /* + * Increment the failure counter only on periodic balance. + * We do not want newidle balance, which can be very + * frequent, pollute the failure counter causing + * excessive cache_hot migrations and active balances. + */ + if (idle != CPU_NEWLY_IDLE) + sd->nr_balance_failed++; - if (need_active_balance(sd, sd_idle, idle)) { + if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), + this_cpu)) { raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the active_load_balance_cpu_stop, @@ -3017,10 +3197,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq) interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; - if (pulled_task) { - this_rq->idle_stamp = 0; + if (pulled_task) break; - } } raw_spin_lock(&this_rq->lock); @@ -3093,13 +3271,40 @@ out_unlock: } #ifdef CONFIG_NO_HZ + +static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb); + +static void trigger_sched_softirq(void *data) +{ + raise_softirq_irqoff(SCHED_SOFTIRQ); +} + +static inline void init_sched_softirq_csd(struct call_single_data *csd) +{ + csd->func = trigger_sched_softirq; + csd->info = NULL; + csd->flags = 0; + csd->priv = 0; +} + +/* + * idle load balancing details + * - One of the idle CPUs nominates itself as idle load_balancer, while + * entering idle. + * - This idle load balancer CPU will also go into tickless mode when + * it is idle, just like all other idle CPUs + * - When one of the busy CPUs notice that there may be an idle rebalancing + * needed, they will kick the idle load balancer, which then does idle + * load balancing for all the idle CPUs. + */ static struct { atomic_t load_balancer; - cpumask_var_t cpu_mask; - cpumask_var_t ilb_grp_nohz_mask; -} nohz ____cacheline_aligned = { - .load_balancer = ATOMIC_INIT(-1), -}; + atomic_t first_pick_cpu; + atomic_t second_pick_cpu; + cpumask_var_t idle_cpus_mask; + cpumask_var_t grp_idle_mask; + unsigned long next_balance; /* in jiffy units */ +} nohz ____cacheline_aligned; int get_nohz_load_balancer(void) { @@ -3153,17 +3358,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) */ static inline int is_semi_idle_group(struct sched_group *ilb_group) { - cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, + cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask, sched_group_cpus(ilb_group)); /* * A sched_group is semi-idle when it has atleast one busy cpu * and atleast one idle cpu. */ - if (cpumask_empty(nohz.ilb_grp_nohz_mask)) + if (cpumask_empty(nohz.grp_idle_mask)) return 0; - if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) + if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group))) return 0; return 1; @@ -3196,7 +3401,7 @@ static int find_new_ilb(int cpu) * Optimize for the case when we have no idle CPUs or only one * idle CPU. Don't walk the sched_domain hierarchy in such cases */ - if (cpumask_weight(nohz.cpu_mask) < 2) + if (cpumask_weight(nohz.idle_cpus_mask) < 2) goto out_done; for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { @@ -3204,7 +3409,7 @@ static int find_new_ilb(int cpu) do { if (is_semi_idle_group(ilb_group)) - return cpumask_first(nohz.ilb_grp_nohz_mask); + return cpumask_first(nohz.grp_idle_mask); ilb_group = ilb_group->next; @@ -3212,98 +3417,116 @@ static int find_new_ilb(int cpu) } out_done: - return cpumask_first(nohz.cpu_mask); + return nr_cpu_ids; } #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ static inline int find_new_ilb(int call_cpu) { - return cpumask_first(nohz.cpu_mask); + return nr_cpu_ids; } #endif /* + * Kick a CPU to do the nohz balancing, if it is time for it. We pick the + * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle + * CPU (if there is one). + */ +static void nohz_balancer_kick(int cpu) +{ + int ilb_cpu; + + nohz.next_balance++; + + ilb_cpu = get_nohz_load_balancer(); + + if (ilb_cpu >= nr_cpu_ids) { + ilb_cpu = cpumask_first(nohz.idle_cpus_mask); + if (ilb_cpu >= nr_cpu_ids) + return; + } + + if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { + struct call_single_data *cp; + + cpu_rq(ilb_cpu)->nohz_balance_kick = 1; + cp = &per_cpu(remote_sched_softirq_cb, cpu); + __smp_call_function_single(ilb_cpu, cp, 0); + } + return; +} + +/* * This routine will try to nominate the ilb (idle load balancing) * owner among the cpus whose ticks are stopped. ilb owner will do the idle - * load balancing on behalf of all those cpus. If all the cpus in the system - * go into this tickless mode, then there will be no ilb owner (as there is - * no need for one) and all the cpus will sleep till the next wakeup event - * arrives... - * - * For the ilb owner, tick is not stopped. And this tick will be used - * for idle load balancing. ilb owner will still be part of - * nohz.cpu_mask.. + * load balancing on behalf of all those cpus. * - * While stopping the tick, this cpu will become the ilb owner if there - * is no other owner. And will be the owner till that cpu becomes busy - * or if all cpus in the system stop their ticks at which point - * there is no need for ilb owner. + * When the ilb owner becomes busy, we will not have new ilb owner until some + * idle CPU wakes up and goes back to idle or some busy CPU tries to kick + * idle load balancing by kicking one of the idle CPUs. * - * When the ilb owner becomes busy, it nominates another owner, during the - * next busy scheduler_tick() + * Ticks are stopped for the ilb owner as well, with busy CPU kicking this + * ilb owner CPU in future (when there is a need for idle load balancing on + * behalf of all idle CPUs). */ -int select_nohz_load_balancer(int stop_tick) +void select_nohz_load_balancer(int stop_tick) { int cpu = smp_processor_id(); if (stop_tick) { - cpu_rq(cpu)->in_nohz_recently = 1; - if (!cpu_active(cpu)) { if (atomic_read(&nohz.load_balancer) != cpu) - return 0; + return; /* * If we are going offline and still the leader, * give up! */ - if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) + if (atomic_cmpxchg(&nohz.load_balancer, cpu, + nr_cpu_ids) != cpu) BUG(); - return 0; + return; } - cpumask_set_cpu(cpu, nohz.cpu_mask); + cpumask_set_cpu(cpu, nohz.idle_cpus_mask); - /* time for ilb owner also to sleep */ - if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { - if (atomic_read(&nohz.load_balancer) == cpu) - atomic_set(&nohz.load_balancer, -1); - return 0; - } + if (atomic_read(&nohz.first_pick_cpu) == cpu) + atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); + if (atomic_read(&nohz.second_pick_cpu) == cpu) + atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); - if (atomic_read(&nohz.load_balancer) == -1) { - /* make me the ilb owner */ - if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) - return 1; - } else if (atomic_read(&nohz.load_balancer) == cpu) { + if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { int new_ilb; - if (!(sched_smt_power_savings || - sched_mc_power_savings)) - return 1; + /* make me the ilb owner */ + if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, + cpu) != nr_cpu_ids) + return; + /* * Check to see if there is a more power-efficient * ilb. */ new_ilb = find_new_ilb(cpu); if (new_ilb < nr_cpu_ids && new_ilb != cpu) { - atomic_set(&nohz.load_balancer, -1); + atomic_set(&nohz.load_balancer, nr_cpu_ids); resched_cpu(new_ilb); - return 0; + return; } - return 1; + return; } } else { - if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) - return 0; + if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) + return; - cpumask_clear_cpu(cpu, nohz.cpu_mask); + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); if (atomic_read(&nohz.load_balancer) == cpu) - if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) + if (atomic_cmpxchg(&nohz.load_balancer, cpu, + nr_cpu_ids) != cpu) BUG(); } - return 0; + return; } #endif @@ -3385,11 +3608,102 @@ out: rq->next_balance = next_balance; } +#ifdef CONFIG_NO_HZ /* - * run_rebalance_domains is triggered when needed from the scheduler tick. - * In CONFIG_NO_HZ case, the idle load balance owner will do the + * In CONFIG_NO_HZ case, the idle balance kickee will do the * rebalancing for all the cpus for whom scheduler ticks are stopped. */ +static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) +{ + struct rq *this_rq = cpu_rq(this_cpu); + struct rq *rq; + int balance_cpu; + + if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) + return; + + for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { + if (balance_cpu == this_cpu) + continue; + + /* + * If this cpu gets work to do, stop the load balancing + * work being done for other cpus. Next load + * balancing owner will pick it up. + */ + if (need_resched()) { + this_rq->nohz_balance_kick = 0; + break; + } + + raw_spin_lock_irq(&this_rq->lock); + update_rq_clock(this_rq); + update_cpu_load(this_rq); + raw_spin_unlock_irq(&this_rq->lock); + + rebalance_domains(balance_cpu, CPU_IDLE); + + rq = cpu_rq(balance_cpu); + if (time_after(this_rq->next_balance, rq->next_balance)) + this_rq->next_balance = rq->next_balance; + } + nohz.next_balance = this_rq->next_balance; + this_rq->nohz_balance_kick = 0; +} + +/* + * Current heuristic for kicking the idle load balancer + * - first_pick_cpu is the one of the busy CPUs. It will kick + * idle load balancer when it has more than one process active. This + * eliminates the need for idle load balancing altogether when we have + * only one running process in the system (common case). + * - If there are more than one busy CPU, idle load balancer may have + * to run for active_load_balance to happen (i.e., two busy CPUs are + * SMT or core siblings and can run better if they move to different + * physical CPUs). So, second_pick_cpu is the second of the busy CPUs + * which will kick idle load balancer as soon as it has any load. + */ +static inline int nohz_kick_needed(struct rq *rq, int cpu) +{ + unsigned long now = jiffies; + int ret; + int first_pick_cpu, second_pick_cpu; + + if (time_before(now, nohz.next_balance)) + return 0; + + if (rq->idle_at_tick) + return 0; + + first_pick_cpu = atomic_read(&nohz.first_pick_cpu); + second_pick_cpu = atomic_read(&nohz.second_pick_cpu); + + if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && + second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu) + return 0; + + ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); + if (ret == nr_cpu_ids || ret == cpu) { + atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); + if (rq->nr_running > 1) + return 1; + } else { + ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); + if (ret == nr_cpu_ids || ret == cpu) { + if (rq->nr_running) + return 1; + } + } + return 0; +} +#else +static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } +#endif + +/* + * run_rebalance_domains is triggered when needed from the scheduler tick. + * Also triggered for nohz idle balancing (with nohz_balancing_kick set). + */ static void run_rebalance_domains(struct softirq_action *h) { int this_cpu = smp_processor_id(); @@ -3399,37 +3713,12 @@ static void run_rebalance_domains(struct softirq_action *h) rebalance_domains(this_cpu, idle); -#ifdef CONFIG_NO_HZ /* - * If this cpu is the owner for idle load balancing, then do the + * If this cpu has a pending nohz_balance_kick, then do the * balancing on behalf of the other idle cpus whose ticks are * stopped. */ - if (this_rq->idle_at_tick && - atomic_read(&nohz.load_balancer) == this_cpu) { - struct rq *rq; - int balance_cpu; - - for_each_cpu(balance_cpu, nohz.cpu_mask) { - if (balance_cpu == this_cpu) - continue; - - /* - * If this cpu gets work to do, stop the load balancing - * work being done for other cpus. Next load - * balancing owner will pick it up. - */ - if (need_resched()) - break; - - rebalance_domains(balance_cpu, CPU_IDLE); - - rq = cpu_rq(balance_cpu); - if (time_after(this_rq->next_balance, rq->next_balance)) - this_rq->next_balance = rq->next_balance; - } - } -#endif + nohz_idle_balance(this_cpu, idle); } static inline int on_null_domain(int cpu) @@ -3439,57 +3728,17 @@ static inline int on_null_domain(int cpu) /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. - * - * In case of CONFIG_NO_HZ, this is the place where we nominate a new - * idle load balancing owner or decide to stop the periodic load balancing, - * if the whole system is idle. */ static inline void trigger_load_balance(struct rq *rq, int cpu) { -#ifdef CONFIG_NO_HZ - /* - * If we were in the nohz mode recently and busy at the current - * scheduler tick, then check if we need to nominate new idle - * load balancer. - */ - if (rq->in_nohz_recently && !rq->idle_at_tick) { - rq->in_nohz_recently = 0; - - if (atomic_read(&nohz.load_balancer) == cpu) { - cpumask_clear_cpu(cpu, nohz.cpu_mask); - atomic_set(&nohz.load_balancer, -1); - } - - if (atomic_read(&nohz.load_balancer) == -1) { - int ilb = find_new_ilb(cpu); - - if (ilb < nr_cpu_ids) - resched_cpu(ilb); - } - } - - /* - * If this cpu is idle and doing idle load balancing for all the - * cpus with ticks stopped, is it time for that to stop? - */ - if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && - cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { - resched_cpu(cpu); - return; - } - - /* - * If this cpu is idle and the idle load balancing is done by - * someone else, then no need raise the SCHED_SOFTIRQ - */ - if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && - cpumask_test_cpu(cpu, nohz.cpu_mask)) - return; -#endif /* Don't need to rebalance while attached to NULL domain */ if (time_after_eq(jiffies, rq->next_balance) && likely(!on_null_domain(cpu))) raise_softirq(SCHED_SOFTIRQ); +#ifdef CONFIG_NO_HZ + else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) + nohz_balancer_kick(cpu); +#endif } static void rq_online_fair(struct rq *rq) @@ -3542,8 +3791,13 @@ static void task_fork_fair(struct task_struct *p) raw_spin_lock_irqsave(&rq->lock, flags); - if (unlikely(task_cpu(p) != this_cpu)) + update_rq_clock(rq); + + if (unlikely(task_cpu(p) != this_cpu)) { + rcu_read_lock(); __set_task_cpu(p, this_cpu); + rcu_read_unlock(); + } update_curr(cfs_rq); diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 83c66e8..185f920 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -61,3 +61,8 @@ SCHED_FEAT(ASYM_EFF_LOAD, 1) * release the lock. Decreases scheduling overhead. */ SCHED_FEAT(OWNER_SPIN, 1) + +/* + * Decrement CPU power based on irq activity + */ +SCHED_FEAT(NONIRQ_POWER, 1) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 8afb953..bea7d79 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -609,7 +609,7 @@ static void update_curr_rt(struct rq *rq) if (!task_has_rt_policy(curr)) return; - delta_exec = rq->clock - curr->se.exec_start; + delta_exec = rq->clock_task - curr->se.exec_start; if (unlikely((s64)delta_exec < 0)) delta_exec = 0; @@ -618,7 +618,7 @@ static void update_curr_rt(struct rq *rq) curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); - curr->se.exec_start = rq->clock; + curr->se.exec_start = rq->clock_task; cpuacct_charge(curr, delta_exec); sched_rt_avg_update(rq, delta_exec); @@ -960,18 +960,19 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) * runqueue. Otherwise simply start this RT task * on its current runqueue. * - * We want to avoid overloading runqueues. Even if - * the RT task is of higher priority than the current RT task. - * RT tasks behave differently than other tasks. If - * one gets preempted, we try to push it off to another queue. - * So trying to keep a preempting RT task on the same - * cache hot CPU will force the running RT task to - * a cold CPU. So we waste all the cache for the lower - * RT task in hopes of saving some of a RT task - * that is just being woken and probably will have - * cold cache anyway. + * We want to avoid overloading runqueues. If the woken + * task is a higher priority, then it will stay on this CPU + * and the lower prio task should be moved to another CPU. + * Even though this will probably make the lower prio task + * lose its cache, we do not want to bounce a higher task + * around just because it gave up its CPU, perhaps for a + * lock? + * + * For equal prio tasks, we just let the scheduler sort it out. */ if (unlikely(rt_task(rq->curr)) && + (rq->curr->rt.nr_cpus_allowed < 2 || + rq->curr->prio < p->prio) && (p->rt.nr_cpus_allowed > 1)) { int cpu = find_lowest_rq(p); @@ -1074,7 +1075,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) } while (rt_rq); p = rt_task_of(rt_se); - p->se.exec_start = rq->clock; + p->se.exec_start = rq->clock_task; return p; } @@ -1139,7 +1140,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) for_each_leaf_rt_rq(rt_rq, rq) { array = &rt_rq->active; idx = sched_find_first_bit(array->bitmap); - next_idx: +next_idx: if (idx >= MAX_RT_PRIO) continue; if (next && next->prio < idx) @@ -1315,7 +1316,7 @@ static int push_rt_task(struct rq *rq) if (!next_task) return 0; - retry: +retry: if (unlikely(next_task == rq->curr)) { WARN_ON(1); return 0; @@ -1463,7 +1464,7 @@ static int pull_rt_task(struct rq *this_rq) * but possible) */ } - skip: +skip: double_unlock_balance(this_rq, src_rq); } @@ -1491,7 +1492,10 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && has_pushable_tasks(rq) && - p->rt.nr_cpus_allowed > 1) + p->rt.nr_cpus_allowed > 1 && + rt_task(rq->curr) && + (rq->curr->rt.nr_cpus_allowed < 2 || + rq->curr->prio < p->prio)) push_rt_tasks(rq); } @@ -1663,9 +1667,6 @@ static void watchdog(struct rq *rq, struct task_struct *p) { unsigned long soft, hard; - if (!p->signal) - return; - /* max may change after cur was read, this will be fixed next tick */ soft = task_rlimit(p, RLIMIT_RTTIME); hard = task_rlimit_max(p, RLIMIT_RTTIME); @@ -1712,7 +1713,7 @@ static void set_curr_task_rt(struct rq *rq) { struct task_struct *p = rq->curr; - p->se.exec_start = rq->clock; + p->se.exec_start = rq->clock_task; /* The running task is never eligible for pushing */ dequeue_pushable_task(rq, p); diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 32d2bd4..25c2f96 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -295,13 +295,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) static inline void account_group_user_time(struct task_struct *tsk, cputime_t cputime) { - struct thread_group_cputimer *cputimer; - - /* tsk == current, ensure it is safe to use ->signal */ - if (unlikely(tsk->exit_state)) - return; - - cputimer = &tsk->signal->cputimer; + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; if (!cputimer->running) return; @@ -325,13 +319,7 @@ static inline void account_group_user_time(struct task_struct *tsk, static inline void account_group_system_time(struct task_struct *tsk, cputime_t cputime) { - struct thread_group_cputimer *cputimer; - - /* tsk == current, ensure it is safe to use ->signal */ - if (unlikely(tsk->exit_state)) - return; - - cputimer = &tsk->signal->cputimer; + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; if (!cputimer->running) return; @@ -355,16 +343,7 @@ static inline void account_group_system_time(struct task_struct *tsk, static inline void account_group_exec_runtime(struct task_struct *tsk, unsigned long long ns) { - struct thread_group_cputimer *cputimer; - struct signal_struct *sig; - - sig = tsk->signal; - /* see __exit_signal()->task_rq_unlock_wait() */ - barrier(); - if (unlikely(!sig)) - return; - - cputimer = &sig->cputimer; + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; if (!cputimer->running) return; diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c new file mode 100644 index 0000000..45bddc0 --- /dev/null +++ b/kernel/sched_stoptask.c @@ -0,0 +1,108 @@ +/* + * stop-task scheduling class. + * + * The stop task is the highest priority task in the system, it preempts + * everything and will be preempted by nothing. + * + * See kernel/stop_machine.c + */ + +#ifdef CONFIG_SMP +static int +select_task_rq_stop(struct rq *rq, struct task_struct *p, + int sd_flag, int flags) +{ + return task_cpu(p); /* stop tasks as never migrate */ +} +#endif /* CONFIG_SMP */ + +static void +check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) +{ + resched_task(rq->curr); /* we preempt everything */ +} + +static struct task_struct *pick_next_task_stop(struct rq *rq) +{ + struct task_struct *stop = rq->stop; + + if (stop && stop->state == TASK_RUNNING) + return stop; + + return NULL; +} + +static void +enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) +{ +} + +static void +dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) +{ +} + +static void yield_task_stop(struct rq *rq) +{ + BUG(); /* the stop task should never yield, its pointless. */ +} + +static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) +{ +} + +static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) +{ +} + +static void set_curr_task_stop(struct rq *rq) +{ +} + +static void switched_to_stop(struct rq *rq, struct task_struct *p, + int running) +{ + BUG(); /* its impossible to change to this class */ +} + +static void prio_changed_stop(struct rq *rq, struct task_struct *p, + int oldprio, int running) +{ + BUG(); /* how!?, what priority? */ +} + +static unsigned int +get_rr_interval_stop(struct rq *rq, struct task_struct *task) +{ + return 0; +} + +/* + * Simple, special scheduling class for the per-CPU stop tasks: + */ +static const struct sched_class stop_sched_class = { + .next = &rt_sched_class, + + .enqueue_task = enqueue_task_stop, + .dequeue_task = dequeue_task_stop, + .yield_task = yield_task_stop, + + .check_preempt_curr = check_preempt_curr_stop, + + .pick_next_task = pick_next_task_stop, + .put_prev_task = put_prev_task_stop, + +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_stop, +#endif + + .set_curr_task = set_curr_task_stop, + .task_tick = task_tick_stop, + + .get_rr_interval = get_rr_interval_stop, + + .prio_changed = prio_changed_stop, + .switched_to = switched_to_stop, + + /* no .task_new for stop tasks */ +}; diff --git a/kernel/signal.c b/kernel/signal.c index 906ae5a..919562c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -637,7 +637,7 @@ static inline bool si_fromuser(const struct siginfo *info) /* * Bad permissions for sending the signal - * - the caller must hold at least the RCU read lock + * - the caller must hold the RCU read lock */ static int check_kill_permission(int sig, struct siginfo *info, struct task_struct *t) @@ -1127,11 +1127,14 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long /* * send signal info to all the members of a group - * - the caller must hold the RCU read lock at least */ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { - int ret = check_kill_permission(sig, info, p); + int ret; + + rcu_read_lock(); + ret = check_kill_permission(sig, info, p); + rcu_read_unlock(); if (!ret && sig) ret = do_send_sig_info(sig, info, p, true); @@ -2212,6 +2215,14 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) #ifdef __ARCH_SI_TRAPNO err |= __put_user(from->si_trapno, &to->si_trapno); #endif +#ifdef BUS_MCEERR_AO + /* + * Other callers might not initialize the si_lsb field, + * so check explicitely for the right codes here. + */ + if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) + err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); +#endif break; case __SI_CHLD: err |= __put_user(from->si_pid, &to->si_pid); diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c deleted file mode 100644 index e45c436..0000000 --- a/kernel/slow-work-debugfs.c +++ /dev/null @@ -1,227 +0,0 @@ -/* Slow work debugging - * - * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#include <linux/module.h> -#include <linux/slow-work.h> -#include <linux/fs.h> -#include <linux/time.h> -#include <linux/seq_file.h> -#include "slow-work.h" - -#define ITERATOR_SHIFT (BITS_PER_LONG - 4) -#define ITERATOR_SELECTOR (0xfUL << ITERATOR_SHIFT) -#define ITERATOR_COUNTER (~ITERATOR_SELECTOR) - -void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m) -{ - seq_puts(m, "Slow-work: New thread"); -} - -/* - * Render the time mark field on a work item into a 5-char time with units plus - * a space - */ -static void slow_work_print_mark(struct seq_file *m, struct slow_work *work) -{ - struct timespec now, diff; - - now = CURRENT_TIME; - diff = timespec_sub(now, work->mark); - - if (diff.tv_sec < 0) - seq_puts(m, " -ve "); - else if (diff.tv_sec == 0 && diff.tv_nsec < 1000) - seq_printf(m, "%3luns ", diff.tv_nsec); - else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000) - seq_printf(m, "%3luus ", diff.tv_nsec / 1000); - else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000) - seq_printf(m, "%3lums ", diff.tv_nsec / 1000000); - else if (diff.tv_sec <= 1) - seq_puts(m, " 1s "); - else if (diff.tv_sec < 60) - seq_printf(m, "%4lus ", diff.tv_sec); - else if (diff.tv_sec < 60 * 60) - seq_printf(m, "%4lum ", diff.tv_sec / 60); - else if (diff.tv_sec < 60 * 60 * 24) - seq_printf(m, "%4luh ", diff.tv_sec / 3600); - else - seq_puts(m, "exces "); -} - -/* - * Describe a slow work item for debugfs - */ -static int slow_work_runqueue_show(struct seq_file *m, void *v) -{ - struct slow_work *work; - struct list_head *p = v; - unsigned long id; - - switch ((unsigned long) v) { - case 1: - seq_puts(m, "THR PID ITEM ADDR FL MARK DESC\n"); - return 0; - case 2: - seq_puts(m, "=== ===== ================ == ===== ==========\n"); - return 0; - - case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1: - id = (unsigned long) v - 3; - - read_lock(&slow_work_execs_lock); - work = slow_work_execs[id]; - if (work) { - smp_read_barrier_depends(); - - seq_printf(m, "%3lu %5d %16p %2lx ", - id, slow_work_pids[id], work, work->flags); - slow_work_print_mark(m, work); - - if (work->ops->desc) - work->ops->desc(work, m); - seq_putc(m, '\n'); - } - read_unlock(&slow_work_execs_lock); - return 0; - - default: - work = list_entry(p, struct slow_work, link); - seq_printf(m, "%3s - %16p %2lx ", - work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq", - work, work->flags); - slow_work_print_mark(m, work); - - if (work->ops->desc) - work->ops->desc(work, m); - seq_putc(m, '\n'); - return 0; - } -} - -/* - * map the iterator to a work item - */ -static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos) -{ - struct list_head *p; - unsigned long count, id; - - switch (*_pos >> ITERATOR_SHIFT) { - case 0x0: - if (*_pos == 0) - *_pos = 1; - if (*_pos < 3) - return (void *)(unsigned long) *_pos; - if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT) - for (id = *_pos - 3; - id < SLOW_WORK_THREAD_LIMIT; - id++, (*_pos)++) - if (slow_work_execs[id]) - return (void *)(unsigned long) *_pos; - *_pos = 0x1UL << ITERATOR_SHIFT; - - case 0x1: - count = *_pos & ITERATOR_COUNTER; - list_for_each(p, &slow_work_queue) { - if (count == 0) - return p; - count--; - } - *_pos = 0x2UL << ITERATOR_SHIFT; - - case 0x2: - count = *_pos & ITERATOR_COUNTER; - list_for_each(p, &vslow_work_queue) { - if (count == 0) - return p; - count--; - } - *_pos = 0x3UL << ITERATOR_SHIFT; - - default: - return NULL; - } -} - -/* - * set up the iterator to start reading from the first line - */ -static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos) -{ - spin_lock_irq(&slow_work_queue_lock); - return slow_work_runqueue_index(m, _pos); -} - -/* - * move to the next line - */ -static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos) -{ - struct list_head *p = v; - unsigned long selector = *_pos >> ITERATOR_SHIFT; - - (*_pos)++; - switch (selector) { - case 0x0: - return slow_work_runqueue_index(m, _pos); - - case 0x1: - if (*_pos >> ITERATOR_SHIFT == 0x1) { - p = p->next; - if (p != &slow_work_queue) - return p; - } - *_pos = 0x2UL << ITERATOR_SHIFT; - p = &vslow_work_queue; - - case 0x2: - if (*_pos >> ITERATOR_SHIFT == 0x2) { - p = p->next; - if (p != &vslow_work_queue) - return p; - } - *_pos = 0x3UL << ITERATOR_SHIFT; - - default: - return NULL; - } -} - -/* - * clean up after reading - */ -static void slow_work_runqueue_stop(struct seq_file *m, void *v) -{ - spin_unlock_irq(&slow_work_queue_lock); -} - -static const struct seq_operations slow_work_runqueue_ops = { - .start = slow_work_runqueue_start, - .stop = slow_work_runqueue_stop, - .next = slow_work_runqueue_next, - .show = slow_work_runqueue_show, -}; - -/* - * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents - */ -static int slow_work_runqueue_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &slow_work_runqueue_ops); -} - -const struct file_operations slow_work_runqueue_fops = { - .owner = THIS_MODULE, - .open = slow_work_runqueue_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; diff --git a/kernel/slow-work.c b/kernel/slow-work.c deleted file mode 100644 index 7d3f4fa..0000000 --- a/kernel/slow-work.c +++ /dev/null @@ -1,1068 +0,0 @@ -/* Worker thread pool for slow items, such as filesystem lookups or mkdirs - * - * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - * - * See Documentation/slow-work.txt - */ - -#include <linux/module.h> -#include <linux/slow-work.h> -#include <linux/kthread.h> -#include <linux/freezer.h> -#include <linux/wait.h> -#include <linux/debugfs.h> -#include "slow-work.h" - -static void slow_work_cull_timeout(unsigned long); -static void slow_work_oom_timeout(unsigned long); - -#ifdef CONFIG_SYSCTL -static int slow_work_min_threads_sysctl(struct ctl_table *, int, - void __user *, size_t *, loff_t *); - -static int slow_work_max_threads_sysctl(struct ctl_table *, int , - void __user *, size_t *, loff_t *); -#endif - -/* - * The pool of threads has at least min threads in it as long as someone is - * using the facility, and may have as many as max. - * - * A portion of the pool may be processing very slow operations. - */ -static unsigned slow_work_min_threads = 2; -static unsigned slow_work_max_threads = 4; -static unsigned vslow_work_proportion = 50; /* % of threads that may process - * very slow work */ - -#ifdef CONFIG_SYSCTL -static const int slow_work_min_min_threads = 2; -static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT; -static const int slow_work_min_vslow = 1; -static const int slow_work_max_vslow = 99; - -ctl_table slow_work_sysctls[] = { - { - .procname = "min-threads", - .data = &slow_work_min_threads, - .maxlen = sizeof(unsigned), - .mode = 0644, - .proc_handler = slow_work_min_threads_sysctl, - .extra1 = (void *) &slow_work_min_min_threads, - .extra2 = &slow_work_max_threads, - }, - { - .procname = "max-threads", - .data = &slow_work_max_threads, - .maxlen = sizeof(unsigned), - .mode = 0644, - .proc_handler = slow_work_max_threads_sysctl, - .extra1 = &slow_work_min_threads, - .extra2 = (void *) &slow_work_max_max_threads, - }, - { - .procname = "vslow-percentage", - .data = &vslow_work_proportion, - .maxlen = sizeof(unsigned), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = (void *) &slow_work_min_vslow, - .extra2 = (void *) &slow_work_max_vslow, - }, - {} -}; -#endif - -/* - * The active state of the thread pool - */ -static atomic_t slow_work_thread_count; -static atomic_t vslow_work_executing_count; - -static bool slow_work_may_not_start_new_thread; -static bool slow_work_cull; /* cull a thread due to lack of activity */ -static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0); -static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0); -static struct slow_work slow_work_new_thread; /* new thread starter */ - -/* - * slow work ID allocation (use slow_work_queue_lock) - */ -static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT); - -/* - * Unregistration tracking to prevent put_ref() from disappearing during module - * unload - */ -#ifdef CONFIG_MODULES -static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT]; -static struct module *slow_work_unreg_module; -static struct slow_work *slow_work_unreg_work_item; -static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq); -static DEFINE_MUTEX(slow_work_unreg_sync_lock); - -static void slow_work_set_thread_processing(int id, struct slow_work *work) -{ - if (work) - slow_work_thread_processing[id] = work->owner; -} -static void slow_work_done_thread_processing(int id, struct slow_work *work) -{ - struct module *module = slow_work_thread_processing[id]; - - slow_work_thread_processing[id] = NULL; - smp_mb(); - if (slow_work_unreg_work_item == work || - slow_work_unreg_module == module) - wake_up_all(&slow_work_unreg_wq); -} -static void slow_work_clear_thread_processing(int id) -{ - slow_work_thread_processing[id] = NULL; -} -#else -static void slow_work_set_thread_processing(int id, struct slow_work *work) {} -static void slow_work_done_thread_processing(int id, struct slow_work *work) {} -static void slow_work_clear_thread_processing(int id) {} -#endif - -/* - * Data for tracking currently executing items for indication through /proc - */ -#ifdef CONFIG_SLOW_WORK_DEBUG -struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT]; -pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT]; -DEFINE_RWLOCK(slow_work_execs_lock); -#endif - -/* - * The queues of work items and the lock governing access to them. These are - * shared between all the CPUs. It doesn't make sense to have per-CPU queues - * as the number of threads bears no relation to the number of CPUs. - * - * There are two queues of work items: one for slow work items, and one for - * very slow work items. - */ -LIST_HEAD(slow_work_queue); -LIST_HEAD(vslow_work_queue); -DEFINE_SPINLOCK(slow_work_queue_lock); - -/* - * The following are two wait queues that get pinged when a work item is placed - * on an empty queue. These allow work items that are hogging a thread by - * sleeping in a way that could be deferred to yield their thread and enqueue - * themselves. - */ -static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation); -static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation); - -/* - * The thread controls. A variable used to signal to the threads that they - * should exit when the queue is empty, a waitqueue used by the threads to wait - * for signals, and a completion set by the last thread to exit. - */ -static bool slow_work_threads_should_exit; -static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq); -static DECLARE_COMPLETION(slow_work_last_thread_exited); - -/* - * The number of users of the thread pool and its lock. Whilst this is zero we - * have no threads hanging around, and when this reaches zero, we wait for all - * active or queued work items to complete and kill all the threads we do have. - */ -static int slow_work_user_count; -static DEFINE_MUTEX(slow_work_user_lock); - -static inline int slow_work_get_ref(struct slow_work *work) -{ - if (work->ops->get_ref) - return work->ops->get_ref(work); - - return 0; -} - -static inline void slow_work_put_ref(struct slow_work *work) -{ - if (work->ops->put_ref) - work->ops->put_ref(work); -} - -/* - * Calculate the maximum number of active threads in the pool that are - * permitted to process very slow work items. - * - * The answer is rounded up to at least 1, but may not equal or exceed the - * maximum number of the threads in the pool. This means we always have at - * least one thread that can process slow work items, and we always have at - * least one thread that won't get tied up doing so. - */ -static unsigned slow_work_calc_vsmax(void) -{ - unsigned vsmax; - - vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion; - vsmax /= 100; - vsmax = max(vsmax, 1U); - return min(vsmax, slow_work_max_threads - 1); -} - -/* - * Attempt to execute stuff queued on a slow thread. Return true if we managed - * it, false if there was nothing to do. - */ -static noinline bool slow_work_execute(int id) -{ - struct slow_work *work = NULL; - unsigned vsmax; - bool very_slow; - - vsmax = slow_work_calc_vsmax(); - - /* see if we can schedule a new thread to be started if we're not - * keeping up with the work */ - if (!waitqueue_active(&slow_work_thread_wq) && - (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) && - atomic_read(&slow_work_thread_count) < slow_work_max_threads && - !slow_work_may_not_start_new_thread) - slow_work_enqueue(&slow_work_new_thread); - - /* find something to execute */ - spin_lock_irq(&slow_work_queue_lock); - if (!list_empty(&vslow_work_queue) && - atomic_read(&vslow_work_executing_count) < vsmax) { - work = list_entry(vslow_work_queue.next, - struct slow_work, link); - if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags)) - BUG(); - list_del_init(&work->link); - atomic_inc(&vslow_work_executing_count); - very_slow = true; - } else if (!list_empty(&slow_work_queue)) { - work = list_entry(slow_work_queue.next, - struct slow_work, link); - if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags)) - BUG(); - list_del_init(&work->link); - very_slow = false; - } else { - very_slow = false; /* avoid the compiler warning */ - } - - slow_work_set_thread_processing(id, work); - if (work) { - slow_work_mark_time(work); - slow_work_begin_exec(id, work); - } - - spin_unlock_irq(&slow_work_queue_lock); - - if (!work) - return false; - - if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags)) - BUG(); - - /* don't execute if the work is in the process of being cancelled */ - if (!test_bit(SLOW_WORK_CANCELLING, &work->flags)) - work->ops->execute(work); - - if (very_slow) - atomic_dec(&vslow_work_executing_count); - clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags); - - /* wake up anyone waiting for this work to be complete */ - wake_up_bit(&work->flags, SLOW_WORK_EXECUTING); - - slow_work_end_exec(id, work); - - /* if someone tried to enqueue the item whilst we were executing it, - * then it'll be left unenqueued to avoid multiple threads trying to - * execute it simultaneously - * - * there is, however, a race between us testing the pending flag and - * getting the spinlock, and between the enqueuer setting the pending - * flag and getting the spinlock, so we use a deferral bit to tell us - * if the enqueuer got there first - */ - if (test_bit(SLOW_WORK_PENDING, &work->flags)) { - spin_lock_irq(&slow_work_queue_lock); - - if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) && - test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) - goto auto_requeue; - - spin_unlock_irq(&slow_work_queue_lock); - } - - /* sort out the race between module unloading and put_ref() */ - slow_work_put_ref(work); - slow_work_done_thread_processing(id, work); - - return true; - -auto_requeue: - /* we must complete the enqueue operation - * - we transfer our ref on the item back to the appropriate queue - * - don't wake another thread up as we're awake already - */ - slow_work_mark_time(work); - if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) - list_add_tail(&work->link, &vslow_work_queue); - else - list_add_tail(&work->link, &slow_work_queue); - spin_unlock_irq(&slow_work_queue_lock); - slow_work_clear_thread_processing(id); - return true; -} - -/** - * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work - * work: The work item under execution that wants to sleep - * _timeout: Scheduler sleep timeout - * - * Allow a requeueable work item to sleep on a slow-work processor thread until - * that thread is needed to do some other work or the sleep is interrupted by - * some other event. - * - * The caller must set up a wake up event before calling this and must have set - * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own - * condition before calling this function as no test is made here. - * - * False is returned if there is nothing on the queue; true is returned if the - * work item should be requeued - */ -bool slow_work_sleep_till_thread_needed(struct slow_work *work, - signed long *_timeout) -{ - wait_queue_head_t *wfo_wq; - struct list_head *queue; - - DEFINE_WAIT(wait); - - if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { - wfo_wq = &vslow_work_queue_waits_for_occupation; - queue = &vslow_work_queue; - } else { - wfo_wq = &slow_work_queue_waits_for_occupation; - queue = &slow_work_queue; - } - - if (!list_empty(queue)) - return true; - - add_wait_queue_exclusive(wfo_wq, &wait); - if (list_empty(queue)) - *_timeout = schedule_timeout(*_timeout); - finish_wait(wfo_wq, &wait); - - return !list_empty(queue); -} -EXPORT_SYMBOL(slow_work_sleep_till_thread_needed); - -/** - * slow_work_enqueue - Schedule a slow work item for processing - * @work: The work item to queue - * - * Schedule a slow work item for processing. If the item is already undergoing - * execution, this guarantees not to re-enter the execution routine until the - * first execution finishes. - * - * The item is pinned by this function as it retains a reference to it, managed - * through the item operations. The item is unpinned once it has been - * executed. - * - * An item may hog the thread that is running it for a relatively large amount - * of time, sufficient, for example, to perform several lookup, mkdir, create - * and setxattr operations. It may sleep on I/O and may sleep to obtain locks. - * - * Conversely, if a number of items are awaiting processing, it may take some - * time before any given item is given attention. The number of threads in the - * pool may be increased to deal with demand, but only up to a limit. - * - * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in - * the very slow queue, from which only a portion of the threads will be - * allowed to pick items to execute. This ensures that very slow items won't - * overly block ones that are just ordinarily slow. - * - * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is - * attempted queued) - */ -int slow_work_enqueue(struct slow_work *work) -{ - wait_queue_head_t *wfo_wq; - struct list_head *queue; - unsigned long flags; - int ret; - - if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) - return -ECANCELED; - - BUG_ON(slow_work_user_count <= 0); - BUG_ON(!work); - BUG_ON(!work->ops); - - /* when honouring an enqueue request, we only promise that we will run - * the work function in the future; we do not promise to run it once - * per enqueue request - * - * we use the PENDING bit to merge together repeat requests without - * having to disable IRQs and take the spinlock, whilst still - * maintaining our promise - */ - if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { - if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { - wfo_wq = &vslow_work_queue_waits_for_occupation; - queue = &vslow_work_queue; - } else { - wfo_wq = &slow_work_queue_waits_for_occupation; - queue = &slow_work_queue; - } - - spin_lock_irqsave(&slow_work_queue_lock, flags); - - if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags))) - goto cancelled; - - /* we promise that we will not attempt to execute the work - * function in more than one thread simultaneously - * - * this, however, leaves us with a problem if we're asked to - * enqueue the work whilst someone is executing the work - * function as simply queueing the work immediately means that - * another thread may try executing it whilst it is already - * under execution - * - * to deal with this, we set the ENQ_DEFERRED bit instead of - * enqueueing, and the thread currently executing the work - * function will enqueue the work item when the work function - * returns and it has cleared the EXECUTING bit - */ - if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { - set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); - } else { - ret = slow_work_get_ref(work); - if (ret < 0) - goto failed; - slow_work_mark_time(work); - list_add_tail(&work->link, queue); - wake_up(&slow_work_thread_wq); - - /* if someone who could be requeued is sleeping on a - * thread, then ask them to yield their thread */ - if (work->link.prev == queue) - wake_up(wfo_wq); - } - - spin_unlock_irqrestore(&slow_work_queue_lock, flags); - } - return 0; - -cancelled: - ret = -ECANCELED; -failed: - spin_unlock_irqrestore(&slow_work_queue_lock, flags); - return ret; -} -EXPORT_SYMBOL(slow_work_enqueue); - -static int slow_work_wait(void *word) -{ - schedule(); - return 0; -} - -/** - * slow_work_cancel - Cancel a slow work item - * @work: The work item to cancel - * - * This function will cancel a previously enqueued work item. If we cannot - * cancel the work item, it is guarenteed to have run when this function - * returns. - */ -void slow_work_cancel(struct slow_work *work) -{ - bool wait = true, put = false; - - set_bit(SLOW_WORK_CANCELLING, &work->flags); - smp_mb(); - - /* if the work item is a delayed work item with an active timer, we - * need to wait for the timer to finish _before_ getting the spinlock, - * lest we deadlock against the timer routine - * - * the timer routine will leave DELAYED set if it notices the - * CANCELLING flag in time - */ - if (test_bit(SLOW_WORK_DELAYED, &work->flags)) { - struct delayed_slow_work *dwork = - container_of(work, struct delayed_slow_work, work); - del_timer_sync(&dwork->timer); - } - - spin_lock_irq(&slow_work_queue_lock); - - if (test_bit(SLOW_WORK_DELAYED, &work->flags)) { - /* the timer routine aborted or never happened, so we are left - * holding the timer's reference on the item and should just - * drop the pending flag and wait for any ongoing execution to - * finish */ - struct delayed_slow_work *dwork = - container_of(work, struct delayed_slow_work, work); - - BUG_ON(timer_pending(&dwork->timer)); - BUG_ON(!list_empty(&work->link)); - - clear_bit(SLOW_WORK_DELAYED, &work->flags); - put = true; - clear_bit(SLOW_WORK_PENDING, &work->flags); - - } else if (test_bit(SLOW_WORK_PENDING, &work->flags) && - !list_empty(&work->link)) { - /* the link in the pending queue holds a reference on the item - * that we will need to release */ - list_del_init(&work->link); - wait = false; - put = true; - clear_bit(SLOW_WORK_PENDING, &work->flags); - - } else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) { - /* the executor is holding our only reference on the item, so - * we merely need to wait for it to finish executing */ - clear_bit(SLOW_WORK_PENDING, &work->flags); - } - - spin_unlock_irq(&slow_work_queue_lock); - - /* the EXECUTING flag is set by the executor whilst the spinlock is set - * and before the item is dequeued - so assuming the above doesn't - * actually dequeue it, simply waiting for the EXECUTING flag to be - * released here should be sufficient */ - if (wait) - wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait, - TASK_UNINTERRUPTIBLE); - - clear_bit(SLOW_WORK_CANCELLING, &work->flags); - if (put) - slow_work_put_ref(work); -} -EXPORT_SYMBOL(slow_work_cancel); - -/* - * Handle expiry of the delay timer, indicating that a delayed slow work item - * should now be queued if not cancelled - */ -static void delayed_slow_work_timer(unsigned long data) -{ - wait_queue_head_t *wfo_wq; - struct list_head *queue; - struct slow_work *work = (struct slow_work *) data; - unsigned long flags; - bool queued = false, put = false, first = false; - - if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) { - wfo_wq = &vslow_work_queue_waits_for_occupation; - queue = &vslow_work_queue; - } else { - wfo_wq = &slow_work_queue_waits_for_occupation; - queue = &slow_work_queue; - } - - spin_lock_irqsave(&slow_work_queue_lock, flags); - if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) { - clear_bit(SLOW_WORK_DELAYED, &work->flags); - - if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { - /* we discard the reference the timer was holding in - * favour of the one the executor holds */ - set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); - put = true; - } else { - slow_work_mark_time(work); - list_add_tail(&work->link, queue); - queued = true; - if (work->link.prev == queue) - first = true; - } - } - - spin_unlock_irqrestore(&slow_work_queue_lock, flags); - if (put) - slow_work_put_ref(work); - if (first) - wake_up(wfo_wq); - if (queued) - wake_up(&slow_work_thread_wq); -} - -/** - * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing - * @dwork: The delayed work item to queue - * @delay: When to start executing the work, in jiffies from now - * - * This is similar to slow_work_enqueue(), but it adds a delay before the work - * is actually queued for processing. - * - * The item can have delayed processing requested on it whilst it is being - * executed. The delay will begin immediately, and if it expires before the - * item finishes executing, the item will be placed back on the queue when it - * has done executing. - */ -int delayed_slow_work_enqueue(struct delayed_slow_work *dwork, - unsigned long delay) -{ - struct slow_work *work = &dwork->work; - unsigned long flags; - int ret; - - if (delay == 0) - return slow_work_enqueue(&dwork->work); - - BUG_ON(slow_work_user_count <= 0); - BUG_ON(!work); - BUG_ON(!work->ops); - - if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) - return -ECANCELED; - - if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { - spin_lock_irqsave(&slow_work_queue_lock, flags); - - if (test_bit(SLOW_WORK_CANCELLING, &work->flags)) - goto cancelled; - - /* the timer holds a reference whilst it is pending */ - ret = slow_work_get_ref(work); - if (ret < 0) - goto cant_get_ref; - - if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags)) - BUG(); - dwork->timer.expires = jiffies + delay; - dwork->timer.data = (unsigned long) work; - dwork->timer.function = delayed_slow_work_timer; - add_timer(&dwork->timer); - - spin_unlock_irqrestore(&slow_work_queue_lock, flags); - } - - return 0; - -cancelled: - ret = -ECANCELED; -cant_get_ref: - spin_unlock_irqrestore(&slow_work_queue_lock, flags); - return ret; -} -EXPORT_SYMBOL(delayed_slow_work_enqueue); - -/* - * Schedule a cull of the thread pool at some time in the near future - */ -static void slow_work_schedule_cull(void) -{ - mod_timer(&slow_work_cull_timer, - round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT)); -} - -/* - * Worker thread culling algorithm - */ -static bool slow_work_cull_thread(void) -{ - unsigned long flags; - bool do_cull = false; - - spin_lock_irqsave(&slow_work_queue_lock, flags); - - if (slow_work_cull) { - slow_work_cull = false; - - if (list_empty(&slow_work_queue) && - list_empty(&vslow_work_queue) && - atomic_read(&slow_work_thread_count) > - slow_work_min_threads) { - slow_work_schedule_cull(); - do_cull = true; - } - } - - spin_unlock_irqrestore(&slow_work_queue_lock, flags); - return do_cull; -} - -/* - * Determine if there is slow work available for dispatch - */ -static inline bool slow_work_available(int vsmax) -{ - return !list_empty(&slow_work_queue) || - (!list_empty(&vslow_work_queue) && - atomic_read(&vslow_work_executing_count) < vsmax); -} - -/* - * Worker thread dispatcher - */ -static int slow_work_thread(void *_data) -{ - int vsmax, id; - - DEFINE_WAIT(wait); - - set_freezable(); - set_user_nice(current, -5); - - /* allocate ourselves an ID */ - spin_lock_irq(&slow_work_queue_lock); - id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT); - BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT); - __set_bit(id, slow_work_ids); - slow_work_set_thread_pid(id, current->pid); - spin_unlock_irq(&slow_work_queue_lock); - - sprintf(current->comm, "kslowd%03u", id); - - for (;;) { - vsmax = vslow_work_proportion; - vsmax *= atomic_read(&slow_work_thread_count); - vsmax /= 100; - - prepare_to_wait_exclusive(&slow_work_thread_wq, &wait, - TASK_INTERRUPTIBLE); - if (!freezing(current) && - !slow_work_threads_should_exit && - !slow_work_available(vsmax) && - !slow_work_cull) - schedule(); - finish_wait(&slow_work_thread_wq, &wait); - - try_to_freeze(); - - vsmax = vslow_work_proportion; - vsmax *= atomic_read(&slow_work_thread_count); - vsmax /= 100; - - if (slow_work_available(vsmax) && slow_work_execute(id)) { - cond_resched(); - if (list_empty(&slow_work_queue) && - list_empty(&vslow_work_queue) && - atomic_read(&slow_work_thread_count) > - slow_work_min_threads) - slow_work_schedule_cull(); - continue; - } - - if (slow_work_threads_should_exit) - break; - - if (slow_work_cull && slow_work_cull_thread()) - break; - } - - spin_lock_irq(&slow_work_queue_lock); - slow_work_set_thread_pid(id, 0); - __clear_bit(id, slow_work_ids); - spin_unlock_irq(&slow_work_queue_lock); - - if (atomic_dec_and_test(&slow_work_thread_count)) - complete_and_exit(&slow_work_last_thread_exited, 0); - return 0; -} - -/* - * Handle thread cull timer expiration - */ -static void slow_work_cull_timeout(unsigned long data) -{ - slow_work_cull = true; - wake_up(&slow_work_thread_wq); -} - -/* - * Start a new slow work thread - */ -static void slow_work_new_thread_execute(struct slow_work *work) -{ - struct task_struct *p; - - if (slow_work_threads_should_exit) - return; - - if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads) - return; - - if (!mutex_trylock(&slow_work_user_lock)) - return; - - slow_work_may_not_start_new_thread = true; - atomic_inc(&slow_work_thread_count); - p = kthread_run(slow_work_thread, NULL, "kslowd"); - if (IS_ERR(p)) { - printk(KERN_DEBUG "Slow work thread pool: OOM\n"); - if (atomic_dec_and_test(&slow_work_thread_count)) - BUG(); /* we're running on a slow work thread... */ - mod_timer(&slow_work_oom_timer, - round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT)); - } else { - /* ratelimit the starting of new threads */ - mod_timer(&slow_work_oom_timer, jiffies + 1); - } - - mutex_unlock(&slow_work_user_lock); -} - -static const struct slow_work_ops slow_work_new_thread_ops = { - .owner = THIS_MODULE, - .execute = slow_work_new_thread_execute, -#ifdef CONFIG_SLOW_WORK_DEBUG - .desc = slow_work_new_thread_desc, -#endif -}; - -/* - * post-OOM new thread start suppression expiration - */ -static void slow_work_oom_timeout(unsigned long data) -{ - slow_work_may_not_start_new_thread = false; -} - -#ifdef CONFIG_SYSCTL -/* - * Handle adjustment of the minimum number of threads - */ -static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - int n; - - if (ret == 0) { - mutex_lock(&slow_work_user_lock); - if (slow_work_user_count > 0) { - /* see if we need to start or stop threads */ - n = atomic_read(&slow_work_thread_count) - - slow_work_min_threads; - - if (n < 0 && !slow_work_may_not_start_new_thread) - slow_work_enqueue(&slow_work_new_thread); - else if (n > 0) - slow_work_schedule_cull(); - } - mutex_unlock(&slow_work_user_lock); - } - - return ret; -} - -/* - * Handle adjustment of the maximum number of threads - */ -static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - int n; - - if (ret == 0) { - mutex_lock(&slow_work_user_lock); - if (slow_work_user_count > 0) { - /* see if we need to stop threads */ - n = slow_work_max_threads - - atomic_read(&slow_work_thread_count); - - if (n < 0) - slow_work_schedule_cull(); - } - mutex_unlock(&slow_work_user_lock); - } - - return ret; -} -#endif /* CONFIG_SYSCTL */ - -/** - * slow_work_register_user - Register a user of the facility - * @module: The module about to make use of the facility - * - * Register a user of the facility, starting up the initial threads if there - * aren't any other users at this point. This will return 0 if successful, or - * an error if not. - */ -int slow_work_register_user(struct module *module) -{ - struct task_struct *p; - int loop; - - mutex_lock(&slow_work_user_lock); - - if (slow_work_user_count == 0) { - printk(KERN_NOTICE "Slow work thread pool: Starting up\n"); - init_completion(&slow_work_last_thread_exited); - - slow_work_threads_should_exit = false; - slow_work_init(&slow_work_new_thread, - &slow_work_new_thread_ops); - slow_work_may_not_start_new_thread = false; - slow_work_cull = false; - - /* start the minimum number of threads */ - for (loop = 0; loop < slow_work_min_threads; loop++) { - atomic_inc(&slow_work_thread_count); - p = kthread_run(slow_work_thread, NULL, "kslowd"); - if (IS_ERR(p)) - goto error; - } - printk(KERN_NOTICE "Slow work thread pool: Ready\n"); - } - - slow_work_user_count++; - mutex_unlock(&slow_work_user_lock); - return 0; - -error: - if (atomic_dec_and_test(&slow_work_thread_count)) - complete(&slow_work_last_thread_exited); - if (loop > 0) { - printk(KERN_ERR "Slow work thread pool:" - " Aborting startup on ENOMEM\n"); - slow_work_threads_should_exit = true; - wake_up_all(&slow_work_thread_wq); - wait_for_completion(&slow_work_last_thread_exited); - printk(KERN_ERR "Slow work thread pool: Aborted\n"); - } - mutex_unlock(&slow_work_user_lock); - return PTR_ERR(p); -} -EXPORT_SYMBOL(slow_work_register_user); - -/* - * wait for all outstanding items from the calling module to complete - * - note that more items may be queued whilst we're waiting - */ -static void slow_work_wait_for_items(struct module *module) -{ -#ifdef CONFIG_MODULES - DECLARE_WAITQUEUE(myself, current); - struct slow_work *work; - int loop; - - mutex_lock(&slow_work_unreg_sync_lock); - add_wait_queue(&slow_work_unreg_wq, &myself); - - for (;;) { - spin_lock_irq(&slow_work_queue_lock); - - /* first of all, we wait for the last queued item in each list - * to be processed */ - list_for_each_entry_reverse(work, &vslow_work_queue, link) { - if (work->owner == module) { - set_current_state(TASK_UNINTERRUPTIBLE); - slow_work_unreg_work_item = work; - goto do_wait; - } - } - list_for_each_entry_reverse(work, &slow_work_queue, link) { - if (work->owner == module) { - set_current_state(TASK_UNINTERRUPTIBLE); - slow_work_unreg_work_item = work; - goto do_wait; - } - } - - /* then we wait for the items being processed to finish */ - slow_work_unreg_module = module; - smp_mb(); - for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) { - if (slow_work_thread_processing[loop] == module) - goto do_wait; - } - spin_unlock_irq(&slow_work_queue_lock); - break; /* okay, we're done */ - - do_wait: - spin_unlock_irq(&slow_work_queue_lock); - schedule(); - slow_work_unreg_work_item = NULL; - slow_work_unreg_module = NULL; - } - - remove_wait_queue(&slow_work_unreg_wq, &myself); - mutex_unlock(&slow_work_unreg_sync_lock); -#endif /* CONFIG_MODULES */ -} - -/** - * slow_work_unregister_user - Unregister a user of the facility - * @module: The module whose items should be cleared - * - * Unregister a user of the facility, killing all the threads if this was the - * last one. - * - * This waits for all the work items belonging to the nominated module to go - * away before proceeding. - */ -void slow_work_unregister_user(struct module *module) -{ - /* first of all, wait for all outstanding items from the calling module - * to complete */ - if (module) - slow_work_wait_for_items(module); - - /* then we can actually go about shutting down the facility if need - * be */ - mutex_lock(&slow_work_user_lock); - - BUG_ON(slow_work_user_count <= 0); - - slow_work_user_count--; - if (slow_work_user_count == 0) { - printk(KERN_NOTICE "Slow work thread pool: Shutting down\n"); - slow_work_threads_should_exit = true; - del_timer_sync(&slow_work_cull_timer); - del_timer_sync(&slow_work_oom_timer); - wake_up_all(&slow_work_thread_wq); - wait_for_completion(&slow_work_last_thread_exited); - printk(KERN_NOTICE "Slow work thread pool:" - " Shut down complete\n"); - } - - mutex_unlock(&slow_work_user_lock); -} -EXPORT_SYMBOL(slow_work_unregister_user); - -/* - * Initialise the slow work facility - */ -static int __init init_slow_work(void) -{ - unsigned nr_cpus = num_possible_cpus(); - - if (slow_work_max_threads < nr_cpus) - slow_work_max_threads = nr_cpus; -#ifdef CONFIG_SYSCTL - if (slow_work_max_max_threads < nr_cpus * 2) - slow_work_max_max_threads = nr_cpus * 2; -#endif -#ifdef CONFIG_SLOW_WORK_DEBUG - { - struct dentry *dbdir; - - dbdir = debugfs_create_dir("slow_work", NULL); - if (dbdir && !IS_ERR(dbdir)) - debugfs_create_file("runqueue", S_IFREG | 0400, dbdir, - NULL, &slow_work_runqueue_fops); - } -#endif - return 0; -} - -subsys_initcall(init_slow_work); diff --git a/kernel/slow-work.h b/kernel/slow-work.h deleted file mode 100644 index a29ebd1..0000000 --- a/kernel/slow-work.h +++ /dev/null @@ -1,72 +0,0 @@ -/* Slow work private definitions - * - * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of - * things to do */ -#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after - * OOM */ - -#define SLOW_WORK_THREAD_LIMIT 255 /* abs maximum number of slow-work threads */ - -/* - * slow-work.c - */ -#ifdef CONFIG_SLOW_WORK_DEBUG -extern struct slow_work *slow_work_execs[]; -extern pid_t slow_work_pids[]; -extern rwlock_t slow_work_execs_lock; -#endif - -extern struct list_head slow_work_queue; -extern struct list_head vslow_work_queue; -extern spinlock_t slow_work_queue_lock; - -/* - * slow-work-debugfs.c - */ -#ifdef CONFIG_SLOW_WORK_DEBUG -extern const struct file_operations slow_work_runqueue_fops; - -extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *); -#endif - -/* - * Helper functions - */ -static inline void slow_work_set_thread_pid(int id, pid_t pid) -{ -#ifdef CONFIG_SLOW_WORK_DEBUG - slow_work_pids[id] = pid; -#endif -} - -static inline void slow_work_mark_time(struct slow_work *work) -{ -#ifdef CONFIG_SLOW_WORK_DEBUG - work->mark = CURRENT_TIME; -#endif -} - -static inline void slow_work_begin_exec(int id, struct slow_work *work) -{ -#ifdef CONFIG_SLOW_WORK_DEBUG - slow_work_execs[id] = work; -#endif -} - -static inline void slow_work_end_exec(int id, struct slow_work *work) -{ -#ifdef CONFIG_SLOW_WORK_DEBUG - write_lock(&slow_work_execs_lock); - slow_work_execs[id] = NULL; - write_unlock(&slow_work_execs_lock); -#endif -} diff --git a/kernel/smp.c b/kernel/smp.c index 75c970c..ed6aacf 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -365,9 +365,10 @@ call: EXPORT_SYMBOL_GPL(smp_call_function_any); /** - * __smp_call_function_single(): Run a function on another CPU + * __smp_call_function_single(): Run a function on a specific CPU * @cpu: The CPU to run on. * @data: Pre-allocated and setup data structure + * @wait: If true, wait until function has completed on specified CPU. * * Like smp_call_function_single(), but allow caller to pass in a * pre-allocated data structure. Useful for embedding @data inside @@ -376,8 +377,10 @@ EXPORT_SYMBOL_GPL(smp_call_function_any); void __smp_call_function_single(int cpu, struct call_single_data *data, int wait) { - csd_lock(data); + unsigned int this_cpu; + unsigned long flags; + this_cpu = get_cpu(); /* * Can deadlock when called with interrupts disabled. * We allow cpu's that are not yet online though, as no one else can @@ -387,7 +390,15 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() && !oops_in_progress); - generic_exec_single(cpu, data, wait); + if (cpu == this_cpu) { + local_irq_save(flags); + data->func(data->info); + local_irq_restore(flags); + } else { + csd_lock(data); + generic_exec_single(cpu, data, wait); + } + put_cpu(); } /** diff --git a/kernel/softirq.c b/kernel/softirq.c index 07b4f1b..fc97888 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -77,11 +77,21 @@ void wakeup_softirqd(void) } /* + * preempt_count and SOFTIRQ_OFFSET usage: + * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving + * softirq processing. + * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET) + * on local_bh_disable or local_bh_enable. + * This lets us distinguish between whether we are currently processing + * softirq and whether we just have bh disabled. + */ + +/* * This one is for softirq.c-internal use, * where hardirqs are disabled legitimately: */ #ifdef CONFIG_TRACE_IRQFLAGS -static void __local_bh_disable(unsigned long ip) +static void __local_bh_disable(unsigned long ip, unsigned int cnt) { unsigned long flags; @@ -95,32 +105,43 @@ static void __local_bh_disable(unsigned long ip) * We must manually increment preempt_count here and manually * call the trace_preempt_off later. */ - preempt_count() += SOFTIRQ_OFFSET; + preempt_count() += cnt; /* * Were softirqs turned off above: */ - if (softirq_count() == SOFTIRQ_OFFSET) + if (softirq_count() == cnt) trace_softirqs_off(ip); raw_local_irq_restore(flags); - if (preempt_count() == SOFTIRQ_OFFSET) + if (preempt_count() == cnt) trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); } #else /* !CONFIG_TRACE_IRQFLAGS */ -static inline void __local_bh_disable(unsigned long ip) +static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) { - add_preempt_count(SOFTIRQ_OFFSET); + add_preempt_count(cnt); barrier(); } #endif /* CONFIG_TRACE_IRQFLAGS */ void local_bh_disable(void) { - __local_bh_disable((unsigned long)__builtin_return_address(0)); + __local_bh_disable((unsigned long)__builtin_return_address(0), + SOFTIRQ_DISABLE_OFFSET); } EXPORT_SYMBOL(local_bh_disable); +static void __local_bh_enable(unsigned int cnt) +{ + WARN_ON_ONCE(in_irq()); + WARN_ON_ONCE(!irqs_disabled()); + + if (softirq_count() == cnt) + trace_softirqs_on((unsigned long)__builtin_return_address(0)); + sub_preempt_count(cnt); +} + /* * Special-case - softirqs can safely be enabled in * cond_resched_softirq(), or by __do_softirq(), @@ -128,12 +149,7 @@ EXPORT_SYMBOL(local_bh_disable); */ void _local_bh_enable(void) { - WARN_ON_ONCE(in_irq()); - WARN_ON_ONCE(!irqs_disabled()); - - if (softirq_count() == SOFTIRQ_OFFSET) - trace_softirqs_on((unsigned long)__builtin_return_address(0)); - sub_preempt_count(SOFTIRQ_OFFSET); + __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); } EXPORT_SYMBOL(_local_bh_enable); @@ -147,13 +163,13 @@ static inline void _local_bh_enable_ip(unsigned long ip) /* * Are softirqs going to be turned on now: */ - if (softirq_count() == SOFTIRQ_OFFSET) + if (softirq_count() == SOFTIRQ_DISABLE_OFFSET) trace_softirqs_on(ip); /* * Keep preemption disabled until we are done with * softirq processing: */ - sub_preempt_count(SOFTIRQ_OFFSET - 1); + sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); if (unlikely(!in_interrupt() && local_softirq_pending())) do_softirq(); @@ -198,7 +214,8 @@ asmlinkage void __do_softirq(void) pending = local_softirq_pending(); account_system_vtime(current); - __local_bh_disable((unsigned long)__builtin_return_address(0)); + __local_bh_disable((unsigned long)__builtin_return_address(0), + SOFTIRQ_OFFSET); lockdep_softirq_enter(); cpu = smp_processor_id(); @@ -245,7 +262,7 @@ restart: lockdep_softirq_exit(); account_system_vtime(current); - _local_bh_enable(); + __local_bh_enable(SOFTIRQ_OFFSET); } #ifndef __ARCH_HAS_DO_SOFTIRQ @@ -279,10 +296,16 @@ void irq_enter(void) rcu_irq_enter(); if (idle_cpu(cpu) && !in_interrupt()) { - __irq_enter(); + /* + * Prevent raise_softirq from needlessly waking up ksoftirqd + * here, as softirq will be serviced on return from interrupt. + */ + local_bh_disable(); tick_check_idle(cpu); - } else - __irq_enter(); + _local_bh_enable(); + } + + __irq_enter(); } #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED @@ -696,6 +719,7 @@ static int run_ksoftirqd(void * __bind_cpu) { set_current_state(TASK_INTERRUPTIBLE); + current->flags |= PF_KSOFTIRQD; while (!kthread_should_stop()) { preempt_disable(); if (!local_softirq_pending()) { @@ -886,17 +910,14 @@ int __init __weak early_irq_init(void) return 0; } +#ifdef CONFIG_GENERIC_HARDIRQS int __init __weak arch_probe_nr_irqs(void) { - return 0; + return NR_IRQS_LEGACY; } int __init __weak arch_early_irq_init(void) { return 0; } - -int __weak arch_init_chip_data(struct irq_desc *desc, int node) -{ - return 0; -} +#endif diff --git a/kernel/softlockup.c b/kernel/softlockup.c deleted file mode 100644 index 4b493f6..0000000 --- a/kernel/softlockup.c +++ /dev/null @@ -1,293 +0,0 @@ -/* - * Detect Soft Lockups - * - * started by Ingo Molnar, Copyright (C) 2005, 2006 Red Hat, Inc. - * - * this code detects soft lockups: incidents in where on a CPU - * the kernel does not reschedule for 10 seconds or more. - */ -#include <linux/mm.h> -#include <linux/cpu.h> -#include <linux/nmi.h> -#include <linux/init.h> -#include <linux/delay.h> -#include <linux/freezer.h> -#include <linux/kthread.h> -#include <linux/lockdep.h> -#include <linux/notifier.h> -#include <linux/module.h> -#include <linux/sysctl.h> - -#include <asm/irq_regs.h> - -static DEFINE_SPINLOCK(print_lock); - -static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */ -static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */ -static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); -static DEFINE_PER_CPU(bool, softlock_touch_sync); - -static int __read_mostly did_panic; -int __read_mostly softlockup_thresh = 60; - -/* - * Should we panic (and reboot, if panic_timeout= is set) when a - * soft-lockup occurs: - */ -unsigned int __read_mostly softlockup_panic = - CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; - -static int __init softlockup_panic_setup(char *str) -{ - softlockup_panic = simple_strtoul(str, NULL, 0); - - return 1; -} -__setup("softlockup_panic=", softlockup_panic_setup); - -static int -softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) -{ - did_panic = 1; - - return NOTIFY_DONE; -} - -static struct notifier_block panic_block = { - .notifier_call = softlock_panic, -}; - -/* - * Returns seconds, approximately. We don't need nanosecond - * resolution, and we don't need to waste time with a big divide when - * 2^30ns == 1.074s. - */ -static unsigned long get_timestamp(int this_cpu) -{ - return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ -} - -static void __touch_softlockup_watchdog(void) -{ - int this_cpu = raw_smp_processor_id(); - - __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu); -} - -void touch_softlockup_watchdog(void) -{ - __raw_get_cpu_var(softlockup_touch_ts) = 0; -} -EXPORT_SYMBOL(touch_softlockup_watchdog); - -void touch_softlockup_watchdog_sync(void) -{ - __raw_get_cpu_var(softlock_touch_sync) = true; - __raw_get_cpu_var(softlockup_touch_ts) = 0; -} - -void touch_all_softlockup_watchdogs(void) -{ - int cpu; - - /* Cause each CPU to re-update its timestamp rather than complain */ - for_each_online_cpu(cpu) - per_cpu(softlockup_touch_ts, cpu) = 0; -} -EXPORT_SYMBOL(touch_all_softlockup_watchdogs); - -int proc_dosoftlockup_thresh(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - touch_all_softlockup_watchdogs(); - return proc_dointvec_minmax(table, write, buffer, lenp, ppos); -} - -/* - * This callback runs from the timer interrupt, and checks - * whether the watchdog thread has hung or not: - */ -void softlockup_tick(void) -{ - int this_cpu = smp_processor_id(); - unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu); - unsigned long print_ts; - struct pt_regs *regs = get_irq_regs(); - unsigned long now; - - /* Is detection switched off? */ - if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) { - /* Be sure we don't false trigger if switched back on */ - if (touch_ts) - per_cpu(softlockup_touch_ts, this_cpu) = 0; - return; - } - - if (touch_ts == 0) { - if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) { - /* - * If the time stamp was touched atomically - * make sure the scheduler tick is up to date. - */ - per_cpu(softlock_touch_sync, this_cpu) = false; - sched_clock_tick(); - } - __touch_softlockup_watchdog(); - return; - } - - print_ts = per_cpu(softlockup_print_ts, this_cpu); - - /* report at most once a second */ - if (print_ts == touch_ts || did_panic) - return; - - /* do not print during early bootup: */ - if (unlikely(system_state != SYSTEM_RUNNING)) { - __touch_softlockup_watchdog(); - return; - } - - now = get_timestamp(this_cpu); - - /* - * Wake up the high-prio watchdog task twice per - * threshold timespan. - */ - if (time_after(now - softlockup_thresh/2, touch_ts)) - wake_up_process(per_cpu(softlockup_watchdog, this_cpu)); - - /* Warn about unreasonable delays: */ - if (time_before_eq(now - softlockup_thresh, touch_ts)) - return; - - per_cpu(softlockup_print_ts, this_cpu) = touch_ts; - - spin_lock(&print_lock); - printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n", - this_cpu, now - touch_ts, - current->comm, task_pid_nr(current)); - print_modules(); - print_irqtrace_events(current); - if (regs) - show_regs(regs); - else - dump_stack(); - spin_unlock(&print_lock); - - if (softlockup_panic) - panic("softlockup: hung tasks"); -} - -/* - * The watchdog thread - runs every second and touches the timestamp. - */ -static int watchdog(void *__bind_cpu) -{ - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - - sched_setscheduler(current, SCHED_FIFO, ¶m); - - /* initialize timestamp */ - __touch_softlockup_watchdog(); - - set_current_state(TASK_INTERRUPTIBLE); - /* - * Run briefly once per second to reset the softlockup timestamp. - * If this gets delayed for more than 60 seconds then the - * debug-printout triggers in softlockup_tick(). - */ - while (!kthread_should_stop()) { - __touch_softlockup_watchdog(); - schedule(); - - if (kthread_should_stop()) - break; - - set_current_state(TASK_INTERRUPTIBLE); - } - __set_current_state(TASK_RUNNING); - - return 0; -} - -/* - * Create/destroy watchdog threads as CPUs come and go: - */ -static int __cpuinit -cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) -{ - int hotcpu = (unsigned long)hcpu; - struct task_struct *p; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - BUG_ON(per_cpu(softlockup_watchdog, hotcpu)); - p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); - if (IS_ERR(p)) { - printk(KERN_ERR "watchdog for %i failed\n", hotcpu); - return NOTIFY_BAD; - } - per_cpu(softlockup_touch_ts, hotcpu) = 0; - per_cpu(softlockup_watchdog, hotcpu) = p; - kthread_bind(p, hotcpu); - break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - wake_up_process(per_cpu(softlockup_watchdog, hotcpu)); - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - if (!per_cpu(softlockup_watchdog, hotcpu)) - break; - /* Unbind so it can run. Fall thru. */ - kthread_bind(per_cpu(softlockup_watchdog, hotcpu), - cpumask_any(cpu_online_mask)); - case CPU_DEAD: - case CPU_DEAD_FROZEN: - p = per_cpu(softlockup_watchdog, hotcpu); - per_cpu(softlockup_watchdog, hotcpu) = NULL; - kthread_stop(p); - break; -#endif /* CONFIG_HOTPLUG_CPU */ - } - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata cpu_nfb = { - .notifier_call = cpu_callback -}; - -static int __initdata nosoftlockup; - -static int __init nosoftlockup_setup(char *str) -{ - nosoftlockup = 1; - return 1; -} -__setup("nosoftlockup", nosoftlockup_setup); - -static int __init spawn_softlockup_task(void) -{ - void *cpu = (void *)(long)smp_processor_id(); - int err; - - if (nosoftlockup) - return 0; - - err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - if (err == NOTIFY_BAD) { - BUG(); - return 1; - } - cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); - register_cpu_notifier(&cpu_nfb); - - atomic_notifier_chain_register(&panic_notifier_list, &panic_block); - - return 0; -} -early_initcall(spawn_softlockup_task); diff --git a/kernel/srcu.c b/kernel/srcu.c index 2980da3..c71e075 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -46,11 +46,9 @@ static int init_srcu_struct_fields(struct srcu_struct *sp) int __init_srcu_struct(struct srcu_struct *sp, const char *name, struct lock_class_key *key) { -#ifdef CONFIG_DEBUG_LOCK_ALLOC /* Don't re-initialize a lock while it is held. */ debug_check_no_locks_freed((void *)sp, sizeof(*sp)); lockdep_init_map(&sp->dep_map, name, key, 0); -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ return init_srcu_struct_fields(sp); } EXPORT_SYMBOL_GPL(__init_srcu_struct); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 70f8d90..090c288 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -35,9 +35,9 @@ struct cpu_stop_done { /* the actual stopper, one per every possible cpu, enabled on online cpus */ struct cpu_stopper { spinlock_t lock; + bool enabled; /* is this stopper enabled? */ struct list_head works; /* list of pending works */ struct task_struct *thread; /* stopper thread */ - bool enabled; /* is this stopper enabled? */ }; static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); @@ -287,11 +287,12 @@ repeat: goto repeat; } +extern void sched_set_stop_task(int cpu, struct task_struct *stop); + /* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; unsigned int cpu = (unsigned long)hcpu; struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); struct task_struct *p; @@ -304,13 +305,13 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, cpu); if (IS_ERR(p)) return NOTIFY_BAD; - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); get_task_struct(p); + kthread_bind(p, cpu); + sched_set_stop_task(cpu, p); stopper->thread = p; break; case CPU_ONLINE: - kthread_bind(stopper->thread, cpu); /* strictly unnecessary, as first user will wake it */ wake_up_process(stopper->thread); /* mark enabled */ @@ -325,6 +326,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, { struct cpu_stop_work *work; + sched_set_stop_task(cpu, NULL); /* kill the stopper */ kthread_stop(stopper->thread); /* drain remaining works */ diff --git a/kernel/sys.c b/kernel/sys.c index e83ddbb..7f5a0cd 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -931,6 +931,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) pgid = pid; if (pgid < 0) return -EINVAL; + rcu_read_lock(); /* From this point forward we keep holding onto the tasklist lock * so that our parent does not change from under us. -DaveM @@ -984,6 +985,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) out: /* All paths lead to here, thus we are safe. -DaveM */ write_unlock_irq(&tasklist_lock); + rcu_read_unlock(); return err; } @@ -1236,15 +1238,14 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) { - if (resource >= RLIM_NLIMITS) - return -EINVAL; - else { - struct rlimit value; - task_lock(current->group_leader); - value = current->signal->rlim[resource]; - task_unlock(current->group_leader); - return copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; - } + struct rlimit value; + int ret; + + ret = do_prlimit(current, resource, NULL, &value); + if (!ret) + ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; + + return ret; } #ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT @@ -1272,44 +1273,89 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, #endif -SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) +static inline bool rlim64_is_infinity(__u64 rlim64) { - struct rlimit new_rlim, *old_rlim; - int retval; +#if BITS_PER_LONG < 64 + return rlim64 >= ULONG_MAX; +#else + return rlim64 == RLIM64_INFINITY; +#endif +} + +static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64) +{ + if (rlim->rlim_cur == RLIM_INFINITY) + rlim64->rlim_cur = RLIM64_INFINITY; + else + rlim64->rlim_cur = rlim->rlim_cur; + if (rlim->rlim_max == RLIM_INFINITY) + rlim64->rlim_max = RLIM64_INFINITY; + else + rlim64->rlim_max = rlim->rlim_max; +} + +static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim) +{ + if (rlim64_is_infinity(rlim64->rlim_cur)) + rlim->rlim_cur = RLIM_INFINITY; + else + rlim->rlim_cur = (unsigned long)rlim64->rlim_cur; + if (rlim64_is_infinity(rlim64->rlim_max)) + rlim->rlim_max = RLIM_INFINITY; + else + rlim->rlim_max = (unsigned long)rlim64->rlim_max; +} + +/* make sure you are allowed to change @tsk limits before calling this */ +int do_prlimit(struct task_struct *tsk, unsigned int resource, + struct rlimit *new_rlim, struct rlimit *old_rlim) +{ + struct rlimit *rlim; + int retval = 0; if (resource >= RLIM_NLIMITS) return -EINVAL; - if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) - return -EFAULT; - if (new_rlim.rlim_cur > new_rlim.rlim_max) - return -EINVAL; - old_rlim = current->signal->rlim + resource; - if ((new_rlim.rlim_max > old_rlim->rlim_max) && - !capable(CAP_SYS_RESOURCE)) - return -EPERM; - if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) - return -EPERM; - - retval = security_task_setrlimit(resource, &new_rlim); - if (retval) - return retval; - - if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) { - /* - * The caller is asking for an immediate RLIMIT_CPU - * expiry. But we use the zero value to mean "it was - * never set". So let's cheat and make it one second - * instead - */ - new_rlim.rlim_cur = 1; + if (new_rlim) { + if (new_rlim->rlim_cur > new_rlim->rlim_max) + return -EINVAL; + if (resource == RLIMIT_NOFILE && + new_rlim->rlim_max > sysctl_nr_open) + return -EPERM; } - task_lock(current->group_leader); - *old_rlim = new_rlim; - task_unlock(current->group_leader); - - if (resource != RLIMIT_CPU) + /* protect tsk->signal and tsk->sighand from disappearing */ + read_lock(&tasklist_lock); + if (!tsk->sighand) { + retval = -ESRCH; goto out; + } + + rlim = tsk->signal->rlim + resource; + task_lock(tsk->group_leader); + if (new_rlim) { + if (new_rlim->rlim_max > rlim->rlim_max && + !capable(CAP_SYS_RESOURCE)) + retval = -EPERM; + if (!retval) + retval = security_task_setrlimit(tsk->group_leader, + resource, new_rlim); + if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) { + /* + * The caller is asking for an immediate RLIMIT_CPU + * expiry. But we use the zero value to mean "it was + * never set". So let's cheat and make it one second + * instead + */ + new_rlim->rlim_cur = 1; + } + } + if (!retval) { + if (old_rlim) + *old_rlim = *rlim; + if (new_rlim) + *rlim = *new_rlim; + } + task_unlock(tsk->group_leader); /* * RLIMIT_CPU handling. Note that the kernel fails to return an error @@ -1317,14 +1363,84 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) * very long-standing error, and fixing it now risks breakage of * applications, so we live with it */ - if (new_rlim.rlim_cur == RLIM_INFINITY) - goto out; - - update_rlimit_cpu(new_rlim.rlim_cur); + if (!retval && new_rlim && resource == RLIMIT_CPU && + new_rlim->rlim_cur != RLIM_INFINITY) + update_rlimit_cpu(tsk, new_rlim->rlim_cur); out: + read_unlock(&tasklist_lock); + return retval; +} + +/* rcu lock must be held */ +static int check_prlimit_permission(struct task_struct *task) +{ + const struct cred *cred = current_cred(), *tcred; + + tcred = __task_cred(task); + if ((cred->uid != tcred->euid || + cred->uid != tcred->suid || + cred->uid != tcred->uid || + cred->gid != tcred->egid || + cred->gid != tcred->sgid || + cred->gid != tcred->gid) && + !capable(CAP_SYS_RESOURCE)) { + return -EPERM; + } + return 0; } +SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, + const struct rlimit64 __user *, new_rlim, + struct rlimit64 __user *, old_rlim) +{ + struct rlimit64 old64, new64; + struct rlimit old, new; + struct task_struct *tsk; + int ret; + + if (new_rlim) { + if (copy_from_user(&new64, new_rlim, sizeof(new64))) + return -EFAULT; + rlim64_to_rlim(&new64, &new); + } + + rcu_read_lock(); + tsk = pid ? find_task_by_vpid(pid) : current; + if (!tsk) { + rcu_read_unlock(); + return -ESRCH; + } + ret = check_prlimit_permission(tsk); + if (ret) { + rcu_read_unlock(); + return ret; + } + get_task_struct(tsk); + rcu_read_unlock(); + + ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, + old_rlim ? &old : NULL); + + if (!ret && old_rlim) { + rlim_to_rlim64(&old, &old64); + if (copy_to_user(old_rlim, &old64, sizeof(old64))) + ret = -EFAULT; + } + + put_task_struct(tsk); + return ret; +} + +SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) +{ + struct rlimit new_rlim; + + if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) + return -EFAULT; + return do_prlimit(current, resource, &new_rlim, NULL); +} + /* * It would make sense to put struct rusage in the task_struct, * except that would make the task_struct be *really big*. After diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 70f2ea7..c782fe9 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -50,6 +50,7 @@ cond_syscall(compat_sys_sendmsg); cond_syscall(sys_recvmsg); cond_syscall(sys_recvmmsg); cond_syscall(compat_sys_recvmsg); +cond_syscall(compat_sys_recv); cond_syscall(compat_sys_recvfrom); cond_syscall(compat_sys_recvmmsg); cond_syscall(sys_socketcall); @@ -181,3 +182,7 @@ cond_syscall(sys_eventfd2); /* performance counters: */ cond_syscall(sys_perf_event_open); + +/* fanotify! */ +cond_syscall(sys_fanotify_init); +cond_syscall(sys_fanotify_mark); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d24f761..3a45c22 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -44,16 +44,17 @@ #include <linux/times.h> #include <linux/limits.h> #include <linux/dcache.h> +#include <linux/dnotify.h> #include <linux/syscalls.h> #include <linux/vmstat.h> #include <linux/nfs_fs.h> #include <linux/acpi.h> #include <linux/reboot.h> #include <linux/ftrace.h> -#include <linux/slow-work.h> #include <linux/perf_event.h> #include <linux/kprobes.h> #include <linux/pipe_fs_i.h> +#include <linux/oom.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -76,15 +77,16 @@ #include <scsi/sg.h> #endif +#ifdef CONFIG_LOCKUP_DETECTOR +#include <linux/nmi.h> +#endif + #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; -extern int sysctl_panic_on_oom; -extern int sysctl_oom_kill_allocating_task; -extern int sysctl_oom_dump_tasks; extern int max_threads; extern int core_uses_pid; extern int suid_dumpable; @@ -106,7 +108,7 @@ extern int blk_iopoll_enabled; #endif /* Constants used for minimum and maximum */ -#ifdef CONFIG_DETECT_SOFTLOCKUP +#ifdef CONFIG_LOCKUP_DETECTOR static int sixty = 60; static int neg_one = -1; #endif @@ -130,6 +132,9 @@ static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; +#ifdef CONFIG_INOTIFY_USER +#include <linux/inotify.h> +#endif #ifdef CONFIG_SPARC #include <asm/system.h> #endif @@ -206,9 +211,6 @@ static struct ctl_table fs_table[]; static struct ctl_table debug_table[]; static struct ctl_table dev_table[]; extern struct ctl_table random_table[]; -#ifdef CONFIG_INOTIFY_USER -extern struct ctl_table inotify_table[]; -#endif #ifdef CONFIG_EPOLL extern struct ctl_table epoll_table[]; #endif @@ -562,7 +564,7 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif -#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) +#ifdef CONFIG_HOTPLUG { .procname = "hotplug", .data = &uevent_helper, @@ -710,7 +712,34 @@ static struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = proc_dointvec, }, -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) +#if defined(CONFIG_LOCKUP_DETECTOR) + { + .procname = "watchdog", + .data = &watchdog_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dowatchdog_enabled, + }, + { + .procname = "watchdog_thresh", + .data = &softlockup_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dowatchdog_thresh, + .extra1 = &neg_one, + .extra2 = &sixty, + }, + { + .procname = "softlockup_panic", + .data = &softlockup_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR) { .procname = "unknown_nmi_panic", .data = &unknown_nmi_panic, @@ -813,26 +842,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_DETECT_SOFTLOCKUP - { - .procname = "softlockup_panic", - .data = &softlockup_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, - { - .procname = "softlockup_thresh", - .data = &softlockup_thresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dosoftlockup_thresh, - .extra1 = &neg_one, - .extra2 = &sixty, - }, -#endif #ifdef CONFIG_DETECT_HUNG_TASK { .procname = "hung_task_panic", @@ -906,13 +915,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_SLOW_WORK - { - .procname = "slow-work", - .mode = 0555, - .child = slow_work_sysctls, - }, -#endif #ifdef CONFIG_PERF_EVENTS { .procname = "perf_event_paranoid", @@ -1711,10 +1713,7 @@ static __init int sysctl_init(void) { sysctl_set_parent(NULL, root_table); #ifdef CONFIG_SYSCTL_SYSCALL_CHECK - { - int err; - err = sysctl_check_table(current->nsproxy, root_table); - } + sysctl_check_table(current->nsproxy, root_table); #endif return 0; } @@ -2486,7 +2485,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int kbuf[left] = 0; } - for (; left && vleft--; i++, min++, max++, first=0) { + for (; left && vleft--; i++, first = 0) { unsigned long val; if (write) { diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index 04cdcf7..10b90d8 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c @@ -143,15 +143,6 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) if (!table->maxlen) set_fail(&fail, table, "No maxlen"); } - if ((table->proc_handler == proc_doulongvec_minmax) || - (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { - if (table->maxlen > sizeof (unsigned long)) { - if (!table->extra1) - set_fail(&fail, table, "No min"); - if (!table->extra2) - set_fail(&fail, table, "No max"); - } - } #ifdef CONFIG_PROC_SYSCTL if (table->procname && !table->proc_handler) set_fail(&fail, table, "No proc_handler"); diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 4f10451..f8b11a2 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -115,7 +115,9 @@ static int test_kprobes(void) int ret; struct kprobe *kps[2] = {&kp, &kp2}; - kp.addr = 0; /* addr should be cleard for reusing kprobe. */ + /* addr and flags should be cleard for reusing kprobe. */ + kp.addr = NULL; + kp.flags = 0; ret = register_kprobes(kps, 2); if (ret < 0) { printk(KERN_ERR "Kprobe smoke test failed: " @@ -210,7 +212,9 @@ static int test_jprobes(void) int ret; struct jprobe *jps[2] = {&jp, &jp2}; - jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ + /* addr and flags should be cleard for reusing kprobe. */ + jp.kp.addr = NULL; + jp.kp.flags = 0; ret = register_jprobes(jps, 2); if (ret < 0) { printk(KERN_ERR "Kprobe smoke test failed: " @@ -323,7 +327,9 @@ static int test_kretprobes(void) int ret; struct kretprobe *rps[2] = {&rp, &rp2}; - rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ + /* addr and flags should be cleard for reusing kprobe. */ + rp.kp.addr = NULL; + rp.kp.flags = 0; ret = register_kretprobes(rps, 2); if (ret < 0) { printk(KERN_ERR "Kprobe smoke test failed: " diff --git a/kernel/time.c b/kernel/time.c index 848b1c2..ba9b338 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -300,22 +300,6 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran) } EXPORT_SYMBOL(timespec_trunc); -#ifndef CONFIG_GENERIC_TIME -/* - * Simulate gettimeofday using do_gettimeofday which only allows a timeval - * and therefore only yields usec accuracy - */ -void getnstimeofday(struct timespec *tv) -{ - struct timeval x; - - do_gettimeofday(&x); - tv->tv_sec = x.tv_sec; - tv->tv_nsec = x.tv_usec * NSEC_PER_USEC; -} -EXPORT_SYMBOL_GPL(getnstimeofday); -#endif - /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 95ed429..f06a8a3 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -6,7 +6,7 @@ config TICK_ONESHOT config NO_HZ bool "Tickless System (Dynamic Ticks)" - depends on GENERIC_TIME && GENERIC_CLOCKEVENTS + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS select TICK_ONESHOT help This option enables a tickless system: timer interrupts will @@ -15,7 +15,7 @@ config NO_HZ config HIGH_RES_TIMERS bool "High Resolution Timer Support" - depends on GENERIC_TIME && GENERIC_CLOCKEVENTS + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS select TICK_ONESHOT help This option enables high resolution timer support. If your diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index f08e99c..c18d7ef 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -531,7 +531,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs) return max_nsecs - (max_nsecs >> 5); } -#ifdef CONFIG_GENERIC_TIME +#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET /** * clocksource_select - Select the best clocksource available @@ -577,7 +577,7 @@ static void clocksource_select(void) } } -#else /* CONFIG_GENERIC_TIME */ +#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ static inline void clocksource_select(void) { } @@ -639,19 +639,18 @@ static void clocksource_enqueue(struct clocksource *cs) #define MAX_UPDATE_LENGTH 5 /* Seconds */ /** - * __clocksource_register_scale - Used to install new clocksources + * __clocksource_updatefreq_scale - Used update clocksource with new freq * @t: clocksource to be registered * @scale: Scale factor multiplied against freq to get clocksource hz * @freq: clocksource frequency (cycles per second) divided by scale * - * Returns -EBUSY if registration fails, zero otherwise. + * This should only be called from the clocksource->enable() method. * * This *SHOULD NOT* be called directly! Please use the - * clocksource_register_hz() or clocksource_register_khz helper functions. + * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions. */ -int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) +void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) { - /* * Ideally we want to use some of the limits used in * clocksource_max_deferment, to provide a more informed @@ -662,7 +661,27 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) NSEC_PER_SEC/scale, MAX_UPDATE_LENGTH*scale); cs->max_idle_ns = clocksource_max_deferment(cs); +} +EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); + +/** + * __clocksource_register_scale - Used to install new clocksources + * @t: clocksource to be registered + * @scale: Scale factor multiplied against freq to get clocksource hz + * @freq: clocksource frequency (cycles per second) divided by scale + * + * Returns -EBUSY if registration fails, zero otherwise. + * + * This *SHOULD NOT* be called directly! Please use the + * clocksource_register_hz() or clocksource_register_khz helper functions. + */ +int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) +{ + + /* Intialize mult/shift and max_idle_ns */ + __clocksource_updatefreq_scale(cs, scale, freq); + /* Add clocksource to the clcoksource list */ mutex_lock(&clocksource_mutex); clocksource_enqueue(cs); clocksource_select(); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index c631168..d232189 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -149,10 +149,18 @@ static void ntp_update_offset(long offset) time_reftime = get_seconds(); offset64 = offset; - freq_adj = (offset64 * secs) << - (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); + freq_adj = ntp_update_offset_fll(offset64, secs); - freq_adj += ntp_update_offset_fll(offset64, secs); + /* + * Clamp update interval to reduce PLL gain with low + * sampling rate (e.g. intermittent network connection) + * to avoid instability. + */ + if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant))) + secs = 1 << (SHIFT_PLL + 1 + time_constant); + + freq_adj += (offset64 * secs) << + (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index b3bafd5..48b2761 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -188,7 +188,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) /* * Setup the next period for devices, which do not have * periodic mode. We read dev->next_event first and add to it - * when the event alrady expired. clockevents_program_event() + * when the event already expired. clockevents_program_event() * sets dev->next_event only when the event is really * programmed to the device. */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 783fbad..3e216e0 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -154,14 +154,14 @@ static void tick_nohz_update_jiffies(ktime_t now) * Updates the per cpu time idle statistics counters */ static void -update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time) +update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) { ktime_t delta; if (ts->idle_active) { delta = ktime_sub(now, ts->idle_entrytime); ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); - if (nr_iowait_cpu() > 0) + if (nr_iowait_cpu(cpu) > 0) ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); ts->idle_entrytime = now; } @@ -175,19 +175,19 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - update_ts_time_stats(ts, now, NULL); + update_ts_time_stats(cpu, ts, now, NULL); ts->idle_active = 0; sched_clock_idle_wakeup_event(0); } -static ktime_t tick_nohz_start_idle(struct tick_sched *ts) +static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) { ktime_t now; now = ktime_get(); - update_ts_time_stats(ts, now, NULL); + update_ts_time_stats(cpu, ts, now, NULL); ts->idle_entrytime = now; ts->idle_active = 1; @@ -216,7 +216,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) if (!tick_nohz_enabled) return -1; - update_ts_time_stats(ts, ktime_get(), last_update_time); + update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); return ktime_to_us(ts->idle_sleeptime); } @@ -242,7 +242,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) if (!tick_nohz_enabled) return -1; - update_ts_time_stats(ts, ktime_get(), last_update_time); + update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); return ktime_to_us(ts->iowait_sleeptime); } @@ -284,7 +284,7 @@ void tick_nohz_stop_sched_tick(int inidle) */ ts->inidle = 1; - now = tick_nohz_start_idle(ts); + now = tick_nohz_start_idle(cpu, ts); /* * If this cpu is offline and it is the one which updates @@ -325,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle) } while (read_seqretry(&xtime_lock, seq)); if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || - arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) { + arch_needs_cpu(cpu)) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; } else { @@ -405,13 +405,7 @@ void tick_nohz_stop_sched_tick(int inidle) * the scheduler tick in nohz_restart_sched_tick. */ if (!ts->tick_stopped) { - if (select_nohz_load_balancer(1)) { - /* - * sched tick not stopped! - */ - cpumask_clear_cpu(cpu, nohz_cpu_mask); - goto out; - } + select_nohz_load_balancer(1); ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; @@ -780,7 +774,6 @@ void tick_setup_sched_timer(void) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); ktime_t now = ktime_get(); - u64 offset; /* * Emulate tick processing via per-CPU hrtimers: @@ -790,10 +783,6 @@ void tick_setup_sched_timer(void) /* Get the next period (per cpu) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); - offset = ktime_to_ns(tick_period) >> 1; - do_div(offset, num_possible_cpus()); - offset *= smp_processor_id(); - hrtimer_add_expires_ns(&ts->sched_timer, offset); for (;;) { hrtimer_forward(&ts->sched_timer, now, tick_period); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index caf8d4d..49010d8 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -153,8 +153,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); * - wall_to_monotonic is no longer the boot time, getboottime must be * used instead. */ -struct timespec xtime __attribute__ ((aligned (16))); -struct timespec wall_to_monotonic __attribute__ ((aligned (16))); +static struct timespec xtime __attribute__ ((aligned (16))); +static struct timespec wall_to_monotonic __attribute__ ((aligned (16))); static struct timespec total_sleep_time; /* @@ -170,11 +170,10 @@ void timekeeping_leap_insert(int leapsecond) { xtime.tv_sec += leapsecond; wall_to_monotonic.tv_sec -= leapsecond; - update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); + update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + timekeeper.mult); } -#ifdef CONFIG_GENERIC_TIME - /** * timekeeping_forward_now - update clock to the current time * @@ -328,7 +327,8 @@ int do_settimeofday(struct timespec *tv) timekeeper.ntp_error = 0; ntp_clear(); - update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); + update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + timekeeper.mult); write_sequnlock_irqrestore(&xtime_lock, flags); @@ -376,52 +376,6 @@ void timekeeping_notify(struct clocksource *clock) tick_clock_notify(); } -#else /* GENERIC_TIME */ - -static inline void timekeeping_forward_now(void) { } - -/** - * ktime_get - get the monotonic time in ktime_t format - * - * returns the time in ktime_t format - */ -ktime_t ktime_get(void) -{ - struct timespec now; - - ktime_get_ts(&now); - - return timespec_to_ktime(now); -} -EXPORT_SYMBOL_GPL(ktime_get); - -/** - * ktime_get_ts - get the monotonic clock in timespec format - * @ts: pointer to timespec variable - * - * The function calculates the monotonic clock from the realtime - * clock and the wall_to_monotonic offset and stores the result - * in normalized timespec format in the variable pointed to by @ts. - */ -void ktime_get_ts(struct timespec *ts) -{ - struct timespec tomono; - unsigned long seq; - - do { - seq = read_seqbegin(&xtime_lock); - getnstimeofday(ts); - tomono = wall_to_monotonic; - - } while (read_seqretry(&xtime_lock, seq)); - - set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, - ts->tv_nsec + tomono.tv_nsec); -} -EXPORT_SYMBOL_GPL(ktime_get_ts); - -#endif /* !GENERIC_TIME */ - /** * ktime_get_real - get the real (wall-) time in ktime_t format * @@ -579,9 +533,9 @@ static int timekeeping_resume(struct sys_device *dev) if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { ts = timespec_sub(ts, timekeeping_suspend_time); - xtime = timespec_add_safe(xtime, ts); + xtime = timespec_add(xtime, ts); wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); - total_sleep_time = timespec_add_safe(total_sleep_time, ts); + total_sleep_time = timespec_add(total_sleep_time, ts); } /* re-base the last cycle value */ timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); @@ -736,6 +690,7 @@ static void timekeeping_adjust(s64 offset) static cycle_t logarithmic_accumulation(cycle_t offset, int shift) { u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; + u64 raw_nsecs; /* If the offset is smaller then a shifted interval, do nothing */ if (offset < timekeeper.cycle_interval<<shift) @@ -752,12 +707,15 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) second_overflow(); } - /* Accumulate into raw time */ - raw_time.tv_nsec += timekeeper.raw_interval << shift;; - while (raw_time.tv_nsec >= NSEC_PER_SEC) { - raw_time.tv_nsec -= NSEC_PER_SEC; - raw_time.tv_sec++; + /* Accumulate raw time */ + raw_nsecs = timekeeper.raw_interval << shift; + raw_nsecs += raw_time.tv_nsec; + if (raw_nsecs >= NSEC_PER_SEC) { + u64 raw_secs = raw_nsecs; + raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); + raw_time.tv_sec += raw_secs; } + raw_time.tv_nsec = raw_nsecs; /* Accumulate error between NTP and clock interval */ timekeeper.ntp_error += tick_length << shift; @@ -784,10 +742,11 @@ void update_wall_time(void) return; clock = timekeeper.clock; -#ifdef CONFIG_GENERIC_TIME - offset = (clock->read(clock) - clock->cycle_last) & clock->mask; -#else + +#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET offset = timekeeper.cycle_interval; +#else + offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #endif timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; @@ -856,7 +815,8 @@ void update_wall_time(void) } /* check to see if there is a new clocksource to use */ - update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); + update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + timekeeper.mult); } /** @@ -887,7 +847,7 @@ EXPORT_SYMBOL_GPL(getboottime); */ void monotonic_to_bootbased(struct timespec *ts) { - *ts = timespec_add_safe(*ts, total_sleep_time); + *ts = timespec_add(*ts, total_sleep_time); } EXPORT_SYMBOL_GPL(monotonic_to_bootbased); @@ -902,6 +862,11 @@ struct timespec __current_kernel_time(void) return xtime; } +struct timespec __get_wall_to_monotonic(void) +{ + return wall_to_monotonic; +} + struct timespec current_kernel_time(void) { struct timespec now; diff --git a/kernel/timer.c b/kernel/timer.c index ee305c8..68a9ae7 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -37,7 +37,7 @@ #include <linux/delay.h> #include <linux/tick.h> #include <linux/kallsyms.h> -#include <linux/perf_event.h> +#include <linux/irq_work.h> #include <linux/sched.h> #include <linux/slab.h> @@ -90,8 +90,13 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; /* * Note that all tvec_bases are 2 byte aligned and lower bit of - * base in timer_list is guaranteed to be zero. Use the LSB for - * the new flag to indicate whether the timer is deferrable + * base in timer_list is guaranteed to be zero. Use the LSB to + * indicate whether the timer is deferrable. + * + * A deferrable timer will work normally when the system is busy, but + * will not cause a CPU to come out of idle just to service it; instead, + * the timer will be serviced when the CPU eventually wakes up with a + * subsequent non-deferrable timer. */ #define TBASE_DEFERRABLE_FLAG (0x1) @@ -321,6 +326,7 @@ EXPORT_SYMBOL_GPL(round_jiffies_up_relative); /** * set_timer_slack - set the allowed slack for a timer + * @timer: the timer to be modified * @slack_hz: the amount of time (in jiffies) allowed for rounding * * Set the amount of time, in jiffies, that a certain timer has @@ -577,6 +583,19 @@ static void __init_timer(struct timer_list *timer, lockdep_init_map(&timer->lockdep_map, name, key, 0); } +void setup_deferrable_timer_on_stack_key(struct timer_list *timer, + const char *name, + struct lock_class_key *key, + void (*function)(unsigned long), + unsigned long data) +{ + timer->function = function; + timer->data = data; + init_timer_on_stack_key(timer, name, key); + timer_set_deferrable(timer); +} +EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key); + /** * init_timer_key - initialize a timer * @timer: the timer to be initialized @@ -679,12 +698,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, cpu = smp_processor_id(); #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) - if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { - int preferred_cpu = get_nohz_load_balancer(); - - if (preferred_cpu >= 0) - cpu = preferred_cpu; - } + if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) + cpu = get_nohz_timer_target(); #endif new_base = per_cpu(tvec_bases, cpu); @@ -1264,7 +1279,10 @@ void update_process_times(int user_tick) run_local_timers(); rcu_check_callbacks(cpu, user_tick); printk_tick(); - perf_event_do_pending(); +#ifdef CONFIG_IRQ_WORK + if (in_irq()) + irq_work_run(); +#endif scheduler_tick(); run_posix_cpu_timers(p); } @@ -1289,7 +1307,6 @@ void run_local_timers(void) { hrtimer_run_queues(); raise_softirq(TIMER_SOFTIRQ); - softlockup_tick(); } /* @@ -1750,3 +1767,25 @@ unsigned long msleep_interruptible(unsigned int msecs) } EXPORT_SYMBOL(msleep_interruptible); + +static int __sched do_usleep_range(unsigned long min, unsigned long max) +{ + ktime_t kmin; + unsigned long delta; + + kmin = ktime_set(0, min * NSEC_PER_USEC); + delta = (max - min) * NSEC_PER_USEC; + return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); +} + +/** + * usleep_range - Drop in replacement for udelay where wakeup is flexible + * @min: Minimum time in usecs to sleep + * @max: Maximum time in usecs to sleep + */ +void usleep_range(unsigned long min, unsigned long max) +{ + __set_current_state(TASK_UNINTERRUPTIBLE); + do_usleep_range(min, max); +} +EXPORT_SYMBOL(usleep_range); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8b1797c..e04b8bc 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS help See Documentation/trace/ftrace-design.txt +config HAVE_C_RECORDMCOUNT + bool + help + C version of recordmcount available? + config TRACER_MAX_TRACE bool @@ -121,7 +126,7 @@ if FTRACE config FUNCTION_TRACER bool "Kernel Function Tracer" depends on HAVE_FUNCTION_TRACER - select FRAME_POINTER + select FRAME_POINTER if (!ARM_UNWIND) select KALLSYMS select GENERIC_TRACER select CONTEXT_SWITCH_TRACER @@ -153,7 +158,7 @@ config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n depends on TRACE_IRQFLAGS_SUPPORT - depends on GENERIC_TIME + depends on !ARCH_USES_GETTIMEOFFSET select TRACE_IRQFLAGS select GENERIC_TRACER select TRACER_MAX_TRACE @@ -175,7 +180,7 @@ config IRQSOFF_TRACER config PREEMPT_TRACER bool "Preemption-off Latency Tracer" default n - depends on GENERIC_TIME + depends on !ARCH_USES_GETTIMEOFFSET depends on PREEMPT select GENERIC_TRACER select TRACER_MAX_TRACE @@ -194,15 +199,6 @@ config PREEMPT_TRACER enabled. This option and the irqs-off timing option can be used together or separately.) -config SYSPROF_TRACER - bool "Sysprof Tracer" - depends on X86 - select GENERIC_TRACER - select CONTEXT_SWITCH_TRACER - help - This tracer provides the trace needed by the 'Sysprof' userspace - tool. - config SCHED_TRACER bool "Scheduling Latency Tracer" select GENERIC_TRACER @@ -229,23 +225,6 @@ config FTRACE_SYSCALLS help Basic tracer to catch the syscall entry and exit events. -config BOOT_TRACER - bool "Trace boot initcalls" - select GENERIC_TRACER - select CONTEXT_SWITCH_TRACER - help - This tracer helps developers to optimize boot times: it records - the timings of the initcalls and traces key events and the identity - of tasks that can cause boot delays, such as context-switches. - - Its aim is to be parsed by the scripts/bootgraph.pl tool to - produce pretty graphics about boot inefficiencies, giving a visual - representation of the delays during initcalls - but the raw - /debug/tracing/trace text output is readable too. - - You must pass in initcall_debug and ftrace=initcall to the kernel - command line to enable this on bootup. - config TRACE_BRANCH_PROFILING bool select GENERIC_TRACER @@ -325,28 +304,6 @@ config BRANCH_TRACER Say N if unsure. -config KSYM_TRACER - bool "Trace read and write access on kernel memory locations" - depends on HAVE_HW_BREAKPOINT - select TRACING - help - This tracer helps find read and write operations on any given kernel - symbol i.e. /proc/kallsyms. - -config PROFILE_KSYM_TRACER - bool "Profile all kernel memory accesses on 'watched' variables" - depends on KSYM_TRACER - help - This tracer profiles kernel accesses on variables watched through the - ksym tracer ftrace plugin. Depending upon the hardware, all read - and write operations on kernel variables can be monitored for - accesses. - - The results will be displayed in: - /debugfs/tracing/profile_ksym - - Say N if unsure. - config STACK_TRACER bool "Trace max stack" depends on HAVE_FUNCTION_TRACER @@ -371,37 +328,6 @@ config STACK_TRACER Say N if unsure. -config KMEMTRACE - bool "Trace SLAB allocations" - select GENERIC_TRACER - help - kmemtrace provides tracing for slab allocator functions, such as - kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected - data is then fed to the userspace application in order to analyse - allocation hotspots, internal fragmentation and so on, making it - possible to see how well an allocator performs, as well as debug - and profile kernel code. - - This requires an userspace application to use. See - Documentation/trace/kmemtrace.txt for more information. - - Saying Y will make the kernel somewhat larger and slower. However, - if you disable kmemtrace at run-time or boot-time, the performance - impact is minimal (depending on the arch the kernel is built for). - - If unsure, say N. - -config WORKQUEUE_TRACER - bool "Trace workqueues" - select GENERIC_TRACER - help - The workqueue tracer provides some statistical information - about each cpu workqueue thread such as the number of the - works inserted and executed since their creation. It can help - to evaluate the amount of work each of them has to perform. - For example it can help a developer to decide whether he should - choose a per-cpu workqueue instead of a singlethreaded one. - config BLK_DEV_IO_TRACE bool "Support for tracing block IO actions" depends on SYSFS diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index ffb1a5b..53f3381 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -30,7 +30,6 @@ obj-$(CONFIG_TRACING) += trace_output.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o -obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o @@ -38,10 +37,8 @@ obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o obj-$(CONFIG_NOP_TRACER) += trace_nop.o obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o -obj-$(CONFIG_BOOT_TRACER) += trace_boot.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o -obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o ifeq ($(CONFIG_BLOCK),y) @@ -55,7 +52,9 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o -obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o obj-$(CONFIG_EVENT_TRACING) += power-traces.o +ifeq ($(CONFIG_TRACING),y) +obj-$(CONFIG_KGDB_KDB) += trace_kdb.o +endif libftrace-y := ftrace.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 638711c..959f8d6 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -169,9 +169,12 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) }; +#define BLK_TC_HARDBARRIER BLK_TC_BARRIER +#define BLK_TC_RAHEAD BLK_TC_AHEAD + /* The ilog2() calls fall out because they're constant */ -#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \ - (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name)) +#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \ + (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name)) /* * The worker for the various blk_add_trace*() types. Fills out a @@ -194,9 +197,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, return; what |= ddir_act[rw & WRITE]; - what |= MASK_TC_BIT(rw, BARRIER); - what |= MASK_TC_BIT(rw, SYNCIO); - what |= MASK_TC_BIT(rw, AHEAD); + what |= MASK_TC_BIT(rw, HARDBARRIER); + what |= MASK_TC_BIT(rw, SYNC); + what |= MASK_TC_BIT(rw, RAHEAD); what |= MASK_TC_BIT(rw, META); what |= MASK_TC_BIT(rw, DISCARD); @@ -549,6 +552,41 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, } EXPORT_SYMBOL_GPL(blk_trace_setup); +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) +static int compat_blk_trace_setup(struct request_queue *q, char *name, + dev_t dev, struct block_device *bdev, + char __user *arg) +{ + struct blk_user_trace_setup buts; + struct compat_blk_user_trace_setup cbuts; + int ret; + + if (copy_from_user(&cbuts, arg, sizeof(cbuts))) + return -EFAULT; + + buts = (struct blk_user_trace_setup) { + .act_mask = cbuts.act_mask, + .buf_size = cbuts.buf_size, + .buf_nr = cbuts.buf_nr, + .start_lba = cbuts.start_lba, + .end_lba = cbuts.end_lba, + .pid = cbuts.pid, + }; + memcpy(&buts.name, &cbuts.name, 32); + + ret = do_blk_trace_setup(q, name, dev, bdev, &buts); + if (ret) + return ret; + + if (copy_to_user(arg, &buts.name, 32)) { + blk_trace_remove(q); + return -EFAULT; + } + + return 0; +} +#endif + int blk_trace_startstop(struct request_queue *q, int start) { int ret; @@ -601,6 +639,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) if (!q) return -ENXIO; + lock_kernel(); mutex_lock(&bdev->bd_mutex); switch (cmd) { @@ -608,6 +647,12 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) bdevname(bdev, b); ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) + case BLKTRACESETUP32: + bdevname(bdev, b); + ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); + break; +#endif case BLKTRACESTART: start = 1; case BLKTRACESTOP: @@ -622,6 +667,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) } mutex_unlock(&bdev->bd_mutex); + unlock_kernel(); return ret; } @@ -661,10 +707,13 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, if (likely(!bt)) return; - if (blk_discard_rq(rq)) - rw |= (1 << BIO_RW_DISCARD); + if (rq->cmd_flags & REQ_DISCARD) + rw |= REQ_DISCARD; + + if (rq->cmd_flags & REQ_SECURE) + rw |= REQ_SECURE; - if (blk_pc_request(rq)) { + if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { what |= BLK_TC_ACT(BLK_TC_PC); __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, what, rq->errors, rq->cmd_len, rq->cmd); @@ -925,7 +974,7 @@ void blk_add_driver_data(struct request_queue *q, if (likely(!bt)) return; - if (blk_pc_request(rq)) + if (rq->cmd_type == REQ_TYPE_BLOCK_PC) __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, BLK_TA_DRV_DATA, rq->errors, len, data); else @@ -1730,7 +1779,7 @@ void blk_dump_cmd(char *buf, struct request *rq) int len = rq->cmd_len; unsigned char *cmd = rq->cmd; - if (!blk_pc_request(rq)) { + if (rq->cmd_type != REQ_TYPE_BLOCK_PC) { buf[0] = '\0'; return; } @@ -1755,21 +1804,23 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) if (rw & WRITE) rwbs[i++] = 'W'; - else if (rw & 1 << BIO_RW_DISCARD) + else if (rw & REQ_DISCARD) rwbs[i++] = 'D'; else if (bytes) rwbs[i++] = 'R'; else rwbs[i++] = 'N'; - if (rw & 1 << BIO_RW_AHEAD) + if (rw & REQ_RAHEAD) rwbs[i++] = 'A'; - if (rw & 1 << BIO_RW_BARRIER) + if (rw & REQ_HARDBARRIER) rwbs[i++] = 'B'; - if (rw & 1 << BIO_RW_SYNCIO) + if (rw & REQ_SYNC) rwbs[i++] = 'S'; - if (rw & 1 << BIO_RW_META) + if (rw & REQ_META) rwbs[i++] = 'M'; + if (rw & REQ_SECURE) + rwbs[i++] = 'E'; rwbs[i] = '\0'; } @@ -1779,8 +1830,11 @@ void blk_fill_rwbs_rq(char *rwbs, struct request *rq) int rw = rq->cmd_flags & 0x03; int bytes; - if (blk_discard_rq(rq)) - rw |= (1 << BIO_RW_DISCARD); + if (rq->cmd_flags & REQ_DISCARD) + rw |= REQ_DISCARD; + + if (rq->cmd_flags & REQ_SECURE) + rw |= REQ_SECURE; bytes = blk_rq_bytes(rq); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6d2cb14..ebd80d5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -381,12 +381,19 @@ static int function_stat_show(struct seq_file *m, void *v) { struct ftrace_profile *rec = v; char str[KSYM_SYMBOL_LEN]; + int ret = 0; #ifdef CONFIG_FUNCTION_GRAPH_TRACER - static DEFINE_MUTEX(mutex); static struct trace_seq s; unsigned long long avg; unsigned long long stddev; #endif + mutex_lock(&ftrace_profile_lock); + + /* we raced with function_profile_reset() */ + if (unlikely(rec->counter == 0)) { + ret = -EBUSY; + goto out; + } kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); seq_printf(m, " %-30.30s %10lu", str, rec->counter); @@ -408,7 +415,6 @@ static int function_stat_show(struct seq_file *m, void *v) do_div(stddev, (rec->counter - 1) * 1000); } - mutex_lock(&mutex); trace_seq_init(&s); trace_print_graph_duration(rec->time, &s); trace_seq_puts(&s, " "); @@ -416,11 +422,12 @@ static int function_stat_show(struct seq_file *m, void *v) trace_seq_puts(&s, " "); trace_print_graph_duration(stddev, &s); trace_print_seq(m, &s); - mutex_unlock(&mutex); #endif seq_putc(m, '\n'); +out: + mutex_unlock(&ftrace_profile_lock); - return 0; + return ret; } static void ftrace_profile_reset(struct ftrace_profile_stat *stat) @@ -877,10 +884,8 @@ enum { FTRACE_ENABLE_CALLS = (1 << 0), FTRACE_DISABLE_CALLS = (1 << 1), FTRACE_UPDATE_TRACE_FUNC = (1 << 2), - FTRACE_ENABLE_MCOUNT = (1 << 3), - FTRACE_DISABLE_MCOUNT = (1 << 4), - FTRACE_START_FUNC_RET = (1 << 5), - FTRACE_STOP_FUNC_RET = (1 << 6), + FTRACE_START_FUNC_RET = (1 << 3), + FTRACE_STOP_FUNC_RET = (1 << 4), }; static int ftrace_filtered; @@ -1219,8 +1224,6 @@ static void ftrace_shutdown(int command) static void ftrace_startup_sysctl(void) { - int command = FTRACE_ENABLE_MCOUNT; - if (unlikely(ftrace_disabled)) return; @@ -1228,23 +1231,17 @@ static void ftrace_startup_sysctl(void) saved_ftrace_func = NULL; /* ftrace_start_up is true if we want ftrace running */ if (ftrace_start_up) - command |= FTRACE_ENABLE_CALLS; - - ftrace_run_update_code(command); + ftrace_run_update_code(FTRACE_ENABLE_CALLS); } static void ftrace_shutdown_sysctl(void) { - int command = FTRACE_DISABLE_MCOUNT; - if (unlikely(ftrace_disabled)) return; /* ftrace_start_up is true if ftrace is running */ if (ftrace_start_up) - command |= FTRACE_DISABLE_CALLS; - - ftrace_run_update_code(command); + ftrace_run_update_code(FTRACE_DISABLE_CALLS); } static cycle_t ftrace_update_time; @@ -1361,24 +1358,29 @@ enum { #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ struct ftrace_iterator { - struct ftrace_page *pg; - int hidx; - int idx; - unsigned flags; - struct trace_parser parser; + loff_t pos; + loff_t func_pos; + struct ftrace_page *pg; + struct dyn_ftrace *func; + struct ftrace_func_probe *probe; + struct trace_parser parser; + int hidx; + int idx; + unsigned flags; }; static void * -t_hash_next(struct seq_file *m, void *v, loff_t *pos) +t_hash_next(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; - struct hlist_node *hnd = v; + struct hlist_node *hnd = NULL; struct hlist_head *hhd; - WARN_ON(!(iter->flags & FTRACE_ITER_HASH)); - (*pos)++; + iter->pos = *pos; + if (iter->probe) + hnd = &iter->probe->node; retry: if (iter->hidx >= FTRACE_FUNC_HASHSIZE) return NULL; @@ -1401,7 +1403,12 @@ t_hash_next(struct seq_file *m, void *v, loff_t *pos) } } - return hnd; + if (WARN_ON_ONCE(!hnd)) + return NULL; + + iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node); + + return iter; } static void *t_hash_start(struct seq_file *m, loff_t *pos) @@ -1410,26 +1417,32 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) void *p = NULL; loff_t l; - if (!(iter->flags & FTRACE_ITER_HASH)) - *pos = 0; - - iter->flags |= FTRACE_ITER_HASH; + if (iter->func_pos > *pos) + return NULL; iter->hidx = 0; - for (l = 0; l <= *pos; ) { - p = t_hash_next(m, p, &l); + for (l = 0; l <= (*pos - iter->func_pos); ) { + p = t_hash_next(m, &l); if (!p) break; } - return p; + if (!p) + return NULL; + + /* Only set this if we have an item */ + iter->flags |= FTRACE_ITER_HASH; + + return iter; } -static int t_hash_show(struct seq_file *m, void *v) +static int +t_hash_show(struct seq_file *m, struct ftrace_iterator *iter) { struct ftrace_func_probe *rec; - struct hlist_node *hnd = v; - rec = hlist_entry(hnd, struct ftrace_func_probe, node); + rec = iter->probe; + if (WARN_ON_ONCE(!rec)) + return -EIO; if (rec->ops->print) return rec->ops->print(m, rec->ip, rec->ops, rec->data); @@ -1450,12 +1463,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos) struct dyn_ftrace *rec = NULL; if (iter->flags & FTRACE_ITER_HASH) - return t_hash_next(m, v, pos); + return t_hash_next(m, pos); (*pos)++; + iter->pos = *pos; if (iter->flags & FTRACE_ITER_PRINTALL) - return NULL; + return t_hash_start(m, pos); retry: if (iter->idx >= iter->pg->index) { @@ -1484,7 +1498,20 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } } - return rec; + if (!rec) + return t_hash_start(m, pos); + + iter->func_pos = *pos; + iter->func = rec; + + return iter; +} + +static void reset_iter_read(struct ftrace_iterator *iter) +{ + iter->pos = 0; + iter->func_pos = 0; + iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH); } static void *t_start(struct seq_file *m, loff_t *pos) @@ -1495,6 +1522,12 @@ static void *t_start(struct seq_file *m, loff_t *pos) mutex_lock(&ftrace_lock); /* + * If an lseek was done, then reset and start from beginning. + */ + if (*pos < iter->pos) + reset_iter_read(iter); + + /* * For set_ftrace_filter reading, if we have the filter * off, we can short cut and just print out that all * functions are enabled. @@ -1503,12 +1536,19 @@ static void *t_start(struct seq_file *m, loff_t *pos) if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; + /* reset in case of seek/pread */ + iter->flags &= ~FTRACE_ITER_HASH; return iter; } if (iter->flags & FTRACE_ITER_HASH) return t_hash_start(m, pos); + /* + * Unfortunately, we need to restart at ftrace_pages_start + * every time we let go of the ftrace_mutex. This is because + * those pointers can change without the lock. + */ iter->pg = ftrace_pages_start; iter->idx = 0; for (l = 0; l <= *pos; ) { @@ -1517,10 +1557,14 @@ static void *t_start(struct seq_file *m, loff_t *pos) break; } - if (!p && iter->flags & FTRACE_ITER_FILTER) - return t_hash_start(m, pos); + if (!p) { + if (iter->flags & FTRACE_ITER_FILTER) + return t_hash_start(m, pos); - return p; + return NULL; + } + + return iter; } static void t_stop(struct seq_file *m, void *p) @@ -1531,16 +1575,18 @@ static void t_stop(struct seq_file *m, void *p) static int t_show(struct seq_file *m, void *v) { struct ftrace_iterator *iter = m->private; - struct dyn_ftrace *rec = v; + struct dyn_ftrace *rec; if (iter->flags & FTRACE_ITER_HASH) - return t_hash_show(m, v); + return t_hash_show(m, iter); if (iter->flags & FTRACE_ITER_PRINTALL) { seq_printf(m, "#### all functions enabled ####\n"); return 0; } + rec = iter->func; + if (!rec) return 0; @@ -1592,8 +1638,8 @@ ftrace_failures_open(struct inode *inode, struct file *file) ret = ftrace_avail_open(inode, file); if (!ret) { - m = (struct seq_file *)file->private_data; - iter = (struct ftrace_iterator *)m->private; + m = file->private_data; + iter = m->private; iter->flags = FTRACE_ITER_FAILURES; } @@ -1883,7 +1929,6 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip) struct hlist_head *hhd; struct hlist_node *n; unsigned long key; - int resched; key = hash_long(ip, FTRACE_HASH_BITS); @@ -1897,12 +1942,12 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip) * period. This syncs the hash iteration and freeing of items * on the hash. rcu_read_lock is too dangerous here. */ - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); hlist_for_each_entry_rcu(entry, n, hhd, node) { if (entry->ip == ip) entry->ops->func(ip, parent_ip, &entry->data); } - ftrace_preempt_enable(resched); + preempt_enable_notrace(); } static struct ftrace_ops trace_probe_ops __read_mostly = diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c deleted file mode 100644 index bbfc1bb..0000000 --- a/kernel/trace/kmemtrace.c +++ /dev/null @@ -1,529 +0,0 @@ -/* - * Memory allocator tracing - * - * Copyright (C) 2008 Eduard - Gabriel Munteanu - * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi> - * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> - */ - -#include <linux/tracepoint.h> -#include <linux/seq_file.h> -#include <linux/debugfs.h> -#include <linux/dcache.h> -#include <linux/fs.h> - -#include <linux/kmemtrace.h> - -#include "trace_output.h" -#include "trace.h" - -/* Select an alternative, minimalistic output than the original one */ -#define TRACE_KMEM_OPT_MINIMAL 0x1 - -static struct tracer_opt kmem_opts[] = { - /* Default disable the minimalistic output */ - { TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) }, - { } -}; - -static struct tracer_flags kmem_tracer_flags = { - .val = 0, - .opts = kmem_opts -}; - -static struct trace_array *kmemtrace_array; - -/* Trace allocations */ -static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node) -{ - struct ftrace_event_call *call = &event_kmem_alloc; - struct trace_array *tr = kmemtrace_array; - struct kmemtrace_alloc_entry *entry; - struct ring_buffer_event *event; - - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); - if (!event) - return; - - entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, 0); - - entry->ent.type = TRACE_KMEM_ALLOC; - entry->type_id = type_id; - entry->call_site = call_site; - entry->ptr = ptr; - entry->bytes_req = bytes_req; - entry->bytes_alloc = bytes_alloc; - entry->gfp_flags = gfp_flags; - entry->node = node; - - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); - - trace_wake_up(); -} - -static inline void kmemtrace_free(enum kmemtrace_type_id type_id, - unsigned long call_site, - const void *ptr) -{ - struct ftrace_event_call *call = &event_kmem_free; - struct trace_array *tr = kmemtrace_array; - struct kmemtrace_free_entry *entry; - struct ring_buffer_event *event; - - event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); - if (!event) - return; - entry = ring_buffer_event_data(event); - tracing_generic_entry_update(&entry->ent, 0, 0); - - entry->ent.type = TRACE_KMEM_FREE; - entry->type_id = type_id; - entry->call_site = call_site; - entry->ptr = ptr; - - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); - - trace_wake_up(); -} - -static void kmemtrace_kmalloc(void *ignore, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags) -{ - kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr, - bytes_req, bytes_alloc, gfp_flags, -1); -} - -static void kmemtrace_kmem_cache_alloc(void *ignore, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags) -{ - kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr, - bytes_req, bytes_alloc, gfp_flags, -1); -} - -static void kmemtrace_kmalloc_node(void *ignore, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node) -{ - kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr, - bytes_req, bytes_alloc, gfp_flags, node); -} - -static void kmemtrace_kmem_cache_alloc_node(void *ignore, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node) -{ - kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr, - bytes_req, bytes_alloc, gfp_flags, node); -} - -static void -kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr) -{ - kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr); -} - -static void kmemtrace_kmem_cache_free(void *ignore, - unsigned long call_site, const void *ptr) -{ - kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr); -} - -static int kmemtrace_start_probes(void) -{ - int err; - - err = register_trace_kmalloc(kmemtrace_kmalloc, NULL); - if (err) - return err; - err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL); - if (err) - return err; - err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL); - if (err) - return err; - err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL); - if (err) - return err; - err = register_trace_kfree(kmemtrace_kfree, NULL); - if (err) - return err; - err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL); - - return err; -} - -static void kmemtrace_stop_probes(void) -{ - unregister_trace_kmalloc(kmemtrace_kmalloc, NULL); - unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL); - unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL); - unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL); - unregister_trace_kfree(kmemtrace_kfree, NULL); - unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL); -} - -static int kmem_trace_init(struct trace_array *tr) -{ - kmemtrace_array = tr; - - tracing_reset_online_cpus(tr); - - kmemtrace_start_probes(); - - return 0; -} - -static void kmem_trace_reset(struct trace_array *tr) -{ - kmemtrace_stop_probes(); -} - -static void kmemtrace_headers(struct seq_file *s) -{ - /* Don't need headers for the original kmemtrace output */ - if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)) - return; - - seq_printf(s, "#\n"); - seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS " - " POINTER NODE CALLER\n"); - seq_printf(s, "# FREE | | | | " - " | | | |\n"); - seq_printf(s, "# |\n\n"); -} - -/* - * The following functions give the original output from kmemtrace, - * plus the origin CPU, since reordering occurs in-kernel now. - */ - -#define KMEMTRACE_USER_ALLOC 0 -#define KMEMTRACE_USER_FREE 1 - -struct kmemtrace_user_event { - u8 event_id; - u8 type_id; - u16 event_size; - u32 cpu; - u64 timestamp; - unsigned long call_site; - unsigned long ptr; -}; - -struct kmemtrace_user_event_alloc { - size_t bytes_req; - size_t bytes_alloc; - unsigned gfp_flags; - int node; -}; - -static enum print_line_t -kmemtrace_print_alloc(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct trace_seq *s = &iter->seq; - struct kmemtrace_alloc_entry *entry; - int ret; - - trace_assign_type(entry, iter->ent); - - ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu " - "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n", - entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr, - (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc, - (unsigned long)entry->gfp_flags, entry->node); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -kmemtrace_print_free(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct trace_seq *s = &iter->seq; - struct kmemtrace_free_entry *entry; - int ret; - - trace_assign_type(entry, iter->ent); - - ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n", - entry->type_id, (void *)entry->call_site, - (unsigned long)entry->ptr); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct trace_seq *s = &iter->seq; - struct kmemtrace_alloc_entry *entry; - struct kmemtrace_user_event *ev; - struct kmemtrace_user_event_alloc *ev_alloc; - - trace_assign_type(entry, iter->ent); - - ev = trace_seq_reserve(s, sizeof(*ev)); - if (!ev) - return TRACE_TYPE_PARTIAL_LINE; - - ev->event_id = KMEMTRACE_USER_ALLOC; - ev->type_id = entry->type_id; - ev->event_size = sizeof(*ev) + sizeof(*ev_alloc); - ev->cpu = iter->cpu; - ev->timestamp = iter->ts; - ev->call_site = entry->call_site; - ev->ptr = (unsigned long)entry->ptr; - - ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc)); - if (!ev_alloc) - return TRACE_TYPE_PARTIAL_LINE; - - ev_alloc->bytes_req = entry->bytes_req; - ev_alloc->bytes_alloc = entry->bytes_alloc; - ev_alloc->gfp_flags = entry->gfp_flags; - ev_alloc->node = entry->node; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -kmemtrace_print_free_user(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - struct trace_seq *s = &iter->seq; - struct kmemtrace_free_entry *entry; - struct kmemtrace_user_event *ev; - - trace_assign_type(entry, iter->ent); - - ev = trace_seq_reserve(s, sizeof(*ev)); - if (!ev) - return TRACE_TYPE_PARTIAL_LINE; - - ev->event_id = KMEMTRACE_USER_FREE; - ev->type_id = entry->type_id; - ev->event_size = sizeof(*ev); - ev->cpu = iter->cpu; - ev->timestamp = iter->ts; - ev->call_site = entry->call_site; - ev->ptr = (unsigned long)entry->ptr; - - return TRACE_TYPE_HANDLED; -} - -/* The two other following provide a more minimalistic output */ -static enum print_line_t -kmemtrace_print_alloc_compress(struct trace_iterator *iter) -{ - struct kmemtrace_alloc_entry *entry; - struct trace_seq *s = &iter->seq; - int ret; - - trace_assign_type(entry, iter->ent); - - /* Alloc entry */ - ret = trace_seq_printf(s, " + "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Type */ - switch (entry->type_id) { - case KMEMTRACE_TYPE_KMALLOC: - ret = trace_seq_printf(s, "K "); - break; - case KMEMTRACE_TYPE_CACHE: - ret = trace_seq_printf(s, "C "); - break; - case KMEMTRACE_TYPE_PAGES: - ret = trace_seq_printf(s, "P "); - break; - default: - ret = trace_seq_printf(s, "? "); - } - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Requested */ - ret = trace_seq_printf(s, "%4zu ", entry->bytes_req); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Allocated */ - ret = trace_seq_printf(s, "%4zu ", entry->bytes_alloc); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Flags - * TODO: would be better to see the name of the GFP flag names - */ - ret = trace_seq_printf(s, "%08x ", entry->gfp_flags); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Pointer to allocated */ - ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Node and call site*/ - ret = trace_seq_printf(s, "%4d %pf\n", entry->node, - (void *)entry->call_site); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -kmemtrace_print_free_compress(struct trace_iterator *iter) -{ - struct kmemtrace_free_entry *entry; - struct trace_seq *s = &iter->seq; - int ret; - - trace_assign_type(entry, iter->ent); - - /* Free entry */ - ret = trace_seq_printf(s, " - "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Type */ - switch (entry->type_id) { - case KMEMTRACE_TYPE_KMALLOC: - ret = trace_seq_printf(s, "K "); - break; - case KMEMTRACE_TYPE_CACHE: - ret = trace_seq_printf(s, "C "); - break; - case KMEMTRACE_TYPE_PAGES: - ret = trace_seq_printf(s, "P "); - break; - default: - ret = trace_seq_printf(s, "? "); - } - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Skip requested/allocated/flags */ - ret = trace_seq_printf(s, " "); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Pointer to allocated */ - ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - /* Skip node and print call site*/ - ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - - if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)) - return TRACE_TYPE_UNHANDLED; - - switch (entry->type) { - case TRACE_KMEM_ALLOC: - return kmemtrace_print_alloc_compress(iter); - case TRACE_KMEM_FREE: - return kmemtrace_print_free_compress(iter); - default: - return TRACE_TYPE_UNHANDLED; - } -} - -static struct trace_event_functions kmem_trace_alloc_funcs = { - .trace = kmemtrace_print_alloc, - .binary = kmemtrace_print_alloc_user, -}; - -static struct trace_event kmem_trace_alloc = { - .type = TRACE_KMEM_ALLOC, - .funcs = &kmem_trace_alloc_funcs, -}; - -static struct trace_event_functions kmem_trace_free_funcs = { - .trace = kmemtrace_print_free, - .binary = kmemtrace_print_free_user, -}; - -static struct trace_event kmem_trace_free = { - .type = TRACE_KMEM_FREE, - .funcs = &kmem_trace_free_funcs, -}; - -static struct tracer kmem_tracer __read_mostly = { - .name = "kmemtrace", - .init = kmem_trace_init, - .reset = kmem_trace_reset, - .print_line = kmemtrace_print_line, - .print_header = kmemtrace_headers, - .flags = &kmem_tracer_flags -}; - -void kmemtrace_init(void) -{ - /* earliest opportunity to start kmem tracing */ -} - -static int __init init_kmem_tracer(void) -{ - if (!register_ftrace_event(&kmem_trace_alloc)) { - pr_warning("Warning: could not register kmem events\n"); - return 1; - } - - if (!register_ftrace_event(&kmem_trace_free)) { - pr_warning("Warning: could not register kmem events\n"); - return 1; - } - - if (register_tracer(&kmem_tracer) != 0) { - pr_warning("Warning: could not register the kmem tracer\n"); - return 1; - } - - return 0; -} -device_initcall(init_kmem_tracer); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1da7b6e..c5a632a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -405,7 +405,7 @@ static inline int test_time_stamp(u64 delta) #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) /* Max number of timestamps that can fit on a page */ -#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP) +#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND) int ring_buffer_print_page_header(struct trace_seq *s) { @@ -443,6 +443,7 @@ int ring_buffer_print_page_header(struct trace_seq *s) */ struct ring_buffer_per_cpu { int cpu; + atomic_t record_disabled; struct ring_buffer *buffer; spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; @@ -462,7 +463,6 @@ struct ring_buffer_per_cpu { unsigned long read; u64 write_stamp; u64 read_stamp; - atomic_t record_disabled; }; struct ring_buffer { @@ -2242,8 +2242,6 @@ static void trace_recursive_unlock(void) #endif -static DEFINE_PER_CPU(int, rb_need_resched); - /** * ring_buffer_lock_reserve - reserve a part of the buffer * @buffer: the ring buffer to reserve from @@ -2264,13 +2262,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; - int cpu, resched; + int cpu; if (ring_buffer_flags != RB_BUFFERS_ON) return NULL; /* If we are tracing schedule, we don't want to recurse */ - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); if (atomic_read(&buffer->record_disabled)) goto out_nocheck; @@ -2295,21 +2293,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) if (!event) goto out; - /* - * Need to store resched state on this cpu. - * Only the first needs to. - */ - - if (preempt_count() == 1) - per_cpu(rb_need_resched, cpu) = resched; - return event; out: trace_recursive_unlock(); out_nocheck: - ftrace_preempt_enable(resched); + preempt_enable_notrace(); return NULL; } EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); @@ -2355,13 +2345,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, trace_recursive_unlock(); - /* - * Only the last preempt count needs to restore preemption. - */ - if (preempt_count() == 1) - ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); - else - preempt_enable_no_resched_notrace(); + preempt_enable_notrace(); return 0; } @@ -2469,13 +2453,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, trace_recursive_unlock(); - /* - * Only the last preempt count needs to restore preemption. - */ - if (preempt_count() == 1) - ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); - else - preempt_enable_no_resched_notrace(); + preempt_enable_notrace(); } EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); @@ -2501,12 +2479,12 @@ int ring_buffer_write(struct ring_buffer *buffer, struct ring_buffer_event *event; void *body; int ret = -EBUSY; - int cpu, resched; + int cpu; if (ring_buffer_flags != RB_BUFFERS_ON) return -EBUSY; - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); if (atomic_read(&buffer->record_disabled)) goto out; @@ -2536,7 +2514,7 @@ int ring_buffer_write(struct ring_buffer *buffer, ret = 0; out: - ftrace_preempt_enable(resched); + preempt_enable_notrace(); return ret; } @@ -2628,6 +2606,19 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); +/* + * The total entries in the ring buffer is the running counter + * of entries entered into the ring buffer, minus the sum of + * the entries read from the ring buffer and the number of + * entries that were overwritten. + */ +static inline unsigned long +rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) +{ + return local_read(&cpu_buffer->entries) - + (local_read(&cpu_buffer->overrun) + cpu_buffer->read); +} + /** * ring_buffer_entries_cpu - get the number of entries in a cpu buffer * @buffer: The ring buffer @@ -2636,16 +2627,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun)) - - cpu_buffer->read; - return ret; + return rb_num_of_entries(cpu_buffer); } EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); @@ -2706,8 +2694,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - entries += (local_read(&cpu_buffer->entries) - - local_read(&cpu_buffer->overrun)) - cpu_buffer->read; + entries += rb_num_of_entries(cpu_buffer); } return entries; @@ -3007,13 +2994,11 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) static void rb_advance_iter(struct ring_buffer_iter *iter) { - struct ring_buffer *buffer; struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; unsigned length; cpu_buffer = iter->cpu_buffer; - buffer = cpu_buffer->buffer; /* * Check if we are at the end of the buffer. @@ -3868,6 +3853,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer, rpos = reader->read; pos += size; + if (rpos >= commit) + break; + event = rb_reader_event(cpu_buffer); size = rb_event_length(event); } while (len > size); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 086d363..001bcd2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -101,10 +101,7 @@ static inline void ftrace_enable_cpu(void) preempt_enable(); } -static cpumask_var_t __read_mostly tracing_buffer_mask; - -#define for_each_tracing_cpu(cpu) \ - for_each_cpu(cpu, tracing_buffer_mask) +cpumask_var_t __read_mostly tracing_buffer_mask; /* * ftrace_dump_on_oops - variable to dump ftrace buffer on oops @@ -344,7 +341,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | - TRACE_ITER_GRAPH_TIME; + TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD; static int trace_stop_count; static DEFINE_SPINLOCK(tracing_start_lock); @@ -428,6 +425,7 @@ static const char *trace_options[] = { "latency-format", "sleep-time", "graph-time", + "record-cmd", NULL }; @@ -659,6 +657,10 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); + if (!current_trace->use_max_tr) { + WARN_ON_ONCE(1); + return; + } arch_spin_lock(&ftrace_max_lock); tr->buffer = max_tr.buffer; @@ -685,6 +687,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); + if (!current_trace->use_max_tr) { + WARN_ON_ONCE(1); + return; + } + arch_spin_lock(&ftrace_max_lock); ftrace_disable_cpu(); @@ -729,18 +736,11 @@ __acquires(kernel_lock) return -1; } - if (strlen(type->name) > MAX_TRACER_SIZE) { + if (strlen(type->name) >= MAX_TRACER_SIZE) { pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); return -1; } - /* - * When this gets called we hold the BKL which means that - * preemption is disabled. Various trace selftests however - * need to disable and enable preemption for successful tests. - * So we drop the BKL here and grab it after the tests again. - */ - unlock_kernel(); mutex_lock(&trace_types_lock); tracing_selftest_running = true; @@ -822,7 +822,6 @@ __acquires(kernel_lock) #endif out_unlock: - lock_kernel(); return ret; } @@ -1331,61 +1330,6 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags) #endif /* CONFIG_STACKTRACE */ -static void -ftrace_trace_special(void *__tr, - unsigned long arg1, unsigned long arg2, unsigned long arg3, - int pc) -{ - struct ftrace_event_call *call = &event_special; - struct ring_buffer_event *event; - struct trace_array *tr = __tr; - struct ring_buffer *buffer = tr->buffer; - struct special_entry *entry; - - event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL, - sizeof(*entry), 0, pc); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->arg1 = arg1; - entry->arg2 = arg2; - entry->arg3 = arg3; - - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, pc); -} - -void -__trace_special(void *__tr, void *__data, - unsigned long arg1, unsigned long arg2, unsigned long arg3) -{ - ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count()); -} - -void -ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) -{ - struct trace_array *tr = &global_trace; - struct trace_array_cpu *data; - unsigned long flags; - int cpu; - int pc; - - if (tracing_disabled) - return; - - pc = preempt_count(); - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = tr->data[cpu]; - - if (likely(atomic_inc_return(&data->disabled) == 1)) - ftrace_trace_special(tr, arg1, arg2, arg3, pc); - - atomic_dec(&data->disabled); - local_irq_restore(flags); -} - /** * trace_vbprintk - write binary msg to tracing buffer * @@ -1404,7 +1348,6 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) struct bprint_entry *entry; unsigned long flags; int disable; - int resched; int cpu, len = 0, size, pc; if (unlikely(tracing_selftest_running || tracing_disabled)) @@ -1414,7 +1357,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) pause_graph_tracing(); pc = preempt_count(); - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); data = tr->data[cpu]; @@ -1452,7 +1395,7 @@ out_unlock: out: atomic_dec_return(&data->disabled); - ftrace_preempt_enable(resched); + preempt_enable_notrace(); unpause_graph_tracing(); return len; @@ -1539,11 +1482,6 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) } EXPORT_SYMBOL_GPL(trace_vprintk); -enum trace_file_type { - TRACE_FILE_LAT_FMT = 1, - TRACE_FILE_ANNOTATE = 2, -}; - static void trace_iterator_increment(struct trace_iterator *iter) { /* Don't allow ftrace to trace into the ring buffers */ @@ -1641,7 +1579,7 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, } /* Find the next real entry, and increment the iterator to the next entry */ -static void *find_next_entry_inc(struct trace_iterator *iter) +void *trace_find_next_entry_inc(struct trace_iterator *iter) { iter->ent = __find_next_entry(iter, &iter->cpu, &iter->lost_events, &iter->ts); @@ -1676,19 +1614,19 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos) return NULL; if (iter->idx < 0) - ent = find_next_entry_inc(iter); + ent = trace_find_next_entry_inc(iter); else ent = iter; while (ent && iter->idx < i) - ent = find_next_entry_inc(iter); + ent = trace_find_next_entry_inc(iter); iter->pos = *pos; return ent; } -static void tracing_iter_reset(struct trace_iterator *iter, int cpu) +void tracing_iter_reset(struct trace_iterator *iter, int cpu) { struct trace_array *tr = iter->tr; struct ring_buffer_event *event; @@ -2049,7 +1987,7 @@ int trace_empty(struct trace_iterator *iter) } /* Called with trace_event_read_lock() held. */ -static enum print_line_t print_trace_line(struct trace_iterator *iter) +enum print_line_t print_trace_line(struct trace_iterator *iter) { enum print_line_t ret; @@ -2258,7 +2196,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp) static int tracing_release(struct inode *inode, struct file *file) { - struct seq_file *m = (struct seq_file *)file->private_data; + struct seq_file *m = file->private_data; struct trace_iterator *iter; int cpu; @@ -2394,6 +2332,7 @@ static const struct file_operations show_traces_fops = { .open = show_traces_open, .read = seq_read, .release = seq_release, + .llseek = seq_lseek, }; /* @@ -2487,6 +2426,7 @@ static const struct file_operations tracing_cpumask_fops = { .open = tracing_open_generic, .read = tracing_cpumask_read, .write = tracing_cpumask_write, + .llseek = generic_file_llseek, }; static int tracing_trace_options_show(struct seq_file *m, void *v) @@ -2562,6 +2502,9 @@ static void set_tracer_flags(unsigned int mask, int enabled) trace_flags |= mask; else trace_flags &= ~mask; + + if (mask == TRACE_ITER_RECORD_CMD) + trace_event_enable_cmd_record(enabled); } static ssize_t @@ -2653,6 +2596,7 @@ tracing_readme_read(struct file *filp, char __user *ubuf, static const struct file_operations tracing_readme_fops = { .open = tracing_open_generic, .read = tracing_readme_read, + .llseek = generic_file_llseek, }; static ssize_t @@ -2703,6 +2647,7 @@ tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, static const struct file_operations tracing_saved_cmdlines_fops = { .open = tracing_open_generic, .read = tracing_saved_cmdlines_read, + .llseek = generic_file_llseek, }; static ssize_t @@ -2798,6 +2743,9 @@ static int tracing_resize_ring_buffer(unsigned long size) if (ret < 0) return ret; + if (!current_trace->use_max_tr) + goto out; + ret = ring_buffer_resize(max_tr.buffer, size); if (ret < 0) { int r; @@ -2825,11 +2773,14 @@ static int tracing_resize_ring_buffer(unsigned long size) return ret; } + max_tr.entries = size; + out: global_trace.entries = size; return ret; } + /** * tracing_update_buffers - used by tracing facility to expand ring buffers * @@ -2890,12 +2841,26 @@ static int tracing_set_tracer(const char *buf) trace_branch_disable(); if (current_trace && current_trace->reset) current_trace->reset(tr); - + if (current_trace && current_trace->use_max_tr) { + /* + * We don't free the ring buffer. instead, resize it because + * The max_tr ring buffer has some state (e.g. ring->clock) and + * we want preserve it. + */ + ring_buffer_resize(max_tr.buffer, 1); + max_tr.entries = 1; + } destroy_trace_option_files(topts); current_trace = t; topts = create_trace_option_files(current_trace); + if (current_trace->use_max_tr) { + ret = ring_buffer_resize(max_tr.buffer, global_trace.entries); + if (ret < 0) + goto out; + max_tr.entries = global_trace.entries; + } if (t->init) { ret = tracer_init(t, tr); @@ -3032,6 +2997,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) if (iter->trace->pipe_open) iter->trace->pipe_open(iter); + nonseekable_open(inode, filp); out: mutex_unlock(&trace_types_lock); return ret; @@ -3211,7 +3177,7 @@ waitagain: trace_event_read_lock(); trace_access_lock(iter->cpu_file); - while (find_next_entry_inc(iter) != NULL) { + while (trace_find_next_entry_inc(iter) != NULL) { enum print_line_t ret; int len = iter->seq.len; @@ -3294,7 +3260,7 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) if (ret != TRACE_TYPE_NO_CONSUME) trace_consume(iter); rem -= count; - if (!find_next_entry_inc(iter)) { + if (!trace_find_next_entry_inc(iter)) { rem = 0; iter->ent = NULL; break; @@ -3350,7 +3316,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, if (ret <= 0) goto out_err; - if (!iter->ent && !find_next_entry_inc(iter)) { + if (!iter->ent && !trace_find_next_entry_inc(iter)) { ret = -EFAULT; goto out_err; } @@ -3477,7 +3443,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, } tracing_start(); - max_tr.entries = global_trace.entries; mutex_unlock(&trace_types_lock); return cnt; @@ -3498,6 +3463,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { char *buf; + size_t written; if (tracing_disabled) return -EINVAL; @@ -3519,11 +3485,15 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, } else buf[cnt] = '\0'; - cnt = mark_printk("%s", buf); + written = mark_printk("%s", buf); kfree(buf); - *fpos += cnt; + *fpos += written; - return cnt; + /* don't tell userspace we wrote more - it might confuse them */ + if (written > cnt) + written = cnt; + + return written; } static int tracing_clock_show(struct seq_file *m, void *v) @@ -3590,18 +3560,21 @@ static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, .write = tracing_max_lat_write, + .llseek = generic_file_llseek, }; static const struct file_operations tracing_ctrl_fops = { .open = tracing_open_generic, .read = tracing_ctrl_read, .write = tracing_ctrl_write, + .llseek = generic_file_llseek, }; static const struct file_operations set_tracer_fops = { .open = tracing_open_generic, .read = tracing_set_trace_read, .write = tracing_set_trace_write, + .llseek = generic_file_llseek, }; static const struct file_operations tracing_pipe_fops = { @@ -3610,17 +3583,20 @@ static const struct file_operations tracing_pipe_fops = { .read = tracing_read_pipe, .splice_read = tracing_splice_read_pipe, .release = tracing_release_pipe, + .llseek = no_llseek, }; static const struct file_operations tracing_entries_fops = { .open = tracing_open_generic, .read = tracing_entries_read, .write = tracing_entries_write, + .llseek = generic_file_llseek, }; static const struct file_operations tracing_mark_fops = { .open = tracing_open_generic, .write = tracing_mark_write, + .llseek = generic_file_llseek, }; static const struct file_operations trace_clock_fops = { @@ -3926,6 +3902,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, static const struct file_operations tracing_stats_fops = { .open = tracing_open_generic, .read = tracing_stats_read, + .llseek = generic_file_llseek, }; #ifdef CONFIG_DYNAMIC_FTRACE @@ -3962,6 +3939,7 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf, static const struct file_operations tracing_dyn_info_fops = { .open = tracing_open_generic, .read = tracing_read_dyn_info, + .llseek = generic_file_llseek, }; #endif @@ -4115,6 +4093,7 @@ static const struct file_operations trace_options_fops = { .open = tracing_open_generic, .read = trace_options_read, .write = trace_options_write, + .llseek = generic_file_llseek, }; static ssize_t @@ -4166,6 +4145,7 @@ static const struct file_operations trace_options_core_fops = { .open = tracing_open_generic, .read = trace_options_core_read, .write = trace_options_core_write, + .llseek = generic_file_llseek, }; struct dentry *trace_create_file(const char *name, @@ -4355,9 +4335,6 @@ static __init int tracer_init_debugfs(void) trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, &tracing_dyn_info_fops); #endif -#ifdef CONFIG_SYSPROF_TRACER - init_tracer_sysprof_debugfs(d_tracer); -#endif create_trace_options_dir(); @@ -4414,7 +4391,7 @@ static struct notifier_block trace_die_notifier = { */ #define KERN_TRACE KERN_EMERG -static void +void trace_printk_seq(struct trace_seq *s) { /* Probably should print a warning here. */ @@ -4429,6 +4406,13 @@ trace_printk_seq(struct trace_seq *s) trace_seq_init(s); } +void trace_init_global_iter(struct trace_iterator *iter) +{ + iter->tr = &global_trace; + iter->trace = current_trace; + iter->cpu_file = TRACE_PIPE_ALL_CPU; +} + static void __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) { @@ -4454,8 +4438,10 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) if (disable_tracing) ftrace_kill(); + trace_init_global_iter(&iter); + for_each_tracing_cpu(cpu) { - atomic_inc(&global_trace.data[cpu]->disabled); + atomic_inc(&iter.tr->data[cpu]->disabled); } old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; @@ -4504,7 +4490,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) iter.iter_flags |= TRACE_FILE_LAT_FMT; iter.pos = -1; - if (find_next_entry_inc(&iter) != NULL) { + if (trace_find_next_entry_inc(&iter) != NULL) { int ret; ret = print_trace_line(&iter); @@ -4526,7 +4512,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) trace_flags |= old_userobj; for_each_tracing_cpu(cpu) { - atomic_dec(&global_trace.data[cpu]->disabled); + atomic_dec(&iter.tr->data[cpu]->disabled); } tracing_on(); } @@ -4575,16 +4561,14 @@ __init static int tracer_alloc_buffers(void) #ifdef CONFIG_TRACER_MAX_TRACE - max_tr.buffer = ring_buffer_alloc(ring_buf_size, - TRACE_BUFFER_FLAGS); + max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS); if (!max_tr.buffer) { printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); WARN_ON(1); ring_buffer_free(global_trace.buffer); goto out_free_cpumask; } - max_tr.entries = ring_buffer_size(max_tr.buffer); - WARN_ON(max_tr.entries != global_trace.entries); + max_tr.entries = 1; #endif /* Allocate the first page for all buffers */ @@ -4597,9 +4581,6 @@ __init static int tracer_alloc_buffers(void) register_tracer(&nop_trace); current_trace = &nop_trace; -#ifdef CONFIG_BOOT_TRACER - register_tracer(&boot_tracer); -#endif /* All seems OK, enable tracing */ tracing_disabled = 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2cd9639..9021f8c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -9,10 +9,7 @@ #include <linux/mmiotrace.h> #include <linux/tracepoint.h> #include <linux/ftrace.h> -#include <trace/boot.h> -#include <linux/kmemtrace.h> #include <linux/hw_breakpoint.h> - #include <linux/trace_seq.h> #include <linux/ftrace_event.h> @@ -25,30 +22,17 @@ enum trace_type { TRACE_STACK, TRACE_PRINT, TRACE_BPRINT, - TRACE_SPECIAL, TRACE_MMIO_RW, TRACE_MMIO_MAP, TRACE_BRANCH, - TRACE_BOOT_CALL, - TRACE_BOOT_RET, TRACE_GRAPH_RET, TRACE_GRAPH_ENT, TRACE_USER_STACK, - TRACE_KMEM_ALLOC, - TRACE_KMEM_FREE, TRACE_BLK, - TRACE_KSYM, __TRACE_LAST_TYPE, }; -enum kmemtrace_type_id { - KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ - KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ - KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ -}; - -extern struct tracer boot_tracer; #undef __field #define __field(type, item) type item; @@ -204,23 +188,15 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ - IF_ASSIGN(var, ent, struct special_entry, 0); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ TRACE_MMIO_MAP); \ - IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\ - IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\ IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ TRACE_GRAPH_ENT); \ IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ - IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ - TRACE_KMEM_ALLOC); \ - IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ - TRACE_KMEM_FREE); \ - IF_ASSIGN(var, ent, struct ksym_trace_entry, TRACE_KSYM);\ __ftrace_bad_type(); \ } while (0) @@ -298,6 +274,7 @@ struct tracer { struct tracer *next; int print_max; struct tracer_flags *flags; + int use_max_tr; }; @@ -318,7 +295,6 @@ struct dentry *trace_create_file(const char *name, const struct file_operations *fops); struct dentry *tracing_init_dentry(void); -void init_tracer_sysprof_debugfs(struct dentry *d_tracer); struct ring_buffer_event; @@ -338,6 +314,14 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts); +int trace_empty(struct trace_iterator *iter); + +void *trace_find_next_entry_inc(struct trace_iterator *iter); + +void trace_init_global_iter(struct trace_iterator *iter); + +void tracing_iter_reset(struct trace_iterator *iter, int cpu); + void default_wait_pipe(struct trace_iterator *iter); void poll_wait_pipe(struct trace_iterator *iter); @@ -355,15 +339,14 @@ void tracing_sched_wakeup_trace(struct trace_array *tr, struct task_struct *wakee, struct task_struct *cur, unsigned long flags, int pc); -void trace_special(struct trace_array *tr, - struct trace_array_cpu *data, - unsigned long arg1, - unsigned long arg2, - unsigned long arg3, int pc); void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc); +void trace_graph_function(struct trace_array *tr, + unsigned long ip, + unsigned long parent_ip, + unsigned long flags, int pc); void trace_default_header(struct seq_file *m); void print_trace_header(struct seq_file *m, struct trace_iterator *iter); int trace_empty(struct trace_iterator *iter); @@ -380,8 +363,15 @@ void tracing_start_sched_switch_record(void); int register_tracer(struct tracer *type); void unregister_tracer(struct tracer *type); int is_tracing_stopped(void); +enum trace_file_type { + TRACE_FILE_LAT_FMT = 1, + TRACE_FILE_ANNOTATE = 2, +}; + +extern cpumask_var_t __read_mostly tracing_buffer_mask; -extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr); +#define for_each_tracing_cpu(cpu) \ + for_each_cpu(cpu, tracing_buffer_mask) extern unsigned long nsecs_to_usecs(unsigned long nsecs); @@ -452,12 +442,8 @@ extern int trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr); -extern int trace_selftest_startup_sysprof(struct tracer *trace, - struct trace_array *tr); extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); -extern int trace_selftest_startup_ksym(struct tracer *trace, - struct trace_array *tr); #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); @@ -471,6 +457,8 @@ trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args); int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...); +void trace_printk_seq(struct trace_seq *s); +enum print_line_t print_trace_line(struct trace_iterator *iter); extern unsigned long trace_flags; @@ -617,6 +605,7 @@ enum trace_iterator_flags { TRACE_ITER_LATENCY_FMT = 0x20000, TRACE_ITER_SLEEP_TIME = 0x40000, TRACE_ITER_GRAPH_TIME = 0x80000, + TRACE_ITER_RECORD_CMD = 0x100000, }; /* @@ -628,54 +617,6 @@ enum trace_iterator_flags { extern struct tracer nop_trace; -/** - * ftrace_preempt_disable - disable preemption scheduler safe - * - * When tracing can happen inside the scheduler, there exists - * cases that the tracing might happen before the need_resched - * flag is checked. If this happens and the tracer calls - * preempt_enable (after a disable), a schedule might take place - * causing an infinite recursion. - * - * To prevent this, we read the need_resched flag before - * disabling preemption. When we want to enable preemption we - * check the flag, if it is set, then we call preempt_enable_no_resched. - * Otherwise, we call preempt_enable. - * - * The rational for doing the above is that if need_resched is set - * and we have yet to reschedule, we are either in an atomic location - * (where we do not need to check for scheduling) or we are inside - * the scheduler and do not want to resched. - */ -static inline int ftrace_preempt_disable(void) -{ - int resched; - - resched = need_resched(); - preempt_disable_notrace(); - - return resched; -} - -/** - * ftrace_preempt_enable - enable preemption scheduler safe - * @resched: the return value from ftrace_preempt_disable - * - * This is a scheduler safe way to enable preemption and not miss - * any preemption checks. The disabled saved the state of preemption. - * If resched is set, then we are either inside an atomic or - * are inside the scheduler (we would have already scheduled - * otherwise). In this case, we do not want to call normal - * preempt_enable, but preempt_enable_no_resched instead. - */ -static inline void ftrace_preempt_enable(int resched) -{ - if (resched) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); -} - #ifdef CONFIG_BRANCH_TRACER extern int enable_branch_tracing(struct trace_array *tr); extern void disable_branch_tracing(void); @@ -766,6 +707,8 @@ struct filter_pred { int pop_n; }; +extern struct list_head ftrace_common_fields; + extern enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not); extern void print_event_filter(struct ftrace_event_call *call, @@ -795,6 +738,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, return 0; } +extern void trace_event_enable_cmd_record(bool enable); + extern struct mutex event_mutex; extern struct list_head ftrace_events; diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c deleted file mode 100644 index c21d5f3..0000000 --- a/kernel/trace/trace_boot.c +++ /dev/null @@ -1,185 +0,0 @@ -/* - * ring buffer based initcalls tracer - * - * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> - * - */ - -#include <linux/init.h> -#include <linux/debugfs.h> -#include <linux/ftrace.h> -#include <linux/kallsyms.h> -#include <linux/time.h> - -#include "trace.h" -#include "trace_output.h" - -static struct trace_array *boot_trace; -static bool pre_initcalls_finished; - -/* Tells the boot tracer that the pre_smp_initcalls are finished. - * So we are ready . - * It doesn't enable sched events tracing however. - * You have to call enable_boot_trace to do so. - */ -void start_boot_trace(void) -{ - pre_initcalls_finished = true; -} - -void enable_boot_trace(void) -{ - if (boot_trace && pre_initcalls_finished) - tracing_start_sched_switch_record(); -} - -void disable_boot_trace(void) -{ - if (boot_trace && pre_initcalls_finished) - tracing_stop_sched_switch_record(); -} - -static int boot_trace_init(struct trace_array *tr) -{ - boot_trace = tr; - - if (!tr) - return 0; - - tracing_reset_online_cpus(tr); - - tracing_sched_switch_assign_trace(tr); - return 0; -} - -static enum print_line_t -initcall_call_print_line(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - struct trace_seq *s = &iter->seq; - struct trace_boot_call *field; - struct boot_trace_call *call; - u64 ts; - unsigned long nsec_rem; - int ret; - - trace_assign_type(field, entry); - call = &field->boot_call; - ts = iter->ts; - nsec_rem = do_div(ts, NSEC_PER_SEC); - - ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", - (unsigned long)ts, nsec_rem, call->func, call->caller); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - else - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t -initcall_ret_print_line(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - struct trace_seq *s = &iter->seq; - struct trace_boot_ret *field; - struct boot_trace_ret *init_ret; - u64 ts; - unsigned long nsec_rem; - int ret; - - trace_assign_type(field, entry); - init_ret = &field->boot_ret; - ts = iter->ts; - nsec_rem = do_div(ts, NSEC_PER_SEC); - - ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " - "returned %d after %llu msecs\n", - (unsigned long) ts, - nsec_rem, - init_ret->func, init_ret->result, init_ret->duration); - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - else - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t initcall_print_line(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - - switch (entry->type) { - case TRACE_BOOT_CALL: - return initcall_call_print_line(iter); - case TRACE_BOOT_RET: - return initcall_ret_print_line(iter); - default: - return TRACE_TYPE_UNHANDLED; - } -} - -struct tracer boot_tracer __read_mostly = -{ - .name = "initcall", - .init = boot_trace_init, - .reset = tracing_reset_online_cpus, - .print_line = initcall_print_line, -}; - -void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) -{ - struct ftrace_event_call *call = &event_boot_call; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct trace_boot_call *entry; - struct trace_array *tr = boot_trace; - - if (!tr || !pre_initcalls_finished) - return; - - /* Get its name now since this function could - * disappear because it is in the .init section. - */ - sprint_symbol(bt->func, (unsigned long)fn); - preempt_disable(); - - buffer = tr->buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->boot_call = *bt; - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, 0); - out: - preempt_enable(); -} - -void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) -{ - struct ftrace_event_call *call = &event_boot_ret; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct trace_boot_ret *entry; - struct trace_array *tr = boot_trace; - - if (!tr || !pre_initcalls_finished) - return; - - sprint_symbol(bt->func, (unsigned long)fn); - preempt_disable(); - - buffer = tr->buffer; - event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->boot_ret = *bt; - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, 0); - out: - preempt_enable(); -} diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 9d589d8..685a67d 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -32,16 +32,15 @@ u64 notrace trace_clock_local(void) { u64 clock; - int resched; /* * sched_clock() is an architecture implemented, fast, scalable, * lockless clock. It is not guaranteed to be coherent across * CPUs, nor across CPU idle events. */ - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); clock = sched_clock(); - ftrace_preempt_enable(resched); + preempt_enable_notrace(); return clock; } @@ -56,7 +55,7 @@ u64 notrace trace_clock_local(void) */ u64 notrace trace_clock(void) { - return cpu_clock(raw_smp_processor_id()); + return local_clock(); } diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index dc008c1..e3dfeca 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -151,23 +151,6 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, ); /* - * Special (free-form) trace entry: - */ -FTRACE_ENTRY(special, special_entry, - - TRACE_SPECIAL, - - F_STRUCT( - __field( unsigned long, arg1 ) - __field( unsigned long, arg2 ) - __field( unsigned long, arg3 ) - ), - - F_printk("(%08lx) (%08lx) (%08lx)", - __entry->arg1, __entry->arg2, __entry->arg3) -); - -/* * Stack-trace entry: */ @@ -271,33 +254,6 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, __entry->map_id, __entry->opcode) ); -FTRACE_ENTRY(boot_call, trace_boot_call, - - TRACE_BOOT_CALL, - - F_STRUCT( - __field_struct( struct boot_trace_call, boot_call ) - __field_desc( pid_t, boot_call, caller ) - __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN) - ), - - F_printk("%d %s", __entry->caller, __entry->func) -); - -FTRACE_ENTRY(boot_ret, trace_boot_ret, - - TRACE_BOOT_RET, - - F_STRUCT( - __field_struct( struct boot_trace_ret, boot_ret ) - __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN) - __field_desc( int, boot_ret, result ) - __field_desc( unsigned long, boot_ret, duration ) - ), - - F_printk("%s %d %lx", - __entry->func, __entry->result, __entry->duration) -); #define TRACE_FUNC_SIZE 30 #define TRACE_FILE_SIZE 20 @@ -318,53 +274,3 @@ FTRACE_ENTRY(branch, trace_branch, __entry->func, __entry->file, __entry->correct) ); -FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, - - TRACE_KMEM_ALLOC, - - F_STRUCT( - __field( enum kmemtrace_type_id, type_id ) - __field( unsigned long, call_site ) - __field( const void *, ptr ) - __field( size_t, bytes_req ) - __field( size_t, bytes_alloc ) - __field( gfp_t, gfp_flags ) - __field( int, node ) - ), - - F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi" - " flags:%x node:%d", - __entry->type_id, __entry->call_site, __entry->ptr, - __entry->bytes_req, __entry->bytes_alloc, - __entry->gfp_flags, __entry->node) -); - -FTRACE_ENTRY(kmem_free, kmemtrace_free_entry, - - TRACE_KMEM_FREE, - - F_STRUCT( - __field( enum kmemtrace_type_id, type_id ) - __field( unsigned long, call_site ) - __field( const void *, ptr ) - ), - - F_printk("type:%u call_site:%lx ptr:%p", - __entry->type_id, __entry->call_site, __entry->ptr) -); - -FTRACE_ENTRY(ksym_trace, ksym_trace_entry, - - TRACE_KSYM, - - F_STRUCT( - __field( unsigned long, ip ) - __field( unsigned char, type ) - __array( char , cmd, TASK_COMM_LEN ) - __field( unsigned long, addr ) - ), - - F_printk("ip: %pF type: %d ksym_name: %pS cmd: %s", - (void *)__entry->ip, (unsigned int)__entry->type, - (void *)__entry->addr, __entry->cmd) -); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 8a2b73f..39c059c 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -9,9 +9,7 @@ #include <linux/kprobes.h> #include "trace.h" -EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs); - -static char *perf_trace_buf[4]; +static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; /* * Force it to be aligned to unsigned long to avoid misaligned accesses @@ -26,7 +24,7 @@ static int total_ref_count; static int perf_trace_event_init(struct ftrace_event_call *tp_event, struct perf_event *p_event) { - struct hlist_head *list; + struct hlist_head __percpu *list; int ret = -ENOMEM; int cpu; @@ -44,11 +42,11 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, tp_event->perf_events = list; if (!total_ref_count) { - char *buf; + char __percpu *buf; int i; - for (i = 0; i < 4; i++) { - buf = (char *)alloc_percpu(perf_trace_t); + for (i = 0; i < PERF_NR_CONTEXTS; i++) { + buf = (char __percpu *)alloc_percpu(perf_trace_t); if (!buf) goto fail; @@ -56,13 +54,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, } } - if (tp_event->class->reg) - ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); - else - ret = tracepoint_probe_register(tp_event->name, - tp_event->class->perf_probe, - tp_event); - + ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); if (ret) goto fail; @@ -73,7 +65,7 @@ fail: if (!total_ref_count) { int i; - for (i = 0; i < 4; i++) { + for (i = 0; i < PERF_NR_CONTEXTS; i++) { free_percpu(perf_trace_buf[i]); perf_trace_buf[i] = NULL; } @@ -96,11 +88,11 @@ int perf_trace_init(struct perf_event *p_event) mutex_lock(&event_mutex); list_for_each_entry(tp_event, &ftrace_events, list) { if (tp_event->event.type == event_id && - tp_event->class && - (tp_event->class->perf_probe || - tp_event->class->reg) && + tp_event->class && tp_event->class->reg && try_module_get(tp_event->mod)) { ret = perf_trace_event_init(tp_event, p_event); + if (ret) + module_put(tp_event->mod); break; } } @@ -109,22 +101,26 @@ int perf_trace_init(struct perf_event *p_event) return ret; } -int perf_trace_enable(struct perf_event *p_event) +int perf_trace_add(struct perf_event *p_event, int flags) { struct ftrace_event_call *tp_event = p_event->tp_event; + struct hlist_head __percpu *pcpu_list; struct hlist_head *list; - list = tp_event->perf_events; - if (WARN_ON_ONCE(!list)) + pcpu_list = tp_event->perf_events; + if (WARN_ON_ONCE(!pcpu_list)) return -EINVAL; - list = this_cpu_ptr(list); + if (!(flags & PERF_EF_START)) + p_event->hw.state = PERF_HES_STOPPED; + + list = this_cpu_ptr(pcpu_list); hlist_add_head_rcu(&p_event->hlist_entry, list); return 0; } -void perf_trace_disable(struct perf_event *p_event) +void perf_trace_del(struct perf_event *p_event, int flags) { hlist_del_rcu(&p_event->hlist_entry); } @@ -138,29 +134,25 @@ void perf_trace_destroy(struct perf_event *p_event) if (--tp_event->perf_refcount > 0) goto out; - if (tp_event->class->reg) - tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); - else - tracepoint_probe_unregister(tp_event->name, - tp_event->class->perf_probe, - tp_event); + tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); /* - * Ensure our callback won't be called anymore. See - * tracepoint_probe_unregister() and __DO_TRACE(). + * Ensure our callback won't be called anymore. The buffers + * will be freed after that. */ - synchronize_sched(); + tracepoint_synchronize_unregister(); free_percpu(tp_event->perf_events); tp_event->perf_events = NULL; if (!--total_ref_count) { - for (i = 0; i < 4; i++) { + for (i = 0; i < PERF_NR_CONTEXTS; i++) { free_percpu(perf_trace_buf[i]); perf_trace_buf[i] = NULL; } } out: + module_put(tp_event->mod); mutex_unlock(&event_mutex); } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 53cffc0..398c0e8 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -28,6 +28,7 @@ DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); +LIST_HEAD(ftrace_common_fields); struct list_head * trace_get_fields(struct ftrace_event_call *event_call) @@ -37,15 +38,11 @@ trace_get_fields(struct ftrace_event_call *event_call) return event_call->class->get_fields(event_call); } -int trace_define_field(struct ftrace_event_call *call, const char *type, - const char *name, int offset, int size, int is_signed, - int filter_type) +static int __trace_define_field(struct list_head *head, const char *type, + const char *name, int offset, int size, + int is_signed, int filter_type) { struct ftrace_event_field *field; - struct list_head *head; - - if (WARN_ON(!call->class)) - return 0; field = kzalloc(sizeof(*field), GFP_KERNEL); if (!field) @@ -68,7 +65,6 @@ int trace_define_field(struct ftrace_event_call *call, const char *type, field->size = size; field->is_signed = is_signed; - head = trace_get_fields(call); list_add(&field->link, head); return 0; @@ -80,17 +76,32 @@ err: return -ENOMEM; } + +int trace_define_field(struct ftrace_event_call *call, const char *type, + const char *name, int offset, int size, int is_signed, + int filter_type) +{ + struct list_head *head; + + if (WARN_ON(!call->class)) + return 0; + + head = trace_get_fields(call); + return __trace_define_field(head, type, name, offset, size, + is_signed, filter_type); +} EXPORT_SYMBOL_GPL(trace_define_field); #define __common_field(type, item) \ - ret = trace_define_field(call, #type, "common_" #item, \ - offsetof(typeof(ent), item), \ - sizeof(ent.item), \ - is_signed_type(type), FILTER_OTHER); \ + ret = __trace_define_field(&ftrace_common_fields, #type, \ + "common_" #item, \ + offsetof(typeof(ent), item), \ + sizeof(ent.item), \ + is_signed_type(type), FILTER_OTHER); \ if (ret) \ return ret; -static int trace_define_common_fields(struct ftrace_event_call *call) +static int trace_define_common_fields(void) { int ret; struct trace_entry ent; @@ -130,6 +141,55 @@ int trace_event_raw_init(struct ftrace_event_call *call) } EXPORT_SYMBOL_GPL(trace_event_raw_init); +int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) +{ + switch (type) { + case TRACE_REG_REGISTER: + return tracepoint_probe_register(call->name, + call->class->probe, + call); + case TRACE_REG_UNREGISTER: + tracepoint_probe_unregister(call->name, + call->class->probe, + call); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return tracepoint_probe_register(call->name, + call->class->perf_probe, + call); + case TRACE_REG_PERF_UNREGISTER: + tracepoint_probe_unregister(call->name, + call->class->perf_probe, + call); + return 0; +#endif + } + return 0; +} +EXPORT_SYMBOL_GPL(ftrace_event_reg); + +void trace_event_enable_cmd_record(bool enable) +{ + struct ftrace_event_call *call; + + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { + if (!(call->flags & TRACE_EVENT_FL_ENABLED)) + continue; + + if (enable) { + tracing_start_cmdline_record(); + call->flags |= TRACE_EVENT_FL_RECORDED_CMD; + } else { + tracing_stop_cmdline_record(); + call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; + } + } + mutex_unlock(&event_mutex); +} + static int ftrace_event_enable_disable(struct ftrace_event_call *call, int enable) { @@ -139,24 +199,20 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call, case 0: if (call->flags & TRACE_EVENT_FL_ENABLED) { call->flags &= ~TRACE_EVENT_FL_ENABLED; - tracing_stop_cmdline_record(); - if (call->class->reg) - call->class->reg(call, TRACE_REG_UNREGISTER); - else - tracepoint_probe_unregister(call->name, - call->class->probe, - call); + if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) { + tracing_stop_cmdline_record(); + call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; + } + call->class->reg(call, TRACE_REG_UNREGISTER); } break; case 1: if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { - tracing_start_cmdline_record(); - if (call->class->reg) - ret = call->class->reg(call, TRACE_REG_REGISTER); - else - ret = tracepoint_probe_register(call->name, - call->class->probe, - call); + if (trace_flags & TRACE_ITER_RECORD_CMD) { + tracing_start_cmdline_record(); + call->flags |= TRACE_EVENT_FL_RECORDED_CMD; + } + ret = call->class->reg(call, TRACE_REG_REGISTER); if (ret) { tracing_stop_cmdline_record(); pr_info("event trace: Could not enable event " @@ -194,8 +250,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { - if (!call->name || !call->class || - (!call->class->probe && !call->class->reg)) + if (!call->name || !call->class || !call->class->reg) continue; if (match && @@ -321,7 +376,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ - if (call->class && (call->class->probe || call->class->reg)) + if (call->class && call->class->reg) return call; } @@ -474,8 +529,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { - if (!call->name || !call->class || - (!call->class->probe && !call->class->reg)) + if (!call->name || !call->class || !call->class->reg) continue; if (system && strcmp(call->class->system, system) != 0) @@ -544,85 +598,146 @@ out: return ret; } -static ssize_t -event_format_read(struct file *filp, char __user *ubuf, size_t cnt, - loff_t *ppos) +enum { + FORMAT_HEADER = 1, + FORMAT_FIELD_SEPERATOR = 2, + FORMAT_PRINTFMT = 3, +}; + +static void *f_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_call *call = filp->private_data; + struct ftrace_event_call *call = m->private; struct ftrace_event_field *field; - struct list_head *head; - struct trace_seq *s; - int common_field_count = 5; - char *buf; - int r = 0; + struct list_head *common_head = &ftrace_common_fields; + struct list_head *head = trace_get_fields(call); - if (*ppos) - return 0; + (*pos)++; - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; + switch ((unsigned long)v) { + case FORMAT_HEADER: + if (unlikely(list_empty(common_head))) + return NULL; - trace_seq_init(s); + field = list_entry(common_head->prev, + struct ftrace_event_field, link); + return field; - trace_seq_printf(s, "name: %s\n", call->name); - trace_seq_printf(s, "ID: %d\n", call->event.type); - trace_seq_printf(s, "format:\n"); + case FORMAT_FIELD_SEPERATOR: + if (unlikely(list_empty(head))) + return NULL; - head = trace_get_fields(call); - list_for_each_entry_reverse(field, head, link) { - /* - * Smartly shows the array type(except dynamic array). - * Normal: - * field:TYPE VAR - * If TYPE := TYPE[LEN], it is shown: - * field:TYPE VAR[LEN] - */ - const char *array_descriptor = strchr(field->type, '['); + field = list_entry(head->prev, struct ftrace_event_field, link); + return field; - if (!strncmp(field->type, "__data_loc", 10)) - array_descriptor = NULL; + case FORMAT_PRINTFMT: + /* all done */ + return NULL; + } - if (!array_descriptor) { - r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;" - "\tsize:%u;\tsigned:%d;\n", - field->type, field->name, field->offset, - field->size, !!field->is_signed); - } else { - r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;" - "\tsize:%u;\tsigned:%d;\n", - (int)(array_descriptor - field->type), - field->type, field->name, - array_descriptor, field->offset, - field->size, !!field->is_signed); - } + field = v; + if (field->link.prev == common_head) + return (void *)FORMAT_FIELD_SEPERATOR; + else if (field->link.prev == head) + return (void *)FORMAT_PRINTFMT; - if (--common_field_count == 0) - r = trace_seq_printf(s, "\n"); + field = list_entry(field->link.prev, struct ftrace_event_field, link); - if (!r) - break; - } + return field; +} - if (r) - r = trace_seq_printf(s, "\nprint fmt: %s\n", - call->print_fmt); +static void *f_start(struct seq_file *m, loff_t *pos) +{ + loff_t l = 0; + void *p; - if (!r) { - /* - * ug! The format output is bigger than a PAGE!! - */ - buf = "FORMAT TOO BIG\n"; - r = simple_read_from_buffer(ubuf, cnt, ppos, - buf, strlen(buf)); - goto out; + /* Start by showing the header */ + if (!*pos) + return (void *)FORMAT_HEADER; + + p = (void *)FORMAT_HEADER; + do { + p = f_next(m, p, &l); + } while (p && l < *pos); + + return p; +} + +static int f_show(struct seq_file *m, void *v) +{ + struct ftrace_event_call *call = m->private; + struct ftrace_event_field *field; + const char *array_descriptor; + + switch ((unsigned long)v) { + case FORMAT_HEADER: + seq_printf(m, "name: %s\n", call->name); + seq_printf(m, "ID: %d\n", call->event.type); + seq_printf(m, "format:\n"); + return 0; + + case FORMAT_FIELD_SEPERATOR: + seq_putc(m, '\n'); + return 0; + + case FORMAT_PRINTFMT: + seq_printf(m, "\nprint fmt: %s\n", + call->print_fmt); + return 0; } - r = simple_read_from_buffer(ubuf, cnt, ppos, - s->buffer, s->len); - out: - kfree(s); - return r; + field = v; + + /* + * Smartly shows the array type(except dynamic array). + * Normal: + * field:TYPE VAR + * If TYPE := TYPE[LEN], it is shown: + * field:TYPE VAR[LEN] + */ + array_descriptor = strchr(field->type, '['); + + if (!strncmp(field->type, "__data_loc", 10)) + array_descriptor = NULL; + + if (!array_descriptor) + seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", + field->type, field->name, field->offset, + field->size, !!field->is_signed); + else + seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", + (int)(array_descriptor - field->type), + field->type, field->name, + array_descriptor, field->offset, + field->size, !!field->is_signed); + + return 0; +} + +static void f_stop(struct seq_file *m, void *p) +{ +} + +static const struct seq_operations trace_format_seq_ops = { + .start = f_start, + .next = f_next, + .stop = f_stop, + .show = f_show, +}; + +static int trace_format_open(struct inode *inode, struct file *file) +{ + struct ftrace_event_call *call = inode->i_private; + struct seq_file *m; + int ret; + + ret = seq_open(file, &trace_format_seq_ops); + if (ret < 0) + return ret; + + m = file->private_data; + m->private = call; + + return 0; } static ssize_t @@ -820,8 +935,10 @@ static const struct file_operations ftrace_enable_fops = { }; static const struct file_operations ftrace_event_format_fops = { - .open = tracing_open_generic, - .read = event_format_read, + .open = trace_format_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, }; static const struct file_operations ftrace_event_id_fops = { @@ -963,35 +1080,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, return -1; } - if (call->class->probe || call->class->reg) + if (call->class->reg) trace_create_file("enable", 0644, call->dir, call, enable); #ifdef CONFIG_PERF_EVENTS - if (call->event.type && (call->class->perf_probe || call->class->reg)) + if (call->event.type && call->class->reg) trace_create_file("id", 0444, call->dir, call, id); #endif - if (call->class->define_fields) { - /* - * Other events may have the same class. Only update - * the fields if they are not already defined. - */ - head = trace_get_fields(call); - if (list_empty(head)) { - ret = trace_define_common_fields(call); - if (!ret) - ret = call->class->define_fields(call); - if (ret < 0) { - pr_warning("Could not initialize trace point" - " events/%s\n", call->name); - return ret; - } + /* + * Other events may have the same class. Only update + * the fields if they are not already defined. + */ + head = trace_get_fields(call); + if (list_empty(head)) { + ret = call->class->define_fields(call); + if (ret < 0) { + pr_warning("Could not initialize trace point" + " events/%s\n", call->name); + return ret; } - trace_create_file("filter", 0644, call->dir, call, - filter); } + trace_create_file("filter", 0644, call->dir, call, + filter); trace_create_file("format", 0444, call->dir, call, format); @@ -999,11 +1112,17 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, return 0; } -static int __trace_add_event_call(struct ftrace_event_call *call) +static int +__trace_add_event_call(struct ftrace_event_call *call, struct module *mod, + const struct file_operations *id, + const struct file_operations *enable, + const struct file_operations *filter, + const struct file_operations *format) { struct dentry *d_events; int ret; + /* The linker may leave blanks */ if (!call->name) return -EINVAL; @@ -1011,8 +1130,8 @@ static int __trace_add_event_call(struct ftrace_event_call *call) ret = call->class->raw_init(call); if (ret < 0) { if (ret != -ENOSYS) - pr_warning("Could not initialize trace " - "events/%s\n", call->name); + pr_warning("Could not initialize trace events/%s\n", + call->name); return ret; } } @@ -1021,11 +1140,10 @@ static int __trace_add_event_call(struct ftrace_event_call *call) if (!d_events) return -ENOENT; - ret = event_create_dir(call, d_events, &ftrace_event_id_fops, - &ftrace_enable_fops, &ftrace_event_filter_fops, - &ftrace_event_format_fops); + ret = event_create_dir(call, d_events, id, enable, filter, format); if (!ret) list_add(&call->list, &ftrace_events); + call->mod = mod; return ret; } @@ -1035,7 +1153,10 @@ int trace_add_event_call(struct ftrace_event_call *call) { int ret; mutex_lock(&event_mutex); - ret = __trace_add_event_call(call); + ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops, + &ftrace_enable_fops, + &ftrace_event_filter_fops, + &ftrace_event_format_fops); mutex_unlock(&event_mutex); return ret; } @@ -1152,8 +1273,6 @@ static void trace_module_add_events(struct module *mod) { struct ftrace_module_file_ops *file_ops = NULL; struct ftrace_event_call *call, *start, *end; - struct dentry *d_events; - int ret; start = mod->trace_events; end = mod->trace_events + mod->num_trace_events; @@ -1161,38 +1280,14 @@ static void trace_module_add_events(struct module *mod) if (start == end) return; - d_events = event_trace_events_dir(); - if (!d_events) + file_ops = trace_create_file_ops(mod); + if (!file_ops) return; for_each_event(call, start, end) { - /* The linker may leave blanks */ - if (!call->name) - continue; - if (call->class->raw_init) { - ret = call->class->raw_init(call); - if (ret < 0) { - if (ret != -ENOSYS) - pr_warning("Could not initialize trace " - "point events/%s\n", call->name); - continue; - } - } - /* - * This module has events, create file ops for this module - * if not already done. - */ - if (!file_ops) { - file_ops = trace_create_file_ops(mod); - if (!file_ops) - return; - } - call->mod = mod; - ret = event_create_dir(call, d_events, + __trace_add_event_call(call, mod, &file_ops->id, &file_ops->enable, &file_ops->filter, &file_ops->format); - if (!ret) - list_add(&call->list, &ftrace_events); } } @@ -1319,25 +1414,14 @@ static __init int event_trace_init(void) trace_create_file("enable", 0644, d_events, NULL, &ftrace_system_enable_fops); + if (trace_define_common_fields()) + pr_warning("tracing: Failed to allocate common fields"); + for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { - /* The linker may leave blanks */ - if (!call->name) - continue; - if (call->class->raw_init) { - ret = call->class->raw_init(call); - if (ret < 0) { - if (ret != -ENOSYS) - pr_warning("Could not initialize trace " - "point events/%s\n", call->name); - continue; - } - } - ret = event_create_dir(call, d_events, &ftrace_event_id_fops, + __trace_add_event_call(call, NULL, &ftrace_event_id_fops, &ftrace_enable_fops, &ftrace_event_filter_fops, &ftrace_event_format_fops); - if (!ret) - list_add(&call->list, &ftrace_events); } while (true) { @@ -1524,12 +1608,11 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) struct ftrace_entry *entry; unsigned long flags; long disabled; - int resched; int cpu; int pc; pc = preempt_count(); - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); @@ -1551,7 +1634,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) out: atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); - ftrace_preempt_enable(resched); + preempt_enable_notrace(); } static struct ftrace_ops trace_ops __initdata = diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 57bb1bb..36d4010 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -497,12 +497,10 @@ void print_subsystem_event_filter(struct event_subsystem *system, } static struct ftrace_event_field * -find_event_field(struct ftrace_event_call *call, char *name) +__find_event_field(struct list_head *head, char *name) { struct ftrace_event_field *field; - struct list_head *head; - head = trace_get_fields(call); list_for_each_entry(field, head, link) { if (!strcmp(field->name, name)) return field; @@ -511,6 +509,20 @@ find_event_field(struct ftrace_event_call *call, char *name) return NULL; } +static struct ftrace_event_field * +find_event_field(struct ftrace_event_call *call, char *name) +{ + struct ftrace_event_field *field; + struct list_head *head; + + field = __find_event_field(&ftrace_common_fields, name); + if (field) + return field; + + head = trace_get_fields(call); + return __find_event_field(head, name); +} + static void filter_free_pred(struct filter_pred *pred) { if (!pred) @@ -627,9 +639,6 @@ static int init_subsystem_preds(struct event_subsystem *system) int err; list_for_each_entry(call, &ftrace_events, list) { - if (!call->class || !call->class->define_fields) - continue; - if (strcmp(call->class->system, system->name) != 0) continue; @@ -646,9 +655,6 @@ static void filter_free_subsystem_preds(struct event_subsystem *system) struct ftrace_event_call *call; list_for_each_entry(call, &ftrace_events, list) { - if (!call->class || !call->class->define_fields) - continue; - if (strcmp(call->class->system, system->name) != 0) continue; @@ -1251,9 +1257,6 @@ static int replace_system_preds(struct event_subsystem *system, list_for_each_entry(call, &ftrace_events, list) { struct event_filter *filter = call->filter; - if (!call->class || !call->class->define_fields) - continue; - if (strcmp(call->class->system, system->name) != 0) continue; diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 8536e2a..4ba44de 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -125,12 +125,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #include "trace_entries.h" -static int ftrace_raw_init_event(struct ftrace_event_call *call) -{ - INIT_LIST_HEAD(&call->class->fields); - return 0; -} - #undef __entry #define __entry REC @@ -158,7 +152,7 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call) struct ftrace_event_class event_class_ftrace_##call = { \ .system = __stringify(TRACE_SYSTEM), \ .define_fields = ftrace_define_fields_##call, \ - .raw_init = ftrace_raw_init_event, \ + .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ }; \ \ struct ftrace_event_call __used \ diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index b3f3776..16aee4d 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -54,14 +54,14 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) struct trace_array_cpu *data; unsigned long flags; long disabled; - int cpu, resched; + int cpu; int pc; if (unlikely(!ftrace_function_enabled)) return; pc = preempt_count(); - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); local_save_flags(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; @@ -71,7 +71,7 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) trace_function(tr, ip, parent_ip, flags, pc); atomic_dec(&data->disabled); - ftrace_preempt_enable(resched); + preempt_enable_notrace(); } static void diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 79f4bac..76b0598 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -15,15 +15,19 @@ #include "trace.h" #include "trace_output.h" +/* When set, irq functions will be ignored */ +static int ftrace_graph_skip_irqs; + struct fgraph_cpu_data { pid_t last_pid; int depth; + int depth_irq; int ignore; unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; }; struct fgraph_data { - struct fgraph_cpu_data *cpu_data; + struct fgraph_cpu_data __percpu *cpu_data; /* Place to preserve last processed entry. */ struct ftrace_graph_ent_entry ent; @@ -41,6 +45,7 @@ struct fgraph_data { #define TRACE_GRAPH_PRINT_PROC 0x8 #define TRACE_GRAPH_PRINT_DURATION 0x10 #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 +#define TRACE_GRAPH_PRINT_IRQS 0x40 static struct tracer_opt trace_opts[] = { /* Display overruns? (for self-debug purpose) */ @@ -55,13 +60,15 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, /* Display absolute time of an entry */ { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, + /* Display interrupts */ + { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, { } /* Empty entry */ }; static struct tracer_flags tracer_flags = { /* Don't display overruns and proc by default */ .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | - TRACE_GRAPH_PRINT_DURATION, + TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, .opts = trace_opts }; @@ -204,6 +211,14 @@ int __trace_graph_entry(struct trace_array *tr, return 1; } +static inline int ftrace_graph_ignore_irqs(void) +{ + if (!ftrace_graph_skip_irqs) + return 0; + + return in_irq(); +} + int trace_graph_entry(struct ftrace_graph_ent *trace) { struct trace_array *tr = graph_array; @@ -218,7 +233,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) return 0; /* trace it when it is-nested-in or is a function enabled. */ - if (!(trace->depth || ftrace_graph_addr(trace->func))) + if (!(trace->depth || ftrace_graph_addr(trace->func)) || + ftrace_graph_ignore_irqs()) return 0; local_irq_save(flags); @@ -246,6 +262,34 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) return trace_graph_entry(trace); } +static void +__trace_graph_function(struct trace_array *tr, + unsigned long ip, unsigned long flags, int pc) +{ + u64 time = trace_clock_local(); + struct ftrace_graph_ent ent = { + .func = ip, + .depth = 0, + }; + struct ftrace_graph_ret ret = { + .func = ip, + .depth = 0, + .calltime = time, + .rettime = time, + }; + + __trace_graph_entry(tr, &ent, flags, pc); + __trace_graph_return(tr, &ret, flags, pc); +} + +void +trace_graph_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned long flags, int pc) +{ + __trace_graph_function(tr, ip, flags, pc); +} + void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, unsigned long flags, @@ -507,7 +551,15 @@ get_return_for_leaf(struct trace_iterator *iter, * if the output fails. */ data->ent = *curr; - data->ret = *next; + /* + * If the next event is not a return type, then + * we only care about what type it is. Otherwise we can + * safely copy the entire event. + */ + if (next->ent.type == TRACE_GRAPH_RET) + data->ret = *next; + else + data->ret.ent.type = next->ent.type; } } @@ -641,7 +693,9 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) /* Print nsecs (we don't want to exceed 7 numbers) */ if (len < 7) { - snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem); + size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len); + + snprintf(nsecs_str, slen, "%03lu", nsecs_rem); ret = trace_seq_printf(s, ".%s", nsecs_str); if (!ret) return TRACE_TYPE_PARTIAL_LINE; @@ -846,6 +900,108 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, return 0; } +/* + * Entry check for irq code + * + * returns 1 if + * - we are inside irq code + * - we just extered irq code + * + * retunns 0 if + * - funcgraph-interrupts option is set + * - we are not inside irq code + */ +static int +check_irq_entry(struct trace_iterator *iter, u32 flags, + unsigned long addr, int depth) +{ + int cpu = iter->cpu; + int *depth_irq; + struct fgraph_data *data = iter->private; + + /* + * If we are either displaying irqs, or we got called as + * a graph event and private data does not exist, + * then we bypass the irq check. + */ + if ((flags & TRACE_GRAPH_PRINT_IRQS) || + (!data)) + return 0; + + depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + + /* + * We are inside the irq code + */ + if (*depth_irq >= 0) + return 1; + + if ((addr < (unsigned long)__irqentry_text_start) || + (addr >= (unsigned long)__irqentry_text_end)) + return 0; + + /* + * We are entering irq code. + */ + *depth_irq = depth; + return 1; +} + +/* + * Return check for irq code + * + * returns 1 if + * - we are inside irq code + * - we just left irq code + * + * returns 0 if + * - funcgraph-interrupts option is set + * - we are not inside irq code + */ +static int +check_irq_return(struct trace_iterator *iter, u32 flags, int depth) +{ + int cpu = iter->cpu; + int *depth_irq; + struct fgraph_data *data = iter->private; + + /* + * If we are either displaying irqs, or we got called as + * a graph event and private data does not exist, + * then we bypass the irq check. + */ + if ((flags & TRACE_GRAPH_PRINT_IRQS) || + (!data)) + return 0; + + depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + + /* + * We are not inside the irq code. + */ + if (*depth_irq == -1) + return 0; + + /* + * We are inside the irq code, and this is returning entry. + * Let's not trace it and clear the entry depth, since + * we are out of irq code. + * + * This condition ensures that we 'leave the irq code' once + * we are out of the entry depth. Thus protecting us from + * the RETURN entry loss. + */ + if (*depth_irq >= depth) { + *depth_irq = -1; + return 1; + } + + /* + * We are inside the irq code, and this is not the entry. + */ + return 1; +} + static enum print_line_t print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, struct trace_iterator *iter, u32 flags) @@ -856,6 +1012,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, static enum print_line_t ret; int cpu = iter->cpu; + if (check_irq_entry(iter, flags, call->func, call->depth)) + return TRACE_TYPE_HANDLED; + if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) return TRACE_TYPE_PARTIAL_LINE; @@ -893,6 +1052,9 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, int ret; int i; + if (check_irq_return(iter, flags, trace->depth)) + return TRACE_TYPE_HANDLED; + if (data) { struct fgraph_cpu_data *cpu_data; int cpu = iter->cpu; @@ -1045,7 +1207,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, enum print_line_t -print_graph_function_flags(struct trace_iterator *iter, u32 flags) +__print_graph_function_flags(struct trace_iterator *iter, u32 flags) { struct ftrace_graph_ent_entry *field; struct fgraph_data *data = iter->private; @@ -1108,7 +1270,18 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) static enum print_line_t print_graph_function(struct trace_iterator *iter) { - return print_graph_function_flags(iter, tracer_flags.val); + return __print_graph_function_flags(iter, tracer_flags.val); +} + +enum print_line_t print_graph_function_flags(struct trace_iterator *iter, + u32 flags) +{ + if (trace_flags & TRACE_ITER_LATENCY_FMT) + flags |= TRACE_GRAPH_PRINT_DURATION; + else + flags |= TRACE_GRAPH_PRINT_ABS_TIME; + + return __print_graph_function_flags(iter, flags); } static enum print_line_t @@ -1140,7 +1313,7 @@ static void print_lat_header(struct seq_file *s, u32 flags) seq_printf(s, "#%.*s|||| / \n", size, spaces); } -void print_graph_headers_flags(struct seq_file *s, u32 flags) +static void __print_graph_headers_flags(struct seq_file *s, u32 flags) { int lat = trace_flags & TRACE_ITER_LATENCY_FMT; @@ -1181,6 +1354,23 @@ void print_graph_headers(struct seq_file *s) print_graph_headers_flags(s, tracer_flags.val); } +void print_graph_headers_flags(struct seq_file *s, u32 flags) +{ + struct trace_iterator *iter = s->private; + + if (trace_flags & TRACE_ITER_LATENCY_FMT) { + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return; + + print_trace_header(s, iter); + flags |= TRACE_GRAPH_PRINT_DURATION; + } else + flags |= TRACE_GRAPH_PRINT_ABS_TIME; + + __print_graph_headers_flags(s, flags); +} + void graph_trace_open(struct trace_iterator *iter) { /* pid and depth on the last trace processed */ @@ -1201,9 +1391,12 @@ void graph_trace_open(struct trace_iterator *iter) pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); + int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + *pid = -1; *depth = 0; *ignore = 0; + *depth_irq = -1; } iter->private = data; @@ -1226,6 +1419,14 @@ void graph_trace_close(struct trace_iterator *iter) } } +static int func_graph_set_flag(u32 old_flags, u32 bit, int set) +{ + if (bit == TRACE_GRAPH_PRINT_IRQS) + ftrace_graph_skip_irqs = !set; + + return 0; +} + static struct trace_event_functions graph_functions = { .trace = print_graph_function_event, }; @@ -1252,6 +1453,7 @@ static struct tracer graph_trace __read_mostly = { .print_line = print_graph_function, .print_header = print_graph_headers, .flags = &tracer_flags, + .set_flag = func_graph_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_function_graph, #endif diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 6fd486e..5cf8c60 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -87,14 +87,22 @@ static __cacheline_aligned_in_smp unsigned long max_sequence; #ifdef CONFIG_FUNCTION_TRACER /* - * irqsoff uses its own tracer function to keep the overhead down: + * Prologue for the preempt and irqs off function tracers. + * + * Returns 1 if it is OK to continue, and data->disabled is + * incremented. + * 0 if the trace is to be ignored, and data->disabled + * is kept the same. + * + * Note, this function is also used outside this ifdef but + * inside the #ifdef of the function graph tracer below. + * This is OK, since the function graph tracer is + * dependent on the function tracer. */ -static void -irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +static int func_prolog_dec(struct trace_array *tr, + struct trace_array_cpu **data, + unsigned long *flags) { - struct trace_array *tr = irqsoff_trace; - struct trace_array_cpu *data; - unsigned long flags; long disabled; int cpu; @@ -106,18 +114,38 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) */ cpu = raw_smp_processor_id(); if (likely(!per_cpu(tracing_cpu, cpu))) - return; + return 0; - local_save_flags(flags); + local_save_flags(*flags); /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(flags)) - return; + if (!irqs_disabled_flags(*flags)) + return 0; - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); + *data = tr->data[cpu]; + disabled = atomic_inc_return(&(*data)->disabled); if (likely(disabled == 1)) - trace_function(tr, ip, parent_ip, flags, preempt_count()); + return 1; + + atomic_dec(&(*data)->disabled); + + return 0; +} + +/* + * irqsoff uses its own tracer function to keep the overhead down: + */ +static void +irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + if (!func_prolog_dec(tr, &data, &flags)) + return; + + trace_function(tr, ip, parent_ip, flags, preempt_count()); atomic_dec(&data->disabled); } @@ -155,30 +183,16 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; - long disabled; int ret; - int cpu; int pc; - cpu = raw_smp_processor_id(); - if (likely(!per_cpu(tracing_cpu, cpu))) + if (!func_prolog_dec(tr, &data, &flags)) return 0; - local_save_flags(flags); - /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(flags)) - return 0; - - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) { - pc = preempt_count(); - ret = __trace_graph_entry(tr, trace, flags, pc); - } else - ret = 0; - + pc = preempt_count(); + ret = __trace_graph_entry(tr, trace, flags, pc); atomic_dec(&data->disabled); + return ret; } @@ -187,27 +201,13 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace) struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; - long disabled; - int cpu; int pc; - cpu = raw_smp_processor_id(); - if (likely(!per_cpu(tracing_cpu, cpu))) + if (!func_prolog_dec(tr, &data, &flags)) return; - local_save_flags(flags); - /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(flags)) - return; - - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) { - pc = preempt_count(); - __trace_graph_return(tr, trace, flags, pc); - } - + pc = preempt_count(); + __trace_graph_return(tr, trace, flags, pc); atomic_dec(&data->disabled); } @@ -229,75 +229,33 @@ static void irqsoff_trace_close(struct trace_iterator *iter) static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) { - u32 flags = GRAPH_TRACER_FLAGS; - - if (trace_flags & TRACE_ITER_LATENCY_FMT) - flags |= TRACE_GRAPH_PRINT_DURATION; - else - flags |= TRACE_GRAPH_PRINT_ABS_TIME; - /* * In graph mode call the graph tracer output function, * otherwise go with the TRACE_FN event handler */ if (is_graph()) - return print_graph_function_flags(iter, flags); + return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); return TRACE_TYPE_UNHANDLED; } static void irqsoff_print_header(struct seq_file *s) { - if (is_graph()) { - struct trace_iterator *iter = s->private; - u32 flags = GRAPH_TRACER_FLAGS; - - if (trace_flags & TRACE_ITER_LATENCY_FMT) { - /* print nothing if the buffers are empty */ - if (trace_empty(iter)) - return; - - print_trace_header(s, iter); - flags |= TRACE_GRAPH_PRINT_DURATION; - } else - flags |= TRACE_GRAPH_PRINT_ABS_TIME; - - print_graph_headers_flags(s, flags); - } else + if (is_graph()) + print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); + else trace_default_header(s); } static void -trace_graph_function(struct trace_array *tr, - unsigned long ip, unsigned long flags, int pc) -{ - u64 time = trace_clock_local(); - struct ftrace_graph_ent ent = { - .func = ip, - .depth = 0, - }; - struct ftrace_graph_ret ret = { - .func = ip, - .depth = 0, - .calltime = time, - .rettime = time, - }; - - __trace_graph_entry(tr, &ent, flags, pc); - __trace_graph_return(tr, &ret, flags, pc); -} - -static void __trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc) { - if (!is_graph()) + if (is_graph()) + trace_graph_function(tr, ip, parent_ip, flags, pc); + else trace_function(tr, ip, parent_ip, flags, pc); - else { - trace_graph_function(tr, parent_ip, flags, pc); - trace_graph_function(tr, ip, flags, pc); - } } #else @@ -649,6 +607,7 @@ static struct tracer irqsoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, + .use_max_tr = 1, }; # define register_irqsoff(trace) register_tracer(&trace) #else @@ -681,6 +640,7 @@ static struct tracer preemptoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, + .use_max_tr = 1, }; # define register_preemptoff(trace) register_tracer(&trace) #else @@ -715,6 +675,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = #endif .open = irqsoff_trace_open, .close = irqsoff_trace_close, + .use_max_tr = 1, }; # define register_preemptirqsoff(trace) register_tracer(&trace) diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c new file mode 100644 index 0000000..7b8ecd7 --- /dev/null +++ b/kernel/trace/trace_kdb.c @@ -0,0 +1,136 @@ +/* + * kdb helper for dumping the ftrace buffer + * + * Copyright (C) 2010 Jason Wessel <jason.wessel@windriver.com> + * + * ftrace_dump_buf based on ftrace_dump: + * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> + * + */ +#include <linux/init.h> +#include <linux/kgdb.h> +#include <linux/kdb.h> +#include <linux/ftrace.h> + +#include "../debug/kdb/kdb_private.h" +#include "trace.h" +#include "trace_output.h" + +static void ftrace_dump_buf(int skip_lines, long cpu_file) +{ + /* use static because iter can be a bit big for the stack */ + static struct trace_iterator iter; + unsigned int old_userobj; + int cnt = 0, cpu; + + trace_init_global_iter(&iter); + + for_each_tracing_cpu(cpu) { + atomic_inc(&iter.tr->data[cpu]->disabled); + } + + old_userobj = trace_flags; + + /* don't look at user memory in panic mode */ + trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + + kdb_printf("Dumping ftrace buffer:\n"); + + /* reset all but tr, trace, and overruns */ + memset(&iter.seq, 0, + sizeof(struct trace_iterator) - + offsetof(struct trace_iterator, seq)); + iter.iter_flags |= TRACE_FILE_LAT_FMT; + iter.pos = -1; + + if (cpu_file == TRACE_PIPE_ALL_CPU) { + for_each_tracing_cpu(cpu) { + iter.buffer_iter[cpu] = + ring_buffer_read_prepare(iter.tr->buffer, cpu); + ring_buffer_read_start(iter.buffer_iter[cpu]); + tracing_iter_reset(&iter, cpu); + } + } else { + iter.cpu_file = cpu_file; + iter.buffer_iter[cpu_file] = + ring_buffer_read_prepare(iter.tr->buffer, cpu_file); + ring_buffer_read_start(iter.buffer_iter[cpu_file]); + tracing_iter_reset(&iter, cpu_file); + } + if (!trace_empty(&iter)) + trace_find_next_entry_inc(&iter); + while (!trace_empty(&iter)) { + if (!cnt) + kdb_printf("---------------------------------\n"); + cnt++; + + if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines) + print_trace_line(&iter); + if (!skip_lines) + trace_printk_seq(&iter.seq); + else + skip_lines--; + if (KDB_FLAG(CMD_INTERRUPT)) + goto out; + } + + if (!cnt) + kdb_printf(" (ftrace buffer empty)\n"); + else + kdb_printf("---------------------------------\n"); + +out: + trace_flags = old_userobj; + + for_each_tracing_cpu(cpu) { + atomic_dec(&iter.tr->data[cpu]->disabled); + } + + for_each_tracing_cpu(cpu) + if (iter.buffer_iter[cpu]) + ring_buffer_read_finish(iter.buffer_iter[cpu]); +} + +/* + * kdb_ftdump - Dump the ftrace log buffer + */ +static int kdb_ftdump(int argc, const char **argv) +{ + int skip_lines = 0; + long cpu_file; + char *cp; + + if (argc > 2) + return KDB_ARGCOUNT; + + if (argc) { + skip_lines = simple_strtol(argv[1], &cp, 0); + if (*cp) + skip_lines = 0; + } + + if (argc == 2) { + cpu_file = simple_strtol(argv[2], &cp, 0); + if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 || + !cpu_online(cpu_file)) + return KDB_BADINT; + } else { + cpu_file = TRACE_PIPE_ALL_CPU; + } + + kdb_trap_printk++; + ftrace_dump_buf(skip_lines, cpu_file); + kdb_trap_printk--; + + return 0; +} + +static __init int kdb_ftrace_register(void) +{ + kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", + "Dump ftrace log", 0, KDB_REPEAT_NONE); + return 0; +} + +late_initcall(kdb_ftrace_register); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f52b5f5..544301d 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -30,6 +30,8 @@ #include <linux/ptrace.h> #include <linux/perf_event.h> #include <linux/stringify.h> +#include <linux/limits.h> +#include <linux/uaccess.h> #include <asm/bitsperlong.h> #include "trace.h" @@ -38,6 +40,7 @@ #define MAX_TRACE_ARGS 128 #define MAX_ARGSTR_LEN 63 #define MAX_EVENT_NAME_LEN 64 +#define MAX_STRING_SIZE PATH_MAX #define KPROBE_EVENT_SYSTEM "kprobes" /* Reserved field names */ @@ -58,14 +61,16 @@ const char *reserved_field_names[] = { }; /* Printing function type */ -typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *); +typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, + void *); #define PRINT_TYPE_FUNC_NAME(type) print_type_##type #define PRINT_TYPE_FMT_NAME(type) print_type_format_##type /* Printing in basic type function template */ #define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ - const char *name, void *data)\ + const char *name, \ + void *data, void *ent)\ { \ return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ } \ @@ -80,6 +85,49 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int) DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) +/* data_rloc: data relative location, compatible with u32 */ +#define make_data_rloc(len, roffs) \ + (((u32)(len) << 16) | ((u32)(roffs) & 0xffff)) +#define get_rloc_len(dl) ((u32)(dl) >> 16) +#define get_rloc_offs(dl) ((u32)(dl) & 0xffff) + +static inline void *get_rloc_data(u32 *dl) +{ + return (u8 *)dl + get_rloc_offs(*dl); +} + +/* For data_loc conversion */ +static inline void *get_loc_data(u32 *dl, void *ent) +{ + return (u8 *)ent + get_rloc_offs(*dl); +} + +/* + * Convert data_rloc to data_loc: + * data_rloc stores the offset from data_rloc itself, but data_loc + * stores the offset from event entry. + */ +#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) + +/* For defining macros, define string/string_size types */ +typedef u32 string; +typedef u32 string_size; + +/* Print type function for string type */ +static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, + const char *name, + void *data, void *ent) +{ + int len = *(u32 *)data >> 16; + + if (!len) + return trace_seq_printf(s, " %s=(fault)", name); + else + return trace_seq_printf(s, " %s=\"%s\"", name, + (const char *)get_loc_data(data, ent)); +} +static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; + /* Data fetch function type */ typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); @@ -94,32 +142,38 @@ static __kprobes void call_fetch(struct fetch_param *fprm, return fprm->fn(regs, fprm->data, dest); } -#define FETCH_FUNC_NAME(kind, type) fetch_##kind##_##type +#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type /* * Define macro for basic types - we don't need to define s* types, because * we have to care only about bitwidth at recording time. */ -#define DEFINE_BASIC_FETCH_FUNCS(kind) \ -DEFINE_FETCH_##kind(u8) \ -DEFINE_FETCH_##kind(u16) \ -DEFINE_FETCH_##kind(u32) \ -DEFINE_FETCH_##kind(u64) - -#define CHECK_BASIC_FETCH_FUNCS(kind, fn) \ - ((FETCH_FUNC_NAME(kind, u8) == fn) || \ - (FETCH_FUNC_NAME(kind, u16) == fn) || \ - (FETCH_FUNC_NAME(kind, u32) == fn) || \ - (FETCH_FUNC_NAME(kind, u64) == fn)) +#define DEFINE_BASIC_FETCH_FUNCS(method) \ +DEFINE_FETCH_##method(u8) \ +DEFINE_FETCH_##method(u16) \ +DEFINE_FETCH_##method(u32) \ +DEFINE_FETCH_##method(u64) + +#define CHECK_FETCH_FUNCS(method, fn) \ + (((FETCH_FUNC_NAME(method, u8) == fn) || \ + (FETCH_FUNC_NAME(method, u16) == fn) || \ + (FETCH_FUNC_NAME(method, u32) == fn) || \ + (FETCH_FUNC_NAME(method, u64) == fn) || \ + (FETCH_FUNC_NAME(method, string) == fn) || \ + (FETCH_FUNC_NAME(method, string_size) == fn)) \ + && (fn != NULL)) /* Data fetch function templates */ #define DEFINE_FETCH_reg(type) \ static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ - void *offset, void *dest) \ + void *offset, void *dest) \ { \ *(type *)dest = (type)regs_get_register(regs, \ (unsigned int)((unsigned long)offset)); \ } DEFINE_BASIC_FETCH_FUNCS(reg) +/* No string on the register */ +#define fetch_reg_string NULL +#define fetch_reg_string_size NULL #define DEFINE_FETCH_stack(type) \ static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ @@ -129,6 +183,9 @@ static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ (unsigned int)((unsigned long)offset)); \ } DEFINE_BASIC_FETCH_FUNCS(stack) +/* No string on the stack entry */ +#define fetch_stack_string NULL +#define fetch_stack_string_size NULL #define DEFINE_FETCH_retval(type) \ static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ @@ -137,6 +194,9 @@ static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ *(type *)dest = (type)regs_return_value(regs); \ } DEFINE_BASIC_FETCH_FUNCS(retval) +/* No string on the retval */ +#define fetch_retval_string NULL +#define fetch_retval_string_size NULL #define DEFINE_FETCH_memory(type) \ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ @@ -149,6 +209,62 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ *(type *)dest = retval; \ } DEFINE_BASIC_FETCH_FUNCS(memory) +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max + * length and relative data location. + */ +static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, + void *addr, void *dest) +{ + long ret; + int maxlen = get_rloc_len(*(u32 *)dest); + u8 *dst = get_rloc_data(dest); + u8 *src = addr; + mm_segment_t old_fs = get_fs(); + if (!maxlen) + return; + /* + * Try to get string again, since the string can be changed while + * probing. + */ + set_fs(KERNEL_DS); + pagefault_disable(); + do + ret = __copy_from_user_inatomic(dst++, src++, 1); + while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); + dst[-1] = '\0'; + pagefault_enable(); + set_fs(old_fs); + + if (ret < 0) { /* Failed to fetch string */ + ((u8 *)get_rloc_data(dest))[0] = '\0'; + *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); + } else + *(u32 *)dest = make_data_rloc(src - (u8 *)addr, + get_rloc_offs(*(u32 *)dest)); +} +/* Return the length of string -- including null terminal byte */ +static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, + void *addr, void *dest) +{ + int ret, len = 0; + u8 c; + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + pagefault_disable(); + do { + ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); + len++; + } while (c && ret == 0 && len < MAX_STRING_SIZE); + pagefault_enable(); + set_fs(old_fs); + + if (ret < 0) /* Failed to check the length */ + *(u32 *)dest = 0; + else + *(u32 *)dest = len; +} /* Memory fetching by symbol */ struct symbol_cache { @@ -203,6 +319,8 @@ static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\ *(type *)dest = 0; \ } DEFINE_BASIC_FETCH_FUNCS(symbol) +DEFINE_FETCH_symbol(string) +DEFINE_FETCH_symbol(string_size) /* Dereference memory access function */ struct deref_fetch_param { @@ -224,12 +342,14 @@ static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ *(type *)dest = 0; \ } DEFINE_BASIC_FETCH_FUNCS(deref) +DEFINE_FETCH_deref(string) +DEFINE_FETCH_deref(string_size) static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) { - if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn)) + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) free_deref_fetch_param(data->orig.data); - else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn)) + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) free_symbol_cache(data->orig.data); kfree(data); } @@ -240,23 +360,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) #define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) #define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) -#define ASSIGN_FETCH_FUNC(kind, type) \ - .kind = FETCH_FUNC_NAME(kind, type) - -#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ - {.name = #ptype, \ - .size = sizeof(ftype), \ - .is_signed = sign, \ - .print = PRINT_TYPE_FUNC_NAME(ptype), \ - .fmt = PRINT_TYPE_FMT_NAME(ptype), \ -ASSIGN_FETCH_FUNC(reg, ftype), \ -ASSIGN_FETCH_FUNC(stack, ftype), \ -ASSIGN_FETCH_FUNC(retval, ftype), \ -ASSIGN_FETCH_FUNC(memory, ftype), \ -ASSIGN_FETCH_FUNC(symbol, ftype), \ -ASSIGN_FETCH_FUNC(deref, ftype), \ +/* Fetch types */ +enum { + FETCH_MTD_reg = 0, + FETCH_MTD_stack, + FETCH_MTD_retval, + FETCH_MTD_memory, + FETCH_MTD_symbol, + FETCH_MTD_deref, + FETCH_MTD_END, +}; + +#define ASSIGN_FETCH_FUNC(method, type) \ + [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) + +#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ + {.name = _name, \ + .size = _size, \ + .is_signed = sign, \ + .print = PRINT_TYPE_FUNC_NAME(ptype), \ + .fmt = PRINT_TYPE_FMT_NAME(ptype), \ + .fmttype = _fmttype, \ + .fetch = { \ +ASSIGN_FETCH_FUNC(reg, ftype), \ +ASSIGN_FETCH_FUNC(stack, ftype), \ +ASSIGN_FETCH_FUNC(retval, ftype), \ +ASSIGN_FETCH_FUNC(memory, ftype), \ +ASSIGN_FETCH_FUNC(symbol, ftype), \ +ASSIGN_FETCH_FUNC(deref, ftype), \ + } \ } +#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ + __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) + +#define FETCH_TYPE_STRING 0 +#define FETCH_TYPE_STRSIZE 1 + /* Fetch type information table */ static const struct fetch_type { const char *name; /* Name of type */ @@ -264,14 +404,16 @@ static const struct fetch_type { int is_signed; /* Signed flag */ print_type_func_t print; /* Print functions */ const char *fmt; /* Fromat string */ + const char *fmttype; /* Name in format file */ /* Fetch functions */ - fetch_func_t reg; - fetch_func_t stack; - fetch_func_t retval; - fetch_func_t memory; - fetch_func_t symbol; - fetch_func_t deref; + fetch_func_t fetch[FETCH_MTD_END]; } fetch_type_table[] = { + /* Special types */ + [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, + sizeof(u32), 1, "__data_loc char[]"), + [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, + string_size, sizeof(u32), 0, "u32"), + /* Basic types */ ASSIGN_FETCH_TYPE(u8, u8, 0), ASSIGN_FETCH_TYPE(u16, u16, 0), ASSIGN_FETCH_TYPE(u32, u32, 0), @@ -302,12 +444,28 @@ static __kprobes void fetch_stack_address(struct pt_regs *regs, *(unsigned long *)dest = kernel_stack_pointer(regs); } +static fetch_func_t get_fetch_size_function(const struct fetch_type *type, + fetch_func_t orig_fn) +{ + int i; + + if (type != &fetch_type_table[FETCH_TYPE_STRING]) + return NULL; /* Only string type needs size function */ + for (i = 0; i < FETCH_MTD_END; i++) + if (type->fetch[i] == orig_fn) + return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i]; + + WARN_ON(1); /* This should not happen */ + return NULL; +} + /** * Kprobe event core functions */ struct probe_arg { struct fetch_param fetch; + struct fetch_param fetch_size; unsigned int offset; /* Offset from argument entry */ const char *name; /* Name of this argument */ const char *comm; /* Command of this argument */ @@ -356,8 +514,8 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); static int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs); -/* Check the name is good for event/group */ -static int check_event_name(const char *name) +/* Check the name is good for event/group/fields */ +static int is_good_name(const char *name) { if (!isalpha(*name) && *name != '_') return 0; @@ -399,7 +557,7 @@ static struct trace_probe *alloc_trace_probe(const char *group, else tp->rp.kp.pre_handler = kprobe_dispatcher; - if (!event || !check_event_name(event)) { + if (!event || !is_good_name(event)) { ret = -EINVAL; goto error; } @@ -409,7 +567,7 @@ static struct trace_probe *alloc_trace_probe(const char *group, if (!tp->call.name) goto error; - if (!group || !check_event_name(group)) { + if (!group || !is_good_name(group)) { ret = -EINVAL; goto error; } @@ -429,9 +587,9 @@ error: static void free_probe_arg(struct probe_arg *arg) { - if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn)) + if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) free_deref_fetch_param(arg->fetch.data); - else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn)) + else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) free_symbol_cache(arg->fetch.data); kfree(arg->name); kfree(arg->comm); @@ -548,7 +706,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, if (strcmp(arg, "retval") == 0) { if (is_return) - f->fn = t->retval; + f->fn = t->fetch[FETCH_MTD_retval]; else ret = -EINVAL; } else if (strncmp(arg, "stack", 5) == 0) { @@ -562,7 +720,7 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, if (ret || param > PARAM_MAX_STACK) ret = -EINVAL; else { - f->fn = t->stack; + f->fn = t->fetch[FETCH_MTD_stack]; f->data = (void *)param; } } else @@ -588,7 +746,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, case '%': /* named register */ ret = regs_query_register_offset(arg + 1); if (ret >= 0) { - f->fn = t->reg; + f->fn = t->fetch[FETCH_MTD_reg]; f->data = (void *)(unsigned long)ret; ret = 0; } @@ -598,7 +756,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, ret = strict_strtoul(arg + 1, 0, ¶m); if (ret) break; - f->fn = t->memory; + f->fn = t->fetch[FETCH_MTD_memory]; f->data = (void *)param; } else { ret = split_symbol_offset(arg + 1, &offset); @@ -606,7 +764,7 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, break; f->data = alloc_symbol_cache(arg + 1, offset); if (f->data) - f->fn = t->symbol; + f->fn = t->fetch[FETCH_MTD_symbol]; } break; case '+': /* deref memory */ @@ -636,14 +794,17 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t, if (ret) kfree(dprm); else { - f->fn = t->deref; + f->fn = t->fetch[FETCH_MTD_deref]; f->data = (void *)dprm; } } break; } - if (!ret && !f->fn) + if (!ret && !f->fn) { /* Parsed, but do not find fetch method */ + pr_info("%s type has no corresponding fetch method.\n", + t->name); ret = -EINVAL; + } return ret; } @@ -652,6 +813,7 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp, struct probe_arg *parg, int is_return) { const char *t; + int ret; if (strlen(arg) > MAX_ARGSTR_LEN) { pr_info("Argument is too long.: %s\n", arg); @@ -674,7 +836,13 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp, } parg->offset = tp->size; tp->size += parg->type->size; - return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); + ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); + if (ret >= 0) { + parg->fetch_size.fn = get_fetch_size_function(parg->type, + parg->fetch.fn); + parg->fetch_size.data = parg->fetch.data; + } + return ret; } /* Return 1 if name is reserved or already used by another argument */ @@ -715,7 +883,7 @@ static int create_trace_probe(int argc, char **argv) int i, ret = 0; int is_return = 0, is_delete = 0; char *symbol = NULL, *event = NULL, *group = NULL; - char *arg, *tmp; + char *arg; unsigned long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; @@ -757,14 +925,17 @@ static int create_trace_probe(int argc, char **argv) pr_info("Delete command needs an event name.\n"); return -EINVAL; } + mutex_lock(&probe_lock); tp = find_probe_event(event, group); if (!tp) { + mutex_unlock(&probe_lock); pr_info("Event %s/%s doesn't exist.\n", group, event); return -ENOENT; } /* delete an event */ unregister_trace_probe(tp); free_trace_probe(tp); + mutex_unlock(&probe_lock); return 0; } @@ -821,26 +992,36 @@ static int create_trace_probe(int argc, char **argv) /* parse arguments */ ret = 0; for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + /* Increment count for freeing args in error case */ + tp->nr_args++; + /* Parse argument name */ arg = strchr(argv[i], '='); - if (arg) + if (arg) { *arg++ = '\0'; - else + tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); + } else { arg = argv[i]; + /* If argument name is omitted, set "argN" */ + snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); + tp->args[i].name = kstrdup(buf, GFP_KERNEL); + } - tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); if (!tp->args[i].name) { - pr_info("Failed to allocate argument%d name '%s'.\n", - i, argv[i]); + pr_info("Failed to allocate argument[%d] name.\n", i); ret = -ENOMEM; goto error; } - tmp = strchr(tp->args[i].name, ':'); - if (tmp) - *tmp = '_'; /* convert : to _ */ + + if (!is_good_name(tp->args[i].name)) { + pr_info("Invalid argument[%d] name: %s\n", + i, tp->args[i].name); + ret = -EINVAL; + goto error; + } if (conflict_field_name(tp->args[i].name, tp->args, i)) { - pr_info("Argument%d name '%s' conflicts with " + pr_info("Argument[%d] name '%s' conflicts with " "another field.\n", i, argv[i]); ret = -EINVAL; goto error; @@ -849,12 +1030,9 @@ static int create_trace_probe(int argc, char **argv) /* Parse fetch argument */ ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); if (ret) { - pr_info("Parse error at argument%d. (%d)\n", i, ret); - kfree(tp->args[i].name); + pr_info("Parse error at argument[%d]. (%d)\n", i, ret); goto error; } - - tp->nr_args++; } ret = register_trace_probe(tp); @@ -1043,6 +1221,54 @@ static const struct file_operations kprobe_profile_ops = { .release = seq_release, }; +/* Sum up total data length for dynamic arraies (strings) */ +static __kprobes int __get_data_size(struct trace_probe *tp, + struct pt_regs *regs) +{ + int i, ret = 0; + u32 len; + + for (i = 0; i < tp->nr_args; i++) + if (unlikely(tp->args[i].fetch_size.fn)) { + call_fetch(&tp->args[i].fetch_size, regs, &len); + ret += len; + } + + return ret; +} + +/* Store the value of each argument */ +static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp, + struct pt_regs *regs, + u8 *data, int maxlen) +{ + int i; + u32 end = tp->size; + u32 *dl; /* Data (relative) location */ + + for (i = 0; i < tp->nr_args; i++) { + if (unlikely(tp->args[i].fetch_size.fn)) { + /* + * First, we set the relative location and + * maximum data length to *dl + */ + dl = (u32 *)(data + tp->args[i].offset); + *dl = make_data_rloc(maxlen, end - tp->args[i].offset); + /* Then try to fetch string or dynamic array data */ + call_fetch(&tp->args[i].fetch, regs, dl); + /* Reduce maximum length */ + end += get_rloc_len(*dl); + maxlen -= get_rloc_len(*dl); + /* Trick here, convert data_rloc to data_loc */ + *dl = convert_rloc_to_loc(*dl, + ent_size + tp->args[i].offset); + } else + /* Just fetching data normally */ + call_fetch(&tp->args[i].fetch, regs, + data + tp->args[i].offset); + } +} + /* Kprobe handler */ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) { @@ -1050,8 +1276,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) struct kprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; - u8 *data; - int size, i, pc; + int size, dsize, pc; unsigned long irq_flags; struct ftrace_event_call *call = &tp->call; @@ -1060,7 +1285,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) local_save_flags(irq_flags); pc = preempt_count(); - size = sizeof(*entry) + tp->size; + dsize = __get_data_size(tp, regs); + size = sizeof(*entry) + tp->size + dsize; event = trace_current_buffer_lock_reserve(&buffer, call->event.type, size, irq_flags, pc); @@ -1069,9 +1295,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) entry = ring_buffer_event_data(event); entry->ip = (unsigned long)kp->addr; - data = (u8 *)&entry[1]; - for (i = 0; i < tp->nr_args; i++) - call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); + store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); if (!filter_current_check_discard(buffer, call, entry, event)) trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); @@ -1085,15 +1309,15 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, struct kretprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; - u8 *data; - int size, i, pc; + int size, pc, dsize; unsigned long irq_flags; struct ftrace_event_call *call = &tp->call; local_save_flags(irq_flags); pc = preempt_count(); - size = sizeof(*entry) + tp->size; + dsize = __get_data_size(tp, regs); + size = sizeof(*entry) + tp->size + dsize; event = trace_current_buffer_lock_reserve(&buffer, call->event.type, size, irq_flags, pc); @@ -1103,9 +1327,7 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, entry = ring_buffer_event_data(event); entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; - data = (u8 *)&entry[1]; - for (i = 0; i < tp->nr_args; i++) - call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); + store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); if (!filter_current_check_discard(buffer, call, entry, event)) trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); @@ -1137,7 +1359,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags, data = (u8 *)&field[1]; for (i = 0; i < tp->nr_args; i++) if (!tp->args[i].type->print(s, tp->args[i].name, - data + tp->args[i].offset)) + data + tp->args[i].offset, field)) goto partial; if (!trace_seq_puts(s, "\n")) @@ -1179,7 +1401,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags, data = (u8 *)&field[1]; for (i = 0; i < tp->nr_args; i++) if (!tp->args[i].type->print(s, tp->args[i].name, - data + tp->args[i].offset)) + data + tp->args[i].offset, field)) goto partial; if (!trace_seq_puts(s, "\n")) @@ -1214,11 +1436,6 @@ static void probe_event_disable(struct ftrace_event_call *call) } } -static int probe_event_raw_init(struct ftrace_event_call *event_call) -{ - return 0; -} - #undef DEFINE_FIELD #define DEFINE_FIELD(type, item, name, is_signed) \ do { \ @@ -1239,7 +1456,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call) DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); /* Set argument names as fields */ for (i = 0; i < tp->nr_args; i++) { - ret = trace_define_field(event_call, tp->args[i].type->name, + ret = trace_define_field(event_call, tp->args[i].type->fmttype, tp->args[i].name, sizeof(field) + tp->args[i].offset, tp->args[i].type->size, @@ -1261,7 +1478,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); /* Set argument names as fields */ for (i = 0; i < tp->nr_args; i++) { - ret = trace_define_field(event_call, tp->args[i].type->name, + ret = trace_define_field(event_call, tp->args[i].type->fmttype, tp->args[i].name, sizeof(field) + tp->args[i].offset, tp->args[i].type->size, @@ -1301,8 +1518,13 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); for (i = 0; i < tp->nr_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", - tp->args[i].name); + if (strcmp(tp->args[i].type->name, "string") == 0) + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", __get_str(%s)", + tp->args[i].name); + else + pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", + tp->args[i].name); } #undef LEN_OR_ZERO @@ -1339,11 +1561,11 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, struct ftrace_event_call *call = &tp->call; struct kprobe_trace_entry_head *entry; struct hlist_head *head; - u8 *data; - int size, __size, i; + int size, __size, dsize; int rctx; - __size = sizeof(*entry) + tp->size; + dsize = __get_data_size(tp, regs); + __size = sizeof(*entry) + tp->size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, @@ -1355,9 +1577,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, return; entry->ip = (unsigned long)kp->addr; - data = (u8 *)&entry[1]; - for (i = 0; i < tp->nr_args; i++) - call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); + memset(&entry[1], 0, dsize); + store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); head = this_cpu_ptr(call->perf_events); perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); @@ -1371,11 +1592,11 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, struct ftrace_event_call *call = &tp->call; struct kretprobe_trace_entry_head *entry; struct hlist_head *head; - u8 *data; - int size, __size, i; + int size, __size, dsize; int rctx; - __size = sizeof(*entry) + tp->size; + dsize = __get_data_size(tp, regs); + __size = sizeof(*entry) + tp->size + dsize; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, @@ -1388,9 +1609,7 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; - data = (u8 *)&entry[1]; - for (i = 0; i < tp->nr_args; i++) - call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); + store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); head = this_cpu_ptr(call->perf_events); perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); @@ -1486,15 +1705,12 @@ static int register_probe_event(struct trace_probe *tp) int ret; /* Initialize ftrace_event_call */ + INIT_LIST_HEAD(&call->class->fields); if (probe_is_return(tp)) { - INIT_LIST_HEAD(&call->class->fields); call->event.funcs = &kretprobe_funcs; - call->class->raw_init = probe_event_raw_init; call->class->define_fields = kretprobe_event_define_fields; } else { - INIT_LIST_HEAD(&call->class->fields); call->event.funcs = &kprobe_funcs; - call->class->raw_init = probe_event_raw_init; call->class->define_fields = kprobe_event_define_fields; } if (set_print_fmt(tp) < 0) diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c deleted file mode 100644 index 8eaf007..0000000 --- a/kernel/trace/trace_ksym.c +++ /dev/null @@ -1,508 +0,0 @@ -/* - * trace_ksym.c - Kernel Symbol Tracer - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2009 - */ - -#include <linux/kallsyms.h> -#include <linux/uaccess.h> -#include <linux/debugfs.h> -#include <linux/ftrace.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/fs.h> - -#include "trace_output.h" -#include "trace.h" - -#include <linux/hw_breakpoint.h> -#include <asm/hw_breakpoint.h> - -#include <asm/atomic.h> - -#define KSYM_TRACER_OP_LEN 3 /* rw- */ - -struct trace_ksym { - struct perf_event **ksym_hbp; - struct perf_event_attr attr; -#ifdef CONFIG_PROFILE_KSYM_TRACER - atomic64_t counter; -#endif - struct hlist_node ksym_hlist; -}; - -static struct trace_array *ksym_trace_array; - -static unsigned int ksym_tracing_enabled; - -static HLIST_HEAD(ksym_filter_head); - -static DEFINE_MUTEX(ksym_tracer_mutex); - -#ifdef CONFIG_PROFILE_KSYM_TRACER - -#define MAX_UL_INT 0xffffffff - -void ksym_collect_stats(unsigned long hbp_hit_addr) -{ - struct hlist_node *node; - struct trace_ksym *entry; - - rcu_read_lock(); - hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { - if (entry->attr.bp_addr == hbp_hit_addr) { - atomic64_inc(&entry->counter); - break; - } - } - rcu_read_unlock(); -} -#endif /* CONFIG_PROFILE_KSYM_TRACER */ - -void ksym_hbp_handler(struct perf_event *hbp, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct ring_buffer_event *event; - struct ksym_trace_entry *entry; - struct ring_buffer *buffer; - int pc; - - if (!ksym_tracing_enabled) - return; - - buffer = ksym_trace_array->buffer; - - pc = preempt_count(); - - event = trace_buffer_lock_reserve(buffer, TRACE_KSYM, - sizeof(*entry), 0, pc); - if (!event) - return; - - entry = ring_buffer_event_data(event); - entry->ip = instruction_pointer(regs); - entry->type = hw_breakpoint_type(hbp); - entry->addr = hw_breakpoint_addr(hbp); - strlcpy(entry->cmd, current->comm, TASK_COMM_LEN); - -#ifdef CONFIG_PROFILE_KSYM_TRACER - ksym_collect_stats(hw_breakpoint_addr(hbp)); -#endif /* CONFIG_PROFILE_KSYM_TRACER */ - - trace_buffer_unlock_commit(buffer, event, 0, pc); -} - -/* Valid access types are represented as - * - * rw- : Set Read/Write Access Breakpoint - * -w- : Set Write Access Breakpoint - * --- : Clear Breakpoints - * --x : Set Execution Break points (Not available yet) - * - */ -static int ksym_trace_get_access_type(char *str) -{ - int access = 0; - - if (str[0] == 'r') - access |= HW_BREAKPOINT_R; - - if (str[1] == 'w') - access |= HW_BREAKPOINT_W; - - if (str[2] == 'x') - access |= HW_BREAKPOINT_X; - - switch (access) { - case HW_BREAKPOINT_R: - case HW_BREAKPOINT_W: - case HW_BREAKPOINT_W | HW_BREAKPOINT_R: - return access; - default: - return -EINVAL; - } -} - -/* - * There can be several possible malformed requests and we attempt to capture - * all of them. We enumerate some of the rules - * 1. We will not allow kernel symbols with ':' since it is used as a delimiter. - * i.e. multiple ':' symbols disallowed. Possible uses are of the form - * <module>:<ksym_name>:<op>. - * 2. No delimiter symbol ':' in the input string - * 3. Spurious operator symbols or symbols not in their respective positions - * 4. <ksym_name>:--- i.e. clear breakpoint request when ksym_name not in file - * 5. Kernel symbol not a part of /proc/kallsyms - * 6. Duplicate requests - */ -static int parse_ksym_trace_str(char *input_string, char **ksymname, - unsigned long *addr) -{ - int ret; - - *ksymname = strsep(&input_string, ":"); - *addr = kallsyms_lookup_name(*ksymname); - - /* Check for malformed request: (2), (1) and (5) */ - if ((!input_string) || - (strlen(input_string) != KSYM_TRACER_OP_LEN) || - (*addr == 0)) - return -EINVAL;; - - ret = ksym_trace_get_access_type(input_string); - - return ret; -} - -int process_new_ksym_entry(char *ksymname, int op, unsigned long addr) -{ - struct trace_ksym *entry; - int ret = -ENOMEM; - - entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL); - if (!entry) - return -ENOMEM; - - hw_breakpoint_init(&entry->attr); - - entry->attr.bp_type = op; - entry->attr.bp_addr = addr; - entry->attr.bp_len = HW_BREAKPOINT_LEN_4; - - entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr, - ksym_hbp_handler); - - if (IS_ERR(entry->ksym_hbp)) { - ret = PTR_ERR(entry->ksym_hbp); - if (ret == -ENOSPC) { - printk(KERN_ERR "ksym_tracer: Maximum limit reached." - " No new requests for tracing can be accepted now.\n"); - } else { - printk(KERN_INFO "ksym_tracer request failed. Try again" - " later!!\n"); - } - goto err; - } - - hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head); - - return 0; - -err: - kfree(entry); - - return ret; -} - -static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf, - size_t count, loff_t *ppos) -{ - struct trace_ksym *entry; - struct hlist_node *node; - struct trace_seq *s; - ssize_t cnt = 0; - int ret; - - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; - trace_seq_init(s); - - mutex_lock(&ksym_tracer_mutex); - - hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { - ret = trace_seq_printf(s, "%pS:", - (void *)(unsigned long)entry->attr.bp_addr); - if (entry->attr.bp_type == HW_BREAKPOINT_R) - ret = trace_seq_puts(s, "r--\n"); - else if (entry->attr.bp_type == HW_BREAKPOINT_W) - ret = trace_seq_puts(s, "-w-\n"); - else if (entry->attr.bp_type == (HW_BREAKPOINT_W | HW_BREAKPOINT_R)) - ret = trace_seq_puts(s, "rw-\n"); - WARN_ON_ONCE(!ret); - } - - cnt = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); - - mutex_unlock(&ksym_tracer_mutex); - - kfree(s); - - return cnt; -} - -static void __ksym_trace_reset(void) -{ - struct trace_ksym *entry; - struct hlist_node *node, *node1; - - mutex_lock(&ksym_tracer_mutex); - hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head, - ksym_hlist) { - unregister_wide_hw_breakpoint(entry->ksym_hbp); - hlist_del_rcu(&(entry->ksym_hlist)); - synchronize_rcu(); - kfree(entry); - } - mutex_unlock(&ksym_tracer_mutex); -} - -static ssize_t ksym_trace_filter_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *ppos) -{ - struct trace_ksym *entry; - struct hlist_node *node; - char *buf, *input_string, *ksymname = NULL; - unsigned long ksym_addr = 0; - int ret, op, changed = 0; - - buf = kzalloc(count + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - ret = -EFAULT; - if (copy_from_user(buf, buffer, count)) - goto out; - - buf[count] = '\0'; - input_string = strstrip(buf); - - /* - * Clear all breakpoints if: - * 1: echo > ksym_trace_filter - * 2: echo 0 > ksym_trace_filter - * 3: echo "*:---" > ksym_trace_filter - */ - if (!input_string[0] || !strcmp(input_string, "0") || - !strcmp(input_string, "*:---")) { - __ksym_trace_reset(); - ret = 0; - goto out; - } - - ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr); - if (ret < 0) - goto out; - - mutex_lock(&ksym_tracer_mutex); - - ret = -EINVAL; - hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { - if (entry->attr.bp_addr == ksym_addr) { - /* Check for malformed request: (6) */ - if (entry->attr.bp_type != op) - changed = 1; - else - goto out_unlock; - break; - } - } - if (changed) { - unregister_wide_hw_breakpoint(entry->ksym_hbp); - entry->attr.bp_type = op; - ret = 0; - if (op > 0) { - entry->ksym_hbp = - register_wide_hw_breakpoint(&entry->attr, - ksym_hbp_handler); - if (IS_ERR(entry->ksym_hbp)) - ret = PTR_ERR(entry->ksym_hbp); - else - goto out_unlock; - } - /* Error or "symbol:---" case: drop it */ - hlist_del_rcu(&(entry->ksym_hlist)); - synchronize_rcu(); - kfree(entry); - goto out_unlock; - } else { - /* Check for malformed request: (4) */ - if (op) - ret = process_new_ksym_entry(ksymname, op, ksym_addr); - } -out_unlock: - mutex_unlock(&ksym_tracer_mutex); -out: - kfree(buf); - return !ret ? count : ret; -} - -static const struct file_operations ksym_tracing_fops = { - .open = tracing_open_generic, - .read = ksym_trace_filter_read, - .write = ksym_trace_filter_write, -}; - -static void ksym_trace_reset(struct trace_array *tr) -{ - ksym_tracing_enabled = 0; - __ksym_trace_reset(); -} - -static int ksym_trace_init(struct trace_array *tr) -{ - int cpu, ret = 0; - - for_each_online_cpu(cpu) - tracing_reset(tr, cpu); - ksym_tracing_enabled = 1; - ksym_trace_array = tr; - - return ret; -} - -static void ksym_trace_print_header(struct seq_file *m) -{ - seq_puts(m, - "# TASK-PID CPU# Symbol " - "Type Function\n"); - seq_puts(m, - "# | | | " - " | |\n"); -} - -static enum print_line_t ksym_trace_output(struct trace_iterator *iter) -{ - struct trace_entry *entry = iter->ent; - struct trace_seq *s = &iter->seq; - struct ksym_trace_entry *field; - char str[KSYM_SYMBOL_LEN]; - int ret; - - if (entry->type != TRACE_KSYM) - return TRACE_TYPE_UNHANDLED; - - trace_assign_type(field, entry); - - ret = trace_seq_printf(s, "%11s-%-5d [%03d] %pS", field->cmd, - entry->pid, iter->cpu, (char *)field->addr); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - switch (field->type) { - case HW_BREAKPOINT_R: - ret = trace_seq_printf(s, " R "); - break; - case HW_BREAKPOINT_W: - ret = trace_seq_printf(s, " W "); - break; - case HW_BREAKPOINT_R | HW_BREAKPOINT_W: - ret = trace_seq_printf(s, " RW "); - break; - default: - return TRACE_TYPE_PARTIAL_LINE; - } - - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - sprint_symbol(str, field->ip); - ret = trace_seq_printf(s, "%s\n", str); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -struct tracer ksym_tracer __read_mostly = -{ - .name = "ksym_tracer", - .init = ksym_trace_init, - .reset = ksym_trace_reset, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_ksym, -#endif - .print_header = ksym_trace_print_header, - .print_line = ksym_trace_output -}; - -#ifdef CONFIG_PROFILE_KSYM_TRACER -static int ksym_profile_show(struct seq_file *m, void *v) -{ - struct hlist_node *node; - struct trace_ksym *entry; - int access_type = 0; - char fn_name[KSYM_NAME_LEN]; - - seq_puts(m, " Access Type "); - seq_puts(m, " Symbol Counter\n"); - seq_puts(m, " ----------- "); - seq_puts(m, " ------ -------\n"); - - rcu_read_lock(); - hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { - - access_type = entry->attr.bp_type; - - switch (access_type) { - case HW_BREAKPOINT_R: - seq_puts(m, " R "); - break; - case HW_BREAKPOINT_W: - seq_puts(m, " W "); - break; - case HW_BREAKPOINT_R | HW_BREAKPOINT_W: - seq_puts(m, " RW "); - break; - default: - seq_puts(m, " NA "); - } - - if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0) - seq_printf(m, " %-36s", fn_name); - else - seq_printf(m, " %-36s", "<NA>"); - seq_printf(m, " %15llu\n", - (unsigned long long)atomic64_read(&entry->counter)); - } - rcu_read_unlock(); - - return 0; -} - -static int ksym_profile_open(struct inode *node, struct file *file) -{ - return single_open(file, ksym_profile_show, NULL); -} - -static const struct file_operations ksym_profile_fops = { - .open = ksym_profile_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; -#endif /* CONFIG_PROFILE_KSYM_TRACER */ - -__init static int init_ksym_trace(void) -{ - struct dentry *d_tracer; - - d_tracer = tracing_init_dentry(); - - trace_create_file("ksym_trace_filter", 0644, d_tracer, - NULL, &ksym_tracing_fops); - -#ifdef CONFIG_PROFILE_KSYM_TRACER - trace_create_file("ksym_profile", 0444, d_tracer, - NULL, &ksym_profile_fops); -#endif - - return register_tracer(&ksym_tracer); -} -device_initcall(init_ksym_trace); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 57c1b45..02272ba 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -16,9 +16,6 @@ DECLARE_RWSEM(trace_event_mutex); -DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq); -EXPORT_PER_CPU_SYMBOL(ftrace_event_seq); - static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; @@ -1069,65 +1066,6 @@ static struct trace_event trace_wake_event = { .funcs = &trace_wake_funcs, }; -/* TRACE_SPECIAL */ -static enum print_line_t trace_special_print(struct trace_iterator *iter, - int flags, struct trace_event *event) -{ - struct special_entry *field; - - trace_assign_type(field, iter->ent); - - if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n", - field->arg1, - field->arg2, - field->arg3)) - return TRACE_TYPE_PARTIAL_LINE; - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t trace_special_hex(struct trace_iterator *iter, - int flags, struct trace_event *event) -{ - struct special_entry *field; - struct trace_seq *s = &iter->seq; - - trace_assign_type(field, iter->ent); - - SEQ_PUT_HEX_FIELD_RET(s, field->arg1); - SEQ_PUT_HEX_FIELD_RET(s, field->arg2); - SEQ_PUT_HEX_FIELD_RET(s, field->arg3); - - return TRACE_TYPE_HANDLED; -} - -static enum print_line_t trace_special_bin(struct trace_iterator *iter, - int flags, struct trace_event *event) -{ - struct special_entry *field; - struct trace_seq *s = &iter->seq; - - trace_assign_type(field, iter->ent); - - SEQ_PUT_FIELD_RET(s, field->arg1); - SEQ_PUT_FIELD_RET(s, field->arg2); - SEQ_PUT_FIELD_RET(s, field->arg3); - - return TRACE_TYPE_HANDLED; -} - -static struct trace_event_functions trace_special_funcs = { - .trace = trace_special_print, - .raw = trace_special_print, - .hex = trace_special_hex, - .binary = trace_special_bin, -}; - -static struct trace_event trace_special_event = { - .type = TRACE_SPECIAL, - .funcs = &trace_special_funcs, -}; - /* TRACE_STACK */ static enum print_line_t trace_stack_print(struct trace_iterator *iter, @@ -1161,9 +1099,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, static struct trace_event_functions trace_stack_funcs = { .trace = trace_stack_print, - .raw = trace_special_print, - .hex = trace_special_hex, - .binary = trace_special_bin, }; static struct trace_event trace_stack_event = { @@ -1194,9 +1129,6 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, static struct trace_event_functions trace_user_stack_funcs = { .trace = trace_user_stack_print, - .raw = trace_special_print, - .hex = trace_special_hex, - .binary = trace_special_bin, }; static struct trace_event trace_user_stack_event = { @@ -1314,7 +1246,6 @@ static struct trace_event *events[] __initdata = { &trace_fn_event, &trace_ctx_event, &trace_wake_event, - &trace_special_event, &trace_stack_event, &trace_user_stack_event, &trace_bprint_event, diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 0e73bc2..7319559 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -31,50 +31,99 @@ static int wakeup_rt; static arch_spinlock_t wakeup_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +static void wakeup_reset(struct trace_array *tr); static void __wakeup_reset(struct trace_array *tr); +static int wakeup_graph_entry(struct ftrace_graph_ent *trace); +static void wakeup_graph_return(struct ftrace_graph_ret *trace); static int save_lat_flag; +#define TRACE_DISPLAY_GRAPH 1 + +static struct tracer_opt trace_opts[] = { +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* display latency trace as call graph */ + { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) }, +#endif + { } /* Empty entry */ +}; + +static struct tracer_flags tracer_flags = { + .val = 0, + .opts = trace_opts, +}; + +#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) + #ifdef CONFIG_FUNCTION_TRACER + /* - * irqsoff uses its own tracer function to keep the overhead down: + * Prologue for the wakeup function tracers. + * + * Returns 1 if it is OK to continue, and preemption + * is disabled and data->disabled is incremented. + * 0 if the trace is to be ignored, and preemption + * is not disabled and data->disabled is + * kept the same. + * + * Note, this function is also used outside this ifdef but + * inside the #ifdef of the function graph tracer below. + * This is OK, since the function graph tracer is + * dependent on the function tracer. */ -static void -wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +static int +func_prolog_preempt_disable(struct trace_array *tr, + struct trace_array_cpu **data, + int *pc) { - struct trace_array *tr = wakeup_trace; - struct trace_array_cpu *data; - unsigned long flags; long disabled; - int resched; int cpu; - int pc; if (likely(!wakeup_task)) - return; + return 0; - pc = preempt_count(); - resched = ftrace_preempt_disable(); + *pc = preempt_count(); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); if (cpu != wakeup_current_cpu) goto out_enable; - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); + *data = tr->data[cpu]; + disabled = atomic_inc_return(&(*data)->disabled); if (unlikely(disabled != 1)) goto out; - local_irq_save(flags); + return 1; - trace_function(tr, ip, parent_ip, flags, pc); +out: + atomic_dec(&(*data)->disabled); + +out_enable: + preempt_enable_notrace(); + return 0; +} +/* + * wakeup uses its own tracer function to keep the overhead down: + */ +static void +wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return; + + local_irq_save(flags); + trace_function(tr, ip, parent_ip, flags, pc); local_irq_restore(flags); - out: atomic_dec(&data->disabled); - out_enable: - ftrace_preempt_enable(resched); + preempt_enable_notrace(); } static struct ftrace_ops trace_ops __read_mostly = @@ -83,6 +132,156 @@ static struct ftrace_ops trace_ops __read_mostly = }; #endif /* CONFIG_FUNCTION_TRACER */ +static int start_func_tracer(int graph) +{ + int ret; + + if (!graph) + ret = register_ftrace_function(&trace_ops); + else + ret = register_ftrace_graph(&wakeup_graph_return, + &wakeup_graph_entry); + + if (!ret && tracing_is_enabled()) + tracer_enabled = 1; + else + tracer_enabled = 0; + + return ret; +} + +static void stop_func_tracer(int graph) +{ + tracer_enabled = 0; + + if (!graph) + unregister_ftrace_function(&trace_ops); + else + unregister_ftrace_graph(); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +{ + + if (!(bit & TRACE_DISPLAY_GRAPH)) + return -EINVAL; + + if (!(is_graph() ^ set)) + return 0; + + stop_func_tracer(!set); + + wakeup_reset(wakeup_trace); + tracing_max_latency = 0; + + return start_func_tracer(set); +} + +static int wakeup_graph_entry(struct ftrace_graph_ent *trace) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc, ret = 0; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return 0; + + local_save_flags(flags); + ret = __trace_graph_entry(tr, trace, flags, pc); + atomic_dec(&data->disabled); + preempt_enable_notrace(); + + return ret; +} + +static void wakeup_graph_return(struct ftrace_graph_ret *trace) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return; + + local_save_flags(flags); + __trace_graph_return(tr, trace, flags, pc); + atomic_dec(&data->disabled); + + preempt_enable_notrace(); + return; +} + +static void wakeup_trace_open(struct trace_iterator *iter) +{ + if (is_graph()) + graph_trace_open(iter); +} + +static void wakeup_trace_close(struct trace_iterator *iter) +{ + if (iter->private) + graph_trace_close(iter); +} + +#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC) + +static enum print_line_t wakeup_print_line(struct trace_iterator *iter) +{ + /* + * In graph mode call the graph tracer output function, + * otherwise go with the TRACE_FN event handler + */ + if (is_graph()) + return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); + + return TRACE_TYPE_UNHANDLED; +} + +static void wakeup_print_header(struct seq_file *s) +{ + if (is_graph()) + print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); + else + trace_default_header(s); +} + +static void +__trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned long flags, int pc) +{ + if (is_graph()) + trace_graph_function(tr, ip, parent_ip, flags, pc); + else + trace_function(tr, ip, parent_ip, flags, pc); +} +#else +#define __trace_function trace_function + +static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +{ + return -EINVAL; +} + +static int wakeup_graph_entry(struct ftrace_graph_ent *trace) +{ + return -1; +} + +static enum print_line_t wakeup_print_line(struct trace_iterator *iter) +{ + return TRACE_TYPE_UNHANDLED; +} + +static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } +static void wakeup_print_header(struct seq_file *s) { } +static void wakeup_trace_open(struct trace_iterator *iter) { } +static void wakeup_trace_close(struct trace_iterator *iter) { } +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + /* * Should this new latency be reported/recorded? */ @@ -153,7 +352,7 @@ probe_wakeup_sched_switch(void *ignore, /* The task we are waiting for is waking up */ data = wakeup_trace->data[wakeup_cpu]; - trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); + __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); T0 = data->preempt_timestamp; @@ -253,7 +452,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) * is not called by an assembly function (where as schedule is) * it should be safe to use it here. */ - trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); + __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); out_locked: arch_spin_unlock(&wakeup_lock); @@ -304,12 +503,8 @@ static void start_wakeup_tracer(struct trace_array *tr) */ smp_wmb(); - register_ftrace_function(&trace_ops); - - if (tracing_is_enabled()) - tracer_enabled = 1; - else - tracer_enabled = 0; + if (start_func_tracer(is_graph())) + printk(KERN_ERR "failed to start wakeup tracer\n"); return; fail_deprobe_wake_new: @@ -321,7 +516,7 @@ fail_deprobe: static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; - unregister_ftrace_function(&trace_ops); + stop_func_tracer(is_graph()); unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); unregister_trace_sched_wakeup_new(probe_wakeup, NULL); unregister_trace_sched_wakeup(probe_wakeup, NULL); @@ -380,9 +575,16 @@ static struct tracer wakeup_tracer __read_mostly = .start = wakeup_tracer_start, .stop = wakeup_tracer_stop, .print_max = 1, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flags = &tracer_flags, + .set_flag = wakeup_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, + .use_max_tr = 1, }; static struct tracer wakeup_rt_tracer __read_mostly = @@ -394,9 +596,16 @@ static struct tracer wakeup_rt_tracer __read_mostly = .stop = wakeup_tracer_stop, .wait_pipe = poll_wait_pipe, .print_max = 1, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flags = &tracer_flags, + .set_flag = wakeup_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, + .use_max_tr = 1, }; __init static int init_wakeup_tracer(void) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 250e7f9..155a415 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -13,11 +13,9 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_WAKE: case TRACE_STACK: case TRACE_PRINT: - case TRACE_SPECIAL: case TRACE_BRANCH: case TRACE_GRAPH_ENT: case TRACE_GRAPH_RET: - case TRACE_KSYM: return 1; } return 0; @@ -691,38 +689,6 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr } #endif /* CONFIG_CONTEXT_SWITCH_TRACER */ -#ifdef CONFIG_SYSPROF_TRACER -int -trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr) -{ - unsigned long count; - int ret; - - /* start the tracing */ - ret = tracer_init(trace, tr); - if (ret) { - warn_failed_init_tracer(trace, ret); - return ret; - } - - /* Sleep for a 1/10 of a second */ - msleep(100); - /* stop the tracing. */ - tracing_stop(); - /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); - trace->reset(tr); - tracing_start(); - - if (!ret && !count) { - printk(KERN_CONT ".. no entries found .."); - ret = -1; - } - - return ret; -} -#endif /* CONFIG_SYSPROF_TRACER */ - #ifdef CONFIG_BRANCH_TRACER int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) @@ -755,56 +721,3 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) } #endif /* CONFIG_BRANCH_TRACER */ -#ifdef CONFIG_KSYM_TRACER -static int ksym_selftest_dummy; - -int -trace_selftest_startup_ksym(struct tracer *trace, struct trace_array *tr) -{ - unsigned long count; - int ret; - - /* start the tracing */ - ret = tracer_init(trace, tr); - if (ret) { - warn_failed_init_tracer(trace, ret); - return ret; - } - - ksym_selftest_dummy = 0; - /* Register the read-write tracing request */ - - ret = process_new_ksym_entry("ksym_selftest_dummy", - HW_BREAKPOINT_R | HW_BREAKPOINT_W, - (unsigned long)(&ksym_selftest_dummy)); - - if (ret < 0) { - printk(KERN_CONT "ksym_trace read-write startup test failed\n"); - goto ret_path; - } - /* Perform a read and a write operation over the dummy variable to - * trigger the tracer - */ - if (ksym_selftest_dummy == 0) - ksym_selftest_dummy++; - - /* stop the tracing. */ - tracing_stop(); - /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); - trace->reset(tr); - tracing_start(); - - /* read & write operations - one each is performed on the dummy variable - * triggering two entries in the trace buffer - */ - if (!ret && count != 2) { - printk(KERN_CONT "Ksym tracer startup test failed"); - ret = -1; - } - -ret_path: - return ret; -} -#endif /* CONFIG_KSYM_TRACER */ - diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index f4bc9b2..a6b7e0e 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -110,12 +110,12 @@ static inline void check_stack(void) static void stack_trace_call(unsigned long ip, unsigned long parent_ip) { - int cpu, resched; + int cpu; if (unlikely(!ftrace_enabled || stack_trace_disabled)) return; - resched = ftrace_preempt_disable(); + preempt_disable_notrace(); cpu = raw_smp_processor_id(); /* no atomic needed, we only modify this variable by this cpu */ @@ -127,7 +127,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) out: per_cpu(trace_active, cpu)--; /* prevent recursion in schedule */ - ftrace_preempt_enable(resched); + preempt_enable_notrace(); } static struct ftrace_ops trace_ops __read_mostly = @@ -249,7 +249,7 @@ static int trace_lookup_stack(struct seq_file *m, long i) { unsigned long addr = stack_dump_trace[i]; - return seq_printf(m, "%pF\n", (void *)addr); + return seq_printf(m, "%pS\n", (void *)addr); } static void print_disabled(struct seq_file *m) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 34e3580..bac752f 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -23,6 +23,9 @@ static int syscall_exit_register(struct ftrace_event_call *event, static int syscall_enter_define_fields(struct ftrace_event_call *call); static int syscall_exit_define_fields(struct ftrace_event_call *call); +/* All syscall exit events have the same fields */ +static LIST_HEAD(syscall_exit_fields); + static struct list_head * syscall_get_enter_fields(struct ftrace_event_call *call) { @@ -34,9 +37,7 @@ syscall_get_enter_fields(struct ftrace_event_call *call) static struct list_head * syscall_get_exit_fields(struct ftrace_event_call *call) { - struct syscall_metadata *entry = call->data; - - return &entry->exit_fields; + return &syscall_exit_fields; } struct trace_event_functions enter_syscall_print_funcs = { diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c deleted file mode 100644 index a7974a5..0000000 --- a/kernel/trace/trace_sysprof.c +++ /dev/null @@ -1,329 +0,0 @@ -/* - * trace stack traces - * - * Copyright (C) 2004-2008, Soeren Sandmann - * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com> - * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> - */ -#include <linux/kallsyms.h> -#include <linux/debugfs.h> -#include <linux/hrtimer.h> -#include <linux/uaccess.h> -#include <linux/ftrace.h> -#include <linux/module.h> -#include <linux/irq.h> -#include <linux/fs.h> - -#include <asm/stacktrace.h> - -#include "trace.h" - -static struct trace_array *sysprof_trace; -static int __read_mostly tracer_enabled; - -/* - * 1 msec sample interval by default: - */ -static unsigned long sample_period = 1000000; -static const unsigned int sample_max_depth = 512; - -static DEFINE_MUTEX(sample_timer_lock); -/* - * Per CPU hrtimers that do the profiling: - */ -static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer); - -struct stack_frame { - const void __user *next_fp; - unsigned long return_address; -}; - -static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) -{ - int ret; - - if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) - return 0; - - ret = 1; - pagefault_disable(); - if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) - ret = 0; - pagefault_enable(); - - return ret; -} - -struct backtrace_info { - struct trace_array_cpu *data; - struct trace_array *tr; - int pos; -}; - -static void -backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - /* Ignore warnings */ -} - -static void backtrace_warning(void *data, char *msg) -{ - /* Ignore warnings */ -} - -static int backtrace_stack(void *data, char *name) -{ - /* Don't bother with IRQ stacks for now */ - return -1; -} - -static void backtrace_address(void *data, unsigned long addr, int reliable) -{ - struct backtrace_info *info = data; - - if (info->pos < sample_max_depth && reliable) { - __trace_special(info->tr, info->data, 1, addr, 0); - - info->pos++; - } -} - -static const struct stacktrace_ops backtrace_ops = { - .warning = backtrace_warning, - .warning_symbol = backtrace_warning_symbol, - .stack = backtrace_stack, - .address = backtrace_address, - .walk_stack = print_context_stack, -}; - -static int -trace_kernel(struct pt_regs *regs, struct trace_array *tr, - struct trace_array_cpu *data) -{ - struct backtrace_info info; - unsigned long bp; - char *stack; - - info.tr = tr; - info.data = data; - info.pos = 1; - - __trace_special(info.tr, info.data, 1, regs->ip, 0); - - stack = ((char *)regs + sizeof(struct pt_regs)); -#ifdef CONFIG_FRAME_POINTER - bp = regs->bp; -#else - bp = 0; -#endif - - dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info); - - return info.pos; -} - -static void timer_notify(struct pt_regs *regs, int cpu) -{ - struct trace_array_cpu *data; - struct stack_frame frame; - struct trace_array *tr; - const void __user *fp; - int is_user; - int i; - - if (!regs) - return; - - tr = sysprof_trace; - data = tr->data[cpu]; - is_user = user_mode(regs); - - if (!current || current->pid == 0) - return; - - if (is_user && current->state != TASK_RUNNING) - return; - - __trace_special(tr, data, 0, 0, current->pid); - - if (!is_user) - i = trace_kernel(regs, tr, data); - else - i = 0; - - /* - * Trace user stack if we are not a kernel thread - */ - if (current->mm && i < sample_max_depth) { - regs = (struct pt_regs *)current->thread.sp0 - 1; - - fp = (void __user *)regs->bp; - - __trace_special(tr, data, 2, regs->ip, 0); - - while (i < sample_max_depth) { - frame.next_fp = NULL; - frame.return_address = 0; - if (!copy_stack_frame(fp, &frame)) - break; - if ((unsigned long)fp < regs->sp) - break; - - __trace_special(tr, data, 2, frame.return_address, - (unsigned long)fp); - fp = frame.next_fp; - - i++; - } - - } - - /* - * Special trace entry if we overflow the max depth: - */ - if (i == sample_max_depth) - __trace_special(tr, data, -1, -1, -1); - - __trace_special(tr, data, 3, current->pid, i); -} - -static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer) -{ - /* trace here */ - timer_notify(get_irq_regs(), smp_processor_id()); - - hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); - - return HRTIMER_RESTART; -} - -static void start_stack_timer(void *unused) -{ - struct hrtimer *hrtimer = &__get_cpu_var(stack_trace_hrtimer); - - hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hrtimer->function = stack_trace_timer_fn; - - hrtimer_start(hrtimer, ns_to_ktime(sample_period), - HRTIMER_MODE_REL_PINNED); -} - -static void start_stack_timers(void) -{ - on_each_cpu(start_stack_timer, NULL, 1); -} - -static void stop_stack_timer(int cpu) -{ - struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu); - - hrtimer_cancel(hrtimer); -} - -static void stop_stack_timers(void) -{ - int cpu; - - for_each_online_cpu(cpu) - stop_stack_timer(cpu); -} - -static void stop_stack_trace(struct trace_array *tr) -{ - mutex_lock(&sample_timer_lock); - stop_stack_timers(); - tracer_enabled = 0; - mutex_unlock(&sample_timer_lock); -} - -static int stack_trace_init(struct trace_array *tr) -{ - sysprof_trace = tr; - - tracing_start_cmdline_record(); - - mutex_lock(&sample_timer_lock); - start_stack_timers(); - tracer_enabled = 1; - mutex_unlock(&sample_timer_lock); - return 0; -} - -static void stack_trace_reset(struct trace_array *tr) -{ - tracing_stop_cmdline_record(); - stop_stack_trace(tr); -} - -static struct tracer stack_trace __read_mostly = -{ - .name = "sysprof", - .init = stack_trace_init, - .reset = stack_trace_reset, -#ifdef CONFIG_FTRACE_SELFTEST - .selftest = trace_selftest_startup_sysprof, -#endif -}; - -__init static int init_stack_trace(void) -{ - return register_tracer(&stack_trace); -} -device_initcall(init_stack_trace); - -#define MAX_LONG_DIGITS 22 - -static ssize_t -sysprof_sample_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[MAX_LONG_DIGITS]; - int r; - - r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period)); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t -sysprof_sample_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[MAX_LONG_DIGITS]; - unsigned long val; - - if (cnt > MAX_LONG_DIGITS-1) - cnt = MAX_LONG_DIGITS-1; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - val = simple_strtoul(buf, NULL, 10); - /* - * Enforce a minimum sample period of 100 usecs: - */ - if (val < 100) - val = 100; - - mutex_lock(&sample_timer_lock); - stop_stack_timers(); - sample_period = val * 1000; - start_stack_timers(); - mutex_unlock(&sample_timer_lock); - - return cnt; -} - -static const struct file_operations sysprof_sample_fops = { - .read = sysprof_sample_read, - .write = sysprof_sample_write, -}; - -void init_tracer_sysprof_debugfs(struct dentry *d_tracer) -{ - - trace_create_file("sysprof_sample_period", 0644, - d_tracer, NULL, &sysprof_sample_fops); -} diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index a7cc379..209b379 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -263,6 +263,11 @@ int __init trace_workqueue_early_init(void) { int ret, cpu; + for_each_possible_cpu(cpu) { + spin_lock_init(&workqueue_cpu_stat(cpu)->lock); + INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); + } + ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); if (ret) goto out; @@ -279,11 +284,6 @@ int __init trace_workqueue_early_init(void) if (ret) goto no_creation; - for_each_possible_cpu(cpu) { - spin_lock_init(&workqueue_cpu_stat(cpu)->lock); - INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); - } - return 0; no_creation: diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index c77f3ec..e95ee7f 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -25,6 +25,7 @@ #include <linux/err.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/jump_label.h> extern struct tracepoint __start___tracepoints[]; extern struct tracepoint __stop___tracepoints[]; @@ -263,7 +264,13 @@ static void set_tracepoint(struct tracepoint_entry **entry, * is used. */ rcu_assign_pointer(elem->funcs, (*entry)->funcs); - elem->state = active; + if (!elem->state && active) { + jump_label_enable(&elem->state); + elem->state = active; + } else if (elem->state && !active) { + jump_label_disable(&elem->state); + elem->state = active; + } } /* @@ -277,7 +284,10 @@ static void disable_tracepoint(struct tracepoint *elem) if (elem->unregfunc && elem->state) elem->unregfunc(); - elem->state = 0; + if (elem->state) { + jump_label_disable(&elem->state); + elem->state = 0; + } rcu_assign_pointer(elem->funcs, NULL); } diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index b2d70d3..2591583 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -9,6 +9,7 @@ #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/user_namespace.h> +#include <linux/highuid.h> #include <linux/cred.h> /* @@ -82,3 +83,46 @@ void free_user_ns(struct kref *kref) schedule_work(&ns->destroyer); } EXPORT_SYMBOL(free_user_ns); + +uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid) +{ + struct user_namespace *tmp; + + if (likely(to == cred->user->user_ns)) + return uid; + + + /* Is cred->user the creator of the target user_ns + * or the creator of one of it's parents? + */ + for ( tmp = to; tmp != &init_user_ns; + tmp = tmp->creator->user_ns ) { + if (cred->user == tmp->creator) { + return (uid_t)0; + } + } + + /* No useful relationship so no mapping */ + return overflowuid; +} + +gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid) +{ + struct user_namespace *tmp; + + if (likely(to == cred->user->user_ns)) + return gid; + + /* Is cred->user the creator of the target user_ns + * or the creator of one of it's parents? + */ + for ( tmp = to; tmp != &init_user_ns; + tmp = tmp->creator->user_ns ) { + if (cred->user == tmp->creator) { + return (gid_t)0; + } + } + + /* No useful relationship so no mapping */ + return overflowgid; +} diff --git a/kernel/watchdog.c b/kernel/watchdog.c new file mode 100644 index 0000000..bafba68 --- /dev/null +++ b/kernel/watchdog.c @@ -0,0 +1,566 @@ +/* + * Detect hard and soft lockups on a system + * + * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. + * + * this code detects hard lockups: incidents in where on a CPU + * the kernel does not respond to anything except NMI. + * + * Note: Most of this code is borrowed heavily from softlockup.c, + * so thanks to Ingo for the initial implementation. + * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks + * to those contributors as well. + */ + +#include <linux/mm.h> +#include <linux/cpu.h> +#include <linux/nmi.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <linux/lockdep.h> +#include <linux/notifier.h> +#include <linux/module.h> +#include <linux/sysctl.h> + +#include <asm/irq_regs.h> +#include <linux/perf_event.h> + +int watchdog_enabled; +int __read_mostly softlockup_thresh = 60; + +static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); +static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); +static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); +static DEFINE_PER_CPU(bool, softlockup_touch_sync); +static DEFINE_PER_CPU(bool, soft_watchdog_warn); +#ifdef CONFIG_HARDLOCKUP_DETECTOR +static DEFINE_PER_CPU(bool, hard_watchdog_warn); +static DEFINE_PER_CPU(bool, watchdog_nmi_touch); +static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); +static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); +static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); +#endif + +static int __initdata no_watchdog; + + +/* boot commands */ +/* + * Should we panic when a soft-lockup or hard-lockup occurs: + */ +#ifdef CONFIG_HARDLOCKUP_DETECTOR +static int hardlockup_panic; + +static int __init hardlockup_panic_setup(char *str) +{ + if (!strncmp(str, "panic", 5)) + hardlockup_panic = 1; + return 1; +} +__setup("nmi_watchdog=", hardlockup_panic_setup); +#endif + +unsigned int __read_mostly softlockup_panic = + CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; + +static int __init softlockup_panic_setup(char *str) +{ + softlockup_panic = simple_strtoul(str, NULL, 0); + + return 1; +} +__setup("softlockup_panic=", softlockup_panic_setup); + +static int __init nowatchdog_setup(char *str) +{ + no_watchdog = 1; + return 1; +} +__setup("nowatchdog", nowatchdog_setup); + +/* deprecated */ +static int __init nosoftlockup_setup(char *str) +{ + no_watchdog = 1; + return 1; +} +__setup("nosoftlockup", nosoftlockup_setup); +/* */ + + +/* + * Returns seconds, approximately. We don't need nanosecond + * resolution, and we don't need to waste time with a big divide when + * 2^30ns == 1.074s. + */ +static unsigned long get_timestamp(int this_cpu) +{ + return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ +} + +static unsigned long get_sample_period(void) +{ + /* + * convert softlockup_thresh from seconds to ns + * the divide by 5 is to give hrtimer 5 chances to + * increment before the hardlockup detector generates + * a warning + */ + return softlockup_thresh / 5 * NSEC_PER_SEC; +} + +/* Commands for resetting the watchdog */ +static void __touch_watchdog(void) +{ + int this_cpu = smp_processor_id(); + + __get_cpu_var(watchdog_touch_ts) = get_timestamp(this_cpu); +} + +void touch_softlockup_watchdog(void) +{ + __raw_get_cpu_var(watchdog_touch_ts) = 0; +} +EXPORT_SYMBOL(touch_softlockup_watchdog); + +void touch_all_softlockup_watchdogs(void) +{ + int cpu; + + /* + * this is done lockless + * do we care if a 0 races with a timestamp? + * all it means is the softlock check starts one cycle later + */ + for_each_online_cpu(cpu) + per_cpu(watchdog_touch_ts, cpu) = 0; +} + +#ifdef CONFIG_HARDLOCKUP_DETECTOR +void touch_nmi_watchdog(void) +{ + if (watchdog_enabled) { + unsigned cpu; + + for_each_present_cpu(cpu) { + if (per_cpu(watchdog_nmi_touch, cpu) != true) + per_cpu(watchdog_nmi_touch, cpu) = true; + } + } + touch_softlockup_watchdog(); +} +EXPORT_SYMBOL(touch_nmi_watchdog); + +#endif + +void touch_softlockup_watchdog_sync(void) +{ + __raw_get_cpu_var(softlockup_touch_sync) = true; + __raw_get_cpu_var(watchdog_touch_ts) = 0; +} + +#ifdef CONFIG_HARDLOCKUP_DETECTOR +/* watchdog detector functions */ +static int is_hardlockup(void) +{ + unsigned long hrint = __get_cpu_var(hrtimer_interrupts); + + if (__get_cpu_var(hrtimer_interrupts_saved) == hrint) + return 1; + + __get_cpu_var(hrtimer_interrupts_saved) = hrint; + return 0; +} +#endif + +static int is_softlockup(unsigned long touch_ts) +{ + unsigned long now = get_timestamp(smp_processor_id()); + + /* Warn about unreasonable delays: */ + if (time_after(now, touch_ts + softlockup_thresh)) + return now - touch_ts; + + return 0; +} + +#ifdef CONFIG_HARDLOCKUP_DETECTOR +static struct perf_event_attr wd_hw_attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 1, +}; + +/* Callback function for perf event subsystem */ +static void watchdog_overflow_callback(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + /* Ensure the watchdog never gets throttled */ + event->hw.interrupts = 0; + + if (__get_cpu_var(watchdog_nmi_touch) == true) { + __get_cpu_var(watchdog_nmi_touch) = false; + return; + } + + /* check for a hardlockup + * This is done by making sure our timer interrupt + * is incrementing. The timer interrupt should have + * fired multiple times before we overflow'd. If it hasn't + * then this is a good indication the cpu is stuck + */ + if (is_hardlockup()) { + int this_cpu = smp_processor_id(); + + /* only print hardlockups once */ + if (__get_cpu_var(hard_watchdog_warn) == true) + return; + + if (hardlockup_panic) + panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + else + WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); + + __get_cpu_var(hard_watchdog_warn) = true; + return; + } + + __get_cpu_var(hard_watchdog_warn) = false; + return; +} +static void watchdog_interrupt_count(void) +{ + __get_cpu_var(hrtimer_interrupts)++; +} +#else +static inline void watchdog_interrupt_count(void) { return; } +#endif /* CONFIG_HARDLOCKUP_DETECTOR */ + +/* watchdog kicker functions */ +static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) +{ + unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts); + struct pt_regs *regs = get_irq_regs(); + int duration; + + /* kick the hardlockup detector */ + watchdog_interrupt_count(); + + /* kick the softlockup detector */ + wake_up_process(__get_cpu_var(softlockup_watchdog)); + + /* .. and repeat */ + hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); + + if (touch_ts == 0) { + if (unlikely(__get_cpu_var(softlockup_touch_sync))) { + /* + * If the time stamp was touched atomically + * make sure the scheduler tick is up to date. + */ + __get_cpu_var(softlockup_touch_sync) = false; + sched_clock_tick(); + } + __touch_watchdog(); + return HRTIMER_RESTART; + } + + /* check for a softlockup + * This is done by making sure a high priority task is + * being scheduled. The task touches the watchdog to + * indicate it is getting cpu time. If it hasn't then + * this is a good indication some task is hogging the cpu + */ + duration = is_softlockup(touch_ts); + if (unlikely(duration)) { + /* only warn once */ + if (__get_cpu_var(soft_watchdog_warn) == true) + return HRTIMER_RESTART; + + printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", + smp_processor_id(), duration, + current->comm, task_pid_nr(current)); + print_modules(); + print_irqtrace_events(current); + if (regs) + show_regs(regs); + else + dump_stack(); + + if (softlockup_panic) + panic("softlockup: hung tasks"); + __get_cpu_var(soft_watchdog_warn) = true; + } else + __get_cpu_var(soft_watchdog_warn) = false; + + return HRTIMER_RESTART; +} + + +/* + * The watchdog thread - touches the timestamp. + */ +static int watchdog(void *unused) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); + + sched_setscheduler(current, SCHED_FIFO, ¶m); + + /* initialize timestamp */ + __touch_watchdog(); + + /* kick off the timer for the hardlockup detector */ + /* done here because hrtimer_start can only pin to smp_processor_id() */ + hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), + HRTIMER_MODE_REL_PINNED); + + set_current_state(TASK_INTERRUPTIBLE); + /* + * Run briefly once per second to reset the softlockup timestamp. + * If this gets delayed for more than 60 seconds then the + * debug-printout triggers in watchdog_timer_fn(). + */ + while (!kthread_should_stop()) { + __touch_watchdog(); + schedule(); + + if (kthread_should_stop()) + break; + + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + + return 0; +} + + +#ifdef CONFIG_HARDLOCKUP_DETECTOR +static int watchdog_nmi_enable(int cpu) +{ + struct perf_event_attr *wd_attr; + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + /* is it already setup and enabled? */ + if (event && event->state > PERF_EVENT_STATE_OFF) + goto out; + + /* it is setup but not enabled */ + if (event != NULL) + goto out_enable; + + /* Try to register using hardware perf events */ + wd_attr = &wd_hw_attr; + wd_attr->sample_period = hw_nmi_get_sample_period(); + event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); + if (!IS_ERR(event)) { + printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); + goto out_save; + } + + printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); + return PTR_ERR(event); + + /* success path */ +out_save: + per_cpu(watchdog_ev, cpu) = event; +out_enable: + perf_event_enable(per_cpu(watchdog_ev, cpu)); +out: + return 0; +} + +static void watchdog_nmi_disable(int cpu) +{ + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + if (event) { + perf_event_disable(event); + per_cpu(watchdog_ev, cpu) = NULL; + + /* should be in cleanup, but blocks oprofile */ + perf_event_release_kernel(event); + } + return; +} +#else +static int watchdog_nmi_enable(int cpu) { return 0; } +static void watchdog_nmi_disable(int cpu) { return; } +#endif /* CONFIG_HARDLOCKUP_DETECTOR */ + +/* prepare/enable/disable routines */ +static int watchdog_prepare_cpu(int cpu) +{ + struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); + + WARN_ON(per_cpu(softlockup_watchdog, cpu)); + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer->function = watchdog_timer_fn; + + return 0; +} + +static int watchdog_enable(int cpu) +{ + struct task_struct *p = per_cpu(softlockup_watchdog, cpu); + int err; + + /* enable the perf event */ + err = watchdog_nmi_enable(cpu); + if (err) + return err; + + /* create the watchdog thread */ + if (!p) { + p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); + if (IS_ERR(p)) { + printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); + return PTR_ERR(p); + } + kthread_bind(p, cpu); + per_cpu(watchdog_touch_ts, cpu) = 0; + per_cpu(softlockup_watchdog, cpu) = p; + wake_up_process(p); + } + + /* if any cpu succeeds, watchdog is considered enabled for the system */ + watchdog_enabled = 1; + + return 0; +} + +static void watchdog_disable(int cpu) +{ + struct task_struct *p = per_cpu(softlockup_watchdog, cpu); + struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); + + /* + * cancel the timer first to stop incrementing the stats + * and waking up the kthread + */ + hrtimer_cancel(hrtimer); + + /* disable the perf event */ + watchdog_nmi_disable(cpu); + + /* stop the watchdog thread */ + if (p) { + per_cpu(softlockup_watchdog, cpu) = NULL; + kthread_stop(p); + } +} + +static void watchdog_enable_all_cpus(void) +{ + int cpu; + int result = 0; + + for_each_online_cpu(cpu) + result += watchdog_enable(cpu); + + if (result) + printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); + +} + +static void watchdog_disable_all_cpus(void) +{ + int cpu; + + if (no_watchdog) + return; + + for_each_online_cpu(cpu) + watchdog_disable(cpu); + + /* if all watchdogs are disabled, then they are disabled for the system */ + watchdog_enabled = 0; +} + + +/* sysctl functions */ +#ifdef CONFIG_SYSCTL +/* + * proc handler for /proc/sys/kernel/nmi_watchdog + */ + +int proc_dowatchdog_enabled(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + proc_dointvec(table, write, buffer, length, ppos); + + if (watchdog_enabled) + watchdog_enable_all_cpus(); + else + watchdog_disable_all_cpus(); + return 0; +} + +int proc_dowatchdog_thresh(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} +#endif /* CONFIG_SYSCTL */ + + +/* + * Create/destroy watchdog threads as CPUs come and go: + */ +static int __cpuinit +cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int hotcpu = (unsigned long)hcpu; + int err = 0; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + err = watchdog_prepare_cpu(hotcpu); + break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + err = watchdog_enable(hotcpu); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + watchdog_disable(hotcpu); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + watchdog_disable(hotcpu); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + return notifier_from_errno(err); +} + +static struct notifier_block __cpuinitdata cpu_nfb = { + .notifier_call = cpu_callback +}; + +static int __init spawn_watchdog_task(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + int err; + + if (no_watchdog) + return 0; + + err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + WARN_ON(notifier_to_errno(err)); + + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); + register_cpu_notifier(&cpu_nfb); + + return 0; +} +early_initcall(spawn_watchdog_task); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 327d2de..f77afd9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1,19 +1,26 @@ /* - * linux/kernel/workqueue.c + * kernel/workqueue.c - generic async execution with shared worker pool * - * Generic mechanism for defining kernel helper threads for running - * arbitrary tasks in process context. + * Copyright (C) 2002 Ingo Molnar * - * Started by Ingo Molnar, Copyright (C) 2002 + * Derived from the taskqueue/keventd code by: + * David Woodhouse <dwmw2@infradead.org> + * Andrew Morton + * Kai Petzke <wpp@marie.physik.tu-berlin.de> + * Theodore Ts'o <tytso@mit.edu> * - * Derived from the taskqueue/keventd code by: + * Made to use alloc_percpu by Christoph Lameter. * - * David Woodhouse <dwmw2@infradead.org> - * Andrew Morton - * Kai Petzke <wpp@marie.physik.tu-berlin.de> - * Theodore Ts'o <tytso@mit.edu> + * Copyright (C) 2010 SUSE Linux Products GmbH + * Copyright (C) 2010 Tejun Heo <tj@kernel.org> * - * Made to use alloc_percpu by Christoph Lameter. + * This is the generic async execution mechanism. Work items as are + * executed in process context. The worker pool is shared and + * automatically managed. There is one worker pool for each CPU and + * one extra for works which are better served by workers which are + * not bound to any specific CPU. + * + * Please read Documentation/workqueue.txt for details. */ #include <linux/module.h> @@ -33,41 +40,291 @@ #include <linux/kallsyms.h> #include <linux/debug_locks.h> #include <linux/lockdep.h> +#include <linux/idr.h> + #define CREATE_TRACE_POINTS #include <trace/events/workqueue.h> +#include "workqueue_sched.h" + +enum { + /* global_cwq flags */ + GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ + GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */ + GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ + GCWQ_FREEZING = 1 << 3, /* freeze in progress */ + GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */ + + /* worker flags */ + WORKER_STARTED = 1 << 0, /* started */ + WORKER_DIE = 1 << 1, /* die die die */ + WORKER_IDLE = 1 << 2, /* is idle */ + WORKER_PREP = 1 << 3, /* preparing to run works */ + WORKER_ROGUE = 1 << 4, /* not bound to any cpu */ + WORKER_REBIND = 1 << 5, /* mom is home, come back */ + WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ + WORKER_UNBOUND = 1 << 7, /* worker is unbound */ + + WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND | + WORKER_CPU_INTENSIVE | WORKER_UNBOUND, + + /* gcwq->trustee_state */ + TRUSTEE_START = 0, /* start */ + TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */ + TRUSTEE_BUTCHER = 2, /* butcher workers */ + TRUSTEE_RELEASE = 3, /* release workers */ + TRUSTEE_DONE = 4, /* trustee is done */ + + BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ + BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, + BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1, + + MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ + IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ + + MAYDAY_INITIAL_TIMEOUT = HZ / 100, /* call for help after 10ms */ + MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ + CREATE_COOLDOWN = HZ, /* time to breath after fail */ + TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ + + /* + * Rescue workers are used only on emergencies and shared by + * all cpus. Give -20. + */ + RESCUER_NICE_LEVEL = -20, +}; + +/* + * Structure fields follow one of the following exclusion rules. + * + * I: Modifiable by initialization/destruction paths and read-only for + * everyone else. + * + * P: Preemption protected. Disabling preemption is enough and should + * only be modified and accessed from the local cpu. + * + * L: gcwq->lock protected. Access with gcwq->lock held. + * + * X: During normal operation, modification requires gcwq->lock and + * should be done only from local cpu. Either disabling preemption + * on local cpu or grabbing gcwq->lock is enough for read access. + * If GCWQ_DISASSOCIATED is set, it's identical to L. + * + * F: wq->flush_mutex protected. + * + * W: workqueue_lock protected. + */ + +struct global_cwq; + /* - * The per-CPU workqueue (if single thread, we always use the first - * possible cpu). + * The poor guys doing the actual heavy lifting. All on-duty workers + * are either serving the manager role, on idle list or on busy hash. */ -struct cpu_workqueue_struct { +struct worker { + /* on idle list while idle, on busy hash table while busy */ + union { + struct list_head entry; /* L: while idle */ + struct hlist_node hentry; /* L: while busy */ + }; - spinlock_t lock; + struct work_struct *current_work; /* L: work being processed */ + struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ + struct list_head scheduled; /* L: scheduled works */ + struct task_struct *task; /* I: worker task */ + struct global_cwq *gcwq; /* I: the associated gcwq */ + /* 64 bytes boundary on 64bit, 32 on 32bit */ + unsigned long last_active; /* L: last active timestamp */ + unsigned int flags; /* X: flags */ + int id; /* I: worker id */ + struct work_struct rebind_work; /* L: rebind worker to cpu */ +}; - struct list_head worklist; - wait_queue_head_t more_work; - struct work_struct *current_work; +/* + * Global per-cpu workqueue. There's one and only one for each cpu + * and all works are queued and processed here regardless of their + * target workqueues. + */ +struct global_cwq { + spinlock_t lock; /* the gcwq lock */ + struct list_head worklist; /* L: list of pending works */ + unsigned int cpu; /* I: the associated cpu */ + unsigned int flags; /* L: GCWQ_* flags */ - struct workqueue_struct *wq; - struct task_struct *thread; -} ____cacheline_aligned; + int nr_workers; /* L: total number of workers */ + int nr_idle; /* L: currently idle ones */ + + /* workers are chained either in the idle_list or busy_hash */ + struct list_head idle_list; /* X: list of idle workers */ + struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; + /* L: hash of busy workers */ + + struct timer_list idle_timer; /* L: worker idle timeout */ + struct timer_list mayday_timer; /* L: SOS timer for dworkers */ + + struct ida worker_ida; /* L: for worker IDs */ + + struct task_struct *trustee; /* L: for gcwq shutdown */ + unsigned int trustee_state; /* L: trustee state */ + wait_queue_head_t trustee_wait; /* trustee wait */ + struct worker *first_idle; /* L: first idle worker */ +} ____cacheline_aligned_in_smp; + +/* + * The per-CPU workqueue. The lower WORK_STRUCT_FLAG_BITS of + * work_struct->data are used for flags and thus cwqs need to be + * aligned at two's power of the number of flag bits. + */ +struct cpu_workqueue_struct { + struct global_cwq *gcwq; /* I: the associated gcwq */ + struct workqueue_struct *wq; /* I: the owning workqueue */ + int work_color; /* L: current color */ + int flush_color; /* L: flushing color */ + int nr_in_flight[WORK_NR_COLORS]; + /* L: nr of in_flight works */ + int nr_active; /* L: nr of active works */ + int max_active; /* L: max active works */ + struct list_head delayed_works; /* L: delayed works */ +}; + +/* + * Structure used to wait for workqueue flush. + */ +struct wq_flusher { + struct list_head list; /* F: list of flushers */ + int flush_color; /* F: flush color waiting for */ + struct completion done; /* flush completion */ +}; + +/* + * All cpumasks are assumed to be always set on UP and thus can't be + * used to determine whether there's something to be done. + */ +#ifdef CONFIG_SMP +typedef cpumask_var_t mayday_mask_t; +#define mayday_test_and_set_cpu(cpu, mask) \ + cpumask_test_and_set_cpu((cpu), (mask)) +#define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask)) +#define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask)) +#define alloc_mayday_mask(maskp, gfp) zalloc_cpumask_var((maskp), (gfp)) +#define free_mayday_mask(mask) free_cpumask_var((mask)) +#else +typedef unsigned long mayday_mask_t; +#define mayday_test_and_set_cpu(cpu, mask) test_and_set_bit(0, &(mask)) +#define mayday_clear_cpu(cpu, mask) clear_bit(0, &(mask)) +#define for_each_mayday_cpu(cpu, mask) if ((cpu) = 0, (mask)) +#define alloc_mayday_mask(maskp, gfp) true +#define free_mayday_mask(mask) do { } while (0) +#endif /* * The externally visible workqueue abstraction is an array of * per-CPU workqueues: */ struct workqueue_struct { - struct cpu_workqueue_struct *cpu_wq; - struct list_head list; - const char *name; - int singlethread; - int freezeable; /* Freeze threads during suspend */ - int rt; + unsigned int flags; /* I: WQ_* flags */ + union { + struct cpu_workqueue_struct __percpu *pcpu; + struct cpu_workqueue_struct *single; + unsigned long v; + } cpu_wq; /* I: cwq's */ + struct list_head list; /* W: list of all workqueues */ + + struct mutex flush_mutex; /* protects wq flushing */ + int work_color; /* F: current work color */ + int flush_color; /* F: current flush color */ + atomic_t nr_cwqs_to_flush; /* flush in progress */ + struct wq_flusher *first_flusher; /* F: first flusher */ + struct list_head flusher_queue; /* F: flush waiters */ + struct list_head flusher_overflow; /* F: flush overflow list */ + + mayday_mask_t mayday_mask; /* cpus requesting rescue */ + struct worker *rescuer; /* I: rescue worker */ + + int saved_max_active; /* W: saved cwq max_active */ + const char *name; /* I: workqueue name */ #ifdef CONFIG_LOCKDEP - struct lockdep_map lockdep_map; + struct lockdep_map lockdep_map; #endif }; +struct workqueue_struct *system_wq __read_mostly; +struct workqueue_struct *system_long_wq __read_mostly; +struct workqueue_struct *system_nrt_wq __read_mostly; +struct workqueue_struct *system_unbound_wq __read_mostly; +EXPORT_SYMBOL_GPL(system_wq); +EXPORT_SYMBOL_GPL(system_long_wq); +EXPORT_SYMBOL_GPL(system_nrt_wq); +EXPORT_SYMBOL_GPL(system_unbound_wq); + +#define for_each_busy_worker(worker, i, pos, gcwq) \ + for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ + hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) + +static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask, + unsigned int sw) +{ + if (cpu < nr_cpu_ids) { + if (sw & 1) { + cpu = cpumask_next(cpu, mask); + if (cpu < nr_cpu_ids) + return cpu; + } + if (sw & 2) + return WORK_CPU_UNBOUND; + } + return WORK_CPU_NONE; +} + +static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, + struct workqueue_struct *wq) +{ + return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); +} + +/* + * CPU iterators + * + * An extra gcwq is defined for an invalid cpu number + * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any + * specific CPU. The following iterators are similar to + * for_each_*_cpu() iterators but also considers the unbound gcwq. + * + * for_each_gcwq_cpu() : possible CPUs + WORK_CPU_UNBOUND + * for_each_online_gcwq_cpu() : online CPUs + WORK_CPU_UNBOUND + * for_each_cwq_cpu() : possible CPUs for bound workqueues, + * WORK_CPU_UNBOUND for unbound workqueues + */ +#define for_each_gcwq_cpu(cpu) \ + for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3); \ + (cpu) < WORK_CPU_NONE; \ + (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3)) + +#define for_each_online_gcwq_cpu(cpu) \ + for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3); \ + (cpu) < WORK_CPU_NONE; \ + (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3)) + +#define for_each_cwq_cpu(cpu, wq) \ + for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq)); \ + (cpu) < WORK_CPU_NONE; \ + (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq))) + +#ifdef CONFIG_LOCKDEP +/** + * in_workqueue_context() - in context of specified workqueue? + * @wq: the workqueue of interest + * + * Checks lockdep state to see if the current task is executing from + * within a workqueue item. This function exists only if lockdep is + * enabled. + */ +int in_workqueue_context(struct workqueue_struct *wq) +{ + return lock_is_held(&wq->lockdep_map); +} +#endif + #ifdef CONFIG_DEBUG_OBJECTS_WORK static struct debug_obj_descr work_debug_descr; @@ -107,7 +364,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state) * statically initialized. We just make sure that it * is tracked in the object tracker. */ - if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) { + if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) { debug_object_init(work, &work_debug_descr); debug_object_activate(work, &work_debug_descr); return 0; @@ -181,94 +438,582 @@ static inline void debug_work_deactivate(struct work_struct *work) { } /* Serializes the accesses to the list of workqueues. */ static DEFINE_SPINLOCK(workqueue_lock); static LIST_HEAD(workqueues); +static bool workqueue_freezing; /* W: have wqs started freezing? */ -static int singlethread_cpu __read_mostly; -static const struct cpumask *cpu_singlethread_map __read_mostly; /* - * _cpu_down() first removes CPU from cpu_online_map, then CPU_DEAD - * flushes cwq->worklist. This means that flush_workqueue/wait_on_work - * which comes in between can't use for_each_online_cpu(). We could - * use cpu_possible_map, the cpumask below is more a documentation - * than optimization. + * The almighty global cpu workqueues. nr_running is the only field + * which is expected to be used frequently by other cpus via + * try_to_wake_up(). Put it in a separate cacheline. */ -static cpumask_var_t cpu_populated_map __read_mostly; +static DEFINE_PER_CPU(struct global_cwq, global_cwq); +static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running); + +/* + * Global cpu workqueue and nr_running counter for unbound gcwq. The + * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its + * workers have WORKER_UNBOUND set. + */ +static struct global_cwq unbound_global_cwq; +static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */ + +static int worker_thread(void *__worker); + +static struct global_cwq *get_gcwq(unsigned int cpu) +{ + if (cpu != WORK_CPU_UNBOUND) + return &per_cpu(global_cwq, cpu); + else + return &unbound_global_cwq; +} -/* If it's single threaded, it isn't in the list of workqueues. */ -static inline int is_wq_single_threaded(struct workqueue_struct *wq) +static atomic_t *get_gcwq_nr_running(unsigned int cpu) { - return wq->singlethread; + if (cpu != WORK_CPU_UNBOUND) + return &per_cpu(gcwq_nr_running, cpu); + else + return &unbound_gcwq_nr_running; +} + +static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, + struct workqueue_struct *wq) +{ + if (!(wq->flags & WQ_UNBOUND)) { + if (likely(cpu < nr_cpu_ids)) { +#ifdef CONFIG_SMP + return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); +#else + return wq->cpu_wq.single; +#endif + } + } else if (likely(cpu == WORK_CPU_UNBOUND)) + return wq->cpu_wq.single; + return NULL; } -static const struct cpumask *wq_cpu_map(struct workqueue_struct *wq) +static unsigned int work_color_to_flags(int color) { - return is_wq_single_threaded(wq) - ? cpu_singlethread_map : cpu_populated_map; + return color << WORK_STRUCT_COLOR_SHIFT; } -static -struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu) +static int get_work_color(struct work_struct *work) { - if (unlikely(is_wq_single_threaded(wq))) - cpu = singlethread_cpu; - return per_cpu_ptr(wq->cpu_wq, cpu); + return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) & + ((1 << WORK_STRUCT_COLOR_BITS) - 1); +} + +static int work_next_color(int color) +{ + return (color + 1) % WORK_NR_COLORS; } /* - * Set the workqueue on which a work item is to be run - * - Must *only* be called if the pending flag is set + * A work's data points to the cwq with WORK_STRUCT_CWQ set while the + * work is on queue. Once execution starts, WORK_STRUCT_CWQ is + * cleared and the work data contains the cpu number it was last on. + * + * set_work_{cwq|cpu}() and clear_work_data() can be used to set the + * cwq, cpu or clear work->data. These functions should only be + * called while the work is owned - ie. while the PENDING bit is set. + * + * get_work_[g]cwq() can be used to obtain the gcwq or cwq + * corresponding to a work. gcwq is available once the work has been + * queued anywhere after initialization. cwq is available only from + * queueing until execution starts. */ -static inline void set_wq_data(struct work_struct *work, - struct cpu_workqueue_struct *cwq) +static inline void set_work_data(struct work_struct *work, unsigned long data, + unsigned long flags) { - unsigned long new; - BUG_ON(!work_pending(work)); + atomic_long_set(&work->data, data | flags | work_static(work)); +} - new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING); - new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work); - atomic_long_set(&work->data, new); +static void set_work_cwq(struct work_struct *work, + struct cpu_workqueue_struct *cwq, + unsigned long extra_flags) +{ + set_work_data(work, (unsigned long)cwq, + WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); +} + +static void set_work_cpu(struct work_struct *work, unsigned int cpu) +{ + set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING); +} + +static void clear_work_data(struct work_struct *work) +{ + set_work_data(work, WORK_STRUCT_NO_CPU, 0); +} + +static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work) +{ + unsigned long data = atomic_long_read(&work->data); + + if (data & WORK_STRUCT_CWQ) + return (void *)(data & WORK_STRUCT_WQ_DATA_MASK); + else + return NULL; +} + +static struct global_cwq *get_work_gcwq(struct work_struct *work) +{ + unsigned long data = atomic_long_read(&work->data); + unsigned int cpu; + + if (data & WORK_STRUCT_CWQ) + return ((struct cpu_workqueue_struct *) + (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq; + + cpu = data >> WORK_STRUCT_FLAG_BITS; + if (cpu == WORK_CPU_NONE) + return NULL; + + BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND); + return get_gcwq(cpu); +} + +/* + * Policy functions. These define the policies on how the global + * worker pool is managed. Unless noted otherwise, these functions + * assume that they're being called with gcwq->lock held. + */ + +static bool __need_more_worker(struct global_cwq *gcwq) +{ + return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) || + gcwq->flags & GCWQ_HIGHPRI_PENDING; } /* - * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued. + * Need to wake up a worker? Called from anything but currently + * running workers. */ -static inline void clear_wq_data(struct work_struct *work) +static bool need_more_worker(struct global_cwq *gcwq) { - unsigned long flags = *work_data_bits(work) & - (1UL << WORK_STRUCT_STATIC); - atomic_long_set(&work->data, flags); + return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq); } -static inline -struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) +/* Can I start working? Called from busy but !running workers. */ +static bool may_start_working(struct global_cwq *gcwq) { - return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); + return gcwq->nr_idle; } +/* Do I need to keep working? Called from currently running workers. */ +static bool keep_working(struct global_cwq *gcwq) +{ + atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); + + return !list_empty(&gcwq->worklist) && atomic_read(nr_running) <= 1; +} + +/* Do we need a new worker? Called from manager. */ +static bool need_to_create_worker(struct global_cwq *gcwq) +{ + return need_more_worker(gcwq) && !may_start_working(gcwq); +} + +/* Do I need to be the manager? */ +static bool need_to_manage_workers(struct global_cwq *gcwq) +{ + return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS; +} + +/* Do we have too many workers and should some go away? */ +static bool too_many_workers(struct global_cwq *gcwq) +{ + bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS; + int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */ + int nr_busy = gcwq->nr_workers - nr_idle; + + return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; +} + +/* + * Wake up functions. + */ + +/* Return the first worker. Safe with preemption disabled */ +static struct worker *first_worker(struct global_cwq *gcwq) +{ + if (unlikely(list_empty(&gcwq->idle_list))) + return NULL; + + return list_first_entry(&gcwq->idle_list, struct worker, entry); +} + +/** + * wake_up_worker - wake up an idle worker + * @gcwq: gcwq to wake worker for + * + * Wake up the first idle worker of @gcwq. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void wake_up_worker(struct global_cwq *gcwq) +{ + struct worker *worker = first_worker(gcwq); + + if (likely(worker)) + wake_up_process(worker->task); +} + +/** + * wq_worker_waking_up - a worker is waking up + * @task: task waking up + * @cpu: CPU @task is waking up to + * + * This function is called during try_to_wake_up() when a worker is + * being awoken. + * + * CONTEXT: + * spin_lock_irq(rq->lock) + */ +void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) +{ + struct worker *worker = kthread_data(task); + + if (likely(!(worker->flags & WORKER_NOT_RUNNING))) + atomic_inc(get_gcwq_nr_running(cpu)); +} + +/** + * wq_worker_sleeping - a worker is going to sleep + * @task: task going to sleep + * @cpu: CPU in question, must be the current CPU number + * + * This function is called during schedule() when a busy worker is + * going to sleep. Worker on the same cpu can be woken up by + * returning pointer to its task. + * + * CONTEXT: + * spin_lock_irq(rq->lock) + * + * RETURNS: + * Worker task on @cpu to wake up, %NULL if none. + */ +struct task_struct *wq_worker_sleeping(struct task_struct *task, + unsigned int cpu) +{ + struct worker *worker = kthread_data(task), *to_wakeup = NULL; + struct global_cwq *gcwq = get_gcwq(cpu); + atomic_t *nr_running = get_gcwq_nr_running(cpu); + + if (unlikely(worker->flags & WORKER_NOT_RUNNING)) + return NULL; + + /* this can only happen on the local cpu */ + BUG_ON(cpu != raw_smp_processor_id()); + + /* + * The counterpart of the following dec_and_test, implied mb, + * worklist not empty test sequence is in insert_work(). + * Please read comment there. + * + * NOT_RUNNING is clear. This means that trustee is not in + * charge and we're running on the local cpu w/ rq lock held + * and preemption disabled, which in turn means that none else + * could be manipulating idle_list, so dereferencing idle_list + * without gcwq lock is safe. + */ + if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist)) + to_wakeup = first_worker(gcwq); + return to_wakeup ? to_wakeup->task : NULL; +} + +/** + * worker_set_flags - set worker flags and adjust nr_running accordingly + * @worker: self + * @flags: flags to set + * @wakeup: wakeup an idle worker if necessary + * + * Set @flags in @worker->flags and adjust nr_running accordingly. If + * nr_running becomes zero and @wakeup is %true, an idle worker is + * woken up. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) + */ +static inline void worker_set_flags(struct worker *worker, unsigned int flags, + bool wakeup) +{ + struct global_cwq *gcwq = worker->gcwq; + + WARN_ON_ONCE(worker->task != current); + + /* + * If transitioning into NOT_RUNNING, adjust nr_running and + * wake up an idle worker as necessary if requested by + * @wakeup. + */ + if ((flags & WORKER_NOT_RUNNING) && + !(worker->flags & WORKER_NOT_RUNNING)) { + atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); + + if (wakeup) { + if (atomic_dec_and_test(nr_running) && + !list_empty(&gcwq->worklist)) + wake_up_worker(gcwq); + } else + atomic_dec(nr_running); + } + + worker->flags |= flags; +} + +/** + * worker_clr_flags - clear worker flags and adjust nr_running accordingly + * @worker: self + * @flags: flags to clear + * + * Clear @flags in @worker->flags and adjust nr_running accordingly. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) + */ +static inline void worker_clr_flags(struct worker *worker, unsigned int flags) +{ + struct global_cwq *gcwq = worker->gcwq; + unsigned int oflags = worker->flags; + + WARN_ON_ONCE(worker->task != current); + + worker->flags &= ~flags; + + /* if transitioning out of NOT_RUNNING, increment nr_running */ + if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) + if (!(worker->flags & WORKER_NOT_RUNNING)) + atomic_inc(get_gcwq_nr_running(gcwq->cpu)); +} + +/** + * busy_worker_head - return the busy hash head for a work + * @gcwq: gcwq of interest + * @work: work to be hashed + * + * Return hash head of @gcwq for @work. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + * + * RETURNS: + * Pointer to the hash head. + */ +static struct hlist_head *busy_worker_head(struct global_cwq *gcwq, + struct work_struct *work) +{ + const int base_shift = ilog2(sizeof(struct work_struct)); + unsigned long v = (unsigned long)work; + + /* simple shift and fold hash, do we need something better? */ + v >>= base_shift; + v += v >> BUSY_WORKER_HASH_ORDER; + v &= BUSY_WORKER_HASH_MASK; + + return &gcwq->busy_hash[v]; +} + +/** + * __find_worker_executing_work - find worker which is executing a work + * @gcwq: gcwq of interest + * @bwh: hash head as returned by busy_worker_head() + * @work: work to find worker for + * + * Find a worker which is executing @work on @gcwq. @bwh should be + * the hash head obtained by calling busy_worker_head() with the same + * work. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + * + * RETURNS: + * Pointer to worker which is executing @work if found, NULL + * otherwise. + */ +static struct worker *__find_worker_executing_work(struct global_cwq *gcwq, + struct hlist_head *bwh, + struct work_struct *work) +{ + struct worker *worker; + struct hlist_node *tmp; + + hlist_for_each_entry(worker, tmp, bwh, hentry) + if (worker->current_work == work) + return worker; + return NULL; +} + +/** + * find_worker_executing_work - find worker which is executing a work + * @gcwq: gcwq of interest + * @work: work to find worker for + * + * Find a worker which is executing @work on @gcwq. This function is + * identical to __find_worker_executing_work() except that this + * function calculates @bwh itself. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + * + * RETURNS: + * Pointer to worker which is executing @work if found, NULL + * otherwise. + */ +static struct worker *find_worker_executing_work(struct global_cwq *gcwq, + struct work_struct *work) +{ + return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work), + work); +} + +/** + * gcwq_determine_ins_pos - find insertion position + * @gcwq: gcwq of interest + * @cwq: cwq a work is being queued for + * + * A work for @cwq is about to be queued on @gcwq, determine insertion + * position for the work. If @cwq is for HIGHPRI wq, the work is + * queued at the head of the queue but in FIFO order with respect to + * other HIGHPRI works; otherwise, at the end of the queue. This + * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that + * there are HIGHPRI works pending. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + * + * RETURNS: + * Pointer to inserstion position. + */ +static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq, + struct cpu_workqueue_struct *cwq) +{ + struct work_struct *twork; + + if (likely(!(cwq->wq->flags & WQ_HIGHPRI))) + return &gcwq->worklist; + + list_for_each_entry(twork, &gcwq->worklist, entry) { + struct cpu_workqueue_struct *tcwq = get_work_cwq(twork); + + if (!(tcwq->wq->flags & WQ_HIGHPRI)) + break; + } + + gcwq->flags |= GCWQ_HIGHPRI_PENDING; + return &twork->entry; +} + +/** + * insert_work - insert a work into gcwq + * @cwq: cwq @work belongs to + * @work: work to insert + * @head: insertion point + * @extra_flags: extra WORK_STRUCT_* flags to set + * + * Insert @work which belongs to @cwq into @gcwq after @head. + * @extra_flags is or'd to work_struct flags. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ static void insert_work(struct cpu_workqueue_struct *cwq, - struct work_struct *work, struct list_head *head) + struct work_struct *work, struct list_head *head, + unsigned int extra_flags) { - trace_workqueue_insertion(cwq->thread, work); + struct global_cwq *gcwq = cwq->gcwq; + + /* we own @work, set data and link */ + set_work_cwq(work, cwq, extra_flags); - set_wq_data(work, cwq); /* * Ensure that we get the right work->data if we see the * result of list_add() below, see try_to_grab_pending(). */ smp_wmb(); + list_add_tail(&work->entry, head); - wake_up(&cwq->more_work); + + /* + * Ensure either worker_sched_deactivated() sees the above + * list_add_tail() or we see zero nr_running to avoid workers + * lying around lazily while there are works to be processed. + */ + smp_mb(); + + if (__need_more_worker(gcwq)) + wake_up_worker(gcwq); } -static void __queue_work(struct cpu_workqueue_struct *cwq, +static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, struct work_struct *work) { + struct global_cwq *gcwq; + struct cpu_workqueue_struct *cwq; + struct list_head *worklist; + unsigned int work_flags; unsigned long flags; debug_work_activate(work); - spin_lock_irqsave(&cwq->lock, flags); - insert_work(cwq, work, &cwq->worklist); - spin_unlock_irqrestore(&cwq->lock, flags); + + if (WARN_ON_ONCE(wq->flags & WQ_DYING)) + return; + + /* determine gcwq to use */ + if (!(wq->flags & WQ_UNBOUND)) { + struct global_cwq *last_gcwq; + + if (unlikely(cpu == WORK_CPU_UNBOUND)) + cpu = raw_smp_processor_id(); + + /* + * It's multi cpu. If @wq is non-reentrant and @work + * was previously on a different cpu, it might still + * be running there, in which case the work needs to + * be queued on that cpu to guarantee non-reentrance. + */ + gcwq = get_gcwq(cpu); + if (wq->flags & WQ_NON_REENTRANT && + (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) { + struct worker *worker; + + spin_lock_irqsave(&last_gcwq->lock, flags); + + worker = find_worker_executing_work(last_gcwq, work); + + if (worker && worker->current_cwq->wq == wq) + gcwq = last_gcwq; + else { + /* meh... not running there, queue here */ + spin_unlock_irqrestore(&last_gcwq->lock, flags); + spin_lock_irqsave(&gcwq->lock, flags); + } + } else + spin_lock_irqsave(&gcwq->lock, flags); + } else { + gcwq = get_gcwq(WORK_CPU_UNBOUND); + spin_lock_irqsave(&gcwq->lock, flags); + } + + /* gcwq determined, get cwq and queue */ + cwq = get_cwq(gcwq->cpu, wq); + + BUG_ON(!list_empty(&work->entry)); + + cwq->nr_in_flight[cwq->work_color]++; + work_flags = work_color_to_flags(cwq->work_color); + + if (likely(cwq->nr_active < cwq->max_active)) { + cwq->nr_active++; + worklist = gcwq_determine_ins_pos(gcwq, cwq); + } else { + work_flags |= WORK_STRUCT_DELAYED; + worklist = &cwq->delayed_works; + } + + insert_work(cwq, work, worklist, work_flags); + + spin_unlock_irqrestore(&gcwq->lock, flags); } /** @@ -308,9 +1053,8 @@ queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) { int ret = 0; - if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { - BUG_ON(!list_empty(&work->entry)); - __queue_work(wq_per_cpu(wq, cpu), work); + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + __queue_work(cpu, wq, work); ret = 1; } return ret; @@ -320,10 +1064,9 @@ EXPORT_SYMBOL_GPL(queue_work_on); static void delayed_work_timer_fn(unsigned long __data) { struct delayed_work *dwork = (struct delayed_work *)__data; - struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work); - struct workqueue_struct *wq = cwq->wq; + struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); - __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work); + __queue_work(smp_processor_id(), cwq->wq, &dwork->work); } /** @@ -360,14 +1103,31 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct timer_list *timer = &dwork->timer; struct work_struct *work = &dwork->work; - if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + unsigned int lcpu; + BUG_ON(timer_pending(timer)); BUG_ON(!list_empty(&work->entry)); timer_stats_timer_set_start_info(&dwork->timer); - /* This stores cwq for the moment, for the timer_fn */ - set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id())); + /* + * This stores cwq for the moment, for the timer_fn. + * Note that the work's gcwq is preserved to allow + * reentrance detection for delayed works. + */ + if (!(wq->flags & WQ_UNBOUND)) { + struct global_cwq *gcwq = get_work_gcwq(work); + + if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND) + lcpu = gcwq->cpu; + else + lcpu = raw_smp_processor_id(); + } else + lcpu = WORK_CPU_UNBOUND; + + set_work_cwq(work, get_cwq(lcpu, wq), 0); + timer->expires = jiffies + delay; timer->data = (unsigned long)dwork; timer->function = delayed_work_timer_fn; @@ -382,80 +1142,888 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(queue_delayed_work_on); -static void run_workqueue(struct cpu_workqueue_struct *cwq) +/** + * worker_enter_idle - enter idle state + * @worker: worker which is entering idle state + * + * @worker is entering idle state. Update stats and idle timer if + * necessary. + * + * LOCKING: + * spin_lock_irq(gcwq->lock). + */ +static void worker_enter_idle(struct worker *worker) { - spin_lock_irq(&cwq->lock); - while (!list_empty(&cwq->worklist)) { - struct work_struct *work = list_entry(cwq->worklist.next, - struct work_struct, entry); - work_func_t f = work->func; -#ifdef CONFIG_LOCKDEP + struct global_cwq *gcwq = worker->gcwq; + + BUG_ON(worker->flags & WORKER_IDLE); + BUG_ON(!list_empty(&worker->entry) && + (worker->hentry.next || worker->hentry.pprev)); + + /* can't use worker_set_flags(), also called from start_worker() */ + worker->flags |= WORKER_IDLE; + gcwq->nr_idle++; + worker->last_active = jiffies; + + /* idle_list is LIFO */ + list_add(&worker->entry, &gcwq->idle_list); + + if (likely(!(worker->flags & WORKER_ROGUE))) { + if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer)) + mod_timer(&gcwq->idle_timer, + jiffies + IDLE_WORKER_TIMEOUT); + } else + wake_up_all(&gcwq->trustee_wait); + + /* sanity check nr_running */ + WARN_ON_ONCE(gcwq->nr_workers == gcwq->nr_idle && + atomic_read(get_gcwq_nr_running(gcwq->cpu))); +} + +/** + * worker_leave_idle - leave idle state + * @worker: worker which is leaving idle state + * + * @worker is leaving idle state. Update stats. + * + * LOCKING: + * spin_lock_irq(gcwq->lock). + */ +static void worker_leave_idle(struct worker *worker) +{ + struct global_cwq *gcwq = worker->gcwq; + + BUG_ON(!(worker->flags & WORKER_IDLE)); + worker_clr_flags(worker, WORKER_IDLE); + gcwq->nr_idle--; + list_del_init(&worker->entry); +} + +/** + * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq + * @worker: self + * + * Works which are scheduled while the cpu is online must at least be + * scheduled to a worker which is bound to the cpu so that if they are + * flushed from cpu callbacks while cpu is going down, they are + * guaranteed to execute on the cpu. + * + * This function is to be used by rogue workers and rescuers to bind + * themselves to the target cpu and may race with cpu going down or + * coming online. kthread_bind() can't be used because it may put the + * worker to already dead cpu and set_cpus_allowed_ptr() can't be used + * verbatim as it's best effort and blocking and gcwq may be + * [dis]associated in the meantime. + * + * This function tries set_cpus_allowed() and locks gcwq and verifies + * the binding against GCWQ_DISASSOCIATED which is set during + * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters + * idle state or fetches works without dropping lock, it can guarantee + * the scheduling requirement described in the first paragraph. + * + * CONTEXT: + * Might sleep. Called without any lock but returns with gcwq->lock + * held. + * + * RETURNS: + * %true if the associated gcwq is online (@worker is successfully + * bound), %false if offline. + */ +static bool worker_maybe_bind_and_lock(struct worker *worker) +__acquires(&gcwq->lock) +{ + struct global_cwq *gcwq = worker->gcwq; + struct task_struct *task = worker->task; + + while (true) { /* - * It is permissible to free the struct work_struct - * from inside the function that is called from it, - * this we need to take into account for lockdep too. - * To avoid bogus "held lock freed" warnings as well - * as problems when looking into work->lockdep_map, - * make a copy and use that here. + * The following call may fail, succeed or succeed + * without actually migrating the task to the cpu if + * it races with cpu hotunplug operation. Verify + * against GCWQ_DISASSOCIATED. */ - struct lockdep_map lockdep_map = work->lockdep_map; -#endif - trace_workqueue_execution(cwq->thread, work); - debug_work_deactivate(work); - cwq->current_work = work; - list_del_init(cwq->worklist.next); - spin_unlock_irq(&cwq->lock); - - BUG_ON(get_wq_data(work) != cwq); - work_clear_pending(work); - lock_map_acquire(&cwq->wq->lockdep_map); - lock_map_acquire(&lockdep_map); - f(work); - lock_map_release(&lockdep_map); - lock_map_release(&cwq->wq->lockdep_map); - - if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { - printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " - "%s/0x%08x/%d\n", - current->comm, preempt_count(), - task_pid_nr(current)); - printk(KERN_ERR " last function: "); - print_symbol("%s\n", (unsigned long)f); - debug_show_held_locks(current); - dump_stack(); + if (!(gcwq->flags & GCWQ_DISASSOCIATED)) + set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu)); + + spin_lock_irq(&gcwq->lock); + if (gcwq->flags & GCWQ_DISASSOCIATED) + return false; + if (task_cpu(task) == gcwq->cpu && + cpumask_equal(¤t->cpus_allowed, + get_cpu_mask(gcwq->cpu))) + return true; + spin_unlock_irq(&gcwq->lock); + + /* CPU has come up inbetween, retry migration */ + cpu_relax(); + } +} + +/* + * Function for worker->rebind_work used to rebind rogue busy workers + * to the associated cpu which is coming back online. This is + * scheduled by cpu up but can race with other cpu hotplug operations + * and may be executed twice without intervening cpu down. + */ +static void worker_rebind_fn(struct work_struct *work) +{ + struct worker *worker = container_of(work, struct worker, rebind_work); + struct global_cwq *gcwq = worker->gcwq; + + if (worker_maybe_bind_and_lock(worker)) + worker_clr_flags(worker, WORKER_REBIND); + + spin_unlock_irq(&gcwq->lock); +} + +static struct worker *alloc_worker(void) +{ + struct worker *worker; + + worker = kzalloc(sizeof(*worker), GFP_KERNEL); + if (worker) { + INIT_LIST_HEAD(&worker->entry); + INIT_LIST_HEAD(&worker->scheduled); + INIT_WORK(&worker->rebind_work, worker_rebind_fn); + /* on creation a worker is in !idle && prep state */ + worker->flags = WORKER_PREP; + } + return worker; +} + +/** + * create_worker - create a new workqueue worker + * @gcwq: gcwq the new worker will belong to + * @bind: whether to set affinity to @cpu or not + * + * Create a new worker which is bound to @gcwq. The returned worker + * can be started by calling start_worker() or destroyed using + * destroy_worker(). + * + * CONTEXT: + * Might sleep. Does GFP_KERNEL allocations. + * + * RETURNS: + * Pointer to the newly created worker. + */ +static struct worker *create_worker(struct global_cwq *gcwq, bool bind) +{ + bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND; + struct worker *worker = NULL; + int id = -1; + + spin_lock_irq(&gcwq->lock); + while (ida_get_new(&gcwq->worker_ida, &id)) { + spin_unlock_irq(&gcwq->lock); + if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL)) + goto fail; + spin_lock_irq(&gcwq->lock); + } + spin_unlock_irq(&gcwq->lock); + + worker = alloc_worker(); + if (!worker) + goto fail; + + worker->gcwq = gcwq; + worker->id = id; + + if (!on_unbound_cpu) + worker->task = kthread_create(worker_thread, worker, + "kworker/%u:%d", gcwq->cpu, id); + else + worker->task = kthread_create(worker_thread, worker, + "kworker/u:%d", id); + if (IS_ERR(worker->task)) + goto fail; + + /* + * A rogue worker will become a regular one if CPU comes + * online later on. Make sure every worker has + * PF_THREAD_BOUND set. + */ + if (bind && !on_unbound_cpu) + kthread_bind(worker->task, gcwq->cpu); + else { + worker->task->flags |= PF_THREAD_BOUND; + if (on_unbound_cpu) + worker->flags |= WORKER_UNBOUND; + } + + return worker; +fail: + if (id >= 0) { + spin_lock_irq(&gcwq->lock); + ida_remove(&gcwq->worker_ida, id); + spin_unlock_irq(&gcwq->lock); + } + kfree(worker); + return NULL; +} + +/** + * start_worker - start a newly created worker + * @worker: worker to start + * + * Make the gcwq aware of @worker and start it. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void start_worker(struct worker *worker) +{ + worker->flags |= WORKER_STARTED; + worker->gcwq->nr_workers++; + worker_enter_idle(worker); + wake_up_process(worker->task); +} + +/** + * destroy_worker - destroy a workqueue worker + * @worker: worker to be destroyed + * + * Destroy @worker and adjust @gcwq stats accordingly. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which is released and regrabbed. + */ +static void destroy_worker(struct worker *worker) +{ + struct global_cwq *gcwq = worker->gcwq; + int id = worker->id; + + /* sanity check frenzy */ + BUG_ON(worker->current_work); + BUG_ON(!list_empty(&worker->scheduled)); + + if (worker->flags & WORKER_STARTED) + gcwq->nr_workers--; + if (worker->flags & WORKER_IDLE) + gcwq->nr_idle--; + + list_del_init(&worker->entry); + worker->flags |= WORKER_DIE; + + spin_unlock_irq(&gcwq->lock); + + kthread_stop(worker->task); + kfree(worker); + + spin_lock_irq(&gcwq->lock); + ida_remove(&gcwq->worker_ida, id); +} + +static void idle_worker_timeout(unsigned long __gcwq) +{ + struct global_cwq *gcwq = (void *)__gcwq; + + spin_lock_irq(&gcwq->lock); + + if (too_many_workers(gcwq)) { + struct worker *worker; + unsigned long expires; + + /* idle_list is kept in LIFO order, check the last one */ + worker = list_entry(gcwq->idle_list.prev, struct worker, entry); + expires = worker->last_active + IDLE_WORKER_TIMEOUT; + + if (time_before(jiffies, expires)) + mod_timer(&gcwq->idle_timer, expires); + else { + /* it's been idle for too long, wake up manager */ + gcwq->flags |= GCWQ_MANAGE_WORKERS; + wake_up_worker(gcwq); } + } - spin_lock_irq(&cwq->lock); - cwq->current_work = NULL; + spin_unlock_irq(&gcwq->lock); +} + +static bool send_mayday(struct work_struct *work) +{ + struct cpu_workqueue_struct *cwq = get_work_cwq(work); + struct workqueue_struct *wq = cwq->wq; + unsigned int cpu; + + if (!(wq->flags & WQ_RESCUER)) + return false; + + /* mayday mayday mayday */ + cpu = cwq->gcwq->cpu; + /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ + if (cpu == WORK_CPU_UNBOUND) + cpu = 0; + if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask)) + wake_up_process(wq->rescuer->task); + return true; +} + +static void gcwq_mayday_timeout(unsigned long __gcwq) +{ + struct global_cwq *gcwq = (void *)__gcwq; + struct work_struct *work; + + spin_lock_irq(&gcwq->lock); + + if (need_to_create_worker(gcwq)) { + /* + * We've been trying to create a new worker but + * haven't been successful. We might be hitting an + * allocation deadlock. Send distress signals to + * rescuers. + */ + list_for_each_entry(work, &gcwq->worklist, entry) + send_mayday(work); } - spin_unlock_irq(&cwq->lock); + + spin_unlock_irq(&gcwq->lock); + + mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL); } -static int worker_thread(void *__cwq) +/** + * maybe_create_worker - create a new worker if necessary + * @gcwq: gcwq to create a new worker for + * + * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to + * have at least one idle worker on return from this function. If + * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is + * sent to all rescuers with works scheduled on @gcwq to resolve + * possible allocation deadlock. + * + * On return, need_to_create_worker() is guaranteed to be false and + * may_start_working() true. + * + * LOCKING: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. Does GFP_KERNEL allocations. Called only from + * manager. + * + * RETURNS: + * false if no action was taken and gcwq->lock stayed locked, true + * otherwise. + */ +static bool maybe_create_worker(struct global_cwq *gcwq) +__releases(&gcwq->lock) +__acquires(&gcwq->lock) { - struct cpu_workqueue_struct *cwq = __cwq; - DEFINE_WAIT(wait); + if (!need_to_create_worker(gcwq)) + return false; +restart: + spin_unlock_irq(&gcwq->lock); + + /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ + mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); + + while (true) { + struct worker *worker; + + worker = create_worker(gcwq, true); + if (worker) { + del_timer_sync(&gcwq->mayday_timer); + spin_lock_irq(&gcwq->lock); + start_worker(worker); + BUG_ON(need_to_create_worker(gcwq)); + return true; + } + + if (!need_to_create_worker(gcwq)) + break; - if (cwq->wq->freezeable) - set_freezable(); + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(CREATE_COOLDOWN); - for (;;) { - prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); - if (!freezing(current) && - !kthread_should_stop() && - list_empty(&cwq->worklist)) - schedule(); - finish_wait(&cwq->more_work, &wait); + if (!need_to_create_worker(gcwq)) + break; + } + + del_timer_sync(&gcwq->mayday_timer); + spin_lock_irq(&gcwq->lock); + if (need_to_create_worker(gcwq)) + goto restart; + return true; +} + +/** + * maybe_destroy_worker - destroy workers which have been idle for a while + * @gcwq: gcwq to destroy workers for + * + * Destroy @gcwq workers which have been idle for longer than + * IDLE_WORKER_TIMEOUT. + * + * LOCKING: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. Called only from manager. + * + * RETURNS: + * false if no action was taken and gcwq->lock stayed locked, true + * otherwise. + */ +static bool maybe_destroy_workers(struct global_cwq *gcwq) +{ + bool ret = false; - try_to_freeze(); + while (too_many_workers(gcwq)) { + struct worker *worker; + unsigned long expires; - if (kthread_should_stop()) + worker = list_entry(gcwq->idle_list.prev, struct worker, entry); + expires = worker->last_active + IDLE_WORKER_TIMEOUT; + + if (time_before(jiffies, expires)) { + mod_timer(&gcwq->idle_timer, expires); break; + } - run_workqueue(cwq); + destroy_worker(worker); + ret = true; } - return 0; + return ret; +} + +/** + * manage_workers - manage worker pool + * @worker: self + * + * Assume the manager role and manage gcwq worker pool @worker belongs + * to. At any given time, there can be only zero or one manager per + * gcwq. The exclusion is handled automatically by this function. + * + * The caller can safely start processing works on false return. On + * true return, it's guaranteed that need_to_create_worker() is false + * and may_start_working() is true. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. Does GFP_KERNEL allocations. + * + * RETURNS: + * false if no action was taken and gcwq->lock stayed locked, true if + * some action was taken. + */ +static bool manage_workers(struct worker *worker) +{ + struct global_cwq *gcwq = worker->gcwq; + bool ret = false; + + if (gcwq->flags & GCWQ_MANAGING_WORKERS) + return ret; + + gcwq->flags &= ~GCWQ_MANAGE_WORKERS; + gcwq->flags |= GCWQ_MANAGING_WORKERS; + + /* + * Destroy and then create so that may_start_working() is true + * on return. + */ + ret |= maybe_destroy_workers(gcwq); + ret |= maybe_create_worker(gcwq); + + gcwq->flags &= ~GCWQ_MANAGING_WORKERS; + + /* + * The trustee might be waiting to take over the manager + * position, tell it we're done. + */ + if (unlikely(gcwq->trustee)) + wake_up_all(&gcwq->trustee_wait); + + return ret; +} + +/** + * move_linked_works - move linked works to a list + * @work: start of series of works to be scheduled + * @head: target list to append @work to + * @nextp: out paramter for nested worklist walking + * + * Schedule linked works starting from @work to @head. Work series to + * be scheduled starts at @work and includes any consecutive work with + * WORK_STRUCT_LINKED set in its predecessor. + * + * If @nextp is not NULL, it's updated to point to the next work of + * the last scheduled work. This allows move_linked_works() to be + * nested inside outer list_for_each_entry_safe(). + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void move_linked_works(struct work_struct *work, struct list_head *head, + struct work_struct **nextp) +{ + struct work_struct *n; + + /* + * Linked worklist will always end before the end of the list, + * use NULL for list head. + */ + list_for_each_entry_safe_from(work, n, NULL, entry) { + list_move_tail(&work->entry, head); + if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) + break; + } + + /* + * If we're already inside safe list traversal and have moved + * multiple works to the scheduled queue, the next position + * needs to be updated. + */ + if (nextp) + *nextp = n; +} + +static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) +{ + struct work_struct *work = list_first_entry(&cwq->delayed_works, + struct work_struct, entry); + struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); + + move_linked_works(work, pos, NULL); + __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); + cwq->nr_active++; +} + +/** + * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight + * @cwq: cwq of interest + * @color: color of work which left the queue + * @delayed: for a delayed work + * + * A work either has completed or is removed from pending queue, + * decrement nr_in_flight of its cwq and handle workqueue flushing. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color, + bool delayed) +{ + /* ignore uncolored works */ + if (color == WORK_NO_COLOR) + return; + + cwq->nr_in_flight[color]--; + + if (!delayed) { + cwq->nr_active--; + if (!list_empty(&cwq->delayed_works)) { + /* one down, submit a delayed one */ + if (cwq->nr_active < cwq->max_active) + cwq_activate_first_delayed(cwq); + } + } + + /* is flush in progress and are we at the flushing tip? */ + if (likely(cwq->flush_color != color)) + return; + + /* are there still in-flight works? */ + if (cwq->nr_in_flight[color]) + return; + + /* this cwq is done, clear flush_color */ + cwq->flush_color = -1; + + /* + * If this was the last cwq, wake up the first flusher. It + * will handle the rest. + */ + if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) + complete(&cwq->wq->first_flusher->done); +} + +/** + * process_one_work - process single work + * @worker: self + * @work: work to process + * + * Process @work. This function contains all the logics necessary to + * process a single work including synchronization against and + * interaction with other workers on the same cpu, queueing and + * flushing. As long as context requirement is met, any worker can + * call this function to process a work. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which is released and regrabbed. + */ +static void process_one_work(struct worker *worker, struct work_struct *work) +__releases(&gcwq->lock) +__acquires(&gcwq->lock) +{ + struct cpu_workqueue_struct *cwq = get_work_cwq(work); + struct global_cwq *gcwq = cwq->gcwq; + struct hlist_head *bwh = busy_worker_head(gcwq, work); + bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; + work_func_t f = work->func; + int work_color; + struct worker *collision; +#ifdef CONFIG_LOCKDEP + /* + * It is permissible to free the struct work_struct from + * inside the function that is called from it, this we need to + * take into account for lockdep too. To avoid bogus "held + * lock freed" warnings as well as problems when looking into + * work->lockdep_map, make a copy and use that here. + */ + struct lockdep_map lockdep_map = work->lockdep_map; +#endif + /* + * A single work shouldn't be executed concurrently by + * multiple workers on a single cpu. Check whether anyone is + * already processing the work. If so, defer the work to the + * currently executing one. + */ + collision = __find_worker_executing_work(gcwq, bwh, work); + if (unlikely(collision)) { + move_linked_works(work, &collision->scheduled, NULL); + return; + } + + /* claim and process */ + debug_work_deactivate(work); + hlist_add_head(&worker->hentry, bwh); + worker->current_work = work; + worker->current_cwq = cwq; + work_color = get_work_color(work); + + /* record the current cpu number in the work data and dequeue */ + set_work_cpu(work, gcwq->cpu); + list_del_init(&work->entry); + + /* + * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI, + * wake up another worker; otherwise, clear HIGHPRI_PENDING. + */ + if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) { + struct work_struct *nwork = list_first_entry(&gcwq->worklist, + struct work_struct, entry); + + if (!list_empty(&gcwq->worklist) && + get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI) + wake_up_worker(gcwq); + else + gcwq->flags &= ~GCWQ_HIGHPRI_PENDING; + } + + /* + * CPU intensive works don't participate in concurrency + * management. They're the scheduler's responsibility. + */ + if (unlikely(cpu_intensive)) + worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); + + spin_unlock_irq(&gcwq->lock); + + work_clear_pending(work); + lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_acquire(&lockdep_map); + trace_workqueue_execute_start(work); + f(work); + /* + * While we must be careful to not use "work" after this, the trace + * point will only record its address. + */ + trace_workqueue_execute_end(work); + lock_map_release(&lockdep_map); + lock_map_release(&cwq->wq->lockdep_map); + + if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { + printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), task_pid_nr(current)); + printk(KERN_ERR " last function: "); + print_symbol("%s\n", (unsigned long)f); + debug_show_held_locks(current); + dump_stack(); + } + + spin_lock_irq(&gcwq->lock); + + /* clear cpu intensive status */ + if (unlikely(cpu_intensive)) + worker_clr_flags(worker, WORKER_CPU_INTENSIVE); + + /* we're done with it, release */ + hlist_del_init(&worker->hentry); + worker->current_work = NULL; + worker->current_cwq = NULL; + cwq_dec_nr_in_flight(cwq, work_color, false); +} + +/** + * process_scheduled_works - process scheduled works + * @worker: self + * + * Process all scheduled works. Please note that the scheduled list + * may change while processing a work, so this function repeatedly + * fetches a work from the top and executes it. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. + */ +static void process_scheduled_works(struct worker *worker) +{ + while (!list_empty(&worker->scheduled)) { + struct work_struct *work = list_first_entry(&worker->scheduled, + struct work_struct, entry); + process_one_work(worker, work); + } +} + +/** + * worker_thread - the worker thread function + * @__worker: self + * + * The gcwq worker thread function. There's a single dynamic pool of + * these per each cpu. These workers process all works regardless of + * their specific target workqueue. The only exception is works which + * belong to workqueues with a rescuer which will be explained in + * rescuer_thread(). + */ +static int worker_thread(void *__worker) +{ + struct worker *worker = __worker; + struct global_cwq *gcwq = worker->gcwq; + + /* tell the scheduler that this is a workqueue worker */ + worker->task->flags |= PF_WQ_WORKER; +woke_up: + spin_lock_irq(&gcwq->lock); + + /* DIE can be set only while we're idle, checking here is enough */ + if (worker->flags & WORKER_DIE) { + spin_unlock_irq(&gcwq->lock); + worker->task->flags &= ~PF_WQ_WORKER; + return 0; + } + + worker_leave_idle(worker); +recheck: + /* no more worker necessary? */ + if (!need_more_worker(gcwq)) + goto sleep; + + /* do we need to manage? */ + if (unlikely(!may_start_working(gcwq)) && manage_workers(worker)) + goto recheck; + + /* + * ->scheduled list can only be filled while a worker is + * preparing to process a work or actually processing it. + * Make sure nobody diddled with it while I was sleeping. + */ + BUG_ON(!list_empty(&worker->scheduled)); + + /* + * When control reaches this point, we're guaranteed to have + * at least one idle worker or that someone else has already + * assumed the manager role. + */ + worker_clr_flags(worker, WORKER_PREP); + + do { + struct work_struct *work = + list_first_entry(&gcwq->worklist, + struct work_struct, entry); + + if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { + /* optimization path, not strictly necessary */ + process_one_work(worker, work); + if (unlikely(!list_empty(&worker->scheduled))) + process_scheduled_works(worker); + } else { + move_linked_works(work, &worker->scheduled, NULL); + process_scheduled_works(worker); + } + } while (keep_working(gcwq)); + + worker_set_flags(worker, WORKER_PREP, false); +sleep: + if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker)) + goto recheck; + + /* + * gcwq->lock is held and there's no work to process and no + * need to manage, sleep. Workers are woken up only while + * holding gcwq->lock or from local cpu, so setting the + * current state before releasing gcwq->lock is enough to + * prevent losing any event. + */ + worker_enter_idle(worker); + __set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&gcwq->lock); + schedule(); + goto woke_up; +} + +/** + * rescuer_thread - the rescuer thread function + * @__wq: the associated workqueue + * + * Workqueue rescuer thread function. There's one rescuer for each + * workqueue which has WQ_RESCUER set. + * + * Regular work processing on a gcwq may block trying to create a new + * worker which uses GFP_KERNEL allocation which has slight chance of + * developing into deadlock if some works currently on the same queue + * need to be processed to satisfy the GFP_KERNEL allocation. This is + * the problem rescuer solves. + * + * When such condition is possible, the gcwq summons rescuers of all + * workqueues which have works queued on the gcwq and let them process + * those works so that forward progress can be guaranteed. + * + * This should happen rarely. + */ +static int rescuer_thread(void *__wq) +{ + struct workqueue_struct *wq = __wq; + struct worker *rescuer = wq->rescuer; + struct list_head *scheduled = &rescuer->scheduled; + bool is_unbound = wq->flags & WQ_UNBOUND; + unsigned int cpu; + + set_user_nice(current, RESCUER_NICE_LEVEL); +repeat: + set_current_state(TASK_INTERRUPTIBLE); + + if (kthread_should_stop()) + return 0; + + /* + * See whether any cpu is asking for help. Unbounded + * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND. + */ + for_each_mayday_cpu(cpu, wq->mayday_mask) { + unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; + struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); + struct global_cwq *gcwq = cwq->gcwq; + struct work_struct *work, *n; + + __set_current_state(TASK_RUNNING); + mayday_clear_cpu(cpu, wq->mayday_mask); + + /* migrate to the target cpu if possible */ + rescuer->gcwq = gcwq; + worker_maybe_bind_and_lock(rescuer); + + /* + * Slurp in all works issued via this workqueue and + * process'em. + */ + BUG_ON(!list_empty(&rescuer->scheduled)); + list_for_each_entry_safe(work, n, &gcwq->worklist, entry) + if (get_work_cwq(work) == cwq) + move_linked_works(work, scheduled, &n); + + process_scheduled_works(rescuer); + spin_unlock_irq(&gcwq->lock); + } + + schedule(); + goto repeat; } struct wq_barrier { @@ -469,44 +2037,137 @@ static void wq_barrier_func(struct work_struct *work) complete(&barr->done); } +/** + * insert_wq_barrier - insert a barrier work + * @cwq: cwq to insert barrier into + * @barr: wq_barrier to insert + * @target: target work to attach @barr to + * @worker: worker currently executing @target, NULL if @target is not executing + * + * @barr is linked to @target such that @barr is completed only after + * @target finishes execution. Please note that the ordering + * guarantee is observed only with respect to @target and on the local + * cpu. + * + * Currently, a queued barrier can't be canceled. This is because + * try_to_grab_pending() can't determine whether the work to be + * grabbed is at the head of the queue and thus can't clear LINKED + * flag of the previous work while there must be a valid next work + * after a work with LINKED flag set. + * + * Note that when @worker is non-NULL, @target may be modified + * underneath us, so we can't reliably determine cwq from @target. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, - struct wq_barrier *barr, struct list_head *head) + struct wq_barrier *barr, + struct work_struct *target, struct worker *worker) { + struct list_head *head; + unsigned int linked = 0; + /* - * debugobject calls are safe here even with cwq->lock locked + * debugobject calls are safe here even with gcwq->lock locked * as we know for sure that this will not trigger any of the * checks and call back into the fixup functions where we * might deadlock. */ INIT_WORK_ON_STACK(&barr->work, wq_barrier_func); - __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); - + __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); init_completion(&barr->done); + /* + * If @target is currently being executed, schedule the + * barrier to the worker; otherwise, put it after @target. + */ + if (worker) + head = worker->scheduled.next; + else { + unsigned long *bits = work_data_bits(target); + + head = target->entry.next; + /* there can already be other linked works, inherit and set */ + linked = *bits & WORK_STRUCT_LINKED; + __set_bit(WORK_STRUCT_LINKED_BIT, bits); + } + debug_work_activate(&barr->work); - insert_work(cwq, &barr->work, head); + insert_work(cwq, &barr->work, head, + work_color_to_flags(WORK_NO_COLOR) | linked); } -static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) +/** + * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing + * @wq: workqueue being flushed + * @flush_color: new flush color, < 0 for no-op + * @work_color: new work color, < 0 for no-op + * + * Prepare cwqs for workqueue flushing. + * + * If @flush_color is non-negative, flush_color on all cwqs should be + * -1. If no cwq has in-flight commands at the specified color, all + * cwq->flush_color's stay at -1 and %false is returned. If any cwq + * has in flight commands, its cwq->flush_color is set to + * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq + * wakeup logic is armed and %true is returned. + * + * The caller should have initialized @wq->first_flusher prior to + * calling this function with non-negative @flush_color. If + * @flush_color is negative, no flush color update is done and %false + * is returned. + * + * If @work_color is non-negative, all cwqs should have the same + * work_color which is previous to @work_color and all will be + * advanced to @work_color. + * + * CONTEXT: + * mutex_lock(wq->flush_mutex). + * + * RETURNS: + * %true if @flush_color >= 0 and there's something to flush. %false + * otherwise. + */ +static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq, + int flush_color, int work_color) { - int active = 0; - struct wq_barrier barr; - - WARN_ON(cwq->thread == current); + bool wait = false; + unsigned int cpu; - spin_lock_irq(&cwq->lock); - if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { - insert_wq_barrier(cwq, &barr, &cwq->worklist); - active = 1; + if (flush_color >= 0) { + BUG_ON(atomic_read(&wq->nr_cwqs_to_flush)); + atomic_set(&wq->nr_cwqs_to_flush, 1); } - spin_unlock_irq(&cwq->lock); - if (active) { - wait_for_completion(&barr.done); - destroy_work_on_stack(&barr.work); + for_each_cwq_cpu(cpu, wq) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + struct global_cwq *gcwq = cwq->gcwq; + + spin_lock_irq(&gcwq->lock); + + if (flush_color >= 0) { + BUG_ON(cwq->flush_color != -1); + + if (cwq->nr_in_flight[flush_color]) { + cwq->flush_color = flush_color; + atomic_inc(&wq->nr_cwqs_to_flush); + wait = true; + } + } + + if (work_color >= 0) { + BUG_ON(work_color != work_next_color(cwq->work_color)); + cwq->work_color = work_color; + } + + spin_unlock_irq(&gcwq->lock); } - return active; + if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush)) + complete(&wq->first_flusher->done); + + return wait; } /** @@ -518,20 +2179,150 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) * * We sleep until all works which were queued on entry have been handled, * but we are not livelocked by new incoming ones. - * - * This function used to run the workqueues itself. Now we just wait for the - * helper threads to do it. */ void flush_workqueue(struct workqueue_struct *wq) { - const struct cpumask *cpu_map = wq_cpu_map(wq); - int cpu; + struct wq_flusher this_flusher = { + .list = LIST_HEAD_INIT(this_flusher.list), + .flush_color = -1, + .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done), + }; + int next_color; - might_sleep(); lock_map_acquire(&wq->lockdep_map); lock_map_release(&wq->lockdep_map); - for_each_cpu(cpu, cpu_map) - flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); + + mutex_lock(&wq->flush_mutex); + + /* + * Start-to-wait phase + */ + next_color = work_next_color(wq->work_color); + + if (next_color != wq->flush_color) { + /* + * Color space is not full. The current work_color + * becomes our flush_color and work_color is advanced + * by one. + */ + BUG_ON(!list_empty(&wq->flusher_overflow)); + this_flusher.flush_color = wq->work_color; + wq->work_color = next_color; + + if (!wq->first_flusher) { + /* no flush in progress, become the first flusher */ + BUG_ON(wq->flush_color != this_flusher.flush_color); + + wq->first_flusher = &this_flusher; + + if (!flush_workqueue_prep_cwqs(wq, wq->flush_color, + wq->work_color)) { + /* nothing to flush, done */ + wq->flush_color = next_color; + wq->first_flusher = NULL; + goto out_unlock; + } + } else { + /* wait in queue */ + BUG_ON(wq->flush_color == this_flusher.flush_color); + list_add_tail(&this_flusher.list, &wq->flusher_queue); + flush_workqueue_prep_cwqs(wq, -1, wq->work_color); + } + } else { + /* + * Oops, color space is full, wait on overflow queue. + * The next flush completion will assign us + * flush_color and transfer to flusher_queue. + */ + list_add_tail(&this_flusher.list, &wq->flusher_overflow); + } + + mutex_unlock(&wq->flush_mutex); + + wait_for_completion(&this_flusher.done); + + /* + * Wake-up-and-cascade phase + * + * First flushers are responsible for cascading flushes and + * handling overflow. Non-first flushers can simply return. + */ + if (wq->first_flusher != &this_flusher) + return; + + mutex_lock(&wq->flush_mutex); + + /* we might have raced, check again with mutex held */ + if (wq->first_flusher != &this_flusher) + goto out_unlock; + + wq->first_flusher = NULL; + + BUG_ON(!list_empty(&this_flusher.list)); + BUG_ON(wq->flush_color != this_flusher.flush_color); + + while (true) { + struct wq_flusher *next, *tmp; + + /* complete all the flushers sharing the current flush color */ + list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) { + if (next->flush_color != wq->flush_color) + break; + list_del_init(&next->list); + complete(&next->done); + } + + BUG_ON(!list_empty(&wq->flusher_overflow) && + wq->flush_color != work_next_color(wq->work_color)); + + /* this flush_color is finished, advance by one */ + wq->flush_color = work_next_color(wq->flush_color); + + /* one color has been freed, handle overflow queue */ + if (!list_empty(&wq->flusher_overflow)) { + /* + * Assign the same color to all overflowed + * flushers, advance work_color and append to + * flusher_queue. This is the start-to-wait + * phase for these overflowed flushers. + */ + list_for_each_entry(tmp, &wq->flusher_overflow, list) + tmp->flush_color = wq->work_color; + + wq->work_color = work_next_color(wq->work_color); + + list_splice_tail_init(&wq->flusher_overflow, + &wq->flusher_queue); + flush_workqueue_prep_cwqs(wq, -1, wq->work_color); + } + + if (list_empty(&wq->flusher_queue)) { + BUG_ON(wq->flush_color != wq->work_color); + break; + } + + /* + * Need to flush more colors. Make the next flusher + * the new first flusher and arm cwqs. + */ + BUG_ON(wq->flush_color == wq->work_color); + BUG_ON(wq->flush_color != next->flush_color); + + list_del_init(&next->list); + wq->first_flusher = next; + + if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1)) + break; + + /* + * Meh... this color is already done, clear first + * flusher and repeat cascading. + */ + wq->first_flusher = NULL; + } + +out_unlock: + mutex_unlock(&wq->flush_mutex); } EXPORT_SYMBOL_GPL(flush_workqueue); @@ -547,43 +2338,46 @@ EXPORT_SYMBOL_GPL(flush_workqueue); */ int flush_work(struct work_struct *work) { + struct worker *worker = NULL; + struct global_cwq *gcwq; struct cpu_workqueue_struct *cwq; - struct list_head *prev; struct wq_barrier barr; might_sleep(); - cwq = get_wq_data(work); - if (!cwq) + gcwq = get_work_gcwq(work); + if (!gcwq) return 0; - lock_map_acquire(&cwq->wq->lockdep_map); - lock_map_release(&cwq->wq->lockdep_map); - - prev = NULL; - spin_lock_irq(&cwq->lock); + spin_lock_irq(&gcwq->lock); if (!list_empty(&work->entry)) { /* * See the comment near try_to_grab_pending()->smp_rmb(). - * If it was re-queued under us we are not going to wait. + * If it was re-queued to a different gcwq under us, we + * are not going to wait. */ smp_rmb(); - if (unlikely(cwq != get_wq_data(work))) - goto out; - prev = &work->entry; + cwq = get_work_cwq(work); + if (unlikely(!cwq || gcwq != cwq->gcwq)) + goto already_gone; } else { - if (cwq->current_work != work) - goto out; - prev = &cwq->worklist; + worker = find_worker_executing_work(gcwq, work); + if (!worker) + goto already_gone; + cwq = worker->current_cwq; } - insert_wq_barrier(cwq, &barr, prev->next); -out: - spin_unlock_irq(&cwq->lock); - if (!prev) - return 0; + + insert_wq_barrier(cwq, &barr, work, worker); + spin_unlock_irq(&gcwq->lock); + + lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_release(&cwq->wq->lockdep_map); wait_for_completion(&barr.done); destroy_work_on_stack(&barr.work); return 1; +already_gone: + spin_unlock_irq(&gcwq->lock); + return 0; } EXPORT_SYMBOL_GPL(flush_work); @@ -593,54 +2387,56 @@ EXPORT_SYMBOL_GPL(flush_work); */ static int try_to_grab_pending(struct work_struct *work) { - struct cpu_workqueue_struct *cwq; + struct global_cwq *gcwq; int ret = -1; - if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) return 0; /* * The queueing is in progress, or it is already queued. Try to * steal it from ->worklist without clearing WORK_STRUCT_PENDING. */ - - cwq = get_wq_data(work); - if (!cwq) + gcwq = get_work_gcwq(work); + if (!gcwq) return ret; - spin_lock_irq(&cwq->lock); + spin_lock_irq(&gcwq->lock); if (!list_empty(&work->entry)) { /* - * This work is queued, but perhaps we locked the wrong cwq. + * This work is queued, but perhaps we locked the wrong gcwq. * In that case we must see the new value after rmb(), see * insert_work()->wmb(). */ smp_rmb(); - if (cwq == get_wq_data(work)) { + if (gcwq == get_work_gcwq(work)) { debug_work_deactivate(work); list_del_init(&work->entry); + cwq_dec_nr_in_flight(get_work_cwq(work), + get_work_color(work), + *work_data_bits(work) & WORK_STRUCT_DELAYED); ret = 1; } } - spin_unlock_irq(&cwq->lock); + spin_unlock_irq(&gcwq->lock); return ret; } -static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, - struct work_struct *work) +static void wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) { struct wq_barrier barr; - int running = 0; + struct worker *worker; - spin_lock_irq(&cwq->lock); - if (unlikely(cwq->current_work == work)) { - insert_wq_barrier(cwq, &barr, cwq->worklist.next); - running = 1; - } - spin_unlock_irq(&cwq->lock); + spin_lock_irq(&gcwq->lock); - if (unlikely(running)) { + worker = find_worker_executing_work(gcwq, work); + if (unlikely(worker)) + insert_wq_barrier(worker->current_cwq, &barr, work, worker); + + spin_unlock_irq(&gcwq->lock); + + if (unlikely(worker)) { wait_for_completion(&barr.done); destroy_work_on_stack(&barr.work); } @@ -648,9 +2444,6 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq, static void wait_on_work(struct work_struct *work) { - struct cpu_workqueue_struct *cwq; - struct workqueue_struct *wq; - const struct cpumask *cpu_map; int cpu; might_sleep(); @@ -658,15 +2451,8 @@ static void wait_on_work(struct work_struct *work) lock_map_acquire(&work->lockdep_map); lock_map_release(&work->lockdep_map); - cwq = get_wq_data(work); - if (!cwq) - return; - - wq = cwq->wq; - cpu_map = wq_cpu_map(wq); - - for_each_cpu(cpu, cpu_map) - wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); + for_each_gcwq_cpu(cpu) + wait_on_cpu_work(get_gcwq(cpu), work); } static int __cancel_work_timer(struct work_struct *work, @@ -681,7 +2467,7 @@ static int __cancel_work_timer(struct work_struct *work, wait_on_work(work); } while (unlikely(ret < 0)); - clear_wq_data(work); + clear_work_data(work); return ret; } @@ -727,8 +2513,6 @@ int cancel_delayed_work_sync(struct delayed_work *dwork) } EXPORT_SYMBOL(cancel_delayed_work_sync); -static struct workqueue_struct *keventd_wq __read_mostly; - /** * schedule_work - put work task in global workqueue * @work: job to be done @@ -742,7 +2526,7 @@ static struct workqueue_struct *keventd_wq __read_mostly; */ int schedule_work(struct work_struct *work) { - return queue_work(keventd_wq, work); + return queue_work(system_wq, work); } EXPORT_SYMBOL(schedule_work); @@ -755,7 +2539,7 @@ EXPORT_SYMBOL(schedule_work); */ int schedule_work_on(int cpu, struct work_struct *work) { - return queue_work_on(cpu, keventd_wq, work); + return queue_work_on(cpu, system_wq, work); } EXPORT_SYMBOL(schedule_work_on); @@ -770,7 +2554,7 @@ EXPORT_SYMBOL(schedule_work_on); int schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) { - return queue_delayed_work(keventd_wq, dwork, delay); + return queue_delayed_work(system_wq, dwork, delay); } EXPORT_SYMBOL(schedule_delayed_work); @@ -783,9 +2567,8 @@ EXPORT_SYMBOL(schedule_delayed_work); void flush_delayed_work(struct delayed_work *dwork) { if (del_timer_sync(&dwork->timer)) { - struct cpu_workqueue_struct *cwq; - cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu()); - __queue_work(cwq, &dwork->work); + __queue_work(get_cpu(), get_work_cwq(&dwork->work)->wq, + &dwork->work); put_cpu(); } flush_work(&dwork->work); @@ -804,7 +2587,7 @@ EXPORT_SYMBOL(flush_delayed_work); int schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) { - return queue_delayed_work_on(cpu, keventd_wq, dwork, delay); + return queue_delayed_work_on(cpu, system_wq, dwork, delay); } EXPORT_SYMBOL(schedule_delayed_work_on); @@ -820,8 +2603,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on); int schedule_on_each_cpu(work_func_t func) { int cpu; - int orig = -1; - struct work_struct *works; + struct work_struct __percpu *works; works = alloc_percpu(struct work_struct); if (!works) @@ -829,23 +2611,12 @@ int schedule_on_each_cpu(work_func_t func) get_online_cpus(); - /* - * When running in keventd don't schedule a work item on - * itself. Can just call directly because the work queue is - * already bound. This also is faster. - */ - if (current_is_keventd()) - orig = raw_smp_processor_id(); - for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); INIT_WORK(work, func); - if (cpu != orig) - schedule_work_on(cpu, work); + schedule_work_on(cpu, work); } - if (orig >= 0) - func(per_cpu_ptr(works, orig)); for_each_online_cpu(cpu) flush_work(per_cpu_ptr(works, cpu)); @@ -881,7 +2652,7 @@ int schedule_on_each_cpu(work_func_t func) */ void flush_scheduled_work(void) { - flush_workqueue(keventd_wq); + flush_workqueue(system_wq); } EXPORT_SYMBOL(flush_scheduled_work); @@ -913,170 +2684,169 @@ EXPORT_SYMBOL_GPL(execute_in_process_context); int keventd_up(void) { - return keventd_wq != NULL; + return system_wq != NULL; } -int current_is_keventd(void) +static int alloc_cwqs(struct workqueue_struct *wq) { - struct cpu_workqueue_struct *cwq; - int cpu = raw_smp_processor_id(); /* preempt-safe: keventd is per-cpu */ - int ret = 0; - - BUG_ON(!keventd_wq); + /* + * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. + * Make sure that the alignment isn't lower than that of + * unsigned long long. + */ + const size_t size = sizeof(struct cpu_workqueue_struct); + const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, + __alignof__(unsigned long long)); +#ifdef CONFIG_SMP + bool percpu = !(wq->flags & WQ_UNBOUND); +#else + bool percpu = false; +#endif - cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu); - if (current == cwq->thread) - ret = 1; + if (percpu) + wq->cpu_wq.pcpu = __alloc_percpu(size, align); + else { + void *ptr; - return ret; + /* + * Allocate enough room to align cwq and put an extra + * pointer at the end pointing back to the originally + * allocated pointer which will be used for free. + */ + ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL); + if (ptr) { + wq->cpu_wq.single = PTR_ALIGN(ptr, align); + *(void **)(wq->cpu_wq.single + 1) = ptr; + } + } + /* just in case, make sure it's actually aligned */ + BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); + return wq->cpu_wq.v ? 0 : -ENOMEM; } -static struct cpu_workqueue_struct * -init_cpu_workqueue(struct workqueue_struct *wq, int cpu) +static void free_cwqs(struct workqueue_struct *wq) { - struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); - - cwq->wq = wq; - spin_lock_init(&cwq->lock); - INIT_LIST_HEAD(&cwq->worklist); - init_waitqueue_head(&cwq->more_work); +#ifdef CONFIG_SMP + bool percpu = !(wq->flags & WQ_UNBOUND); +#else + bool percpu = false; +#endif - return cwq; + if (percpu) + free_percpu(wq->cpu_wq.pcpu); + else if (wq->cpu_wq.single) { + /* the pointer to free is stored right after the cwq */ + kfree(*(void **)(wq->cpu_wq.single + 1)); + } } -static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) +static int wq_clamp_max_active(int max_active, unsigned int flags, + const char *name) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - struct workqueue_struct *wq = cwq->wq; - const char *fmt = is_wq_single_threaded(wq) ? "%s" : "%s/%d"; - struct task_struct *p; - - p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu); - /* - * Nobody can add the work_struct to this cwq, - * if (caller is __create_workqueue) - * nobody should see this wq - * else // caller is CPU_UP_PREPARE - * cpu is not on cpu_online_map - * so we can abort safely. - */ - if (IS_ERR(p)) - return PTR_ERR(p); - if (cwq->wq->rt) - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); - cwq->thread = p; + int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; - trace_workqueue_creation(cwq->thread, cpu); + if (max_active < 1 || max_active > lim) + printk(KERN_WARNING "workqueue: max_active %d requested for %s " + "is out of range, clamping between %d and %d\n", + max_active, name, 1, lim); - return 0; + return clamp_val(max_active, 1, lim); } -static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) +struct workqueue_struct *__alloc_workqueue_key(const char *name, + unsigned int flags, + int max_active, + struct lock_class_key *key, + const char *lock_name) { - struct task_struct *p = cwq->thread; + struct workqueue_struct *wq; + unsigned int cpu; - if (p != NULL) { - if (cpu >= 0) - kthread_bind(p, cpu); - wake_up_process(p); - } -} + /* + * Unbound workqueues aren't concurrency managed and should be + * dispatched to workers immediately. + */ + if (flags & WQ_UNBOUND) + flags |= WQ_HIGHPRI; -struct workqueue_struct *__create_workqueue_key(const char *name, - int singlethread, - int freezeable, - int rt, - struct lock_class_key *key, - const char *lock_name) -{ - struct workqueue_struct *wq; - struct cpu_workqueue_struct *cwq; - int err = 0, cpu; + max_active = max_active ?: WQ_DFL_ACTIVE; + max_active = wq_clamp_max_active(max_active, flags, name); wq = kzalloc(sizeof(*wq), GFP_KERNEL); if (!wq) - return NULL; + goto err; - wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); - if (!wq->cpu_wq) { - kfree(wq); - return NULL; - } + wq->flags = flags; + wq->saved_max_active = max_active; + mutex_init(&wq->flush_mutex); + atomic_set(&wq->nr_cwqs_to_flush, 0); + INIT_LIST_HEAD(&wq->flusher_queue); + INIT_LIST_HEAD(&wq->flusher_overflow); wq->name = name; lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); - wq->singlethread = singlethread; - wq->freezeable = freezeable; - wq->rt = rt; INIT_LIST_HEAD(&wq->list); - if (singlethread) { - cwq = init_cpu_workqueue(wq, singlethread_cpu); - err = create_workqueue_thread(cwq, singlethread_cpu); - start_workqueue_thread(cwq, -1); - } else { - cpu_maps_update_begin(); - /* - * We must place this wq on list even if the code below fails. - * cpu_down(cpu) can remove cpu from cpu_populated_map before - * destroy_workqueue() takes the lock, in that case we leak - * cwq[cpu]->thread. - */ - spin_lock(&workqueue_lock); - list_add(&wq->list, &workqueues); - spin_unlock(&workqueue_lock); - /* - * We must initialize cwqs for each possible cpu even if we - * are going to call destroy_workqueue() finally. Otherwise - * cpu_up() can hit the uninitialized cwq once we drop the - * lock. - */ - for_each_possible_cpu(cpu) { - cwq = init_cpu_workqueue(wq, cpu); - if (err || !cpu_online(cpu)) - continue; - err = create_workqueue_thread(cwq, cpu); - start_workqueue_thread(cwq, cpu); - } - cpu_maps_update_done(); + if (alloc_cwqs(wq) < 0) + goto err; + + for_each_cwq_cpu(cpu, wq) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + struct global_cwq *gcwq = get_gcwq(cpu); + + BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); + cwq->gcwq = gcwq; + cwq->wq = wq; + cwq->flush_color = -1; + cwq->max_active = max_active; + INIT_LIST_HEAD(&cwq->delayed_works); } - if (err) { - destroy_workqueue(wq); - wq = NULL; + if (flags & WQ_RESCUER) { + struct worker *rescuer; + + if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL)) + goto err; + + wq->rescuer = rescuer = alloc_worker(); + if (!rescuer) + goto err; + + rescuer->task = kthread_create(rescuer_thread, wq, "%s", name); + if (IS_ERR(rescuer->task)) + goto err; + + rescuer->task->flags |= PF_THREAD_BOUND; + wake_up_process(rescuer->task); } - return wq; -} -EXPORT_SYMBOL_GPL(__create_workqueue_key); -static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) -{ /* - * Our caller is either destroy_workqueue() or CPU_POST_DEAD, - * cpu_add_remove_lock protects cwq->thread. + * workqueue_lock protects global freeze state and workqueues + * list. Grab it, set max_active accordingly and add the new + * workqueue to workqueues list. */ - if (cwq->thread == NULL) - return; + spin_lock(&workqueue_lock); - lock_map_acquire(&cwq->wq->lockdep_map); - lock_map_release(&cwq->wq->lockdep_map); + if (workqueue_freezing && wq->flags & WQ_FREEZEABLE) + for_each_cwq_cpu(cpu, wq) + get_cwq(cpu, wq)->max_active = 0; - flush_cpu_workqueue(cwq); - /* - * If the caller is CPU_POST_DEAD and cwq->worklist was not empty, - * a concurrent flush_workqueue() can insert a barrier after us. - * However, in that case run_workqueue() won't return and check - * kthread_should_stop() until it flushes all work_struct's. - * When ->worklist becomes empty it is safe to exit because no - * more work_structs can be queued on this cwq: flush_workqueue - * checks list_empty(), and a "normal" queue_work() can't use - * a dead CPU. - */ - trace_workqueue_destruction(cwq->thread); - kthread_stop(cwq->thread); - cwq->thread = NULL; + list_add(&wq->list, &workqueues); + + spin_unlock(&workqueue_lock); + + return wq; +err: + if (wq) { + free_cwqs(wq); + free_mayday_mask(wq->mayday_mask); + kfree(wq->rescuer); + kfree(wq); + } + return NULL; } +EXPORT_SYMBOL_GPL(__alloc_workqueue_key); /** * destroy_workqueue - safely terminate a workqueue @@ -1086,72 +2856,520 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) */ void destroy_workqueue(struct workqueue_struct *wq) { - const struct cpumask *cpu_map = wq_cpu_map(wq); - int cpu; + unsigned int cpu; + + wq->flags |= WQ_DYING; + flush_workqueue(wq); - cpu_maps_update_begin(); + /* + * wq list is used to freeze wq, remove from list after + * flushing is complete in case freeze races us. + */ spin_lock(&workqueue_lock); list_del(&wq->list); spin_unlock(&workqueue_lock); - for_each_cpu(cpu, cpu_map) - cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); - cpu_maps_update_done(); + /* sanity check */ + for_each_cwq_cpu(cpu, wq) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + int i; + + for (i = 0; i < WORK_NR_COLORS; i++) + BUG_ON(cwq->nr_in_flight[i]); + BUG_ON(cwq->nr_active); + BUG_ON(!list_empty(&cwq->delayed_works)); + } - free_percpu(wq->cpu_wq); + if (wq->flags & WQ_RESCUER) { + kthread_stop(wq->rescuer->task); + free_mayday_mask(wq->mayday_mask); + kfree(wq->rescuer); + } + + free_cwqs(wq); kfree(wq); } EXPORT_SYMBOL_GPL(destroy_workqueue); +/** + * workqueue_set_max_active - adjust max_active of a workqueue + * @wq: target workqueue + * @max_active: new max_active value. + * + * Set max_active of @wq to @max_active. + * + * CONTEXT: + * Don't call from IRQ context. + */ +void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) +{ + unsigned int cpu; + + max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); + + spin_lock(&workqueue_lock); + + wq->saved_max_active = max_active; + + for_each_cwq_cpu(cpu, wq) { + struct global_cwq *gcwq = get_gcwq(cpu); + + spin_lock_irq(&gcwq->lock); + + if (!(wq->flags & WQ_FREEZEABLE) || + !(gcwq->flags & GCWQ_FREEZING)) + get_cwq(gcwq->cpu, wq)->max_active = max_active; + + spin_unlock_irq(&gcwq->lock); + } + + spin_unlock(&workqueue_lock); +} +EXPORT_SYMBOL_GPL(workqueue_set_max_active); + +/** + * workqueue_congested - test whether a workqueue is congested + * @cpu: CPU in question + * @wq: target workqueue + * + * Test whether @wq's cpu workqueue for @cpu is congested. There is + * no synchronization around this function and the test result is + * unreliable and only useful as advisory hints or for debugging. + * + * RETURNS: + * %true if congested, %false otherwise. + */ +bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) +{ + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + + return !list_empty(&cwq->delayed_works); +} +EXPORT_SYMBOL_GPL(workqueue_congested); + +/** + * work_cpu - return the last known associated cpu for @work + * @work: the work of interest + * + * RETURNS: + * CPU number if @work was ever queued. WORK_CPU_NONE otherwise. + */ +unsigned int work_cpu(struct work_struct *work) +{ + struct global_cwq *gcwq = get_work_gcwq(work); + + return gcwq ? gcwq->cpu : WORK_CPU_NONE; +} +EXPORT_SYMBOL_GPL(work_cpu); + +/** + * work_busy - test whether a work is currently pending or running + * @work: the work to be tested + * + * Test whether @work is currently pending or running. There is no + * synchronization around this function and the test result is + * unreliable and only useful as advisory hints or for debugging. + * Especially for reentrant wqs, the pending state might hide the + * running state. + * + * RETURNS: + * OR'd bitmask of WORK_BUSY_* bits. + */ +unsigned int work_busy(struct work_struct *work) +{ + struct global_cwq *gcwq = get_work_gcwq(work); + unsigned long flags; + unsigned int ret = 0; + + if (!gcwq) + return false; + + spin_lock_irqsave(&gcwq->lock, flags); + + if (work_pending(work)) + ret |= WORK_BUSY_PENDING; + if (find_worker_executing_work(gcwq, work)) + ret |= WORK_BUSY_RUNNING; + + spin_unlock_irqrestore(&gcwq->lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(work_busy); + +/* + * CPU hotplug. + * + * There are two challenges in supporting CPU hotplug. Firstly, there + * are a lot of assumptions on strong associations among work, cwq and + * gcwq which make migrating pending and scheduled works very + * difficult to implement without impacting hot paths. Secondly, + * gcwqs serve mix of short, long and very long running works making + * blocked draining impractical. + * + * This is solved by allowing a gcwq to be detached from CPU, running + * it with unbound (rogue) workers and allowing it to be reattached + * later if the cpu comes back online. A separate thread is created + * to govern a gcwq in such state and is called the trustee of the + * gcwq. + * + * Trustee states and their descriptions. + * + * START Command state used on startup. On CPU_DOWN_PREPARE, a + * new trustee is started with this state. + * + * IN_CHARGE Once started, trustee will enter this state after + * assuming the manager role and making all existing + * workers rogue. DOWN_PREPARE waits for trustee to + * enter this state. After reaching IN_CHARGE, trustee + * tries to execute the pending worklist until it's empty + * and the state is set to BUTCHER, or the state is set + * to RELEASE. + * + * BUTCHER Command state which is set by the cpu callback after + * the cpu has went down. Once this state is set trustee + * knows that there will be no new works on the worklist + * and once the worklist is empty it can proceed to + * killing idle workers. + * + * RELEASE Command state which is set by the cpu callback if the + * cpu down has been canceled or it has come online + * again. After recognizing this state, trustee stops + * trying to drain or butcher and clears ROGUE, rebinds + * all remaining workers back to the cpu and releases + * manager role. + * + * DONE Trustee will enter this state after BUTCHER or RELEASE + * is complete. + * + * trustee CPU draining + * took over down complete + * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE + * | | ^ + * | CPU is back online v return workers | + * ----------------> RELEASE -------------- + */ + +/** + * trustee_wait_event_timeout - timed event wait for trustee + * @cond: condition to wait for + * @timeout: timeout in jiffies + * + * wait_event_timeout() for trustee to use. Handles locking and + * checks for RELEASE request. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. To be used by trustee. + * + * RETURNS: + * Positive indicating left time if @cond is satisfied, 0 if timed + * out, -1 if canceled. + */ +#define trustee_wait_event_timeout(cond, timeout) ({ \ + long __ret = (timeout); \ + while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \ + __ret) { \ + spin_unlock_irq(&gcwq->lock); \ + __wait_event_timeout(gcwq->trustee_wait, (cond) || \ + (gcwq->trustee_state == TRUSTEE_RELEASE), \ + __ret); \ + spin_lock_irq(&gcwq->lock); \ + } \ + gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \ +}) + +/** + * trustee_wait_event - event wait for trustee + * @cond: condition to wait for + * + * wait_event() for trustee to use. Automatically handles locking and + * checks for CANCEL request. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. To be used by trustee. + * + * RETURNS: + * 0 if @cond is satisfied, -1 if canceled. + */ +#define trustee_wait_event(cond) ({ \ + long __ret1; \ + __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\ + __ret1 < 0 ? -1 : 0; \ +}) + +static int __cpuinit trustee_thread(void *__gcwq) +{ + struct global_cwq *gcwq = __gcwq; + struct worker *worker; + struct work_struct *work; + struct hlist_node *pos; + long rc; + int i; + + BUG_ON(gcwq->cpu != smp_processor_id()); + + spin_lock_irq(&gcwq->lock); + /* + * Claim the manager position and make all workers rogue. + * Trustee must be bound to the target cpu and can't be + * cancelled. + */ + BUG_ON(gcwq->cpu != smp_processor_id()); + rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS)); + BUG_ON(rc < 0); + + gcwq->flags |= GCWQ_MANAGING_WORKERS; + + list_for_each_entry(worker, &gcwq->idle_list, entry) + worker->flags |= WORKER_ROGUE; + + for_each_busy_worker(worker, i, pos, gcwq) + worker->flags |= WORKER_ROGUE; + + /* + * Call schedule() so that we cross rq->lock and thus can + * guarantee sched callbacks see the rogue flag. This is + * necessary as scheduler callbacks may be invoked from other + * cpus. + */ + spin_unlock_irq(&gcwq->lock); + schedule(); + spin_lock_irq(&gcwq->lock); + + /* + * Sched callbacks are disabled now. Zap nr_running. After + * this, nr_running stays zero and need_more_worker() and + * keep_working() are always true as long as the worklist is + * not empty. + */ + atomic_set(get_gcwq_nr_running(gcwq->cpu), 0); + + spin_unlock_irq(&gcwq->lock); + del_timer_sync(&gcwq->idle_timer); + spin_lock_irq(&gcwq->lock); + + /* + * We're now in charge. Notify and proceed to drain. We need + * to keep the gcwq running during the whole CPU down + * procedure as other cpu hotunplug callbacks may need to + * flush currently running tasks. + */ + gcwq->trustee_state = TRUSTEE_IN_CHARGE; + wake_up_all(&gcwq->trustee_wait); + + /* + * The original cpu is in the process of dying and may go away + * anytime now. When that happens, we and all workers would + * be migrated to other cpus. Try draining any left work. We + * want to get it over with ASAP - spam rescuers, wake up as + * many idlers as necessary and create new ones till the + * worklist is empty. Note that if the gcwq is frozen, there + * may be frozen works in freezeable cwqs. Don't declare + * completion while frozen. + */ + while (gcwq->nr_workers != gcwq->nr_idle || + gcwq->flags & GCWQ_FREEZING || + gcwq->trustee_state == TRUSTEE_IN_CHARGE) { + int nr_works = 0; + + list_for_each_entry(work, &gcwq->worklist, entry) { + send_mayday(work); + nr_works++; + } + + list_for_each_entry(worker, &gcwq->idle_list, entry) { + if (!nr_works--) + break; + wake_up_process(worker->task); + } + + if (need_to_create_worker(gcwq)) { + spin_unlock_irq(&gcwq->lock); + worker = create_worker(gcwq, false); + spin_lock_irq(&gcwq->lock); + if (worker) { + worker->flags |= WORKER_ROGUE; + start_worker(worker); + } + } + + /* give a breather */ + if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0) + break; + } + + /* + * Either all works have been scheduled and cpu is down, or + * cpu down has already been canceled. Wait for and butcher + * all workers till we're canceled. + */ + do { + rc = trustee_wait_event(!list_empty(&gcwq->idle_list)); + while (!list_empty(&gcwq->idle_list)) + destroy_worker(list_first_entry(&gcwq->idle_list, + struct worker, entry)); + } while (gcwq->nr_workers && rc >= 0); + + /* + * At this point, either draining has completed and no worker + * is left, or cpu down has been canceled or the cpu is being + * brought back up. There shouldn't be any idle one left. + * Tell the remaining busy ones to rebind once it finishes the + * currently scheduled works by scheduling the rebind_work. + */ + WARN_ON(!list_empty(&gcwq->idle_list)); + + for_each_busy_worker(worker, i, pos, gcwq) { + struct work_struct *rebind_work = &worker->rebind_work; + + /* + * Rebind_work may race with future cpu hotplug + * operations. Use a separate flag to mark that + * rebinding is scheduled. + */ + worker->flags |= WORKER_REBIND; + worker->flags &= ~WORKER_ROGUE; + + /* queue rebind_work, wq doesn't matter, use the default one */ + if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, + work_data_bits(rebind_work))) + continue; + + debug_work_activate(rebind_work); + insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, + worker->scheduled.next, + work_color_to_flags(WORK_NO_COLOR)); + } + + /* relinquish manager role */ + gcwq->flags &= ~GCWQ_MANAGING_WORKERS; + + /* notify completion */ + gcwq->trustee = NULL; + gcwq->trustee_state = TRUSTEE_DONE; + wake_up_all(&gcwq->trustee_wait); + spin_unlock_irq(&gcwq->lock); + return 0; +} + +/** + * wait_trustee_state - wait for trustee to enter the specified state + * @gcwq: gcwq the trustee of interest belongs to + * @state: target state to wait for + * + * Wait for the trustee to reach @state. DONE is already matched. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. To be used by cpu_callback. + */ +static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state) +__releases(&gcwq->lock) +__acquires(&gcwq->lock) +{ + if (!(gcwq->trustee_state == state || + gcwq->trustee_state == TRUSTEE_DONE)) { + spin_unlock_irq(&gcwq->lock); + __wait_event(gcwq->trustee_wait, + gcwq->trustee_state == state || + gcwq->trustee_state == TRUSTEE_DONE); + spin_lock_irq(&gcwq->lock); + } +} + static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; - struct cpu_workqueue_struct *cwq; - struct workqueue_struct *wq; - int err = 0; + struct global_cwq *gcwq = get_gcwq(cpu); + struct task_struct *new_trustee = NULL; + struct worker *uninitialized_var(new_worker); + unsigned long flags; action &= ~CPU_TASKS_FROZEN; switch (action) { + case CPU_DOWN_PREPARE: + new_trustee = kthread_create(trustee_thread, gcwq, + "workqueue_trustee/%d\n", cpu); + if (IS_ERR(new_trustee)) + return notifier_from_errno(PTR_ERR(new_trustee)); + kthread_bind(new_trustee, cpu); + /* fall through */ case CPU_UP_PREPARE: - cpumask_set_cpu(cpu, cpu_populated_map); - } -undo: - list_for_each_entry(wq, &workqueues, list) { - cwq = per_cpu_ptr(wq->cpu_wq, cpu); - - switch (action) { - case CPU_UP_PREPARE: - err = create_workqueue_thread(cwq, cpu); - if (!err) - break; - printk(KERN_ERR "workqueue [%s] for %i failed\n", - wq->name, cpu); - action = CPU_UP_CANCELED; - err = -ENOMEM; - goto undo; - - case CPU_ONLINE: - start_workqueue_thread(cwq, cpu); - break; - - case CPU_UP_CANCELED: - start_workqueue_thread(cwq, -1); - case CPU_POST_DEAD: - cleanup_workqueue_thread(cwq); - break; + BUG_ON(gcwq->first_idle); + new_worker = create_worker(gcwq, false); + if (!new_worker) { + if (new_trustee) + kthread_stop(new_trustee); + return NOTIFY_BAD; } } + /* some are called w/ irq disabled, don't disturb irq status */ + spin_lock_irqsave(&gcwq->lock, flags); + switch (action) { - case CPU_UP_CANCELED: + case CPU_DOWN_PREPARE: + /* initialize trustee and tell it to acquire the gcwq */ + BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE); + gcwq->trustee = new_trustee; + gcwq->trustee_state = TRUSTEE_START; + wake_up_process(gcwq->trustee); + wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE); + /* fall through */ + case CPU_UP_PREPARE: + BUG_ON(gcwq->first_idle); + gcwq->first_idle = new_worker; + break; + + case CPU_DYING: + /* + * Before this, the trustee and all workers except for + * the ones which are still executing works from + * before the last CPU down must be on the cpu. After + * this, they'll all be diasporas. + */ + gcwq->flags |= GCWQ_DISASSOCIATED; + break; + case CPU_POST_DEAD: - cpumask_clear_cpu(cpu, cpu_populated_map); + gcwq->trustee_state = TRUSTEE_BUTCHER; + /* fall through */ + case CPU_UP_CANCELED: + destroy_worker(gcwq->first_idle); + gcwq->first_idle = NULL; + break; + + case CPU_DOWN_FAILED: + case CPU_ONLINE: + gcwq->flags &= ~GCWQ_DISASSOCIATED; + if (gcwq->trustee_state != TRUSTEE_DONE) { + gcwq->trustee_state = TRUSTEE_RELEASE; + wake_up_process(gcwq->trustee); + wait_trustee_state(gcwq, TRUSTEE_DONE); + } + + /* + * Trustee is done and there might be no worker left. + * Put the first_idle in and request a real manager to + * take a look. + */ + spin_unlock_irq(&gcwq->lock); + kthread_bind(gcwq->first_idle->task, cpu); + spin_lock_irq(&gcwq->lock); + gcwq->flags |= GCWQ_MANAGE_WORKERS; + start_worker(gcwq->first_idle); + gcwq->first_idle = NULL; + break; } - return notifier_from_errno(err); + spin_unlock_irqrestore(&gcwq->lock, flags); + + return notifier_from_errno(0); } #ifdef CONFIG_SMP @@ -1201,14 +3419,200 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) EXPORT_SYMBOL_GPL(work_on_cpu); #endif /* CONFIG_SMP */ -void __init init_workqueues(void) +#ifdef CONFIG_FREEZER + +/** + * freeze_workqueues_begin - begin freezing workqueues + * + * Start freezing workqueues. After this function returns, all + * freezeable workqueues will queue new works to their frozen_works + * list instead of gcwq->worklist. + * + * CONTEXT: + * Grabs and releases workqueue_lock and gcwq->lock's. + */ +void freeze_workqueues_begin(void) +{ + unsigned int cpu; + + spin_lock(&workqueue_lock); + + BUG_ON(workqueue_freezing); + workqueue_freezing = true; + + for_each_gcwq_cpu(cpu) { + struct global_cwq *gcwq = get_gcwq(cpu); + struct workqueue_struct *wq; + + spin_lock_irq(&gcwq->lock); + + BUG_ON(gcwq->flags & GCWQ_FREEZING); + gcwq->flags |= GCWQ_FREEZING; + + list_for_each_entry(wq, &workqueues, list) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + + if (cwq && wq->flags & WQ_FREEZEABLE) + cwq->max_active = 0; + } + + spin_unlock_irq(&gcwq->lock); + } + + spin_unlock(&workqueue_lock); +} + +/** + * freeze_workqueues_busy - are freezeable workqueues still busy? + * + * Check whether freezing is complete. This function must be called + * between freeze_workqueues_begin() and thaw_workqueues(). + * + * CONTEXT: + * Grabs and releases workqueue_lock. + * + * RETURNS: + * %true if some freezeable workqueues are still busy. %false if + * freezing is complete. + */ +bool freeze_workqueues_busy(void) +{ + unsigned int cpu; + bool busy = false; + + spin_lock(&workqueue_lock); + + BUG_ON(!workqueue_freezing); + + for_each_gcwq_cpu(cpu) { + struct workqueue_struct *wq; + /* + * nr_active is monotonically decreasing. It's safe + * to peek without lock. + */ + list_for_each_entry(wq, &workqueues, list) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + + if (!cwq || !(wq->flags & WQ_FREEZEABLE)) + continue; + + BUG_ON(cwq->nr_active < 0); + if (cwq->nr_active) { + busy = true; + goto out_unlock; + } + } + } +out_unlock: + spin_unlock(&workqueue_lock); + return busy; +} + +/** + * thaw_workqueues - thaw workqueues + * + * Thaw workqueues. Normal queueing is restored and all collected + * frozen works are transferred to their respective gcwq worklists. + * + * CONTEXT: + * Grabs and releases workqueue_lock and gcwq->lock's. + */ +void thaw_workqueues(void) { - alloc_cpumask_var(&cpu_populated_map, GFP_KERNEL); + unsigned int cpu; + + spin_lock(&workqueue_lock); + + if (!workqueue_freezing) + goto out_unlock; - cpumask_copy(cpu_populated_map, cpu_online_mask); - singlethread_cpu = cpumask_first(cpu_possible_mask); - cpu_singlethread_map = cpumask_of(singlethread_cpu); - hotcpu_notifier(workqueue_cpu_callback, 0); - keventd_wq = create_workqueue("events"); - BUG_ON(!keventd_wq); + for_each_gcwq_cpu(cpu) { + struct global_cwq *gcwq = get_gcwq(cpu); + struct workqueue_struct *wq; + + spin_lock_irq(&gcwq->lock); + + BUG_ON(!(gcwq->flags & GCWQ_FREEZING)); + gcwq->flags &= ~GCWQ_FREEZING; + + list_for_each_entry(wq, &workqueues, list) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + + if (!cwq || !(wq->flags & WQ_FREEZEABLE)) + continue; + + /* restore max_active and repopulate worklist */ + cwq->max_active = wq->saved_max_active; + + while (!list_empty(&cwq->delayed_works) && + cwq->nr_active < cwq->max_active) + cwq_activate_first_delayed(cwq); + } + + wake_up_worker(gcwq); + + spin_unlock_irq(&gcwq->lock); + } + + workqueue_freezing = false; +out_unlock: + spin_unlock(&workqueue_lock); +} +#endif /* CONFIG_FREEZER */ + +static int __init init_workqueues(void) +{ + unsigned int cpu; + int i; + + cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE); + + /* initialize gcwqs */ + for_each_gcwq_cpu(cpu) { + struct global_cwq *gcwq = get_gcwq(cpu); + + spin_lock_init(&gcwq->lock); + INIT_LIST_HEAD(&gcwq->worklist); + gcwq->cpu = cpu; + gcwq->flags |= GCWQ_DISASSOCIATED; + + INIT_LIST_HEAD(&gcwq->idle_list); + for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) + INIT_HLIST_HEAD(&gcwq->busy_hash[i]); + + init_timer_deferrable(&gcwq->idle_timer); + gcwq->idle_timer.function = idle_worker_timeout; + gcwq->idle_timer.data = (unsigned long)gcwq; + + setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout, + (unsigned long)gcwq); + + ida_init(&gcwq->worker_ida); + + gcwq->trustee_state = TRUSTEE_DONE; + init_waitqueue_head(&gcwq->trustee_wait); + } + + /* create the initial worker */ + for_each_online_gcwq_cpu(cpu) { + struct global_cwq *gcwq = get_gcwq(cpu); + struct worker *worker; + + if (cpu != WORK_CPU_UNBOUND) + gcwq->flags &= ~GCWQ_DISASSOCIATED; + worker = create_worker(gcwq, true); + BUG_ON(!worker); + spin_lock_irq(&gcwq->lock); + start_worker(worker); + spin_unlock_irq(&gcwq->lock); + } + + system_wq = alloc_workqueue("events", 0, 0); + system_long_wq = alloc_workqueue("events_long", 0, 0); + system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); + system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, + WQ_UNBOUND_MAX_ACTIVE); + BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq); + return 0; } +early_initcall(init_workqueues); diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h new file mode 100644 index 0000000..2d10fc9 --- /dev/null +++ b/kernel/workqueue_sched.h @@ -0,0 +1,9 @@ +/* + * kernel/workqueue_sched.h + * + * Scheduler hooks for concurrency managed workqueue. Only to be + * included from sched.c and workqueue.c. + */ +void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); +struct task_struct *wq_worker_sleeping(struct task_struct *task, + unsigned int cpu); |