diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/acct.c | 20 | ||||
-rw-r--r-- | kernel/cgroup.c | 2 | ||||
-rw-r--r-- | kernel/cgroup_freezer.c | 21 | ||||
-rw-r--r-- | kernel/compat.c | 25 | ||||
-rw-r--r-- | kernel/cred.c | 2 | ||||
-rw-r--r-- | kernel/groups.c | 6 | ||||
-rw-r--r-- | kernel/hrtimer.c | 67 | ||||
-rw-r--r-- | kernel/irq/handle.c | 3 | ||||
-rw-r--r-- | kernel/irq/manage.c | 89 | ||||
-rw-r--r-- | kernel/irq/proc.c | 60 | ||||
-rw-r--r-- | kernel/pm_qos_params.c | 218 | ||||
-rw-r--r-- | kernel/posix-cpu-timers.c | 298 | ||||
-rw-r--r-- | kernel/power/Makefile | 3 | ||||
-rw-r--r-- | kernel/power/block_io.c | 103 | ||||
-rw-r--r-- | kernel/power/power.h | 27 | ||||
-rw-r--r-- | kernel/power/snapshot.c | 145 | ||||
-rw-r--r-- | kernel/power/swap.c | 333 | ||||
-rw-r--r-- | kernel/power/user.c | 37 | ||||
-rw-r--r-- | kernel/sys.c | 31 | ||||
-rw-r--r-- | kernel/sysctl.c | 28 | ||||
-rw-r--r-- | kernel/time.c | 11 | ||||
-rw-r--r-- | kernel/time/clocksource.c | 48 | ||||
-rw-r--r-- | kernel/time/ntp.c | 2 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 35 | ||||
-rw-r--r-- | kernel/timer.c | 137 | ||||
-rw-r--r-- | kernel/workqueue.c | 36 |
26 files changed, 964 insertions, 823 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index e4c0e1f..385b884 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -216,7 +216,6 @@ static int acct_on(char *name) { struct file *file; struct vfsmount *mnt; - int error; struct pid_namespace *ns; struct bsd_acct_struct *acct = NULL; @@ -244,13 +243,6 @@ static int acct_on(char *name) } } - error = security_acct(file); - if (error) { - kfree(acct); - filp_close(file, NULL); - return error; - } - spin_lock(&acct_lock); if (ns->bacct == NULL) { ns->bacct = acct; @@ -281,7 +273,7 @@ static int acct_on(char *name) */ SYSCALL_DEFINE1(acct, const char __user *, name) { - int error; + int error = 0; if (!capable(CAP_SYS_PACCT)) return -EPERM; @@ -299,13 +291,11 @@ SYSCALL_DEFINE1(acct, const char __user *, name) if (acct == NULL) return 0; - error = security_acct(NULL); - if (!error) { - spin_lock(&acct_lock); - acct_file_reopen(acct, NULL, NULL); - spin_unlock(&acct_lock); - } + spin_lock(&acct_lock); + acct_file_reopen(acct, NULL, NULL); + spin_unlock(&acct_lock); } + return error; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e9ec642..2917750 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3615,7 +3615,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * @ss: the subsystem to load * * This function should be called in a modular subsystem's initcall. If the - * subsytem is built as a module, it will be assigned a new subsys_id and set + * subsystem is built as a module, it will be assigned a new subsys_id and set * up for use. If the subsystem is built-in anyway, work is delegated to the * simpler cgroup_init_subsys. */ diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index e5c0244..ce71ed5 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -89,10 +89,10 @@ struct cgroup_subsys freezer_subsys; /* Locks taken and their ordering * ------------------------------ - * css_set_lock * cgroup_mutex (AKA cgroup_lock) - * task->alloc_lock (AKA task_lock) * freezer->lock + * css_set_lock + * task->alloc_lock (AKA task_lock) * task->sighand->siglock * * cgroup code forces css_set_lock to be taken before task->alloc_lock @@ -100,33 +100,38 @@ struct cgroup_subsys freezer_subsys; * freezer_create(), freezer_destroy(): * cgroup_mutex [ by cgroup core ] * - * can_attach(): - * cgroup_mutex + * freezer_can_attach(): + * cgroup_mutex (held by caller of can_attach) * - * cgroup_frozen(): + * cgroup_freezing_or_frozen(): * task->alloc_lock (to get task's cgroup) * * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): - * task->alloc_lock (to get task's cgroup) * freezer->lock * sighand->siglock (if the cgroup is freezing) * * freezer_read(): * cgroup_mutex * freezer->lock + * write_lock css_set_lock (cgroup iterator start) + * task->alloc_lock * read_lock css_set_lock (cgroup iterator start) * * freezer_write() (freeze): * cgroup_mutex * freezer->lock + * write_lock css_set_lock (cgroup iterator start) + * task->alloc_lock * read_lock css_set_lock (cgroup iterator start) - * sighand->siglock + * sighand->siglock (fake signal delivery inside freeze_task()) * * freezer_write() (unfreeze): * cgroup_mutex * freezer->lock + * write_lock css_set_lock (cgroup iterator start) + * task->alloc_lock * read_lock css_set_lock (cgroup iterator start) - * task->alloc_lock (to prevent races with freeze_task()) + * task->alloc_lock (inside thaw_process(), prevents race with refrigerator()) * sighand->siglock */ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, diff --git a/kernel/compat.c b/kernel/compat.c index 7f40e92..5adab05 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -495,29 +495,26 @@ asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, { int ret; cpumask_var_t mask; - unsigned long *k; - unsigned int min_length = cpumask_size(); - - if (nr_cpu_ids <= BITS_PER_COMPAT_LONG) - min_length = sizeof(compat_ulong_t); - if (len < min_length) + if ((len * BITS_PER_BYTE) < nr_cpu_ids) + return -EINVAL; + if (len & (sizeof(compat_ulong_t)-1)) return -EINVAL; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; ret = sched_getaffinity(pid, mask); - if (ret < 0) - goto out; + if (ret == 0) { + size_t retlen = min_t(size_t, len, cpumask_size()); - k = cpumask_bits(mask); - ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8); - if (ret == 0) - ret = min_length; - -out: + if (compat_put_bitmap(user_mask_ptr, cpumask_bits(mask), retlen * 8)) + ret = -EFAULT; + else + ret = retlen; + } free_cpumask_var(mask); + return ret; } diff --git a/kernel/cred.c b/kernel/cred.c index 8f3672a..2c24870 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -522,8 +522,6 @@ int commit_creds(struct cred *new) #endif BUG_ON(atomic_read(&new->usage) < 1); - security_commit_creds(new, old); - get_cred(new); /* we will require a ref for the subj creds too */ /* dumpability changes */ diff --git a/kernel/groups.c b/kernel/groups.c index 2b45b2e..53b1916 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -164,12 +164,6 @@ int groups_search(const struct group_info *group_info, gid_t grp) */ int set_groups(struct cred *new, struct group_info *group_info) { - int retval; - - retval = security_task_setgroups(group_info); - if (retval) - return retval; - put_group_info(new->group_info); groups_sort(group_info); get_group_info(group_info); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 0086628..b9b134b 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1749,35 +1749,15 @@ void __init hrtimers_init(void) } /** - * schedule_hrtimeout_range - sleep until timeout + * schedule_hrtimeout_range_clock - sleep until timeout * @expires: timeout value (ktime_t) * @delta: slack in expires timeout (ktime_t) * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL - * - * Make the current task sleep until the given expiry time has - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). - * - * The @delta argument gives the kernel the freedom to schedule the - * actual wakeup to a time that is both power and performance friendly. - * The kernel give the normal best effort behavior for "@expires+@delta", - * but may decide to fire the timer earlier, but no earlier than @expires. - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to - * pass before the routine returns. - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - * - * Returns 0 when the timer has expired otherwise -EINTR + * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME */ -int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, - const enum hrtimer_mode mode) +int __sched +schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, + const enum hrtimer_mode mode, int clock) { struct hrtimer_sleeper t; @@ -1799,7 +1779,7 @@ int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, return -EINTR; } - hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode); + hrtimer_init_on_stack(&t.timer, clock, mode); hrtimer_set_expires_range_ns(&t.timer, *expires, delta); hrtimer_init_sleeper(&t, current); @@ -1818,6 +1798,41 @@ int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, return !t.task ? 0 : -EINTR; } + +/** + * schedule_hrtimeout_range - sleep until timeout + * @expires: timeout value (ktime_t) + * @delta: slack in expires timeout (ktime_t) + * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL + * + * Make the current task sleep until the given expiry time has + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * The @delta argument gives the kernel the freedom to schedule the + * actual wakeup to a time that is both power and performance friendly. + * The kernel give the normal best effort behavior for "@expires+@delta", + * but may decide to fire the timer earlier, but no earlier than @expires. + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to + * pass before the routine returns. + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task. + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Returns 0 when the timer has expired otherwise -EINTR + */ +int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, + const enum hrtimer_mode mode) +{ + return schedule_hrtimeout_range_clock(expires, delta, mode, + CLOCK_MONOTONIC); +} EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); /** diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 76d5a67..27e5c69 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -370,9 +370,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) irqreturn_t ret, retval = IRQ_NONE; unsigned int status = 0; - if (!(action->flags & IRQF_DISABLED)) - local_irq_enable_in_hardirq(); - do { trace_irq_handler_entry(irq, action); ret = action->handler(irq, action->dev_id); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 704e488..3164ba7 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -138,6 +138,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) return 0; } +int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) +{ + struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; + + if (!desc) + return -EINVAL; + + raw_spin_lock_irqsave(&desc->lock, flags); + desc->affinity_hint = m; + raw_spin_unlock_irqrestore(&desc->lock, flags); + + return 0; +} +EXPORT_SYMBOL_GPL(irq_set_affinity_hint); + #ifndef CONFIG_AUTO_IRQ_AFFINITY /* * Generic version of the affinity autoselector. @@ -757,16 +773,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (new->flags & IRQF_ONESHOT) desc->status |= IRQ_ONESHOT; - /* - * Force MSI interrupts to run with interrupts - * disabled. The multi vector cards can cause stack - * overflows due to nested interrupts when enough of - * them are directed to a core and fire at the same - * time. - */ - if (desc->msi_desc) - new->flags |= IRQF_DISABLED; - if (!(desc->status & IRQ_NOAUTOEN)) { desc->depth = 0; desc->status &= ~IRQ_DISABLED; @@ -916,6 +922,12 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) desc->chip->disable(irq); } +#ifdef CONFIG_SMP + /* make sure affinity_hint is cleaned up */ + if (WARN_ON_ONCE(desc->affinity_hint)) + desc->affinity_hint = NULL; +#endif + raw_spin_unlock_irqrestore(&desc->lock, flags); unregister_handler_proc(irq, action); @@ -1027,7 +1039,6 @@ EXPORT_SYMBOL(free_irq); * Flags: * * IRQF_SHARED Interrupt is shared - * IRQF_DISABLED Disable local interrupts while processing * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy * IRQF_TRIGGER_* Specify active edge(s) or level * @@ -1041,25 +1052,6 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, int retval; /* - * handle_IRQ_event() always ignores IRQF_DISABLED except for - * the _first_ irqaction (sigh). That can cause oopsing, but - * the behavior is classified as "will not fix" so we need to - * start nudging drivers away from using that idiom. - */ - if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) == - (IRQF_SHARED|IRQF_DISABLED)) { - pr_warning( - "IRQ %d/%s: IRQF_DISABLED is not guaranteed on shared IRQs\n", - irq, devname); - } - -#ifdef CONFIG_LOCKDEP - /* - * Lockdep wants atomic interrupt handlers: - */ - irqflags |= IRQF_DISABLED; -#endif - /* * Sanity-check: shared interrupts must pass in a real dev-ID, * otherwise we'll have trouble later trying to figure out * which interrupt is which (messes up the interrupt freeing @@ -1120,3 +1112,40 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, return retval; } EXPORT_SYMBOL(request_threaded_irq); + +/** + * request_any_context_irq - allocate an interrupt line + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs. + * Threaded handler for threaded interrupts. + * @flags: Interrupt type flags + * @name: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * This call allocates interrupt resources and enables the + * interrupt line and IRQ handling. It selects either a + * hardirq or threaded handling method depending on the + * context. + * + * On failure, it returns a negative value. On success, + * it returns either IRQC_IS_HARDIRQ or IRQC_IS_NESTED. + */ +int request_any_context_irq(unsigned int irq, irq_handler_t handler, + unsigned long flags, const char *name, void *dev_id) +{ + struct irq_desc *desc = irq_to_desc(irq); + int ret; + + if (!desc) + return -EINVAL; + + if (desc->status & IRQ_NESTED_THREAD) { + ret = request_threaded_irq(irq, NULL, handler, + flags, name, dev_id); + return !ret ? IRQC_IS_NESTED : ret; + } + + ret = request_irq(irq, handler, flags, name, dev_id); + return !ret ? IRQC_IS_HARDIRQ : ret; +} +EXPORT_SYMBOL_GPL(request_any_context_irq); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 7a6eb04..09a2ee5 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -32,6 +32,27 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v) return 0; } +static int irq_affinity_hint_proc_show(struct seq_file *m, void *v) +{ + struct irq_desc *desc = irq_to_desc((long)m->private); + unsigned long flags; + cpumask_var_t mask; + + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + raw_spin_lock_irqsave(&desc->lock, flags); + if (desc->affinity_hint) + cpumask_copy(mask, desc->affinity_hint); + raw_spin_unlock_irqrestore(&desc->lock, flags); + + seq_cpumask(m, mask); + seq_putc(m, '\n'); + free_cpumask_var(mask); + + return 0; +} + #ifndef is_affinity_mask_valid #define is_affinity_mask_valid(val) 1 #endif @@ -84,6 +105,11 @@ static int irq_affinity_proc_open(struct inode *inode, struct file *file) return single_open(file, irq_affinity_proc_show, PDE(inode)->data); } +static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data); +} + static const struct file_operations irq_affinity_proc_fops = { .open = irq_affinity_proc_open, .read = seq_read, @@ -92,6 +118,13 @@ static const struct file_operations irq_affinity_proc_fops = { .write = irq_affinity_proc_write, }; +static const struct file_operations irq_affinity_hint_proc_fops = { + .open = irq_affinity_hint_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static int default_affinity_show(struct seq_file *m, void *v) { seq_cpumask(m, irq_default_affinity); @@ -147,6 +180,26 @@ static const struct file_operations default_affinity_proc_fops = { .release = single_release, .write = default_affinity_write, }; + +static int irq_node_proc_show(struct seq_file *m, void *v) +{ + struct irq_desc *desc = irq_to_desc((long) m->private); + + seq_printf(m, "%d\n", desc->node); + return 0; +} + +static int irq_node_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, irq_node_proc_show, PDE(inode)->data); +} + +static const struct file_operations irq_node_proc_fops = { + .open = irq_node_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; #endif static int irq_spurious_proc_show(struct seq_file *m, void *v) @@ -231,6 +284,13 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) /* create /proc/irq/<irq>/smp_affinity */ proc_create_data("smp_affinity", 0600, desc->dir, &irq_affinity_proc_fops, (void *)(long)irq); + + /* create /proc/irq/<irq>/affinity_hint */ + proc_create_data("affinity_hint", 0400, desc->dir, + &irq_affinity_hint_proc_fops, (void *)(long)irq); + + proc_create_data("node", 0444, desc->dir, + &irq_node_proc_fops, (void *)(long)irq); #endif proc_create_data("spurious", 0444, desc->dir, diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 3db49b9..f42d3f7 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -2,7 +2,7 @@ * This module exposes the interface to kernel space for specifying * QoS dependencies. It provides infrastructure for registration of: * - * Dependents on a QoS value : register requirements + * Dependents on a QoS value : register requests * Watchers of QoS value : get notified when target QoS value changes * * This QoS design is best effort based. Dependents register their QoS needs. @@ -14,19 +14,21 @@ * timeout: usec <-- currently not used. * throughput: kbs (kilo byte / sec) * - * There are lists of pm_qos_objects each one wrapping requirements, notifiers + * There are lists of pm_qos_objects each one wrapping requests, notifiers * - * User mode requirements on a QOS parameter register themselves to the + * User mode requests on a QOS parameter register themselves to the * subsystem by opening the device node /dev/... and writing there request to * the node. As long as the process holds a file handle open to the node the * client continues to be accounted for. Upon file release the usermode - * requirement is removed and a new qos target is computed. This way when the - * requirement that the application has is cleaned up when closes the file + * request is removed and a new qos target is computed. This way when the + * request that the application has is cleaned up when closes the file * pointer or exits the pm_qos_object will get an opportunity to clean up. * * Mark Gross <mgross@linux.intel.com> */ +/*#define DEBUG*/ + #include <linux/pm_qos_params.h> #include <linux/sched.h> #include <linux/spinlock.h> @@ -42,25 +44,25 @@ #include <linux/uaccess.h> /* - * locking rule: all changes to requirements or notifiers lists + * locking rule: all changes to requests or notifiers lists * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock * held, taken with _irqsave. One lock to rule them all */ -struct requirement_list { +struct pm_qos_request_list { struct list_head list; union { s32 value; s32 usec; s32 kbps; }; - char *name; + int pm_qos_class; }; static s32 max_compare(s32 v1, s32 v2); static s32 min_compare(s32 v1, s32 v2); struct pm_qos_object { - struct requirement_list requirements; + struct pm_qos_request_list requests; struct blocking_notifier_head *notifiers; struct miscdevice pm_qos_power_miscdev; char *name; @@ -72,7 +74,7 @@ struct pm_qos_object { static struct pm_qos_object null_pm_qos; static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); static struct pm_qos_object cpu_dma_pm_qos = { - .requirements = {LIST_HEAD_INIT(cpu_dma_pm_qos.requirements.list)}, + .requests = {LIST_HEAD_INIT(cpu_dma_pm_qos.requests.list)}, .notifiers = &cpu_dma_lat_notifier, .name = "cpu_dma_latency", .default_value = 2000 * USEC_PER_SEC, @@ -82,7 +84,7 @@ static struct pm_qos_object cpu_dma_pm_qos = { static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); static struct pm_qos_object network_lat_pm_qos = { - .requirements = {LIST_HEAD_INIT(network_lat_pm_qos.requirements.list)}, + .requests = {LIST_HEAD_INIT(network_lat_pm_qos.requests.list)}, .notifiers = &network_lat_notifier, .name = "network_latency", .default_value = 2000 * USEC_PER_SEC, @@ -93,8 +95,7 @@ static struct pm_qos_object network_lat_pm_qos = { static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); static struct pm_qos_object network_throughput_pm_qos = { - .requirements = - {LIST_HEAD_INIT(network_throughput_pm_qos.requirements.list)}, + .requests = {LIST_HEAD_INIT(network_throughput_pm_qos.requests.list)}, .notifiers = &network_throughput_notifier, .name = "network_throughput", .default_value = 0, @@ -135,31 +136,34 @@ static s32 min_compare(s32 v1, s32 v2) } -static void update_target(int target) +static void update_target(int pm_qos_class) { s32 extreme_value; - struct requirement_list *node; + struct pm_qos_request_list *node; unsigned long flags; int call_notifier = 0; spin_lock_irqsave(&pm_qos_lock, flags); - extreme_value = pm_qos_array[target]->default_value; + extreme_value = pm_qos_array[pm_qos_class]->default_value; list_for_each_entry(node, - &pm_qos_array[target]->requirements.list, list) { - extreme_value = pm_qos_array[target]->comparitor( + &pm_qos_array[pm_qos_class]->requests.list, list) { + extreme_value = pm_qos_array[pm_qos_class]->comparitor( extreme_value, node->value); } - if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) { + if (atomic_read(&pm_qos_array[pm_qos_class]->target_value) != + extreme_value) { call_notifier = 1; - atomic_set(&pm_qos_array[target]->target_value, extreme_value); - pr_debug(KERN_ERR "new target for qos %d is %d\n", target, - atomic_read(&pm_qos_array[target]->target_value)); + atomic_set(&pm_qos_array[pm_qos_class]->target_value, + extreme_value); + pr_debug(KERN_ERR "new target for qos %d is %d\n", pm_qos_class, + atomic_read(&pm_qos_array[pm_qos_class]->target_value)); } spin_unlock_irqrestore(&pm_qos_lock, flags); if (call_notifier) - blocking_notifier_call_chain(pm_qos_array[target]->notifiers, - (unsigned long) extreme_value, NULL); + blocking_notifier_call_chain( + pm_qos_array[pm_qos_class]->notifiers, + (unsigned long) extreme_value, NULL); } static int register_pm_qos_misc(struct pm_qos_object *qos) @@ -185,125 +189,112 @@ static int find_pm_qos_object_by_minor(int minor) } /** - * pm_qos_requirement - returns current system wide qos expectation + * pm_qos_request - returns current system wide qos expectation * @pm_qos_class: identification of which qos value is requested * * This function returns the current target value in an atomic manner. */ -int pm_qos_requirement(int pm_qos_class) +int pm_qos_request(int pm_qos_class) { return atomic_read(&pm_qos_array[pm_qos_class]->target_value); } -EXPORT_SYMBOL_GPL(pm_qos_requirement); +EXPORT_SYMBOL_GPL(pm_qos_request); /** - * pm_qos_add_requirement - inserts new qos request into the list + * pm_qos_add_request - inserts new qos request into the list * @pm_qos_class: identifies which list of qos request to us - * @name: identifies the request * @value: defines the qos request * * This function inserts a new entry in the pm_qos_class list of requested qos * performance characteristics. It recomputes the aggregate QoS expectations - * for the pm_qos_class of parameters. + * for the pm_qos_class of parameters, and returns the pm_qos_request list + * element as a handle for use in updating and removal. Call needs to save + * this handle for later use. */ -int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value) +struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value) { - struct requirement_list *dep; + struct pm_qos_request_list *dep; unsigned long flags; - dep = kzalloc(sizeof(struct requirement_list), GFP_KERNEL); + dep = kzalloc(sizeof(struct pm_qos_request_list), GFP_KERNEL); if (dep) { if (value == PM_QOS_DEFAULT_VALUE) dep->value = pm_qos_array[pm_qos_class]->default_value; else dep->value = value; - dep->name = kstrdup(name, GFP_KERNEL); - if (!dep->name) - goto cleanup; + dep->pm_qos_class = pm_qos_class; spin_lock_irqsave(&pm_qos_lock, flags); list_add(&dep->list, - &pm_qos_array[pm_qos_class]->requirements.list); + &pm_qos_array[pm_qos_class]->requests.list); spin_unlock_irqrestore(&pm_qos_lock, flags); update_target(pm_qos_class); - - return 0; } -cleanup: - kfree(dep); - return -ENOMEM; + return dep; } -EXPORT_SYMBOL_GPL(pm_qos_add_requirement); +EXPORT_SYMBOL_GPL(pm_qos_add_request); /** - * pm_qos_update_requirement - modifies an existing qos request - * @pm_qos_class: identifies which list of qos request to us - * @name: identifies the request + * pm_qos_update_request - modifies an existing qos request + * @pm_qos_req : handle to list element holding a pm_qos request to use * @value: defines the qos request * - * Updates an existing qos requirement for the pm_qos_class of parameters along + * Updates an existing qos request for the pm_qos_class of parameters along * with updating the target pm_qos_class value. * - * If the named request isn't in the list then no change is made. + * Attempts are made to make this code callable on hot code paths. */ -int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value) +void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req, + s32 new_value) { unsigned long flags; - struct requirement_list *node; int pending_update = 0; + s32 temp; - spin_lock_irqsave(&pm_qos_lock, flags); - list_for_each_entry(node, - &pm_qos_array[pm_qos_class]->requirements.list, list) { - if (strcmp(node->name, name) == 0) { - if (new_value == PM_QOS_DEFAULT_VALUE) - node->value = - pm_qos_array[pm_qos_class]->default_value; - else - node->value = new_value; + if (pm_qos_req) { /*guard against callers passing in null */ + spin_lock_irqsave(&pm_qos_lock, flags); + if (new_value == PM_QOS_DEFAULT_VALUE) + temp = pm_qos_array[pm_qos_req->pm_qos_class]->default_value; + else + temp = new_value; + + if (temp != pm_qos_req->value) { pending_update = 1; - break; + pm_qos_req->value = temp; } + spin_unlock_irqrestore(&pm_qos_lock, flags); + if (pending_update) + update_target(pm_qos_req->pm_qos_class); } - spin_unlock_irqrestore(&pm_qos_lock, flags); - if (pending_update) - update_target(pm_qos_class); - - return 0; } -EXPORT_SYMBOL_GPL(pm_qos_update_requirement); +EXPORT_SYMBOL_GPL(pm_qos_update_request); /** - * pm_qos_remove_requirement - modifies an existing qos request - * @pm_qos_class: identifies which list of qos request to us - * @name: identifies the request + * pm_qos_remove_request - modifies an existing qos request + * @pm_qos_req: handle to request list element * - * Will remove named qos request from pm_qos_class list of parameters and - * recompute the current target value for the pm_qos_class. + * Will remove pm qos request from the list of requests and + * recompute the current target value for the pm_qos_class. Call this + * on slow code paths. */ -void pm_qos_remove_requirement(int pm_qos_class, char *name) +void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req) { unsigned long flags; - struct requirement_list *node; - int pending_update = 0; + int qos_class; + if (pm_qos_req == NULL) + return; + /* silent return to keep pcm code cleaner */ + + qos_class = pm_qos_req->pm_qos_class; spin_lock_irqsave(&pm_qos_lock, flags); - list_for_each_entry(node, - &pm_qos_array[pm_qos_class]->requirements.list, list) { - if (strcmp(node->name, name) == 0) { - kfree(node->name); - list_del(&node->list); - kfree(node); - pending_update = 1; - break; - } - } + list_del(&pm_qos_req->list); + kfree(pm_qos_req); spin_unlock_irqrestore(&pm_qos_lock, flags); - if (pending_update) - update_target(pm_qos_class); + update_target(qos_class); } -EXPORT_SYMBOL_GPL(pm_qos_remove_requirement); +EXPORT_SYMBOL_GPL(pm_qos_remove_request); /** * pm_qos_add_notifier - sets notification entry for changes to target value @@ -313,7 +304,7 @@ EXPORT_SYMBOL_GPL(pm_qos_remove_requirement); * will register the notifier into a notification chain that gets called * upon changes to the pm_qos_class target value. */ - int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) +int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) { int retval; @@ -343,21 +334,16 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) } EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); -#define PID_NAME_LEN 32 - static int pm_qos_power_open(struct inode *inode, struct file *filp) { - int ret; long pm_qos_class; - char name[PID_NAME_LEN]; pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); if (pm_qos_class >= 0) { - filp->private_data = (void *)pm_qos_class; - snprintf(name, PID_NAME_LEN, "process_%d", current->pid); - ret = pm_qos_add_requirement(pm_qos_class, name, - PM_QOS_DEFAULT_VALUE); - if (ret >= 0) + filp->private_data = (void *) pm_qos_add_request(pm_qos_class, + PM_QOS_DEFAULT_VALUE); + + if (filp->private_data) return 0; } return -EPERM; @@ -365,32 +351,40 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp) static int pm_qos_power_release(struct inode *inode, struct file *filp) { - int pm_qos_class; - char name[PID_NAME_LEN]; + struct pm_qos_request_list *req; - pm_qos_class = (long)filp->private_data; - snprintf(name, PID_NAME_LEN, "process_%d", current->pid); - pm_qos_remove_requirement(pm_qos_class, name); + req = (struct pm_qos_request_list *)filp->private_data; + pm_qos_remove_request(req); return 0; } + static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, size_t count, loff_t *f_pos) { s32 value; - int pm_qos_class; - char name[PID_NAME_LEN]; - - pm_qos_class = (long)filp->private_data; - if (count != sizeof(s32)) + int x; + char ascii_value[11]; + struct pm_qos_request_list *pm_qos_req; + + if (count == sizeof(s32)) { + if (copy_from_user(&value, buf, sizeof(s32))) + return -EFAULT; + } else if (count == 11) { /* len('0x12345678/0') */ + if (copy_from_user(ascii_value, buf, 11)) + return -EFAULT; + x = sscanf(ascii_value, "%x", &value); + if (x != 1) + return -EINVAL; + pr_debug(KERN_ERR "%s, %d, 0x%x\n", ascii_value, x, value); + } else return -EINVAL; - if (copy_from_user(&value, buf, sizeof(s32))) - return -EFAULT; - snprintf(name, PID_NAME_LEN, "process_%d", current->pid); - pm_qos_update_requirement(pm_qos_class, name, value); - return sizeof(s32); + pm_qos_req = (struct pm_qos_request_list *)filp->private_data; + pm_qos_update_request(pm_qos_req, value); + + return count; } diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index bc7704b..00bb252 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -11,19 +11,18 @@ #include <trace/events/timer.h> /* - * Called after updating RLIMIT_CPU to set timer expiration if necessary. + * Called after updating RLIMIT_CPU to run cpu timer and update + * tsk->signal->cputime_expires expiration cache if necessary. Needs + * siglock protection since other code may update expiration cache as + * well. */ void update_rlimit_cpu(unsigned long rlim_new) { cputime_t cputime = secs_to_cputime(rlim_new); - struct signal_struct *const sig = current->signal; - if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) || - cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) { - spin_lock_irq(¤t->sighand->siglock); - set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); - spin_unlock_irq(¤t->sighand->siglock); - } + spin_lock_irq(¤t->sighand->siglock); + set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); + spin_unlock_irq(¤t->sighand->siglock); } static int check_clock(const clockid_t which_clock) @@ -548,111 +547,62 @@ static inline int expires_gt(cputime_t expires, cputime_t new_exp) cputime_gt(expires, new_exp); } -static inline int expires_le(cputime_t expires, cputime_t new_exp) -{ - return !cputime_eq(expires, cputime_zero) && - cputime_le(expires, new_exp); -} /* * Insert the timer on the appropriate list before any timers that * expire later. This must be called with the tasklist_lock held - * for reading, and interrupts disabled. + * for reading, interrupts disabled and p->sighand->siglock taken. */ -static void arm_timer(struct k_itimer *timer, union cpu_time_count now) +static void arm_timer(struct k_itimer *timer) { struct task_struct *p = timer->it.cpu.task; struct list_head *head, *listpos; + struct task_cputime *cputime_expires; struct cpu_timer_list *const nt = &timer->it.cpu; struct cpu_timer_list *next; - unsigned long i; - head = (CPUCLOCK_PERTHREAD(timer->it_clock) ? - p->cpu_timers : p->signal->cpu_timers); + if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + head = p->cpu_timers; + cputime_expires = &p->cputime_expires; + } else { + head = p->signal->cpu_timers; + cputime_expires = &p->signal->cputime_expires; + } head += CPUCLOCK_WHICH(timer->it_clock); - BUG_ON(!irqs_disabled()); - spin_lock(&p->sighand->siglock); - listpos = head; - if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { - list_for_each_entry(next, head, entry) { - if (next->expires.sched > nt->expires.sched) - break; - listpos = &next->entry; - } - } else { - list_for_each_entry(next, head, entry) { - if (cputime_gt(next->expires.cpu, nt->expires.cpu)) - break; - listpos = &next->entry; - } + list_for_each_entry(next, head, entry) { + if (cpu_time_before(timer->it_clock, nt->expires, next->expires)) + break; + listpos = &next->entry; } list_add(&nt->entry, listpos); if (listpos == head) { + union cpu_time_count *exp = &nt->expires; + /* - * We are the new earliest-expiring timer. - * If we are a thread timer, there can always - * be a process timer telling us to stop earlier. + * We are the new earliest-expiring POSIX 1.b timer, hence + * need to update expiration cache. Take into account that + * for process timers we share expiration cache with itimers + * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME. */ - if (CPUCLOCK_PERTHREAD(timer->it_clock)) { - union cpu_time_count *exp = &nt->expires; - - switch (CPUCLOCK_WHICH(timer->it_clock)) { - default: - BUG(); - case CPUCLOCK_PROF: - if (expires_gt(p->cputime_expires.prof_exp, - exp->cpu)) - p->cputime_expires.prof_exp = exp->cpu; - break; - case CPUCLOCK_VIRT: - if (expires_gt(p->cputime_expires.virt_exp, - exp->cpu)) - p->cputime_expires.virt_exp = exp->cpu; - break; - case CPUCLOCK_SCHED: - if (p->cputime_expires.sched_exp == 0 || - p->cputime_expires.sched_exp > exp->sched) - p->cputime_expires.sched_exp = - exp->sched; - break; - } - } else { - struct signal_struct *const sig = p->signal; - union cpu_time_count *exp = &timer->it.cpu.expires; - - /* - * For a process timer, set the cached expiration time. - */ - switch (CPUCLOCK_WHICH(timer->it_clock)) { - default: - BUG(); - case CPUCLOCK_VIRT: - if (expires_le(sig->it[CPUCLOCK_VIRT].expires, - exp->cpu)) - break; - sig->cputime_expires.virt_exp = exp->cpu; - break; - case CPUCLOCK_PROF: - if (expires_le(sig->it[CPUCLOCK_PROF].expires, - exp->cpu)) - break; - i = sig->rlim[RLIMIT_CPU].rlim_cur; - if (i != RLIM_INFINITY && - i <= cputime_to_secs(exp->cpu)) - break; - sig->cputime_expires.prof_exp = exp->cpu; - break; - case CPUCLOCK_SCHED: - sig->cputime_expires.sched_exp = exp->sched; - break; - } + switch (CPUCLOCK_WHICH(timer->it_clock)) { + case CPUCLOCK_PROF: + if (expires_gt(cputime_expires->prof_exp, exp->cpu)) + cputime_expires->prof_exp = exp->cpu; + break; + case CPUCLOCK_VIRT: + if (expires_gt(cputime_expires->virt_exp, exp->cpu)) + cputime_expires->virt_exp = exp->cpu; + break; + case CPUCLOCK_SCHED: + if (cputime_expires->sched_exp == 0 || + cputime_expires->sched_exp > exp->sched) + cputime_expires->sched_exp = exp->sched; + break; } } - - spin_unlock(&p->sighand->siglock); } /* @@ -660,7 +610,12 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) */ static void cpu_timer_fire(struct k_itimer *timer) { - if (unlikely(timer->sigq == NULL)) { + if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { + /* + * User don't want any signal. + */ + timer->it.cpu.expires.sched = 0; + } else if (unlikely(timer->sigq == NULL)) { /* * This a special case for clock_nanosleep, * not a normal timer from sys_timer_create. @@ -721,7 +676,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, struct itimerspec *new, struct itimerspec *old) { struct task_struct *p = timer->it.cpu.task; - union cpu_time_count old_expires, new_expires, val; + union cpu_time_count old_expires, new_expires, old_incr, val; int ret; if (unlikely(p == NULL)) { @@ -752,6 +707,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, BUG_ON(!irqs_disabled()); ret = 0; + old_incr = timer->it.cpu.incr; spin_lock(&p->sighand->siglock); old_expires = timer->it.cpu.expires; if (unlikely(timer->it.cpu.firing)) { @@ -759,7 +715,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, ret = TIMER_RETRY; } else list_del_init(&timer->it.cpu.entry); - spin_unlock(&p->sighand->siglock); /* * We need to sample the current value to convert the new @@ -813,6 +768,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, * disable this firing since we are already reporting * it as an overrun (thanks to bump_cpu_timer above). */ + spin_unlock(&p->sighand->siglock); read_unlock(&tasklist_lock); goto out; } @@ -828,11 +784,11 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, */ timer->it.cpu.expires = new_expires; if (new_expires.sched != 0 && - (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE && cpu_time_before(timer->it_clock, val, new_expires)) { - arm_timer(timer, val); + arm_timer(timer); } + spin_unlock(&p->sighand->siglock); read_unlock(&tasklist_lock); /* @@ -853,7 +809,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, timer->it_overrun = -1; if (new_expires.sched != 0 && - (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE && !cpu_time_before(timer->it_clock, val, new_expires)) { /* * The designated time already passed, so we notify @@ -867,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, out: if (old) { sample_to_timespec(timer->it_clock, - timer->it.cpu.incr, &old->it_interval); + old_incr, &old->it_interval); } return ret; } @@ -927,25 +882,6 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) read_unlock(&tasklist_lock); } - if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { - if (timer->it.cpu.incr.sched == 0 && - cpu_time_before(timer->it_clock, - timer->it.cpu.expires, now)) { - /* - * Do-nothing timer expired and has no reload, - * so it's as if it was never set. - */ - timer->it.cpu.expires.sched = 0; - itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; - return; - } - /* - * Account for any expirations and reloads that should - * have happened. - */ - bump_cpu_timer(timer, now); - } - if (unlikely(clear_dead)) { /* * We've noticed that the thread is dead, but @@ -1066,16 +1002,9 @@ static void stop_process_timers(struct signal_struct *sig) struct thread_group_cputimer *cputimer = &sig->cputimer; unsigned long flags; - if (!cputimer->running) - return; - spin_lock_irqsave(&cputimer->lock, flags); cputimer->running = 0; spin_unlock_irqrestore(&cputimer->lock, flags); - - sig->cputime_expires.prof_exp = cputime_zero; - sig->cputime_expires.virt_exp = cputime_zero; - sig->cputime_expires.sched_exp = 0; } static u32 onecputick; @@ -1112,6 +1041,23 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, } } +/** + * task_cputime_zero - Check a task_cputime struct for all zero fields. + * + * @cputime: The struct to compare. + * + * Checks @cputime to see if all fields are zero. Returns true if all fields + * are zero, false if any field is nonzero. + */ +static inline int task_cputime_zero(const struct task_cputime *cputime) +{ + if (cputime_eq(cputime->utime, cputime_zero) && + cputime_eq(cputime->stime, cputime_zero) && + cputime->sum_exec_runtime == 0) + return 1; + return 0; +} + /* * Check for any per-thread CPU timers that have fired and move them * off the tsk->*_timers list onto the firing list. Per-thread timers @@ -1129,19 +1075,6 @@ static void check_process_timers(struct task_struct *tsk, unsigned long soft; /* - * Don't sample the current process CPU clocks if there are no timers. - */ - if (list_empty(&timers[CPUCLOCK_PROF]) && - cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) && - sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && - list_empty(&timers[CPUCLOCK_VIRT]) && - cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) && - list_empty(&timers[CPUCLOCK_SCHED])) { - stop_process_timers(sig); - return; - } - - /* * Collect the current process totals. */ thread_group_cputimer(tsk, &cputime); @@ -1230,18 +1163,11 @@ static void check_process_timers(struct task_struct *tsk, } } - if (!cputime_eq(prof_expires, cputime_zero) && - (cputime_eq(sig->cputime_expires.prof_exp, cputime_zero) || - cputime_gt(sig->cputime_expires.prof_exp, prof_expires))) - sig->cputime_expires.prof_exp = prof_expires; - if (!cputime_eq(virt_expires, cputime_zero) && - (cputime_eq(sig->cputime_expires.virt_exp, cputime_zero) || - cputime_gt(sig->cputime_expires.virt_exp, virt_expires))) - sig->cputime_expires.virt_exp = virt_expires; - if (sched_expires != 0 && - (sig->cputime_expires.sched_exp == 0 || - sig->cputime_expires.sched_exp > sched_expires)) - sig->cputime_expires.sched_exp = sched_expires; + sig->cputime_expires.prof_exp = prof_expires; + sig->cputime_expires.virt_exp = virt_expires; + sig->cputime_expires.sched_exp = sched_expires; + if (task_cputime_zero(&sig->cputime_expires)) + stop_process_timers(sig); } /* @@ -1270,6 +1196,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) goto out; } read_lock(&tasklist_lock); /* arm_timer needs it. */ + spin_lock(&p->sighand->siglock); } else { read_lock(&tasklist_lock); if (unlikely(p->signal == NULL)) { @@ -1290,6 +1217,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) clear_dead_task(timer, now); goto out_unlock; } + spin_lock(&p->sighand->siglock); cpu_timer_sample_group(timer->it_clock, p, &now); bump_cpu_timer(timer, now); /* Leave the tasklist_lock locked for the call below. */ @@ -1298,7 +1226,9 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) /* * Now re-arm for the new expiry time. */ - arm_timer(timer, now); + BUG_ON(!irqs_disabled()); + arm_timer(timer); + spin_unlock(&p->sighand->siglock); out_unlock: read_unlock(&tasklist_lock); @@ -1310,23 +1240,6 @@ out: } /** - * task_cputime_zero - Check a task_cputime struct for all zero fields. - * - * @cputime: The struct to compare. - * - * Checks @cputime to see if all fields are zero. Returns true if all fields - * are zero, false if any field is nonzero. - */ -static inline int task_cputime_zero(const struct task_cputime *cputime) -{ - if (cputime_eq(cputime->utime, cputime_zero) && - cputime_eq(cputime->stime, cputime_zero) && - cputime->sum_exec_runtime == 0) - return 1; - return 0; -} - -/** * task_cputime_expired - Compare two task_cputime entities. * * @sample: The task_cputime structure to be checked for expiration. @@ -1382,7 +1295,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk) } sig = tsk->signal; - if (!task_cputime_zero(&sig->cputime_expires)) { + if (sig->cputimer.running) { struct task_cputime group_sample; thread_group_cputimer(tsk, &group_sample); @@ -1390,7 +1303,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk) return 1; } - return sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY; + return 0; } /* @@ -1419,7 +1332,12 @@ void run_posix_cpu_timers(struct task_struct *tsk) * put them on the firing list. */ check_thread_timers(tsk, &firing); - check_process_timers(tsk, &firing); + /* + * If there are any active process wide timers (POSIX 1.b, itimers, + * RLIMIT_CPU) cputimer must be running. + */ + if (tsk->signal->cputimer.running) + check_process_timers(tsk, &firing); /* * We must release these locks before taking any timer's lock. @@ -1456,21 +1374,23 @@ void run_posix_cpu_timers(struct task_struct *tsk) } /* - * Set one of the process-wide special case CPU timers. + * Set one of the process-wide special case CPU timers or RLIMIT_CPU. * The tsk->sighand->siglock must be held by the caller. - * The *newval argument is relative and we update it to be absolute, *oldval - * is absolute and we update it to be relative. */ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, cputime_t *newval, cputime_t *oldval) { union cpu_time_count now; - struct list_head *head; BUG_ON(clock_idx == CPUCLOCK_SCHED); cpu_timer_sample_group(clock_idx, tsk, &now); if (oldval) { + /* + * We are setting itimer. The *oldval is absolute and we update + * it to be relative, *newval argument is relative and we update + * it to be absolute. + */ if (!cputime_eq(*oldval, cputime_zero)) { if (cputime_le(*oldval, now.cpu)) { /* Just about to fire. */ @@ -1483,33 +1403,21 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, if (cputime_eq(*newval, cputime_zero)) return; *newval = cputime_add(*newval, now.cpu); - - /* - * If the RLIMIT_CPU timer will expire before the - * ITIMER_PROF timer, we have nothing else to do. - */ - if (tsk->signal->rlim[RLIMIT_CPU].rlim_cur - < cputime_to_secs(*newval)) - return; } /* - * Check whether there are any process timers already set to fire - * before this one. If so, we don't have anything more to do. + * Update expiration cache if we are the earliest timer, or eventually + * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire. */ - head = &tsk->signal->cpu_timers[clock_idx]; - if (list_empty(head) || - cputime_ge(list_first_entry(head, - struct cpu_timer_list, entry)->expires.cpu, - *newval)) { - switch (clock_idx) { - case CPUCLOCK_PROF: + switch (clock_idx) { + case CPUCLOCK_PROF: + if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval)) tsk->signal->cputime_expires.prof_exp = *newval; - break; - case CPUCLOCK_VIRT: + break; + case CPUCLOCK_VIRT: + if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval)) tsk->signal->cputime_expires.virt_exp = *newval; - break; - } + break; } } diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 4319181..524e058 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -8,7 +8,8 @@ obj-$(CONFIG_PM_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o -obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o +obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ + block_io.o obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c new file mode 100644 index 0000000..97024fd --- /dev/null +++ b/kernel/power/block_io.c @@ -0,0 +1,103 @@ +/* + * This file provides functions for block I/O operations on swap/file. + * + * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> + * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> + * + * This file is released under the GPLv2. + */ + +#include <linux/bio.h> +#include <linux/kernel.h> +#include <linux/pagemap.h> +#include <linux/swap.h> + +#include "power.h" + +/** + * submit - submit BIO request. + * @rw: READ or WRITE. + * @off physical offset of page. + * @page: page we're reading or writing. + * @bio_chain: list of pending biod (for async reading) + * + * Straight from the textbook - allocate and initialize the bio. + * If we're reading, make sure the page is marked as dirty. + * Then submit it and, if @bio_chain == NULL, wait. + */ +static int submit(int rw, struct block_device *bdev, sector_t sector, + struct page *page, struct bio **bio_chain) +{ + const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); + struct bio *bio; + + bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); + bio->bi_sector = sector; + bio->bi_bdev = bdev; + bio->bi_end_io = end_swap_bio_read; + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + printk(KERN_ERR "PM: Adding page to bio failed at %llu\n", + (unsigned long long)sector); + bio_put(bio); + return -EFAULT; + } + + lock_page(page); + bio_get(bio); + + if (bio_chain == NULL) { + submit_bio(bio_rw, bio); + wait_on_page_locked(page); + if (rw == READ) + bio_set_pages_dirty(bio); + bio_put(bio); + } else { + if (rw == READ) + get_page(page); /* These pages are freed later */ + bio->bi_private = *bio_chain; + *bio_chain = bio; + submit_bio(bio_rw, bio); + } + return 0; +} + +int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain) +{ + return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9), + virt_to_page(addr), bio_chain); +} + +int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain) +{ + return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9), + virt_to_page(addr), bio_chain); +} + +int hib_wait_on_bio_chain(struct bio **bio_chain) +{ + struct bio *bio; + struct bio *next_bio; + int ret = 0; + + if (bio_chain == NULL) + return 0; + + bio = *bio_chain; + if (bio == NULL) + return 0; + while (bio) { + struct page *page; + + next_bio = bio->bi_private; + page = bio->bi_io_vec[0].bv_page; + wait_on_page_locked(page); + if (!PageUptodate(page) || PageError(page)) + ret = -EIO; + put_page(page); + bio_put(bio); + bio = next_bio; + } + *bio_chain = NULL; + return ret; +} diff --git a/kernel/power/power.h b/kernel/power/power.h index 46c5a26..006270f 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -97,24 +97,12 @@ extern int hibernate_preallocate_memory(void); */ struct snapshot_handle { - loff_t offset; /* number of the last byte ready for reading - * or writing in the sequence - */ unsigned int cur; /* number of the block of PAGE_SIZE bytes the * next operation will refer to (ie. current) */ - unsigned int cur_offset; /* offset with respect to the current - * block (for the next operation) - */ - unsigned int prev; /* number of the block of PAGE_SIZE bytes that - * was the current one previously - */ void *buffer; /* address of the block to read from * or write to */ - unsigned int buf_offset; /* location to read from or write to, - * given as a displacement from 'buffer' - */ int sync_read; /* Set to one to notify the caller of * snapshot_write_next() that it may * need to call wait_on_bio_chain() @@ -125,12 +113,12 @@ struct snapshot_handle { * snapshot_read_next()/snapshot_write_next() is allowed to * read/write data after the function returns */ -#define data_of(handle) ((handle).buffer + (handle).buf_offset) +#define data_of(handle) ((handle).buffer) extern unsigned int snapshot_additional_pages(struct zone *zone); extern unsigned long snapshot_get_image_size(void); -extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); -extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); +extern int snapshot_read_next(struct snapshot_handle *handle); +extern int snapshot_write_next(struct snapshot_handle *handle); extern void snapshot_write_finalize(struct snapshot_handle *handle); extern int snapshot_image_loaded(struct snapshot_handle *handle); @@ -154,6 +142,15 @@ extern int swsusp_read(unsigned int *flags_p); extern int swsusp_write(unsigned int flags); extern void swsusp_close(fmode_t); +/* kernel/power/block_io.c */ +extern struct block_device *hib_resume_bdev; + +extern int hib_bio_read_page(pgoff_t page_off, void *addr, + struct bio **bio_chain); +extern int hib_bio_write_page(pgoff_t page_off, void *addr, + struct bio **bio_chain); +extern int hib_wait_on_bio_chain(struct bio **bio_chain); + struct timeval; /* kernel/power/swsusp.c */ extern void swsusp_show_speed(struct timeval *, struct timeval *, diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index be861c2..25ce010 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1604,14 +1604,9 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm) * snapshot_handle structure. The structure gets updated and a pointer * to it should be passed to this function every next time. * - * The @count parameter should contain the number of bytes the caller - * wants to read from the snapshot. It must not be zero. - * * On success the function returns a positive number. Then, the caller * is allowed to read up to the returned number of bytes from the memory - * location computed by the data_of() macro. The number returned - * may be smaller than @count, but this only happens if the read would - * cross a page boundary otherwise. + * location computed by the data_of() macro. * * The function returns 0 to indicate the end of data stream condition, * and a negative number is returned on error. In such cases the @@ -1619,7 +1614,7 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm) * any more. */ -int snapshot_read_next(struct snapshot_handle *handle, size_t count) +int snapshot_read_next(struct snapshot_handle *handle) { if (handle->cur > nr_meta_pages + nr_copy_pages) return 0; @@ -1630,7 +1625,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count) if (!buffer) return -ENOMEM; } - if (!handle->offset) { + if (!handle->cur) { int error; error = init_header((struct swsusp_info *)buffer); @@ -1639,42 +1634,30 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count) handle->buffer = buffer; memory_bm_position_reset(&orig_bm); memory_bm_position_reset(©_bm); - } - if (handle->prev < handle->cur) { - if (handle->cur <= nr_meta_pages) { - memset(buffer, 0, PAGE_SIZE); - pack_pfns(buffer, &orig_bm); - } else { - struct page *page; + } else if (handle->cur <= nr_meta_pages) { + memset(buffer, 0, PAGE_SIZE); + pack_pfns(buffer, &orig_bm); + } else { + struct page *page; - page = pfn_to_page(memory_bm_next_pfn(©_bm)); - if (PageHighMem(page)) { - /* Highmem pages are copied to the buffer, - * because we can't return with a kmapped - * highmem page (we may not be called again). - */ - void *kaddr; + page = pfn_to_page(memory_bm_next_pfn(©_bm)); + if (PageHighMem(page)) { + /* Highmem pages are copied to the buffer, + * because we can't return with a kmapped + * highmem page (we may not be called again). + */ + void *kaddr; - kaddr = kmap_atomic(page, KM_USER0); - memcpy(buffer, kaddr, PAGE_SIZE); - kunmap_atomic(kaddr, KM_USER0); - handle->buffer = buffer; - } else { - handle->buffer = page_address(page); - } + kaddr = kmap_atomic(page, KM_USER0); + memcpy(buffer, kaddr, PAGE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + handle->buffer = buffer; + } else { + handle->buffer = page_address(page); } - handle->prev = handle->cur; - } - handle->buf_offset = handle->cur_offset; - if (handle->cur_offset + count >= PAGE_SIZE) { - count = PAGE_SIZE - handle->cur_offset; - handle->cur_offset = 0; - handle->cur++; - } else { - handle->cur_offset += count; } - handle->offset += count; - return count; + handle->cur++; + return PAGE_SIZE; } /** @@ -2133,14 +2116,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) * snapshot_handle structure. The structure gets updated and a pointer * to it should be passed to this function every next time. * - * The @count parameter should contain the number of bytes the caller - * wants to write to the image. It must not be zero. - * * On success the function returns a positive number. Then, the caller * is allowed to write up to the returned number of bytes to the memory - * location computed by the data_of() macro. The number returned - * may be smaller than @count, but this only happens if the write would - * cross a page boundary otherwise. + * location computed by the data_of() macro. * * The function returns 0 to indicate the "end of file" condition, * and a negative number is returned on error. In such cases the @@ -2148,16 +2126,18 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) * any more. */ -int snapshot_write_next(struct snapshot_handle *handle, size_t count) +int snapshot_write_next(struct snapshot_handle *handle) { static struct chain_allocator ca; int error = 0; /* Check if we have already loaded the entire image */ - if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) + if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) return 0; - if (handle->offset == 0) { + handle->sync_read = 1; + + if (!handle->cur) { if (!buffer) /* This makes the buffer be freed by swsusp_free() */ buffer = get_image_page(GFP_ATOMIC, PG_ANY); @@ -2166,56 +2146,43 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) return -ENOMEM; handle->buffer = buffer; - } - handle->sync_read = 1; - if (handle->prev < handle->cur) { - if (handle->prev == 0) { - error = load_header(buffer); - if (error) - return error; + } else if (handle->cur == 1) { + error = load_header(buffer); + if (error) + return error; - error = memory_bm_create(©_bm, GFP_ATOMIC, PG_ANY); - if (error) - return error; + error = memory_bm_create(©_bm, GFP_ATOMIC, PG_ANY); + if (error) + return error; + + } else if (handle->cur <= nr_meta_pages + 1) { + error = unpack_orig_pfns(buffer, ©_bm); + if (error) + return error; - } else if (handle->prev <= nr_meta_pages) { - error = unpack_orig_pfns(buffer, ©_bm); + if (handle->cur == nr_meta_pages + 1) { + error = prepare_image(&orig_bm, ©_bm); if (error) return error; - if (handle->prev == nr_meta_pages) { - error = prepare_image(&orig_bm, ©_bm); - if (error) - return error; - - chain_init(&ca, GFP_ATOMIC, PG_SAFE); - memory_bm_position_reset(&orig_bm); - restore_pblist = NULL; - handle->buffer = get_buffer(&orig_bm, &ca); - handle->sync_read = 0; - if (IS_ERR(handle->buffer)) - return PTR_ERR(handle->buffer); - } - } else { - copy_last_highmem_page(); + chain_init(&ca, GFP_ATOMIC, PG_SAFE); + memory_bm_position_reset(&orig_bm); + restore_pblist = NULL; handle->buffer = get_buffer(&orig_bm, &ca); + handle->sync_read = 0; if (IS_ERR(handle->buffer)) return PTR_ERR(handle->buffer); - if (handle->buffer != buffer) - handle->sync_read = 0; } - handle->prev = handle->cur; - } - handle->buf_offset = handle->cur_offset; - if (handle->cur_offset + count >= PAGE_SIZE) { - count = PAGE_SIZE - handle->cur_offset; - handle->cur_offset = 0; - handle->cur++; } else { - handle->cur_offset += count; + copy_last_highmem_page(); + handle->buffer = get_buffer(&orig_bm, &ca); + if (IS_ERR(handle->buffer)) + return PTR_ERR(handle->buffer); + if (handle->buffer != buffer) + handle->sync_read = 0; } - handle->offset += count; - return count; + handle->cur++; + return PAGE_SIZE; } /** @@ -2230,7 +2197,7 @@ void snapshot_write_finalize(struct snapshot_handle *handle) { copy_last_highmem_page(); /* Free only if we have loaded the image entirely */ - if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) { + if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); free_highmem_data(); } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 66824d7..b0bb217 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -29,6 +29,40 @@ #define SWSUSP_SIG "S1SUSPEND" +/* + * The swap map is a data structure used for keeping track of each page + * written to a swap partition. It consists of many swap_map_page + * structures that contain each an array of MAP_PAGE_SIZE swap entries. + * These structures are stored on the swap and linked together with the + * help of the .next_swap member. + * + * The swap map is created during suspend. The swap map pages are + * allocated and populated one at a time, so we only need one memory + * page to set up the entire structure. + * + * During resume we also only need to use one swap_map_page structure + * at a time. + */ + +#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) + +struct swap_map_page { + sector_t entries[MAP_PAGE_ENTRIES]; + sector_t next_swap; +}; + +/** + * The swap_map_handle structure is used for handling swap in + * a file-alike way + */ + +struct swap_map_handle { + struct swap_map_page *cur; + sector_t cur_swap; + sector_t first_sector; + unsigned int k; +}; + struct swsusp_header { char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)]; sector_t image; @@ -145,110 +179,24 @@ int swsusp_swap_in_use(void) */ static unsigned short root_swap = 0xffff; -static struct block_device *resume_bdev; - -/** - * submit - submit BIO request. - * @rw: READ or WRITE. - * @off physical offset of page. - * @page: page we're reading or writing. - * @bio_chain: list of pending biod (for async reading) - * - * Straight from the textbook - allocate and initialize the bio. - * If we're reading, make sure the page is marked as dirty. - * Then submit it and, if @bio_chain == NULL, wait. - */ -static int submit(int rw, pgoff_t page_off, struct page *page, - struct bio **bio_chain) -{ - const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); - struct bio *bio; - - bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); - bio->bi_sector = page_off * (PAGE_SIZE >> 9); - bio->bi_bdev = resume_bdev; - bio->bi_end_io = end_swap_bio_read; - - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - printk(KERN_ERR "PM: Adding page to bio failed at %ld\n", - page_off); - bio_put(bio); - return -EFAULT; - } - - lock_page(page); - bio_get(bio); - - if (bio_chain == NULL) { - submit_bio(bio_rw, bio); - wait_on_page_locked(page); - if (rw == READ) - bio_set_pages_dirty(bio); - bio_put(bio); - } else { - if (rw == READ) - get_page(page); /* These pages are freed later */ - bio->bi_private = *bio_chain; - *bio_chain = bio; - submit_bio(bio_rw, bio); - } - return 0; -} - -static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain) -{ - return submit(READ, page_off, virt_to_page(addr), bio_chain); -} - -static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain) -{ - return submit(WRITE, page_off, virt_to_page(addr), bio_chain); -} - -static int wait_on_bio_chain(struct bio **bio_chain) -{ - struct bio *bio; - struct bio *next_bio; - int ret = 0; - - if (bio_chain == NULL) - return 0; - - bio = *bio_chain; - if (bio == NULL) - return 0; - while (bio) { - struct page *page; - - next_bio = bio->bi_private; - page = bio->bi_io_vec[0].bv_page; - wait_on_page_locked(page); - if (!PageUptodate(page) || PageError(page)) - ret = -EIO; - put_page(page); - bio_put(bio); - bio = next_bio; - } - *bio_chain = NULL; - return ret; -} +struct block_device *hib_resume_bdev; /* * Saving part */ -static int mark_swapfiles(sector_t start, unsigned int flags) +static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; - bio_read_page(swsusp_resume_block, swsusp_header, NULL); + hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); memcpy(swsusp_header->sig,SWSUSP_SIG, 10); - swsusp_header->image = start; + swsusp_header->image = handle->first_sector; swsusp_header->flags = flags; - error = bio_write_page(swsusp_resume_block, + error = hib_bio_write_page(swsusp_resume_block, swsusp_header, NULL); } else { printk(KERN_ERR "PM: Swap header not found!\n"); @@ -260,25 +208,26 @@ static int mark_swapfiles(sector_t start, unsigned int flags) /** * swsusp_swap_check - check if the resume device is a swap device * and get its index (if so) + * + * This is called before saving image */ - -static int swsusp_swap_check(void) /* This is called before saving image */ +static int swsusp_swap_check(void) { int res; res = swap_type_of(swsusp_resume_device, swsusp_resume_block, - &resume_bdev); + &hib_resume_bdev); if (res < 0) return res; root_swap = res; - res = blkdev_get(resume_bdev, FMODE_WRITE); + res = blkdev_get(hib_resume_bdev, FMODE_WRITE); if (res) return res; - res = set_blocksize(resume_bdev, PAGE_SIZE); + res = set_blocksize(hib_resume_bdev, PAGE_SIZE); if (res < 0) - blkdev_put(resume_bdev, FMODE_WRITE); + blkdev_put(hib_resume_bdev, FMODE_WRITE); return res; } @@ -309,42 +258,9 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) } else { src = buf; } - return bio_write_page(offset, src, bio_chain); + return hib_bio_write_page(offset, src, bio_chain); } -/* - * The swap map is a data structure used for keeping track of each page - * written to a swap partition. It consists of many swap_map_page - * structures that contain each an array of MAP_PAGE_SIZE swap entries. - * These structures are stored on the swap and linked together with the - * help of the .next_swap member. - * - * The swap map is created during suspend. The swap map pages are - * allocated and populated one at a time, so we only need one memory - * page to set up the entire structure. - * - * During resume we also only need to use one swap_map_page structure - * at a time. - */ - -#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) - -struct swap_map_page { - sector_t entries[MAP_PAGE_ENTRIES]; - sector_t next_swap; -}; - -/** - * The swap_map_handle structure is used for handling swap in - * a file-alike way - */ - -struct swap_map_handle { - struct swap_map_page *cur; - sector_t cur_swap; - unsigned int k; -}; - static void release_swap_writer(struct swap_map_handle *handle) { if (handle->cur) @@ -354,16 +270,33 @@ static void release_swap_writer(struct swap_map_handle *handle) static int get_swap_writer(struct swap_map_handle *handle) { + int ret; + + ret = swsusp_swap_check(); + if (ret) { + if (ret != -ENOSPC) + printk(KERN_ERR "PM: Cannot find swap device, try " + "swapon -a.\n"); + return ret; + } handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); - if (!handle->cur) - return -ENOMEM; + if (!handle->cur) { + ret = -ENOMEM; + goto err_close; + } handle->cur_swap = alloc_swapdev_block(root_swap); if (!handle->cur_swap) { - release_swap_writer(handle); - return -ENOSPC; + ret = -ENOSPC; + goto err_rel; } handle->k = 0; + handle->first_sector = handle->cur_swap; return 0; +err_rel: + release_swap_writer(handle); +err_close: + swsusp_close(FMODE_WRITE); + return ret; } static int swap_write_page(struct swap_map_handle *handle, void *buf, @@ -380,7 +313,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, return error; handle->cur->entries[handle->k++] = offset; if (handle->k >= MAP_PAGE_ENTRIES) { - error = wait_on_bio_chain(bio_chain); + error = hib_wait_on_bio_chain(bio_chain); if (error) goto out; offset = alloc_swapdev_block(root_swap); @@ -406,6 +339,24 @@ static int flush_swap_writer(struct swap_map_handle *handle) return -EINVAL; } +static int swap_writer_finish(struct swap_map_handle *handle, + unsigned int flags, int error) +{ + if (!error) { + flush_swap_writer(handle); + printk(KERN_INFO "PM: S"); + error = mark_swapfiles(handle, flags); + printk("|\n"); + } + + if (error) + free_all_swap_pages(root_swap); + release_swap_writer(handle); + swsusp_close(FMODE_WRITE); + + return error; +} + /** * save_image - save the suspend image data */ @@ -431,7 +382,7 @@ static int save_image(struct swap_map_handle *handle, bio = NULL; do_gettimeofday(&start); while (1) { - ret = snapshot_read_next(snapshot, PAGE_SIZE); + ret = snapshot_read_next(snapshot); if (ret <= 0) break; ret = swap_write_page(handle, data_of(*snapshot), &bio); @@ -441,7 +392,7 @@ static int save_image(struct swap_map_handle *handle, printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); nr_pages++; } - err2 = wait_on_bio_chain(&bio); + err2 = hib_wait_on_bio_chain(&bio); do_gettimeofday(&stop); if (!ret) ret = err2; @@ -483,50 +434,34 @@ int swsusp_write(unsigned int flags) struct swap_map_handle handle; struct snapshot_handle snapshot; struct swsusp_info *header; + unsigned long pages; int error; - error = swsusp_swap_check(); + pages = snapshot_get_image_size(); + error = get_swap_writer(&handle); if (error) { - printk(KERN_ERR "PM: Cannot find swap device, try " - "swapon -a.\n"); + printk(KERN_ERR "PM: Cannot get swap writer\n"); return error; } + if (!enough_swap(pages)) { + printk(KERN_ERR "PM: Not enough free swap\n"); + error = -ENOSPC; + goto out_finish; + } memset(&snapshot, 0, sizeof(struct snapshot_handle)); - error = snapshot_read_next(&snapshot, PAGE_SIZE); + error = snapshot_read_next(&snapshot); if (error < PAGE_SIZE) { if (error >= 0) error = -EFAULT; - goto out; + goto out_finish; } header = (struct swsusp_info *)data_of(snapshot); - if (!enough_swap(header->pages)) { - printk(KERN_ERR "PM: Not enough free swap\n"); - error = -ENOSPC; - goto out; - } - error = get_swap_writer(&handle); - if (!error) { - sector_t start = handle.cur_swap; - - error = swap_write_page(&handle, header, NULL); - if (!error) - error = save_image(&handle, &snapshot, - header->pages - 1); - - if (!error) { - flush_swap_writer(&handle); - printk(KERN_INFO "PM: S"); - error = mark_swapfiles(start, flags); - printk("|\n"); - } - } - if (error) - free_all_swap_pages(root_swap); - - release_swap_writer(&handle); - out: - swsusp_close(FMODE_WRITE); + error = swap_write_page(&handle, header, NULL); + if (!error) + error = save_image(&handle, &snapshot, pages - 1); +out_finish: + error = swap_writer_finish(&handle, flags, error); return error; } @@ -542,18 +477,21 @@ static void release_swap_reader(struct swap_map_handle *handle) handle->cur = NULL; } -static int get_swap_reader(struct swap_map_handle *handle, sector_t start) +static int get_swap_reader(struct swap_map_handle *handle, + unsigned int *flags_p) { int error; - if (!start) + *flags_p = swsusp_header->flags; + + if (!swsusp_header->image) /* how can this happen? */ return -EINVAL; handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH); if (!handle->cur) return -ENOMEM; - error = bio_read_page(start, handle->cur, NULL); + error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL); if (error) { release_swap_reader(handle); return error; @@ -573,21 +511,28 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, offset = handle->cur->entries[handle->k]; if (!offset) return -EFAULT; - error = bio_read_page(offset, buf, bio_chain); + error = hib_bio_read_page(offset, buf, bio_chain); if (error) return error; if (++handle->k >= MAP_PAGE_ENTRIES) { - error = wait_on_bio_chain(bio_chain); + error = hib_wait_on_bio_chain(bio_chain); handle->k = 0; offset = handle->cur->next_swap; if (!offset) release_swap_reader(handle); else if (!error) - error = bio_read_page(offset, handle->cur, NULL); + error = hib_bio_read_page(offset, handle->cur, NULL); } return error; } +static int swap_reader_finish(struct swap_map_handle *handle) +{ + release_swap_reader(handle); + + return 0; +} + /** * load_image - load the image using the swap map handle * @handle and the snapshot handle @snapshot @@ -615,21 +560,21 @@ static int load_image(struct swap_map_handle *handle, bio = NULL; do_gettimeofday(&start); for ( ; ; ) { - error = snapshot_write_next(snapshot, PAGE_SIZE); + error = snapshot_write_next(snapshot); if (error <= 0) break; error = swap_read_page(handle, data_of(*snapshot), &bio); if (error) break; if (snapshot->sync_read) - error = wait_on_bio_chain(&bio); + error = hib_wait_on_bio_chain(&bio); if (error) break; if (!(nr_pages % m)) printk("\b\b\b\b%3d%%", nr_pages / m); nr_pages++; } - err2 = wait_on_bio_chain(&bio); + err2 = hib_wait_on_bio_chain(&bio); do_gettimeofday(&stop); if (!error) error = err2; @@ -657,20 +602,20 @@ int swsusp_read(unsigned int *flags_p) struct snapshot_handle snapshot; struct swsusp_info *header; - *flags_p = swsusp_header->flags; - memset(&snapshot, 0, sizeof(struct snapshot_handle)); - error = snapshot_write_next(&snapshot, PAGE_SIZE); + error = snapshot_write_next(&snapshot); if (error < PAGE_SIZE) return error < 0 ? error : -EFAULT; header = (struct swsusp_info *)data_of(snapshot); - error = get_swap_reader(&handle, swsusp_header->image); + error = get_swap_reader(&handle, flags_p); + if (error) + goto end; if (!error) error = swap_read_page(&handle, header, NULL); if (!error) error = load_image(&handle, &snapshot, header->pages - 1); - release_swap_reader(&handle); - + swap_reader_finish(&handle); +end: if (!error) pr_debug("PM: Image successfully loaded\n"); else @@ -686,11 +631,11 @@ int swsusp_check(void) { int error; - resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); - if (!IS_ERR(resume_bdev)) { - set_blocksize(resume_bdev, PAGE_SIZE); + hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); + if (!IS_ERR(hib_resume_bdev)) { + set_blocksize(hib_resume_bdev, PAGE_SIZE); memset(swsusp_header, 0, PAGE_SIZE); - error = bio_read_page(swsusp_resume_block, + error = hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); if (error) goto put; @@ -698,7 +643,7 @@ int swsusp_check(void) if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); /* Reset swap signature now */ - error = bio_write_page(swsusp_resume_block, + error = hib_bio_write_page(swsusp_resume_block, swsusp_header, NULL); } else { error = -EINVAL; @@ -706,11 +651,11 @@ int swsusp_check(void) put: if (error) - blkdev_put(resume_bdev, FMODE_READ); + blkdev_put(hib_resume_bdev, FMODE_READ); else pr_debug("PM: Signature found, resuming\n"); } else { - error = PTR_ERR(resume_bdev); + error = PTR_ERR(hib_resume_bdev); } if (error) @@ -725,12 +670,12 @@ put: void swsusp_close(fmode_t mode) { - if (IS_ERR(resume_bdev)) { + if (IS_ERR(hib_resume_bdev)) { pr_debug("PM: Image device not initialised\n"); return; } - blkdev_put(resume_bdev, mode); + blkdev_put(hib_resume_bdev, mode); } static int swsusp_header_init(void) diff --git a/kernel/power/user.c b/kernel/power/user.c index a8c9621..e819e17 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -151,6 +151,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, { struct snapshot_data *data; ssize_t res; + loff_t pg_offp = *offp & ~PAGE_MASK; mutex_lock(&pm_mutex); @@ -159,14 +160,19 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, res = -ENODATA; goto Unlock; } - res = snapshot_read_next(&data->handle, count); - if (res > 0) { - if (copy_to_user(buf, data_of(data->handle), res)) - res = -EFAULT; - else - *offp = data->handle.offset; + if (!pg_offp) { /* on page boundary? */ + res = snapshot_read_next(&data->handle); + if (res <= 0) + goto Unlock; + } else { + res = PAGE_SIZE - pg_offp; } + res = simple_read_from_buffer(buf, count, &pg_offp, + data_of(data->handle), res); + if (res > 0) + *offp += res; + Unlock: mutex_unlock(&pm_mutex); @@ -178,18 +184,25 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, { struct snapshot_data *data; ssize_t res; + loff_t pg_offp = *offp & ~PAGE_MASK; mutex_lock(&pm_mutex); data = filp->private_data; - res = snapshot_write_next(&data->handle, count); - if (res > 0) { - if (copy_from_user(data_of(data->handle), buf, res)) - res = -EFAULT; - else - *offp = data->handle.offset; + + if (!pg_offp) { + res = snapshot_write_next(&data->handle); + if (res <= 0) + goto unlock; + } else { + res = PAGE_SIZE - pg_offp; } + res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp, + buf, count); + if (res > 0) + *offp += res; +unlock: mutex_unlock(&pm_mutex); return res; diff --git a/kernel/sys.c b/kernel/sys.c index 7cb426a..0d36d88 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -492,10 +492,6 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) return -ENOMEM; old = current_cred(); - retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE); - if (retval) - goto error; - retval = -EPERM; if (rgid != (gid_t) -1) { if (old->gid == rgid || @@ -543,10 +539,6 @@ SYSCALL_DEFINE1(setgid, gid_t, gid) return -ENOMEM; old = current_cred(); - retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID); - if (retval) - goto error; - retval = -EPERM; if (capable(CAP_SETGID)) new->gid = new->egid = new->sgid = new->fsgid = gid; @@ -610,10 +602,6 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) return -ENOMEM; old = current_cred(); - retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE); - if (retval) - goto error; - retval = -EPERM; if (ruid != (uid_t) -1) { new->uid = ruid; @@ -675,10 +663,6 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) return -ENOMEM; old = current_cred(); - retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID); - if (retval) - goto error; - retval = -EPERM; if (capable(CAP_SETUID)) { new->suid = new->uid = uid; @@ -719,9 +703,6 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) if (!new) return -ENOMEM; - retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES); - if (retval) - goto error; old = current_cred(); retval = -EPERM; @@ -788,10 +769,6 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) return -ENOMEM; old = current_cred(); - retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES); - if (retval) - goto error; - retval = -EPERM; if (!capable(CAP_SETGID)) { if (rgid != (gid_t) -1 && rgid != old->gid && @@ -851,9 +828,6 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid) old = current_cred(); old_fsuid = old->fsuid; - if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0) - goto error; - if (uid == old->uid || uid == old->euid || uid == old->suid || uid == old->fsuid || capable(CAP_SETUID)) { @@ -864,7 +838,6 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid) } } -error: abort_creds(new); return old_fsuid; @@ -888,9 +861,6 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid) old = current_cred(); old_fsgid = old->fsgid; - if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS)) - goto error; - if (gid == old->gid || gid == old->egid || gid == old->sgid || gid == old->fsgid || capable(CAP_SETGID)) { @@ -900,7 +870,6 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid) } } -error: abort_creds(new); return old_fsgid; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index bcfb79e..b125830 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -163,6 +163,27 @@ static int proc_taint(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif +#ifdef CONFIG_MAGIC_SYSRQ +static int __sysrq_enabled; /* Note: sysrq code ises it's own private copy */ + +static int sysrq_sysctl_handler(ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int error; + + error = proc_dointvec(table, write, buffer, lenp, ppos); + if (error) + return error; + + if (write) + sysrq_toggle_support(__sysrq_enabled); + + return 0; +} + +#endif + static struct ctl_table root_table[]; static struct ctl_table_root sysctl_table_root; static struct ctl_table_header root_table_header = { @@ -567,7 +588,7 @@ static struct ctl_table kern_table[] = { .data = &__sysrq_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = sysrq_sysctl_handler, }, #endif #ifdef CONFIG_PROC_SYSCTL @@ -621,7 +642,7 @@ static struct ctl_table kern_table[] = { #endif { .procname = "userprocess_debug", - .data = &sysctl_userprocess_debug, + .data = &show_unhandled_signals, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, @@ -1431,7 +1452,8 @@ static struct ctl_table fs_table[] = { }; static struct ctl_table debug_table[] = { -#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) +#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ + defined(CONFIG_S390) { .procname = "exception-trace", .data = &show_unhandled_signals, diff --git a/kernel/time.c b/kernel/time.c index 656dccf..50612fa 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -132,12 +132,11 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, */ static inline void warp_clock(void) { - write_seqlock_irq(&xtime_lock); - wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; - xtime.tv_sec += sys_tz.tz_minuteswest * 60; - update_xtime_cache(0); - write_sequnlock_irq(&xtime_lock); - clock_was_set(); + struct timespec delta, adjust; + delta.tv_sec = sys_tz.tz_minuteswest * 60; + delta.tv_nsec = 0; + adjust = timespec_add_safe(current_kernel_time(), delta); + do_settimeofday(&adjust); } /* diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 1f5dde6..f08e99c 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -625,6 +625,54 @@ static void clocksource_enqueue(struct clocksource *cs) list_add(&cs->list, entry); } + +/* + * Maximum time we expect to go between ticks. This includes idle + * tickless time. It provides the trade off between selecting a + * mult/shift pair that is very precise but can only handle a short + * period of time, vs. a mult/shift pair that can handle long periods + * of time but isn't as precise. + * + * This is a subsystem constant, and actual hardware limitations + * may override it (ie: clocksources that wrap every 3 seconds). + */ +#define MAX_UPDATE_LENGTH 5 /* Seconds */ + +/** + * __clocksource_register_scale - Used to install new clocksources + * @t: clocksource to be registered + * @scale: Scale factor multiplied against freq to get clocksource hz + * @freq: clocksource frequency (cycles per second) divided by scale + * + * Returns -EBUSY if registration fails, zero otherwise. + * + * This *SHOULD NOT* be called directly! Please use the + * clocksource_register_hz() or clocksource_register_khz helper functions. + */ +int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) +{ + + /* + * Ideally we want to use some of the limits used in + * clocksource_max_deferment, to provide a more informed + * MAX_UPDATE_LENGTH. But for now this just gets the + * register interface working properly. + */ + clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, + NSEC_PER_SEC/scale, + MAX_UPDATE_LENGTH*scale); + cs->max_idle_ns = clocksource_max_deferment(cs); + + mutex_lock(&clocksource_mutex); + clocksource_enqueue(cs); + clocksource_select(); + clocksource_enqueue_watchdog(cs); + mutex_unlock(&clocksource_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(__clocksource_register_scale); + + /** * clocksource_register - Used to install new clocksources * @t: clocksource to be registered diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 7c0f180..c631168 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -69,7 +69,7 @@ static s64 time_freq; /* time at last adjustment (secs): */ static long time_reftime; -long time_adjust; +static long time_adjust; /* constant (boot-param configurable) NTP tick adjustment (upscaled) */ static s64 ntp_tick_adj; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 39f6177..caf8d4d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -165,13 +165,6 @@ struct timespec raw_time; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; -static struct timespec xtime_cache __attribute__ ((aligned (16))); -void update_xtime_cache(u64 nsec) -{ - xtime_cache = xtime; - timespec_add_ns(&xtime_cache, nsec); -} - /* must hold xtime_lock */ void timekeeping_leap_insert(int leapsecond) { @@ -332,8 +325,6 @@ int do_settimeofday(struct timespec *tv) xtime = *tv; - update_xtime_cache(0); - timekeeper.ntp_error = 0; ntp_clear(); @@ -559,7 +550,6 @@ void __init timekeeping_init(void) } set_normalized_timespec(&wall_to_monotonic, -boot.tv_sec, -boot.tv_nsec); - update_xtime_cache(0); total_sleep_time.tv_sec = 0; total_sleep_time.tv_nsec = 0; write_sequnlock_irqrestore(&xtime_lock, flags); @@ -593,7 +583,6 @@ static int timekeeping_resume(struct sys_device *dev) wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); total_sleep_time = timespec_add_safe(total_sleep_time, ts); } - update_xtime_cache(0); /* re-base the last cycle value */ timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); timekeeper.ntp_error = 0; @@ -788,7 +777,6 @@ void update_wall_time(void) { struct clocksource *clock; cycle_t offset; - u64 nsecs; int shift = 0, maxshift; /* Make sure we're fully resumed: */ @@ -847,7 +835,9 @@ void update_wall_time(void) timekeeper.ntp_error += neg << timekeeper.ntp_error_shift; } - /* store full nanoseconds into xtime after rounding it up and + + /* + * Store full nanoseconds into xtime after rounding it up and * add the remainder to the error difference. */ xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1; @@ -855,8 +845,15 @@ void update_wall_time(void) timekeeper.ntp_error += timekeeper.xtime_nsec << timekeeper.ntp_error_shift; - nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift); - update_xtime_cache(nsecs); + /* + * Finally, make sure that after the rounding + * xtime.tv_nsec isn't larger then NSEC_PER_SEC + */ + if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) { + xtime.tv_nsec -= NSEC_PER_SEC; + xtime.tv_sec++; + second_overflow(); + } /* check to see if there is a new clocksource to use */ update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); @@ -896,13 +893,13 @@ EXPORT_SYMBOL_GPL(monotonic_to_bootbased); unsigned long get_seconds(void) { - return xtime_cache.tv_sec; + return xtime.tv_sec; } EXPORT_SYMBOL(get_seconds); struct timespec __current_kernel_time(void) { - return xtime_cache; + return xtime; } struct timespec current_kernel_time(void) @@ -913,7 +910,7 @@ struct timespec current_kernel_time(void) do { seq = read_seqbegin(&xtime_lock); - now = xtime_cache; + now = xtime; } while (read_seqretry(&xtime_lock, seq)); return now; @@ -928,7 +925,7 @@ struct timespec get_monotonic_coarse(void) do { seq = read_seqbegin(&xtime_lock); - now = xtime_cache; + now = xtime; mono = wall_to_monotonic; } while (read_seqretry(&xtime_lock, seq)); diff --git a/kernel/timer.c b/kernel/timer.c index aeb6a54..9199f3c 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -319,6 +319,24 @@ unsigned long round_jiffies_up_relative(unsigned long j) } EXPORT_SYMBOL_GPL(round_jiffies_up_relative); +/** + * set_timer_slack - set the allowed slack for a timer + * @slack_hz: the amount of time (in jiffies) allowed for rounding + * + * Set the amount of time, in jiffies, that a certain timer has + * in terms of slack. By setting this value, the timer subsystem + * will schedule the actual timer somewhere between + * the time mod_timer() asks for, and that time plus the slack. + * + * By setting the slack to -1, a percentage of the delay is used + * instead. + */ +void set_timer_slack(struct timer_list *timer, int slack_hz) +{ + timer->slack = slack_hz; +} +EXPORT_SYMBOL_GPL(set_timer_slack); + static inline void set_running_timer(struct tvec_base *base, struct timer_list *timer) @@ -550,6 +568,7 @@ static void __init_timer(struct timer_list *timer, { timer->entry.next = NULL; timer->base = __raw_get_cpu_var(tvec_bases); + timer->slack = -1; #ifdef CONFIG_TIMER_STATS timer->start_site = NULL; timer->start_pid = -1; @@ -715,6 +734,41 @@ int mod_timer_pending(struct timer_list *timer, unsigned long expires) } EXPORT_SYMBOL(mod_timer_pending); +/* + * Decide where to put the timer while taking the slack into account + * + * Algorithm: + * 1) calculate the maximum (absolute) time + * 2) calculate the highest bit where the expires and new max are different + * 3) use this bit to make a mask + * 4) use the bitmask to round down the maximum time, so that all last + * bits are zeros + */ +static inline +unsigned long apply_slack(struct timer_list *timer, unsigned long expires) +{ + unsigned long expires_limit, mask; + int bit; + + expires_limit = expires + timer->slack; + + if (timer->slack < 0) /* auto slack: use 0.4% */ + expires_limit = expires + (expires - jiffies)/256; + + mask = expires ^ expires_limit; + + if (mask == 0) + return expires; + + bit = find_last_bit(&mask, BITS_PER_LONG); + + mask = (1 << bit) - 1; + + expires_limit = expires_limit & ~(mask); + + return expires_limit; +} + /** * mod_timer - modify a timer's timeout * @timer: the timer to be modified @@ -745,6 +799,8 @@ int mod_timer(struct timer_list *timer, unsigned long expires) if (timer_pending(timer) && timer->expires == expires) return 1; + expires = apply_slack(timer, expires); + return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); } EXPORT_SYMBOL(mod_timer); @@ -955,6 +1011,47 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index) return index; } +static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), + unsigned long data) +{ + int preempt_count = preempt_count(); + +#ifdef CONFIG_LOCKDEP + /* + * It is permissible to free the timer from inside the + * function that is called from it, this we need to take into + * account for lockdep too. To avoid bogus "held lock freed" + * warnings as well as problems when looking into + * timer->lockdep_map, make a copy and use that here. + */ + struct lockdep_map lockdep_map = timer->lockdep_map; +#endif + /* + * Couple the lock chain with the lock chain at + * del_timer_sync() by acquiring the lock_map around the fn() + * call here and in del_timer_sync(). + */ + lock_map_acquire(&lockdep_map); + + trace_timer_expire_entry(timer); + fn(data); + trace_timer_expire_exit(timer); + + lock_map_release(&lockdep_map); + + if (preempt_count != preempt_count()) { + WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", + fn, preempt_count, preempt_count()); + /* + * Restore the preempt count. That gives us a decent + * chance to survive and extract information. If the + * callback kept a lock held, bad luck, but not worse + * than the BUG() we had. + */ + preempt_count() = preempt_count; + } +} + #define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) /** @@ -998,45 +1095,7 @@ static inline void __run_timers(struct tvec_base *base) detach_timer(timer, 1); spin_unlock_irq(&base->lock); - { - int preempt_count = preempt_count(); - -#ifdef CONFIG_LOCKDEP - /* - * It is permissible to free the timer from - * inside the function that is called from - * it, this we need to take into account for - * lockdep too. To avoid bogus "held lock - * freed" warnings as well as problems when - * looking into timer->lockdep_map, make a - * copy and use that here. - */ - struct lockdep_map lockdep_map = - timer->lockdep_map; -#endif - /* - * Couple the lock chain with the lock chain at - * del_timer_sync() by acquiring the lock_map - * around the fn() call here and in - * del_timer_sync(). - */ - lock_map_acquire(&lockdep_map); - - trace_timer_expire_entry(timer); - fn(data); - trace_timer_expire_exit(timer); - - lock_map_release(&lockdep_map); - - if (preempt_count != preempt_count()) { - printk(KERN_ERR "huh, entered %p " - "with preempt_count %08x, exited" - " with %08x?\n", - fn, preempt_count, - preempt_count()); - BUG(); - } - } + call_timer_fn(timer, fn, data); spin_lock_irq(&base->lock); } } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5bfb213..77dabbf 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -229,6 +229,16 @@ static inline void set_wq_data(struct work_struct *work, atomic_long_set(&work->data, new); } +/* + * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued. + */ +static inline void clear_wq_data(struct work_struct *work) +{ + unsigned long flags = *work_data_bits(work) & + (1UL << WORK_STRUCT_STATIC); + atomic_long_set(&work->data, flags); +} + static inline struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) { @@ -671,7 +681,7 @@ static int __cancel_work_timer(struct work_struct *work, wait_on_work(work); } while (unlikely(ret < 0)); - work_clear_pending(work); + clear_wq_data(work); return ret; } @@ -845,6 +855,30 @@ int schedule_on_each_cpu(work_func_t func) return 0; } +/** + * flush_scheduled_work - ensure that any scheduled work has run to completion. + * + * Forces execution of the kernel-global workqueue and blocks until its + * completion. + * + * Think twice before calling this function! It's very easy to get into + * trouble if you don't take great care. Either of the following situations + * will lead to deadlock: + * + * One of the work items currently on the workqueue needs to acquire + * a lock held by your code or its caller. + * + * Your code is running in the context of a work routine. + * + * They will be detected by lockdep when they occur, but the first might not + * occur very often. It depends on what work items are on the workqueue and + * what locks they need, which you have no control over. + * + * In most situations flushing the entire workqueue is overkill; you merely + * need to know that a particular work item isn't queued and isn't running. + * In such cases you should use cancel_delayed_work_sync() or + * cancel_work_sync() instead. + */ void flush_scheduled_work(void) { flush_workqueue(keventd_wq); |