aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt15
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/module.c15
-rw-r--r--kernel/power/Kconfig2
-rw-r--r--kernel/power/snapshot.c41
-rw-r--r--kernel/relay.c5
-rw-r--r--kernel/sched.c58
-rw-r--r--kernel/sched_debug.c1
-rw-r--r--kernel/sched_fair.c277
-rw-r--r--kernel/time/ntp.c23
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/timekeeping.c6
12 files changed, 297 insertions, 150 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 0669b70..9fdba03 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,8 +52,23 @@ config PREEMPT
endchoice
+config PREEMPT_RCU
+ bool "Preemptible RCU"
+ depends on PREEMPT
+ default n
+ help
+ This option reduces the latency of the kernel by making certain
+ RCU sections preemptible. Normally RCU code is non-preemptible, if
+ this option is selected then read-only RCU sections become
+ preemptible. This helps latency, but may expose bugs due to
+ now-naive assumptions about each RCU read-side critical section
+ remaining on a given CPU through its execution.
+
+ Say N if you are unsure.
+
config RCU_TRACE
bool "Enable tracing for RCU - currently stats in debugfs"
+ depends on PREEMPT_RCU
select DEBUG_FS
default y
help
diff --git a/kernel/exit.c b/kernel/exit.c
index cd20bf0..53872bf 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1378,7 +1378,7 @@ unlock_sig:
if (!retval && infop)
retval = put_user(0, &infop->si_errno);
if (!retval && infop)
- retval = put_user(why, &infop->si_code);
+ retval = put_user((short)why, &infop->si_code);
if (!retval && infop)
retval = put_user(exit_code, &infop->si_status);
if (!retval && infop)
diff --git a/kernel/module.c b/kernel/module.c
index be4807f..5d437bf 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2178,10 +2178,20 @@ sys_init_module(void __user *umod,
wake_up(&module_wq);
return ret;
}
+ if (ret > 0) {
+ printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, "
+ "it should follow 0/-E convention\n"
+ KERN_WARNING "%s: loading module anyway...\n",
+ __func__, mod->name, ret,
+ __func__);
+ dump_stack();
+ }
- /* Now it's a first class citizen! */
- mutex_lock(&module_mutex);
+ /* Now it's a first class citizen! Wake up anyone waiting for it. */
mod->state = MODULE_STATE_LIVE;
+ wake_up(&module_wq);
+
+ mutex_lock(&module_mutex);
/* Drop initial reference. */
module_put(mod);
unwind_remove_table(mod->unwind_info, 1);
@@ -2190,7 +2200,6 @@ sys_init_module(void __user *umod,
mod->init_size = 0;
mod->init_text_size = 0;
mutex_unlock(&module_mutex);
- wake_up(&module_wq);
return 0;
}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 7983317..6233f3b 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -190,7 +190,7 @@ config APM_EMULATION
notification of APM "events" (e.g. battery status change).
In order to use APM, you will need supporting software. For location
- and more information, read <file:Documentation/pm.txt> and the
+ and more information, read <file:Documentation/power/pm.txt> and the
Battery Powered Linux mini-HOWTO, available from
<http://www.tldp.org/docs.html#howto>.
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 72a020c..5f91a07 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -447,7 +447,7 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
* of @bm->cur_zone_bm are updated.
*/
-static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
+static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
void **addr, unsigned int *bit_nr)
{
struct zone_bitmap *zone_bm;
@@ -461,7 +461,8 @@ static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
zone_bm = zone_bm->next;
- BUG_ON(!zone_bm);
+ if (!zone_bm)
+ return -EFAULT;
}
bm->cur.zone_bm = zone_bm;
}
@@ -479,23 +480,40 @@ static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
pfn -= bb->start_pfn;
*bit_nr = pfn % BM_BITS_PER_CHUNK;
*addr = bb->data + pfn / BM_BITS_PER_CHUNK;
+ return 0;
}
static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
+ int error;
- memory_bm_find_bit(bm, pfn, &addr, &bit);
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+ BUG_ON(error);
set_bit(bit, addr);
}
+static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
+{
+ void *addr;
+ unsigned int bit;
+ int error;
+
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+ if (!error)
+ set_bit(bit, addr);
+ return error;
+}
+
static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
+ int error;
- memory_bm_find_bit(bm, pfn, &addr, &bit);
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+ BUG_ON(error);
clear_bit(bit, addr);
}
@@ -503,8 +521,10 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
+ int error;
- memory_bm_find_bit(bm, pfn, &addr, &bit);
+ error = memory_bm_find_bit(bm, pfn, &addr, &bit);
+ BUG_ON(error);
return test_bit(bit, addr);
}
@@ -709,8 +729,15 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
region->end_pfn << PAGE_SHIFT);
for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
- if (pfn_valid(pfn))
- memory_bm_set_bit(bm, pfn);
+ if (pfn_valid(pfn)) {
+ /*
+ * It is safe to ignore the result of
+ * mem_bm_set_bit_check() here, since we won't
+ * touch the PFNs for which the error is
+ * returned anyway.
+ */
+ mem_bm_set_bit_check(bm, pfn);
+ }
}
}
diff --git a/kernel/relay.c b/kernel/relay.c
index d080b9d..4c035a8 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1066,7 +1066,7 @@ static int subbuf_splice_actor(struct file *in,
unsigned int flags,
int *nonpad_ret)
{
- unsigned int pidx, poff, total_len, subbuf_pages, ret;
+ unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret;
struct rchan_buf *rbuf = in->private_data;
unsigned int subbuf_size = rbuf->chan->subbuf_size;
uint64_t pos = (uint64_t) *ppos;
@@ -1097,8 +1097,9 @@ static int subbuf_splice_actor(struct file *in,
subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
pidx = (read_start / PAGE_SIZE) % subbuf_pages;
poff = read_start & ~PAGE_MASK;
+ nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS);
- for (total_len = 0; spd.nr_pages < subbuf_pages; spd.nr_pages++) {
+ for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
unsigned int this_len, this_end, private;
unsigned int cur_pos = read_start + total_len;
diff --git a/kernel/sched.c b/kernel/sched.c
index 52b9867..3f7c5eb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -301,7 +301,7 @@ struct cfs_rq {
/* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
- struct sched_entity *curr;
+ struct sched_entity *curr, *next;
unsigned long nr_spread_over;
@@ -1084,7 +1084,7 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
u64 tmp;
if (unlikely(!lw->inv_weight))
- lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1;
+ lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1);
tmp = (u64)delta_exec * weight;
/*
@@ -1108,11 +1108,13 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
+ lw->inv_weight = 0;
}
static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
{
lw->weight -= dec;
+ lw->inv_weight = 0;
}
/*
@@ -1394,6 +1396,12 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
{
s64 delta;
+ /*
+ * Buddy candidates are cache hot:
+ */
+ if (&p->se == cfs_rq_of(&p->se)->next)
+ return 1;
+
if (p->sched_class != &fair_sched_class)
return 0;
@@ -1853,10 +1861,11 @@ out_activate:
schedstat_inc(p, se.nr_wakeups_remote);
update_rq_clock(rq);
activate_task(rq, p, 1);
- check_preempt_curr(rq, p);
success = 1;
out_running:
+ check_preempt_curr(rq, p);
+
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
if (p->sched_class->task_wake_up)
@@ -1890,6 +1899,8 @@ static void __sched_fork(struct task_struct *p)
p->se.exec_start = 0;
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
+ p->se.last_wakeup = 0;
+ p->se.avg_overlap = 0;
#ifdef CONFIG_SCHEDSTATS
p->se.wait_start = 0;
@@ -4268,11 +4279,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
oldprio = p->prio;
on_rq = p->se.on_rq;
running = task_current(rq, p);
- if (on_rq) {
+ if (on_rq)
dequeue_task(rq, p, 0);
- if (running)
- p->sched_class->put_prev_task(rq, p);
- }
+ if (running)
+ p->sched_class->put_prev_task(rq, p);
if (rt_prio(prio))
p->sched_class = &rt_sched_class;
@@ -4281,10 +4291,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
p->prio = prio;
+ if (running)
+ p->sched_class->set_curr_task(rq);
if (on_rq) {
- if (running)
- p->sched_class->set_curr_task(rq);
-
enqueue_task(rq, p, 0);
check_class_changed(rq, p, prev_class, oldprio, running);
@@ -4581,19 +4590,17 @@ recheck:
update_rq_clock(rq);
on_rq = p->se.on_rq;
running = task_current(rq, p);
- if (on_rq) {
+ if (on_rq)
deactivate_task(rq, p, 0);
- if (running)
- p->sched_class->put_prev_task(rq, p);
- }
+ if (running)
+ p->sched_class->put_prev_task(rq, p);
oldprio = p->prio;
__setscheduler(rq, p, policy, param->sched_priority);
+ if (running)
+ p->sched_class->set_curr_task(rq);
if (on_rq) {
- if (running)
- p->sched_class->set_curr_task(rq);
-
activate_task(rq, p, 0);
check_class_changed(rq, p, prev_class, oldprio, running);
@@ -5881,7 +5888,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
spin_unlock_irq(&rq->lock);
break;
- case CPU_DOWN_PREPARE:
+ case CPU_DYING:
+ case CPU_DYING_FROZEN:
/* Update our root-domain */
rq = cpu_rq(cpu);
spin_lock_irqsave(&rq->lock, flags);
@@ -7617,11 +7625,10 @@ void sched_move_task(struct task_struct *tsk)
running = task_current(rq, tsk);
on_rq = tsk->se.on_rq;
- if (on_rq) {
+ if (on_rq)
dequeue_task(rq, tsk, 0);
- if (unlikely(running))
- tsk->sched_class->put_prev_task(rq, tsk);
- }
+ if (unlikely(running))
+ tsk->sched_class->put_prev_task(rq, tsk);
set_task_rq(tsk, task_cpu(tsk));
@@ -7630,11 +7637,10 @@ void sched_move_task(struct task_struct *tsk)
tsk->sched_class->moved_group(tsk);
#endif
- if (on_rq) {
- if (unlikely(running))
- tsk->sched_class->set_curr_task(rq);
+ if (unlikely(running))
+ tsk->sched_class->set_curr_task(rq);
+ if (on_rq)
enqueue_task(rq, tsk, 0);
- }
task_rq_unlock(rq, &flags);
}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 4b5e24c..ef358ba 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -288,6 +288,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
PN(se.exec_start);
PN(se.vruntime);
PN(se.sum_exec_runtime);
+ PN(se.avg_overlap);
nr_switches = p->nvcsw + p->nivcsw;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e2a5305..b85cac4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -73,13 +73,13 @@ unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
/*
* SCHED_OTHER wake-up granularity.
- * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
*
* This option delays the preemption effects of decoupled workloads
* and reduces their over-scheduling. Synchronous workloads will still
* have immediate wakeup/sleep latencies.
*/
-unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
+unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
@@ -175,8 +175,15 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Maintain a cache of leftmost tree entries (it is frequently
* used):
*/
- if (leftmost)
+ if (leftmost) {
cfs_rq->rb_leftmost = &se->run_node;
+ /*
+ * maintain cfs_rq->min_vruntime to be a monotonic increasing
+ * value tracking the leftmost vruntime in the tree.
+ */
+ cfs_rq->min_vruntime =
+ max_vruntime(cfs_rq->min_vruntime, se->vruntime);
+ }
rb_link_node(&se->run_node, parent, link);
rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
@@ -184,8 +191,24 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (cfs_rq->rb_leftmost == &se->run_node)
- cfs_rq->rb_leftmost = rb_next(&se->run_node);
+ if (cfs_rq->rb_leftmost == &se->run_node) {
+ struct rb_node *next_node;
+ struct sched_entity *next;
+
+ next_node = rb_next(&se->run_node);
+ cfs_rq->rb_leftmost = next_node;
+
+ if (next_node) {
+ next = rb_entry(next_node,
+ struct sched_entity, run_node);
+ cfs_rq->min_vruntime =
+ max_vruntime(cfs_rq->min_vruntime,
+ next->vruntime);
+ }
+ }
+
+ if (cfs_rq->next == se)
+ cfs_rq->next = NULL;
rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}
@@ -260,12 +283,8 @@ static u64 __sched_period(unsigned long nr_running)
*/
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u64 slice = __sched_period(cfs_rq->nr_running);
-
- slice *= se->load.weight;
- do_div(slice, cfs_rq->load.weight);
-
- return slice;
+ return calc_delta_mine(__sched_period(cfs_rq->nr_running),
+ se->load.weight, &cfs_rq->load);
}
/*
@@ -303,7 +322,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
unsigned long delta_exec)
{
unsigned long delta_exec_weighted;
- u64 vruntime;
schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
@@ -315,19 +333,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
&curr->load);
}
curr->vruntime += delta_exec_weighted;
-
- /*
- * maintain cfs_rq->min_vruntime to be a monotonic increasing
- * value tracking the leftmost vruntime in the tree.
- */
- if (first_fair(cfs_rq)) {
- vruntime = min_vruntime(curr->vruntime,
- __pick_next_entity(cfs_rq)->vruntime);
- } else
- vruntime = curr->vruntime;
-
- cfs_rq->min_vruntime =
- max_vruntime(cfs_rq->min_vruntime, vruntime);
}
static void update_curr(struct cfs_rq *cfs_rq)
@@ -493,7 +498,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
u64 vruntime;
- vruntime = cfs_rq->min_vruntime;
+ if (first_fair(cfs_rq)) {
+ vruntime = min_vruntime(cfs_rq->min_vruntime,
+ __pick_next_entity(cfs_rq)->vruntime);
+ } else
+ vruntime = cfs_rq->min_vruntime;
if (sched_feat(TREE_AVG)) {
struct sched_entity *last = __pick_last_entity(cfs_rq);
@@ -515,8 +524,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
if (!initial) {
/* sleeps upto a single latency don't count. */
- if (sched_feat(NEW_FAIR_SLEEPERS))
- vruntime -= sysctl_sched_latency;
+ if (sched_feat(NEW_FAIR_SLEEPERS)) {
+ vruntime -= calc_delta_fair(sysctl_sched_latency,
+ &cfs_rq->load);
+ }
/* ensure we never gain time by being placed backwards. */
vruntime = max_vruntime(se->vruntime, vruntime);
@@ -545,6 +556,21 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
account_entity_enqueue(cfs_rq, se);
}
+static void update_avg(u64 *avg, u64 sample)
+{
+ s64 diff = sample - *avg;
+ *avg += diff >> 3;
+}
+
+static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ if (!se->last_wakeup)
+ return;
+
+ update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup);
+ se->last_wakeup = 0;
+}
+
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
{
@@ -555,6 +581,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
update_stats_dequeue(cfs_rq, se);
if (sleep) {
+ update_avg_stats(cfs_rq, se);
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
struct task_struct *tsk = task_of(se);
@@ -616,12 +643,32 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->prev_sum_exec_runtime = se->sum_exec_runtime;
}
+static struct sched_entity *
+pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ s64 diff, gran;
+
+ if (!cfs_rq->next)
+ return se;
+
+ diff = cfs_rq->next->vruntime - se->vruntime;
+ if (diff < 0)
+ return se;
+
+ gran = calc_delta_fair(sysctl_sched_wakeup_granularity, &cfs_rq->load);
+ if (diff > gran)
+ return se;
+
+ return cfs_rq->next;
+}
+
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
struct sched_entity *se = NULL;
if (first_fair(cfs_rq)) {
se = __pick_next_entity(cfs_rq);
+ se = pick_next(cfs_rq, se);
set_next_entity(cfs_rq, se);
}
@@ -949,96 +996,121 @@ static inline int wake_idle(int cpu, struct task_struct *p)
#endif
#ifdef CONFIG_SMP
-static int select_task_rq_fair(struct task_struct *p, int sync)
+
+static const struct sched_class fair_sched_class;
+
+static int
+wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
+ struct task_struct *p, int prev_cpu, int this_cpu, int sync,
+ int idx, unsigned long load, unsigned long this_load,
+ unsigned int imbalance)
{
- int cpu, this_cpu;
- struct rq *rq;
- struct sched_domain *sd, *this_sd = NULL;
- int new_cpu;
+ struct task_struct *curr = this_rq->curr;
+ unsigned long tl = this_load;
+ unsigned long tl_per_task;
+
+ if (!(this_sd->flags & SD_WAKE_AFFINE))
+ return 0;
+
+ /*
+ * If the currently running task will sleep within
+ * a reasonable amount of time then attract this newly
+ * woken task:
+ */
+ if (sync && curr->sched_class == &fair_sched_class) {
+ if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
+ p->se.avg_overlap < sysctl_sched_migration_cost)
+ return 1;
+ }
+
+ schedstat_inc(p, se.nr_wakeups_affine_attempts);
+ tl_per_task = cpu_avg_load_per_task(this_cpu);
+
+ /*
+ * If sync wakeup then subtract the (maximum possible)
+ * effect of the currently running task from the load
+ * of the current CPU:
+ */
+ if (sync)
+ tl -= current->se.load.weight;
+
+ if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) ||
+ 100*(tl + p->se.load.weight) <= imbalance*load) {
+ /*
+ * This domain has SD_WAKE_AFFINE and
+ * p is cache cold in this domain, and
+ * there is no bad imbalance.
+ */
+ schedstat_inc(this_sd, ttwu_move_affine);
+ schedstat_inc(p, se.nr_wakeups_affine);
- cpu = task_cpu(p);
- rq = task_rq(p);
- this_cpu = smp_processor_id();
- new_cpu = cpu;
+ return 1;
+ }
+ return 0;
+}
- if (cpu == this_cpu)
- goto out_set_cpu;
+static int select_task_rq_fair(struct task_struct *p, int sync)
+{
+ struct sched_domain *sd, *this_sd = NULL;
+ int prev_cpu, this_cpu, new_cpu;
+ unsigned long load, this_load;
+ struct rq *rq, *this_rq;
+ unsigned int imbalance;
+ int idx;
+
+ prev_cpu = task_cpu(p);
+ rq = task_rq(p);
+ this_cpu = smp_processor_id();
+ this_rq = cpu_rq(this_cpu);
+ new_cpu = prev_cpu;
+ /*
+ * 'this_sd' is the first domain that both
+ * this_cpu and prev_cpu are present in:
+ */
for_each_domain(this_cpu, sd) {
- if (cpu_isset(cpu, sd->span)) {
+ if (cpu_isset(prev_cpu, sd->span)) {
this_sd = sd;
break;
}
}
if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
- goto out_set_cpu;
+ goto out;
/*
* Check for affine wakeup and passive balancing possibilities.
*/
- if (this_sd) {
- int idx = this_sd->wake_idx;
- unsigned int imbalance;
- unsigned long load, this_load;
-
- imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
-
- load = source_load(cpu, idx);
- this_load = target_load(this_cpu, idx);
-
- new_cpu = this_cpu; /* Wake to this CPU if we can */
-
- if (this_sd->flags & SD_WAKE_AFFINE) {
- unsigned long tl = this_load;
- unsigned long tl_per_task;
-
- /*
- * Attract cache-cold tasks on sync wakeups:
- */
- if (sync && !task_hot(p, rq->clock, this_sd))
- goto out_set_cpu;
-
- schedstat_inc(p, se.nr_wakeups_affine_attempts);
- tl_per_task = cpu_avg_load_per_task(this_cpu);
-
- /*
- * If sync wakeup then subtract the (maximum possible)
- * effect of the currently running task from the load
- * of the current CPU:
- */
- if (sync)
- tl -= current->se.load.weight;
-
- if ((tl <= load &&
- tl + target_load(cpu, idx) <= tl_per_task) ||
- 100*(tl + p->se.load.weight) <= imbalance*load) {
- /*
- * This domain has SD_WAKE_AFFINE and
- * p is cache cold in this domain, and
- * there is no bad imbalance.
- */
- schedstat_inc(this_sd, ttwu_move_affine);
- schedstat_inc(p, se.nr_wakeups_affine);
- goto out_set_cpu;
- }
- }
+ if (!this_sd)
+ goto out;
- /*
- * Start passive balancing when half the imbalance_pct
- * limit is reached.
- */
- if (this_sd->flags & SD_WAKE_BALANCE) {
- if (imbalance*this_load <= 100*load) {
- schedstat_inc(this_sd, ttwu_move_balance);
- schedstat_inc(p, se.nr_wakeups_passive);
- goto out_set_cpu;
- }
+ idx = this_sd->wake_idx;
+
+ imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
+
+ load = source_load(prev_cpu, idx);
+ this_load = target_load(this_cpu, idx);
+
+ if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
+ load, this_load, imbalance))
+ return this_cpu;
+
+ if (prev_cpu == this_cpu)
+ goto out;
+
+ /*
+ * Start passive balancing when half the imbalance_pct
+ * limit is reached.
+ */
+ if (this_sd->flags & SD_WAKE_BALANCE) {
+ if (imbalance*this_load <= 100*load) {
+ schedstat_inc(this_sd, ttwu_move_balance);
+ schedstat_inc(p, se.nr_wakeups_passive);
+ return this_cpu;
}
}
- new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
-out_set_cpu:
+out:
return wake_idle(new_cpu, p);
}
#endif /* CONFIG_SMP */
@@ -1060,6 +1132,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
resched_task(curr);
return;
}
+
+ se->last_wakeup = se->sum_exec_runtime;
+ if (unlikely(se == pse))
+ return;
+
+ cfs_rq_of(pse)->next = pse;
+
/*
* Batch tasks do not preempt (their preemption is driven by
* the tick):
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index c88b591..5fd9b94 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -42,12 +42,13 @@ long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
long time_freq; /* frequency offset (scaled ppm)*/
static long time_reftime; /* time at last adjustment (s) */
long time_adjust;
+static long ntp_tick_adj;
static void ntp_update_frequency(void)
{
u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
<< TICK_LENGTH_SHIFT;
- second_length += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT;
+ second_length += (s64)ntp_tick_adj << TICK_LENGTH_SHIFT;
second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC);
tick_length_base = second_length;
@@ -342,14 +343,16 @@ int do_adjtimex(struct timex *txc)
freq_adj = shift_right(freq_adj, time_constant * 2 +
(SHIFT_PLL + 2) * 2 - SHIFT_NSEC);
if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
+ u64 utemp64;
temp64 = time_offset << (SHIFT_NSEC - SHIFT_FLL);
if (time_offset < 0) {
- temp64 = -temp64;
- do_div(temp64, mtemp);
- freq_adj -= temp64;
+ utemp64 = -temp64;
+ do_div(utemp64, mtemp);
+ freq_adj -= utemp64;
} else {
- do_div(temp64, mtemp);
- freq_adj += temp64;
+ utemp64 = temp64;
+ do_div(utemp64, mtemp);
+ freq_adj += utemp64;
}
}
freq_adj += time_freq;
@@ -400,3 +403,11 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
notify_cmos_timer();
return(result);
}
+
+static int __init ntp_tick_adj_setup(char *str)
+{
+ ntp_tick_adj = simple_strtol(str, NULL, 0);
+ return 1;
+}
+
+__setup("ntp_tick_adj=", ntp_tick_adj_setup);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2968298..686da82 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -640,7 +640,7 @@ void tick_cancel_sched_timer(int cpu)
if (ts->sched_timer.base)
hrtimer_cancel(&ts->sched_timer);
- ts->tick_stopped = 0;
+
ts->nohz_mode = NOHZ_MODE_INACTIVE;
}
#endif /* HIGH_RES_TIMERS */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 1af9fb0..671af61 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -187,8 +187,7 @@ static void change_clocksource(void)
clock->error = 0;
clock->xtime_nsec = 0;
- clocksource_calculate_interval(clock,
- (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
+ clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
tick_clock_notify();
@@ -245,8 +244,7 @@ void __init timekeeping_init(void)
ntp_clear();
clock = clocksource_get_next();
- clocksource_calculate_interval(clock,
- (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
+ clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
clock->cycle_last = clocksource_read(clock);
xtime.tv_sec = sec;