From 41acab8851a0408c1d5ad6c21a07456f88b54d40 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Wed, 10 Mar 2010 23:37:45 -0300 Subject: sched: Implement group scheduler statistics in one struct Put all statistic fields of sched_entity in one struct, sched_statistics, and embed it into sched_entity. This change allows to memset the sched_statistics to 0 when needed (for instance when forking), avoiding bugs of non initialized fields. Signed-off-by: Lucas De Marchi Signed-off-by: Peter Zijlstra LKML-Reference: <1268275065-18542-1-git-send-email-lucas.de.marchi@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 54 ++++++++++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 29 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4b1753f..8cc863d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1127,36 +1127,8 @@ struct load_weight { unsigned long weight, inv_weight; }; -/* - * CFS stats for a schedulable entity (task, task-group etc) - * - * Current field usage histogram: - * - * 4 se->block_start - * 4 se->run_node - * 4 se->sleep_start - * 6 se->load.weight - */ -struct sched_entity { - struct load_weight load; /* for load-balancing */ - struct rb_node run_node; - struct list_head group_node; - unsigned int on_rq; - - u64 exec_start; - u64 sum_exec_runtime; - u64 vruntime; - u64 prev_sum_exec_runtime; - - u64 last_wakeup; - u64 avg_overlap; - - u64 nr_migrations; - - u64 start_runtime; - u64 avg_wakeup; - #ifdef CONFIG_SCHEDSTATS +struct sched_statistics { u64 wait_start; u64 wait_max; u64 wait_count; @@ -1188,6 +1160,30 @@ struct sched_entity { u64 nr_wakeups_affine_attempts; u64 nr_wakeups_passive; u64 nr_wakeups_idle; +}; +#endif + +struct sched_entity { + struct load_weight load; /* for load-balancing */ + struct rb_node run_node; + struct list_head group_node; + unsigned int on_rq; + + u64 exec_start; + u64 sum_exec_runtime; + u64 vruntime; + u64 prev_sum_exec_runtime; + + u64 last_wakeup; + u64 avg_overlap; + + u64 nr_migrations; + + u64 start_runtime; + u64 avg_wakeup; + +#ifdef CONFIG_SCHEDSTATS + struct sched_statistics statistics; #endif #ifdef CONFIG_FAIR_GROUP_SCHED -- cgit v1.1 From 39c0cbe2150cbd848a25ba6cdb271d1ad46818ad Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 11 Mar 2010 17:17:13 +0100 Subject: sched: Rate-limit nohz Entering nohz code on every micro-idle is costing ~10% throughput for netperf TCP_RR when scheduling cross-cpu. Rate limiting entry fixes this, but raises ticks a bit. On my Q6600, an idle box goes from ~85 interrupts/sec to 128. The higher the context switch rate, the more nohz entry costs. With this patch and some cycle recovery patches in my tree, max cross cpu context switch rate is improved by ~16%, a large portion of which of which is this ratelimiting. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1268301003.6785.28.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8cc863d..13efe7d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -271,11 +271,17 @@ extern cpumask_var_t nohz_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) extern int select_nohz_load_balancer(int cpu); extern int get_nohz_load_balancer(void); +extern int nohz_ratelimit(int cpu); #else static inline int select_nohz_load_balancer(int cpu) { return 0; } + +static inline int nohz_ratelimit(int cpu) +{ + return 0; +} #endif /* -- cgit v1.1 From b42e0c41a422a212ddea0666d5a3a0e3c35206db Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 11 Mar 2010 17:15:38 +0100 Subject: sched: Remove avg_wakeup Testing the load which led to this heuristic (nfs4 kbuild) shows that it has outlived it's usefullness. With intervening load balancing changes, I cannot see any difference with/without, so recover there fastpath cycles. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1268301062.6785.29.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 13efe7d..70c560f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1185,9 +1185,6 @@ struct sched_entity { u64 nr_migrations; - u64 start_runtime; - u64 avg_wakeup; - #ifdef CONFIG_SCHEDSTATS struct sched_statistics statistics; #endif -- cgit v1.1 From e12f31d3e5d36328c7fbd0fce40a95e70b59152c Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 11 Mar 2010 17:15:51 +0100 Subject: sched: Remove avg_overlap Both avg_overlap and avg_wakeup had an inherent problem in that their accuracy was detrimentally affected by cross-cpu wakeups, this because we are missing the necessary call to update_curr(). This can't be fixed without increasing overhead in our already too fat fastpath. Additionally, with recent load balancing changes making us prefer to place tasks in an idle cache domain (which is good for compute bound loads), communicating tasks suffer when a sync wakeup, which would enable affine placement, is turned into a non-sync wakeup by SYNC_LESS. With one task on the runqueue, wake_affine() rejects the affine wakeup request, leaving the unfortunate where placed, taking frequent cache misses. Remove it, and recover some fastpath cycles. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1268301121.6785.30.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 70c560f..8604884 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1180,9 +1180,6 @@ struct sched_entity { u64 vruntime; u64 prev_sum_exec_runtime; - u64 last_wakeup; - u64 avg_overlap; - u64 nr_migrations; #ifdef CONFIG_SCHEDSTATS -- cgit v1.1 From 6a1bdc1b577ebcb65f6603c57f8347309bc4ab13 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 15 Mar 2010 10:10:23 +0100 Subject: sched: _cpu_down(): Don't play with current->cpus_allowed _cpu_down() changes the current task's affinity and then recovers it at the end. The problems are well known: we can't restore old_allowed if it was bound to the now-dead-cpu, and we can race with the userspace which can change cpu-affinity during unplug. _cpu_down() should not play with current->cpus_allowed at all. Instead, take_cpu_down() can migrate the caller of _cpu_down() after __cpu_disable() removes the dying cpu from cpu_online_mask. Signed-off-by: Oleg Nesterov Acked-by: Rafael J. Wysocki Signed-off-by: Peter Zijlstra LKML-Reference: <20100315091023.GA9148@redhat.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 43c9451..8bea407 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1843,6 +1843,7 @@ extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(u64 delta_ns); #ifdef CONFIG_HOTPLUG_CPU +extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p); extern void idle_task_exit(void); #else static inline void idle_task_exit(void) {} -- cgit v1.1 From 0017d735092844118bef006696a750a0e4ef6ebd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Mar 2010 18:34:10 +0100 Subject: sched: Fix TASK_WAKING vs fork deadlock Oleg noticed a few races with the TASK_WAKING usage on fork. - since TASK_WAKING is basically a spinlock, it should be IRQ safe - since we set TASK_WAKING (*) without holding rq->lock it could be there still is a rq->lock holder, thereby not actually providing full serialization. (*) in fact we clear PF_STARTING, which in effect enables TASK_WAKING. Cure the second issue by not setting TASK_WAKING in sched_fork(), but only temporarily in wake_up_new_task() while calling select_task_rq(). Cure the first by holding rq->lock around the select_task_rq() call, this will disable IRQs, this however requires that we push down the rq->lock release into select_task_rq_fair()'s cgroup stuff. Because select_task_rq_fair() still needs to drop the rq->lock we cannot fully get rid of TASK_WAKING. Reported-by: Oleg Nesterov Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 8bea407..fb6c188 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1046,7 +1046,8 @@ struct sched_class { void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP - int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); + int (*select_task_rq)(struct rq *rq, struct task_struct *p, + int sd_flag, int flags); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); -- cgit v1.1 From 371fd7e7a56a5c136d31aa980011bd2f131c3ef5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 24 Mar 2010 16:38:48 +0100 Subject: sched: Add enqueue/dequeue flags In order to reduce the dependency on TASK_WAKING rework the enqueue interface to support a proper flags field. Replace the int wakeup, bool head arguments with an int flags argument and create the following flags: ENQUEUE_WAKEUP - the enqueue is a wakeup of a sleeping task, ENQUEUE_WAKING - the enqueue has relative vruntime due to having sched_class::task_waking() called, ENQUEUE_HEAD - the waking task should be places on the head of the priority queue (where appropriate). For symmetry also convert sched_class::dequeue() to a flags scheme. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index fb6c188..e3e900f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1032,12 +1032,17 @@ struct sched_domain; #define WF_SYNC 0x01 /* waker goes to sleep after wakup */ #define WF_FORK 0x02 /* child wakeup after fork */ +#define ENQUEUE_WAKEUP 1 +#define ENQUEUE_WAKING 2 +#define ENQUEUE_HEAD 4 + +#define DEQUEUE_SLEEP 1 + struct sched_class { const struct sched_class *next; - void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup, - bool head); - void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); void (*yield_task) (struct rq *rq); void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); -- cgit v1.1 From 669c55e9f99b90e46eaa0f98a67ec53d46dc969a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 16 Apr 2010 14:59:29 +0200 Subject: sched: Pre-compute cpumask_weight(sched_domain_span(sd)) Dave reported that his large SPARC machines spend lots of time in hweight64(), try and optimize some of those needless cpumask_weight() invocations (esp. with the large offstack cpumasks these are very expensive indeed). Reported-by: David Miller Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index e3e900f..dfea405 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -960,6 +960,7 @@ struct sched_domain { char *name; #endif + unsigned int span_weight; /* * Span of all CPUs in this domain. * -- cgit v1.1